├── .gitignore
├── .project
├── .pydevproject
├── LICENSE
├── README.md
├── docs
├── Design.pages
├── SESSA.pdf
├── TREO.pdf
├── graph-keyword-search-poster-PPT.pptx
├── graph-keyword-search-poster.key
└── graph-keyword-search-poster.pdf
└── src
├── colorAssignment.py
├── dig
├── README.txt
├── SteinerTree.py
├── config.ini
├── esMapping-dig-ht-DT.json
├── graph.py
├── harvest.py
├── main.py
├── outline.py
├── prep.sh
├── prep2.sh
├── prep_ht.sh
├── query.py
├── synonym.py
├── test.py
├── util.py
└── z-attic
│ └── wordSimilarity.py
├── graphSearch.py
├── ngramTree.py
├── ngramsEngine.py
├── pivotEntityRecognition.py
├── queries.txt
├── resourceGraph.py
├── saq-2015_training_set.xml
├── sparqlClient.py
├── testSparqlEndPoint.py
├── wordSimilarity.py
└── ws4j.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
56 | # emacs
57 | *~
58 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | graph-keyword-search
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}
5 | /${PROJECT_DIR_NAME}/src
6 | /${PROJECT_DIR_NAME}/src/dig
7 |
8 | python 3.0
9 | gks python 3
10 |
11 | /opt/dig/venv/gks/lib/python3.5/site-packages/Levenshtein
12 | /opt/dig/venv/gks/lib/python3.5/site-packages/word2vec
13 | /opt/dig/venv/gks/lib/python3.5/site-packages/word2vec-0.8-py3.5.egg-info
14 | ${PROJ}/hybrid-jaccard
15 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5.egg-info/PKG-INFO
16 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5.egg-info
17 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5-macosx-10.10-x86_64.egg
18 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5-macosx-10.10-x86_64.egg/Levenshtein
19 |
20 |
21 | PROJ
22 | /Users/philpot/Documents/project
23 |
24 |
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # graph-keyword-search
2 |
3 | 1. Download and install the latest version of python3 - https://www.python.org/downloads/
4 | 2. Check the python3 installation by opening shell(Terminal or command prompt), type python3 at the shell.
5 | 3. Download and install nltk using - 'pip3 install nltk' command at the shell
6 | 4. Download and install inflection library - 'pip3 install inflection' command at the shell
7 | 5. Download and install SPARQLWrapper library - 'pip3 install SPARQLWrapper' command at the shell
8 |
9 | Running the program :
10 |
11 | python3 graphSearch.py
12 |
13 |
--------------------------------------------------------------------------------
/docs/Design.pages:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/Design.pages
--------------------------------------------------------------------------------
/docs/SESSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/SESSA.pdf
--------------------------------------------------------------------------------
/docs/TREO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/TREO.pdf
--------------------------------------------------------------------------------
/docs/graph-keyword-search-poster-PPT.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster-PPT.pptx
--------------------------------------------------------------------------------
/docs/graph-keyword-search-poster.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster.key
--------------------------------------------------------------------------------
/docs/graph-keyword-search-poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster.pdf
--------------------------------------------------------------------------------
/src/colorAssignment.py:
--------------------------------------------------------------------------------
1 | from ngramTree import *
2 |
3 | # This class is responsible for assignment of colors to the nodes in the ngram tree
4 | class ColorAssignment:
5 |
6 | colorDictionary = {} # This dictionary stores the individual tokens ['a', 'b', 'c', 'd'] and their color values
7 |
8 | # lookuplist - [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']]
9 | def assignInitialColors(self,rootNode,lookupList):
10 |
11 | if rootNode and len(lookupList)>=1:
12 | oneGrams = lookupList[0] # Gets the one grams
13 |
14 | for index in range(len(oneGrams)):
15 | if(oneGrams[index] not in self.colorDictionary):
16 | self.colorDictionary[oneGrams[index]] = index # This assigns the color values to each token
17 |
18 |
19 | stack = []
20 | stack.append(rootNode) # Using stack for DFS
21 |
22 | while(stack):
23 | currNode = stack.pop()
24 |
25 | # Assign colors to this node based on the presence of tokens
26 | if not currNode.isVisited: # If a node repeats, do not initialize color again
27 |
28 | currNode.isVisited = True
29 |
30 | tokens = currNode.data.split(' ') # Check for individual tokens
31 |
32 | for token in tokens:
33 | if(token in self.colorDictionary):
34 | currNode.color.append(self.colorDictionary[token]) # Assign colors
35 |
36 | for childNodes in currNode.children: # Add children to the stack
37 | stack.append(childNodes)
38 |
39 |
40 | #return rootNode
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/src/dig/README.txt:
--------------------------------------------------------------------------------
1 | Per document queries such as _termvector and _mtermvectors seem to
2 | mostly be concerned with multiple occurrences of the (presumed)
3 | repeated word in a single analyzed field.
4 |
5 | POST http://localhost:9200/twitter/tweet/3/_termvector
6 | {
7 | }
8 |
9 | looks good
10 |
11 | Multi-term queries require that we specify all document IDs in the body
12 |
13 | POST http://localhost:9200/twitter/tweet/_mtermvectors
14 | {"ids" : ["1", "2"],
15 | "parameters": {
16 | "fields": ["text"],
17 | "term_statistics": "false"
18 | }
19 | }
20 |
21 | looks good
22 |
23 | Not as meaningful for our more nominal/enumerable values.
24 |
25 |
26 | https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html
27 | Term aggregation seems reasonable for our needs:
28 |
29 | POST http://localhost:9200/twitter/tweet/_search?search_type=count
30 | {
31 | "query": {
32 | "match_all": {}
33 | },
34 | "aggs": {
35 | "eyecolors": {
36 | "terms": {
37 | "field": "eyecolor"
38 | }
39 | }
40 | }
41 | }
42 |
43 | This is the nested equivalent:
44 |
45 | POST http://localhost:9200/twitter2/tweet/_search?search_type=count
46 | {
47 | "query": {
48 | "match_all": {}
49 | },
50 | "aggs": {
51 | "schools": {
52 | "nested": {
53 | "path": "children"
54 | },
55 | "aggs": {
56 | "aggname": {
57 | "terms": {
58 | "field": "children.school"}
59 | }
60 | }
61 | }
62 | }
63 | }
64 |
65 | giving result such as
66 |
67 | {
68 | "took": 1,
69 | "timed_out": false,
70 | "_shards": {
71 | "total": 1,
72 | "successful": 1,
73 | "failed": 0
74 | },
75 | "hits": {
76 | "total": 3,
77 | "max_score": 0,
78 | "hits": []
79 | },
80 | "aggregations": {
81 | "schools": {
82 | "doc_count": 7,
83 | "aggname": {
84 | "doc_count_error_upper_bound": 0,
85 | "sum_other_doc_count": 0,
86 | "buckets": [
87 | {
88 | "key": "Aardvark",
89 | "doc_count": 4
90 | },
91 | {
92 | "key": "Badger",
93 | "doc_count": 1
94 | },
95 | {
96 | "key": "Factory",
97 | "doc_count": 1
98 | },
99 | {
100 | "key": "Junkyard",
101 | "doc_count": 1
102 | }
103 | ]
104 | }
105 | }
106 | }
107 | }
108 |
109 | For our domain, the query looks like:
110 |
111 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count
112 | {
113 | "query": {
114 | "match_all": {}
115 | },
116 | "aggs": {
117 | "itemOfferedAgg": {
118 | "nested": {
119 | "path": "itemOffered"
120 | },
121 | "aggs": {
122 | "termsSubAgg": {
123 | "terms": {
124 | "field": "itemOffered.hairColor",
125 | "size" : 20
126 | }
127 | }
128 | }
129 | }
130 | }
131 | }
132 |
133 | yielding
134 |
135 | {
136 | "took": 1492,
137 | "timed_out": false,
138 | "_shards": {
139 | "total": 20,
140 | "successful": 20,
141 | "failed": 0
142 | },
143 | "hits": {
144 | "total": 19134836,
145 | "max_score": 0,
146 | "hits": []
147 | },
148 | "aggregations": {
149 | "itemOfferedAgg": {
150 | "doc_count": 19134836,
151 | "termsSubAgg": {
152 | "doc_count_error_upper_bound": 0,
153 | "sum_other_doc_count": 345,
154 | "buckets": [
155 | {
156 | "key": "blond",
157 | "doc_count": 813715
158 | },
159 | {
160 | "key": "brown",
161 | "doc_count": 605642
162 | },
163 | {
164 | "key": "NONE",
165 | "doc_count": 295217
166 | },
167 | {
168 | "key": "black",
169 | "doc_count": 199892
170 | },
171 | {
172 | "key": "red",
173 | "doc_count": 142948
174 | },
175 | {
176 | "key": "blonde",
177 | "doc_count": 27069
178 | },
179 | {
180 | "key": "auburn",
181 | "doc_count": 14732
182 | },
183 | {
184 | "key": "gray",
185 | "doc_count": 6624
186 | },
187 | {
188 | "key": "brunette",
189 | "doc_count": 3396
190 | },
191 | {
192 | "key": "light brown",
193 | "doc_count": 1813
194 | },
195 | {
196 | "key": "dark brown",
197 | "doc_count": 1350
198 | },
199 | {
200 | "key": "other",
201 | "doc_count": 862
202 | },
203 | {
204 | "key": "chestnut",
205 | "doc_count": 735
206 | },
207 | {
208 | "key": "dirty brown",
209 | "doc_count": 345
210 | },
211 | {
212 | "key": "auburn red",
213 | "doc_count": 259
214 | },
215 | {
216 | "key": "auburnred",
217 | "doc_count": 142
218 | },
219 | {
220 | "key": "strawberry blonde",
221 | "doc_count": 142
222 | },
223 | {
224 | "key": "white",
225 | "doc_count": 29
226 | },
227 | {
228 | "key": "long",
229 | "doc_count": 23
230 | },
231 | {
232 | "key": "long brown",
233 | "doc_count": 18
234 | }
235 | ]
236 | }
237 | }
238 | }
239 | }
240 |
241 | For the basis to make sense, the filter portion should be specified to include only those documents with the aggregated value?
242 |
243 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count
244 | {
245 | "query": {
246 | "filtered": {
247 | "query": {
248 | "match_all": {}
249 | },
250 | "filter": {
251 | "nested": {
252 | "path": "itemOffered",
253 | "filter": {
254 | "exists": {
255 | "field": "eyeColor"
256 | }
257 | }
258 | }
259 | }
260 | }
261 | },
262 |
263 | "aggs": {
264 | "itemOfferedAgg": {
265 | "nested": {
266 | "path": "itemOffered"
267 | },
268 | "aggs": {
269 | "termsSubAgg": {
270 | "terms": {
271 | "field": "itemOffered.eyeColor",
272 | "size" : 100
273 | }
274 | }
275 | }
276 | }
277 | }
278 | }
279 |
280 | TOPLEVEL objects in our ES are (with useful attributes):
281 | webpage (root WebPage)
282 | mainEntity* -> offer
283 | publisher.name [y]
284 | adultservice (root AdultService)
285 | eyeColor
286 | hairColor
287 | name
288 | personAge
289 | offers* -> offer
290 | offer (root Offer)
291 | availableAtOrFrom* -> place/address
292 | itemOffered* -> adultservice
293 | priceSpecification.price [x]
294 | priceSpecification.billingIncrement [x]
295 | priceSpecification.unitCode [x]
296 | priceSpecification.name [x]
297 | phone (root PhoneNumber)
298 | seller (root PersonOrOrganization)
299 | email (root EmailAddress)
300 |
301 | Thus non-toplevel objects include:
302 | address
303 | geo
304 | priceSpecification
305 | publisher
306 |
307 |
--------------------------------------------------------------------------------
/src/dig/SteinerTree.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Sep 7 12:57:57 2013
4 |
5 | @author: xinghualu
6 | @modified: philpot
7 | """
8 |
9 | # This is a generalized implementation of the Kou algorithm for creating Steiner Trees.
10 | # It can be used with any networkx weighted graph.
11 |
12 | from heapq import heappush, heappop
13 | from networkx import Graph, bidirectional_dijkstra, has_path
14 |
15 | # import json
16 | # def hashableDict(d):
17 | # return json.dumps(d, sort_keys=True)
18 | #
19 | # def unhashableDict(hashableDict):
20 | # return json.loads(hashableDict)
21 |
22 | ## Extract a Steiner tree from a weighted graph, given a list of vertices of interest
23 | # @param G A Graph with weighted edges
24 | # @param voi A list of vertices of interest
25 | # @param generator A method to make a new Graph instance (in the case that you've extended Graph)
26 | # \returns a new graph if no errors, None otherwise
27 | def make_steiner_tree(G, voi, generator=None):
28 | mst = Graph()
29 | for v in voi:
30 | if not v in G:
31 | raise ValueError("make_steiner_tree(): Vertex {} not in original graph".format(v))
32 | if len(voi) == 0:
33 | return mst
34 | if len(voi) == 1:
35 | mst.add_node(voi[0])
36 | return mst
37 |
38 | # Initially, use (a version of) Kruskal's algorithm to extract a minimal spanning tree
39 | # from a weighted graph. This algorithm differs in that only a subset of vertices are
40 | # going to be present in the final subgraph (which is not truly a MST - must use Prim's
41 | # algorithm later.
42 |
43 | # extract all shortest paths among the voi
44 | heapq = []
45 | paths = {}
46 |
47 | # load all the paths bwteen the Steiner vertices. Store them in a heap queue
48 | # and reconstruct the MST of the complete graph using Kruskal's algorithm
49 | for i in range(len(voi) - 1):
50 | v1 = voi[i]
51 | for v2 in voi[i+1:]:
52 | result = bidirectional_dijkstra(G, v1, v2)
53 | if result == False:
54 | raise RuntimeError("The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2))
55 | #print "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2)
56 | distance, vertList = result
57 | keys = [v1, v2]
58 | keys.sort()
59 | key = "%s:%s" % tuple(keys)
60 | paths[key] = (vertList)
61 | heappush(heapq, (distance, v1, v2))
62 |
63 | # construct the minimum spanning tree of the complete graph
64 | while heapq:
65 | w, v1, v2 = heappop(heapq)
66 | # if no path exists yet between v1 and v2, add this one
67 | if v1 not in mst or v2 not in mst or not has_path(mst, v1, v2):
68 | mst.add_edge(v1, v2, weight=w)
69 |
70 | # check if the graph is tree and correct
71 | sTree = set(mst.nodes())
72 | sSteiner = set(voi)
73 | if sTree ^ sSteiner:
74 | raise RuntimeError('Failed to construct MST spanning tree')
75 |
76 | # reconstruct subgraph of origGraph using the paths
77 | if generator is None:
78 | subgraph = Graph()
79 | else:
80 | subgraph = generator()
81 | for edge in mst.edges_iter(data=True):
82 | keys = [edge[0],edge[1]]
83 | keys.sort()
84 | key = "%s:%s" % tuple(keys)
85 | vList = paths[key]
86 | for i in range(len(vList) - 1):
87 | v1 = vList[i]
88 | v2 = vList[i+1]
89 | w = G[v1][v2]
90 | subgraph.add_edge(v1, v2, w)
91 | # get rid of possible loops - result will be a true MST
92 | subgraph = make_prim_mst(subgraph, generator)
93 |
94 | # remove intermediate nodes in paths that are not in list of voi
95 | return _trimTree(subgraph, voi)
96 |
97 |
98 | ## remove intermediate nodes in paths that are not in list of voi in given graph
99 | # @param graph A weighted Graph
100 | # @param voi
101 | #/return graph An updated version of the Graph
102 | def _trimTree(graph, voi):
103 | trimKeepTrack = []
104 | firstNode = voi[0]
105 | if len(graph.neighbors(firstNode)) < 2:
106 | trimKeepTrack.append(firstNode)
107 | firstNeighbor = graph.neighbors(firstNode)[0]
108 | trimKeepTrack.append(firstNeighbor)
109 | graph = _trim(firstNeighbor, graph, trimKeepTrack, voi)
110 | else:
111 | trimKeepTrack.append(firstNode)
112 | graph = _trim(firstNode, graph, trimKeepTrack, voi)
113 | return graph
114 |
115 | def _trim(node, graph, trimKeepTrack, voi):
116 | if len(list(graph.adj[node].keys())) > 1:
117 | for nodeNeighbor in list(graph.adj[node].keys()):
118 | if nodeNeighbor not in trimKeepTrack:
119 | trimKeepTrack.append(nodeNeighbor)
120 | graph = _trim(nodeNeighbor, graph, trimKeepTrack, voi)
121 | if len(list(graph.adj[node].keys())) < 2:
122 | if node not in voi:
123 | graph.remove_node(node)
124 | return graph
125 |
126 |
127 | """
128 | Prim's algorithm: constructs the minimum spanning tree (MST) from an instance of weighted Graph
129 | @param G An weighted Graph()
130 | @param generator A method to make a new Graph instance (in the case that you've extended Graph)
131 | \returns A MST version of G
132 | """
133 | ## generate the Prim's algorithm MST
134 | # @param G A weighted Graph
135 | # @param generator Always set to None
136 | # /return mst Returns the created MST
137 | def make_prim_mst(G, generator=None):
138 | if generator is None:
139 | mst = Graph()
140 | else:
141 | mst = generator()
142 | # priorityQ is a list of list (the reverse of the edge tuple with the weight in the front)
143 | priorityQ = []
144 | firstNode = G.nodes()[0]
145 | mst.add_node(firstNode)
146 | for edge in G.edges_iter(firstNode, data=True):
147 | if len(edge) != 3 or edge[2] is None:
148 | raise ValueError("make_prim_mst accepts a weighted graph only (with numerical weights)")
149 | heappush(priorityQ, (edge[2]['weight'], edge))
150 | while len(mst.edges()) < (G.order()-1):
151 | _, minEdge = heappop(priorityQ)
152 | if len(minEdge) != 3 or minEdge[2] is None:
153 | raise ValueError("make_prim_mst accepts a weighted graph only (with numerical weights)")
154 | v1, v2, _ = minEdge
155 | if v1 not in mst:
156 | for edge in G.edges_iter(v1, data=True):
157 | if edge == minEdge:
158 | continue
159 | heappush(priorityQ, (edge[2]['weight'], edge))
160 | elif v2 not in mst:
161 | for edge in G.edges_iter(v2, data=True):
162 | if edge == minEdge:
163 | continue
164 | heappush(priorityQ, (edge[2]['weight'], edge))
165 | else:
166 | # non-crossing edge
167 | continue
168 | mst.add_edge(minEdge[0],minEdge[1],minEdge[2])
169 | return mst
170 |
--------------------------------------------------------------------------------
/src/dig/config.ini:
--------------------------------------------------------------------------------
1 | [direct]
2 | enable = true
3 |
4 | [wordnet]
5 | enable = true
6 | lemma_min_count = 1
7 | n_enable = true
8 | n_self_factor = 1.0
9 | n_hypernym_factor = 0.5
10 | n_hyponym_factor = 0.5
11 | v_enable = true
12 | v_self_factor = 1.0
13 | v_hypernym_factor = 0.5
14 | v_hyponym_factor = 0.5
15 | a_enable = true
16 | a_self_factor = 1.0
17 | a_hypernym_factor = 0
18 | a_hyponym_factor = 0.5
19 | r_enable = true
20 | r_self_factor = 1.0
21 | r_hypernym_factor = 0
22 | r_hyponym_factor = 0.5
23 |
24 | [word2vec]
25 | enable = true
26 | data_dir = /opt/word2vec/data
27 | data_file = text8-phrases.bin
28 | size = 10
29 | minimum_score = 0.5
30 |
31 | [swoogle]
32 | enable = false
33 | uri_template = http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1={}&phrase2={}
34 |
35 | [easyesa]
36 | enable = false
37 |
38 | [levenshtein]
39 | enable = true
40 | above_score = 0.0
41 | within_score = 1.0
42 |
43 | [hybridjaccard]
44 | enable = true
45 | allowexact_enable = false
46 |
--------------------------------------------------------------------------------
/src/dig/graph.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from collections import defaultdict
4 | from collections import namedtuple
5 | import json
6 | from queue import Queue
7 | import re
8 | import sys, os
9 | from pprint import pprint
10 |
11 | from Levenshtein import distance
12 | # from StringMatcher import distance
13 | # from Levenshtein.StringMatcher import distance
14 | from networkx import Graph, DiGraph
15 |
16 | from SteinerTree import make_steiner_tree
17 | from hybridJaccard import HybridJaccard
18 |
19 | LEAF_VOCAB_CACHE = "/Users/philpot/Documents/project/graph-keyword-search/src/dig/data/cache"
20 |
21 | def loadLeafVocab(pathdesc, root=LEAF_VOCAB_CACHE):
22 | pathname = os.path.join(root, pathdesc + ".json")
23 | with open(pathname, 'r') as f:
24 | j = json.load(f)
25 | # dict of (value, count)
26 | byCount = sorted([(v,q) for (q,v) in j['histo'].items()], reverse=True)
27 | return [t[1] for t in byCount]
28 |
29 | def localPath(suffix):
30 | return os.path.join(os.path.dirname(__file__), suffix)
31 |
32 | # http://stackoverflow.com/a/9283563/2077242
33 | def camelCaseWords(label):
34 | label = re.sub(r'((?<=[a-z])[A-Z]|(?".format(type(self).__name__,
45 | self.number_of_nodes(),
46 | self.number_of_edges())
47 |
48 | def installDomain(self, domainType=None):
49 | if domainType == 'ht':
50 | self.add_node('seller', nodeType='Class', className='PersonOrOrganization', indexRoot='seller')
51 |
52 | self.add_node('phone', nodeType='Class', className='PhoneNumber', indexRoot='phone')
53 | self.add_edge('seller', 'phone', edgeType='ObjectProperty', relationName='telephone')
54 |
55 | self.add_node('phone.name', nodeType='leaf', vocabDescriptor='seller_telephone_name')
56 | self.add_edge('phone', 'phone.name', edgeType='DataProperty', relationName='name')
57 |
58 | self.add_node('email', nodeType='Class', className='EmailAddress', indexRoot='email')
59 | self.add_edge('seller', 'email', edgeType='ObjectProperty', relationName='email')
60 | # for now this ES query doesn't work
61 | # self.add_node('email.name', nodeType='leaf', values=loadLeafVocab('seller_email_name'), vocabDescriptor='seller_email_name')
62 | # so use flat data instead
63 | self.add_node('email.name', nodeType='leaf', vocabDescriptor='email_name')
64 | self.add_edge('email', 'email.name', edgeType='DataProperty', relationName='name')
65 |
66 | self.add_node('offer', nodeType='Class', className='Offer', indexRoot='offer')
67 | self.add_edge('offer', 'seller', edgeType='ObjectProperty', relationName='seller')
68 | self.add_edge('seller', 'offer', edgeType='ObjectProperty', relationName='makesOffer')
69 |
70 | self.add_node('priceSpecification', nodeType='Class', className='PriceSpecification')
71 | self.add_node('priceSpecification.billingIncrement', nodeType='leaf', vocabDescriptor='offer_priceSpecification_billingIncrement')
72 | self.add_edge('priceSpecification', 'priceSpecification.billingIncrement', edgeType='DataProperty', relationName='billingIncrement')
73 | self.add_node('priceSpecification.price', nodeType='leaf', vocabDescriptor='offer_priceSpecification_price')
74 | self.add_edge('priceSpecification', 'priceSpecification.price', edgeType='DataProperty', relationName='price')
75 | self.add_node('priceSpecification.name', nodeType='leaf', vocabDescriptor='offer_priceSpecification_name')
76 | self.add_edge('priceSpecification', 'priceSpecification.name', edgeType='DataProperty', relationName='name')
77 | self.add_node('priceSpecification.unitCode', nodeType='leaf', vocabDescriptor='offer_priceSpecification_unitCode')
78 | self.add_edge('priceSpecification', 'priceSpecification.unitCode', edgeType='DataProperty', relationName='unitCode')
79 |
80 | self.add_edge('offer', 'priceSpecification', edgeType='ObjectProperty', relationName='priceSpecification')
81 |
82 | self.add_node('adultservice', nodeType='Class', className='AdultService', indexRoot='adultservice')
83 | self.add_node('adultservice.eyeColor', nodeType='leaf',
84 | vocabDescriptor='adultservice_eyeColor',
85 | matcherDescriptor=HybridJaccard(ref_path=localPath("data/config/hybridJaccard/eyeColor_reference_wiki.txt"),
86 | config_path=localPath("data/config/hybridJaccard/eyeColor_config.txt")))
87 | self.add_edge('adultservice', 'adultservice.eyeColor', edgeType='DataProperty', relationName='eyeColor')
88 |
89 | self.add_node('adultservice.hairColor', nodeType='leaf',
90 | vocabDescriptor='adultservice_hairColor',
91 | matcherDescriptor=HybridJaccard(ref_path=localPath("data/config/hybridJaccard/hairColor_reference_wiki.txt"),
92 | config_path=localPath("data/config/hybridJaccard/hairColor_config.txt")))
93 | self.add_edge('adultservice', 'adultservice.hairColor', edgeType='DataProperty', relationName='hairColor')
94 | self.add_node('adultservice.name', nodeType='leaf', vocabDescriptor='adultservice_name')
95 | self.add_edge('adultservice', 'adultservice.name', edgeType='DataProperty', relationName='name')
96 | self.add_node('adultservice.personAge', nodeType='leaf', vocabDescriptor='adultservice_personAge')
97 | self.add_edge('adultservice', 'adultservice.personAge', edgeType='DataProperty', relationName='personAge')
98 |
99 | self.add_edge('offer', 'adultservice', edgeType='ObjectProperty', relationName='itemOffered')
100 | self.add_edge('adultservice', 'offer', edgeType='ObjectProperty', relationName='offers')
101 |
102 | self.add_node('place', nodeType='Class', className='Place')
103 | self.add_node('postaladdress', nodeType='Class', className='PostalAddress')
104 |
105 | self.add_edge('offer', 'place', edgeType='ObjectProperty', relationName='availableAtOrFrom')
106 | self.add_edge('place', 'postaladdress', edgeType='ObjectProperty', relationName='address')
107 |
108 | self.add_node('postaladdress.addressLocality', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressLocality')
109 | self.add_edge('postaladdress', 'postaladdress.addressLocality', edgeType='DataProperty', relationName='addressLocality')
110 | self.add_node('postaladdress.addressRegion', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressRegion')
111 | self.add_edge('postaladdress', 'postaladdress.addressRegion', edgeType='DataProperty', relationName='addressRegion')
112 | self.add_node('postaladdress.addressCountry', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressCountry')
113 | self.add_edge('postaladdress', 'postaladdress.addressCountry', edgeType='DataProperty', relationName='addressCountry')
114 |
115 | self.add_node('webpage', nodeType='Class', className='WebPage', indexRoot='webpage')
116 | self.add_edge('offer', 'webpage', edgeType='ObjectProperty', relationName='mainEntityOfPage')
117 | self.add_edge('webpage', 'offer', edgeType='ObjectProperty', relationName='mainEntity')
118 | self.add_node('publisher', nodeType='Class', className='Organization')
119 | self.add_edge('webpage', 'publisher', edgeType='ObjectProperty', relationName='publisher')
120 | self.add_node('publisher.name', nodeType='leaf', vocabDescriptor='webpage_publisher_name')
121 | self.add_edge('publisher', 'publisher.name', edgeType='DataProperty', relationName='name')
122 |
123 | def labelInGraph(self, nodeOrEdge):
124 | try:
125 | return self.node[nodeOrEdge]['className']
126 | except:
127 | try:
128 | return self.edge[nodeOrEdge[0]][nodeOrEdge[1]]['relationName']
129 | except:
130 | return None
131 |
132 | def populateValues(self, nodeOrEdge):
133 | try:
134 | node = nodeOrEdge
135 | nodeType = self.node[node]['nodeType']
136 | if nodeType == 'leaf':
137 | self.populateLeafNode(node)
138 | elif nodeType == 'Class':
139 | self.populateClassNode(node)
140 | except Exception as _:
141 | edge = nodeOrEdge
142 | (node1, node2) = edge
143 | edgeType = self.edge[node1][node2]['edgeType']
144 | if edgeType == 'ObjectProperty':
145 | self.populateRelationEdge(edge)
146 | elif edgeType == 'DataProperty':
147 | self.populateAttributeEdge(edge)
148 |
149 | # The problem is that "values" is too general. We can associate values with nodes and edges via a variety of semantics:
150 | # (1) instances from ES, presumably only for leaf nodes
151 | # (2) ontology labels, ontology descriptions, presumably only for edges and interior nodes
152 |
153 | def populateLeafNode(self, node):
154 | self.node[node]['values'] = loadLeafVocab(self.node[node]['vocabDescriptor'])
155 | self.node[node]['valueOrigin'] = 'leafVocab'
156 |
157 | # The next three probably should use the same methodology/same code
158 | def populateClassNode(self, node):
159 | self.node[node]['values'] = list(set([node, self.node[node]['className']]))
160 | self.node[node]['valueOrigin'] = 'ontology'
161 |
162 | def populateRelationEdge(self, edge):
163 | (node1, node2) = edge
164 | self.edge[node1][node2]['values'] = [camelCaseWords(self.edge[node1][node2]['relationName'])]
165 | self.edge[node1][node2]['valueOrigin'] = 'ontology'
166 |
167 | def populateAttributeEdge(self, edge):
168 | (node1, node2) = edge
169 | self.edge[node1][node2]['values'] = [camelCaseWords(self.edge[node1][node2]['relationName'])]
170 | self.edge[node1][node2]['valueOrigin'] = 'ontology'
171 |
172 | def isLeaf(self, nodeOrEdge):
173 | try:
174 | return self.node[nodeOrEdge]['valueOrigin'] == 'leafVocab'
175 | except:
176 | return False
177 |
178 | def populateAll(self):
179 | for node in self.nodes():
180 | self.populateValues(node)
181 | for edge in self.edges():
182 | self.populateValues(edge)
183 |
184 | def nodeMatch(self, node, label):
185 | """list generator"""
186 | return label.lower().replace('_', ' ') in (value.lower() for value in self.node[node]['values'])
187 |
188 | def edgeMatch(self, edge, label):
189 | """list generator"""
190 | return label.lower().replace('_', ' ') in (value.lower() for value in self.edge[edge[0]][edge[1]]['values'])
191 |
192 | def nodeEditWithin(self, node, label, within=1, above=None):
193 | """set above=0 to avoid matching node value exactly identical to label
194 | Does not find closest node values, just any values within interval"""
195 | l = label.lower().replace('_', ' ')
196 | for value in self.node[node]['values']:
197 | value = value.lower().replace('_', ' ')
198 | actual = distance(l, value)
199 | if (above==None or actual>above) and actual <= within:
200 | # if levenshtein is 0, return true value 0.0
201 | # return actual or 0.0
202 | return(value, actual)
203 |
204 | def edgeEditWithin(self, edge, label, within=1, above=None):
205 | """set above=0 to avoid matching edge value exactly identical to label"""
206 | l = label.lower().replace('_', ' ')
207 | for value in self.edge[edge[0]][edge[1]]['values']:
208 | value = value.lower().replace('_', ' ')
209 | actual = distance(l, value)
210 | if (not above or actual>above) and actual <= within:
211 | # if levenshtein is 0, return true value 0.0
212 | return actual or 0.0
213 |
214 | def nodeNearMatch(self, node, label, allowExact=False):
215 | """set allowExact to True to look up values directly here"""
216 | label = label.lower().replace('_', ' ')
217 | # print(self.node[node])
218 | try:
219 | hjMatcher = self.node[node]['matcherDescriptor']
220 | best = hjMatcher.findBestMatch(label)
221 | if best != "NONE":
222 | for value in self.node[node]['values']:
223 | value = value.lower().replace('_', ' ')
224 | if ((label != value) or allowExact) and (best==value):
225 | # HJ(label)== a value from node and
226 | # either we allow exact or see that label is not exactly the retrieved value
227 | # print(best)
228 | return best
229 | except KeyError:
230 | pass
231 |
232 | def edgeNearMatch(self, edge, label, allowExact=False):
233 | """set allowExact to True to look up values directly here"""
234 | label = label.lower().replace('_', ' ')
235 | try:
236 | hjMatcher = self.edge[edge[0]][edge[1]]['matcherDescriptor']
237 | best = hjMatcher.findBestMatch(label)
238 | if best != "NONE":
239 | for value in self.edge[edge[0]][edge[1]]['values']:
240 | value = value.lower().replace('_', ' ')
241 | if ((label != value) or allowExact) and (best==value):
242 | # HJ(label)== a value from edge and
243 | # either we allow exact or see that label is not exactly the retrieved value
244 | return best
245 | except KeyError:
246 | pass
247 |
248 | def generateSubgraph(self, node):
249 | seen = set()
250 | def visitNode(n1):
251 | if n1 in seen:
252 | pass
253 | else:
254 | yield(("node",n1))
255 | seen.add(n1)
256 | for n2 in self.edge[n1]:
257 | yield from visitEdge((n1,n2))
258 | def visitEdge(e):
259 | (_,n2) = e
260 | if e in seen:
261 | pass
262 | else:
263 | yield(("edge",e))
264 | seen.add(e)
265 | yield from visitNode(n2)
266 | return visitNode(node)
267 |
268 | """SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10},
269 | {"docType": "adultservice", "fieldName": "hairColor", "size": 10},
270 | {"docType": "adultservice", "fieldName": "name", "size": 200},
271 | {"docType": "adultservice", "fieldName": "personAge", "size": 20},
272 |
273 | {"docType": "phone", "fieldName": "name", "size": 200},
274 |
275 | {"docType": "email", "fieldName": "name", "size": 200},
276 |
277 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200},
278 | # Ignore webpage.description, webpage.dateCreated
279 |
280 | # Ignore offer.identifier
281 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10},
282 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 200},
283 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 200},
284 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10},
285 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200},
286 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200},
287 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200},
288 | # Ignore offer.availableAtOrFrom.name
289 | # Ignore offer.availableAtOrFrom.geo.lat, offer.availableAtOrFrom.geo.lon
290 | ]"""
291 |
292 | wg = None
293 |
294 | nodeDesig = namedtuple('nodeDesig', 'nodeType, nodeRefs')
295 |
296 | def truenodeDesig(node):
297 | """Render a kgraph node as a wgraph node"""
298 | return nodeDesig(nodeType='truenode', nodeRefs=(node,))
299 |
300 | def edgenodeDesig(edge):
301 | """Render a kgraph edge as a wgraph node"""
302 | return nodeDesig(nodeType='edgenode', nodeRefs=edge)
303 |
304 | class ImpossibleGraph(Exception):
305 | def __init__(self, message):
306 | # Call the base class constructor with the parameters it needs
307 | super(ImpossibleGraph, self).__init__(message)
308 |
309 | # PythonDecorators/entry_exit_class.py
310 |
311 | class entry_exit(object):
312 |
313 | def __init__(self, f):
314 | self.f = f
315 |
316 | def __call__(self, *args):
317 | print("Entering", self.f.__name__)
318 | r = self.f(*args)
319 | print("Exited", self.f.__name__)
320 | return(r)
321 |
322 | def minimalSubgraph(kgraph, root, query, verbose=False):
323 | # transform into weighted nondirected graph
324 | # all nodes become nodes ("truenode")
325 | # all edges also become nodes ("edgenode")
326 | # induce edge with weight 1 for each node/edge and edge/node
327 | # except: traverse starting at root, dropping any backlinks [?]
328 |
329 | # required contains nodes/edges from original kgraph
330 | required = defaultdict(list)
331 | # To start with, we don't know if root has any cands
332 | required[truenodeDesig(root)]=[]
333 | for a in query.ngrams.values():
334 | for cand in a["candidates"]:
335 | if cand.referentType == 'node':
336 | #required.add(truenodeDesig(cand.referent))
337 | required[truenodeDesig(cand.referent)].append(cand)
338 | elif cand.referentType == 'edge':
339 | #required.add(edgenodeDesig(cand.referent))
340 | required[edgenodeDesig(cand.referent)].append(cand)
341 | if verbose:
342 | print("Steiner tree must contain:")
343 | for n, c in required.items():
344 | print(" ", n.nodeRefs[0], c)
345 |
346 | # seen contains nodes/edges from original kgraph
347 | seen = set()
348 |
349 | # q contains nodes/edges from original kgraph
350 | q = Queue(maxsize=kgraph.number_of_nodes() + 3*kgraph.number_of_edges())
351 | q.put(root)
352 |
353 | # wg contains wgnodes, wgedges
354 | global wg
355 | wg = Graph()
356 |
357 | while not q.empty():
358 | # print("Queue size: {}; wg size {}".format(q.qsize(), len(wg)), file=sys.stderr)
359 | obj = q.get()
360 | # print("Dequeue {}".format(obj), file=sys.stderr)
361 | if not obj in seen:
362 | if isinstance(obj, (str)):
363 | # unseen kgraph node
364 | seen.add(obj)
365 | node = obj
366 | # print("wg: add true node {}".format(node), file=sys.stderr)
367 | wg.add_node(truenodeDesig(node))
368 | for node2 in kgraph.edge[node]:
369 | # print("Enqueue edge {} {}".format(node, node2), file=sys.stderr)
370 | q.put((node,node2))
371 | elif isinstance(obj, (list, tuple)) and len(obj)==2:
372 | # unseen kgraph edge
373 | seen.add(obj)
374 | # edge = obj
375 | # create a node representing original edge
376 | (node1, node2) = obj
377 | truenode1 = truenodeDesig(node1)
378 | truenode2 = truenodeDesig(node2)
379 | edge = obj
380 | # print("wg: add edge node {}".format(edge), file=sys.stderr)
381 | edgenode = edgenodeDesig(edge)
382 | wg.add_node(edgenode)
383 | wg.add_edge(truenode1, edgenode, weight=1)
384 | wg.add_edge(edgenode, truenode2, weight=1)
385 | # print("Enqueue node {}".format(node2), file=sys.stderr)
386 | q.put(node2)
387 | else:
388 | print("Unexpected {}".format(obj), file=sys.stderr)
389 | else:
390 | # print("Obj {} already seen".format(obj), file=sys.stderr)
391 | pass
392 |
393 | # print("Weighted non-directed graph")
394 | # pprint(wg.nodes())
395 | # return (None, wg)
396 | # generate minimal steiner tree
397 | try:
398 | requiredNodes = list(required.keys())
399 | st = make_steiner_tree(wg, requiredNodes)
400 | # convert back to directed graph
401 | neededTruenodes = [nd.nodeRefs[0] for nd in st.nodes() if nd.nodeType=='truenode']
402 | subg = kgraph.subgraph(neededTruenodes)
403 | return (st, wg, subg)
404 | except ValueError as ve:
405 | if "not in original graph" in str(ve):
406 | raise ImpossibleGraph("Cannot generate subgraph of {} containing {}".format("weightedGraph", requiredNodes))
407 | else:
408 | raise(ve)
409 |
410 | g = None
411 |
412 | def htGraph(**kwargs):
413 | global g
414 | g = KGraph(domainType='ht')
415 | g.populateAll()
416 | return g
417 |
--------------------------------------------------------------------------------
/src/dig/harvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | if sys.version_info < (3, 0):
5 | raise "must use python 3.0 or greater"
6 |
7 | import sys
8 | from elasticsearch import Elasticsearch
9 | from pprint import pprint
10 | import json
11 | from collections import OrderedDict
12 | import os
13 |
14 | CA_CERTS_PATH='/Users/philpot/aws/credentials/certs.pem'
15 |
16 | es = Elasticsearch(
17 | [
18 | 'https://darpamemex:darpamemex@esc.memexproxy.com/' # dig-ht-latest/offer
19 | # 'http://user:secret@localhost:9200/',
20 | ],
21 | # make sure we verify SSL certificates (off by default)
22 | verify_certs=True,
23 | # provide a path to CA certs on disk
24 | ca_certs=CA_CERTS_PATH
25 | )
26 |
27 | def makeBodyNested(fieldName="name", innerPath="itemOffered", size=10):
28 | return {
29 | "query": {
30 | "match_all": {}
31 | },
32 | "aggs": {
33 | "toplevelAgg": {
34 | "nested": {
35 | "path": innerPath
36 | },
37 | "aggs": {
38 | "termAgg": {
39 | "terms": {
40 | "field": "{}.{}".format(innerPath, fieldName),
41 | "size" : size
42 | }
43 | }
44 | }
45 | }
46 | }
47 | }
48 |
49 | def makeBodyDirect(fieldName="name", size=10):
50 | return {
51 | "query": {
52 | "match_all": {}
53 | },
54 | "aggs": {
55 | "termAgg": {
56 | "terms": {
57 | "field": fieldName,
58 | "size": size
59 | }
60 | }
61 | }
62 | }
63 |
64 | def makeBody(fieldName="name", innerPath="", size=10):
65 | if innerPath:
66 | return makeBodyNested(fieldName=fieldName,
67 | innerPath=innerPath,
68 | size=size)
69 | else:
70 | return makeBodyDirect(fieldName=fieldName,
71 | size=size)
72 |
73 | """
74 | {'_shards': {'failed': 0, 'successful': 20, 'total': 20},
75 | 'aggregations': {'toplevelAgg': {'doc_count': 19134836,
76 | 'termAgg': {'buckets': [{'doc_count': 18104,
77 | 'key': 'jessica'},
78 | {'doc_count': 15956,
79 | 'key': 'ashley'},
80 | {'doc_count': 12748,
81 | 'key': 'amber'},
82 | {'doc_count': 12037,
83 | 'key': 'tiffany'},
84 | {'doc_count': 11808,
85 | 'key': 'bella'},
86 | {'doc_count': 11628,
87 | 'key': 'mya'},
88 | {'doc_count': 11514,
89 | 'key': 'candy'},
90 | {'doc_count': 10963,
91 | 'key': 'nikki'},
92 | {'doc_count': 10932,
93 | 'key': 'diamond'},
94 | {'doc_count': 10808,
95 | 'key': 'lexi'}],
96 | 'doc_count_error_upper_bound': 2728,
97 | 'sum_other_doc_count': 1322532}}},
98 | 'hits': {'hits': [], 'max_score': 0.0, 'total': 19134836},
99 | 'timed_out': False,
100 | 'took': 1422}
101 | """
102 |
103 | def harvest(index="dig-ht-latest", docType="webpage",fieldName="addressCountry", innerPath="", size=10):
104 | nested = True if innerPath else False
105 | body=makeBody(fieldName=fieldName, innerPath=innerPath, size=size)
106 | result = es.search(index=index,
107 | doc_type=docType,
108 | body=body,
109 | search_type="count")
110 | agg = result['aggregations']['toplevelAgg']['termAgg'] if nested else result['aggregations']['termAgg']
111 | report = {"docType": docType,
112 | "fieldName": fieldName,
113 | "innerPath": innerPath,
114 | "size": size,
115 | # use 'result' later to get hitsTotal, sum_other_doc_count if needed
116 | "result": result,
117 | # collections.OrderedDict is serialized to JSON in the order keys were added
118 | # so preserves decreasing value order
119 | "histo": OrderedDict()
120 | }
121 | for bucket in agg['buckets']:
122 | report["histo"][bucket["key"]] = bucket["doc_count"]
123 | return report
124 |
125 | # def outputPathname(docType="webpage", innerPath="mainEntity.availableAtOrFrom.address", fieldName="addressCountry", root="/tmp", **kwargs):
126 | # return os.path.join(root, "{}_{}_{}.json".format(docType, innerPath.replace('.', '_').replace('__','_'), fieldName))
127 |
128 | OUTPUT_ROOT = "/Users/philpot/Documents/project/graph-keyword-search/src/dig/data/cache"
129 |
130 | def outputPathname(docType="webpage", innerPath="", fieldName="addressCountry", root=OUTPUT_ROOT, **kwargs):
131 | return os.path.join(root, "{}_{}_{}.json".format(docType, innerPath.replace('.', '_').replace('__','_'), fieldName))
132 |
133 | WORKING=[ # works
134 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 200},
135 | # works
136 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200},
137 | {"docType": "offer", "innerPath": "seller", "fieldName": "name", "size": 200},
138 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "personAge", "size": 20},
139 |
140 | {"docType": "offer", "innerPath": "mainEntityOfPage.publisher", "fieldName": "name", "size": 20},
141 | {"docType": "seller", "innerPath": "makesOffer.mainEntityOfPage.publisher", "fieldName": "name", "size": 20},
142 | {"docType": "phone", "innerPath": "owner.makesOffer.mainEntityOfPage.publisher", "fieldName": "name", "size": 20},
143 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10},
144 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 10},
145 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 10},
146 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10},
147 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10},
148 |
149 | {"docType": "offer", "innerPath": "availableAtOrFrom", "fieldName": "name", "size": 10},
150 | {"docType": "offer", "innerPath": "availableAtOrFrom.geo", "fieldName": "lat", "size": 10},
151 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "hairColor", "size": 20},
152 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "eyeColor", "size": 20},
153 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 20},
154 | {"docType": "offer", "innerPath": "availableAtOrFrom.geo", "fieldName": "lat", "size": 10},
155 | {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 10},
156 | {"docType": "seller", "innerPath": "telephone", "fieldName": "a", "size": 10}
157 | ]
158 |
159 |
160 | SPECS=[ # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200},
161 | # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200},
162 | # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200},
163 |
164 | ###{"docType": "seller", "innerPath": "email", "fieldName": "name", "size": 10},
165 | ###{"docType": "seller", "innerPath": "email", "fieldName": "a", "size": 10},
166 | ###{"docType": "offer", "innerPath": "seller.telephone", "fieldName": "a", "size": 10},
167 | # WORKS
168 | ###{"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 10},
169 | # DOES NOT WORK in pyelasticsearch, only in sense/curl
170 | {"docType": "offer", "innerPath": "seller.telephone", "fieldName": "name", "size": 10},
171 | # WORKS
172 | {"docType": "webpage", "innerPath": "mainEntity.seller.telephone", "fieldName": "name", "size": 10}
173 |
174 | # Doesn't work
175 | # {"docType": "offer", "innerPath": "seller.telephone", "fieldName": "name", "size": 200},
176 | # ???
177 | # {"docType": "offer", "innerPath": "seller", "fieldName": "a", "size": 200},
178 | # {"docType": "offer", "innerPath": "itemOffered", "fieldName": "a", "size": 200},
179 |
180 | # {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 200}
181 | # bad syntax
182 | # {"docType": "address", "innerPath": "", "fieldName": "addressCountry", "size": 200}
183 | # doesn't work
184 | # probably the issue w.r.t. nested Pedro suggested
185 | # but sibling fields do work
186 | ]
187 |
188 | # SPECS=[ {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 10} ]
189 |
190 | SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10} ]
191 |
192 | SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10},
193 | {"docType": "adultservice", "fieldName": "hairColor", "size": 10},
194 | {"docType": "adultservice", "fieldName": "name", "size": 200},
195 | {"docType": "adultservice", "fieldName": "personAge", "size": 20},
196 |
197 | # These are valid, but has flat distribution, so not useful for suggestion
198 | # {"docType": "phone", "fieldName": "name", "size": 200},
199 | # {"docType": "email", "fieldName": "name", "size": 200},
200 | # Instead seller-centric distribution
201 | {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 200},
202 | {"docType": "seller", "innerPath": "email", "fieldName": "name", "size": 200},
203 |
204 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200},
205 | # Ignore webpage.description, webpage.dateCreated
206 |
207 | # Ignore offer.identifier
208 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10},
209 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 200},
210 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 200},
211 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10},
212 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200},
213 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200},
214 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200},
215 | # Ignore offer.availableAtOrFrom.name
216 | # Ignore offer.availableAtOrFrom.geo.lat, offer.availableAtOrFrom.geo.lon
217 |
218 | ]
219 |
220 | def harvestToFile(spec):
221 | outPath = None
222 | try:
223 | outPath = outputPathname(**spec)
224 | except:
225 | pass
226 | try:
227 | h = harvest(**spec)
228 | print("Harvest to {}".format(outPath), file=sys.stderr)
229 | with open(outPath, 'w') as f:
230 | # Don't use sort_keys here
231 | # We are counting on the behavior where collections.OrderedDict is
232 | # serialized in the order keys were added. If we add things in
233 | # order of decreasing counts, the order will stick, unless we use sort_keys.
234 | json.dump(h, f, indent=4)
235 | except Exception as e:
236 | print("Error [{}] during processing of {}".format(e, outPath))
237 |
238 | def generateAll ():
239 | for spec in SPECS:
240 | print()
241 | print(spec)
242 | # harvestToFile(spec)
243 | try:
244 | h = harvest(**spec)
245 | # pprint(h)
246 | l = -1
247 | try:
248 | try:
249 | # nested
250 | b = h["result"]["aggregations"]["toplevelAgg"]["termAgg"]["buckets"]
251 | except:
252 | # direct
253 | b = h["result"]["aggregations"]["termAgg"]["buckets"]
254 | l = len(b)
255 | if l>0:
256 | print("Success %d for %s" % (l, spec), file=sys.stderr)
257 | q = 5
258 | for i,v in zip(range(q+1), b[0:q]):
259 | print("value %d is %s" % (i, v))
260 | elif l==0:
261 | print("No data for %s" % (spec), file=sys.stderr)
262 | else:
263 | pass
264 | except Exception as e:
265 | print("Nothing happened for %s" % (spec), file=sys.stderr)
266 | print(e, file=sys.stderr)
267 | except Exception as e:
268 | print("Failed during %s" % (spec), file=sys.stderr)
269 | print(e, file=sys.stderr)
270 |
271 | """
272 |
273 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count
274 | {
275 | "query": {
276 | "filtered": {
277 | "query": {
278 | "match_all": {}
279 | },
280 | "filter": {
281 | "nested": {
282 | "path": "itemOffered",
283 | "filter": {
284 | "exists": {
285 | "field": "eyeColor"
286 | }
287 | }
288 | }
289 | }
290 | }
291 | },
292 |
293 | "aggs": {
294 | "toplevelAgg": {
295 | "nested": {
296 | "path": "itemOffered"
297 | },
298 | "aggs": {
299 | "termAgg": {
300 | "terms": {
301 | "field": "itemOffered.eyeColor",
302 | "size" : 100
303 | }
304 | }
305 | }
306 | }
307 | }
308 | }
309 | """
310 |
--------------------------------------------------------------------------------
/src/dig/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys, os
4 | import argparse
5 |
6 | from graph import htGraph, ImpossibleGraph, minimalSubgraph
7 | from query import Query
8 | from synonym import Thesaurus
9 | # from outline import Outline, iii
10 | from outline import *
11 | import configparser
12 |
13 | from pprint import pprint
14 |
15 | g = None
16 | q = None
17 | s = None
18 | m = None
19 | wg = None
20 | sg = None
21 | o = None
22 |
23 | def interpretConfig(configFile, verbose=False):
24 | try:
25 | cfg = configparser.ConfigParser()
26 | cfg.read(configFile)
27 | except:
28 | if verbose:
29 | print("Unable to read any configuration from {}".format(configFile), file=sys.stderr)
30 | kwargs = {}
31 | for sectionName in cfg.sections():
32 | section = cfg[sectionName]
33 | for key, value in section.items():
34 | kw = sectionName + '_' + key
35 | try:
36 | if key.endswith('count') or key.endswith('size') or key.endswith('length'):
37 | kwargs[kw] = section.getint(key)
38 | elif key.endswith('factor') or key.endswith('score'):
39 | kwargs[kw] = section.getfloat(key)
40 | elif key.endswith('enable'):
41 | kwargs[kw] = section.getboolean(key)
42 | else:
43 | kwargs[kw] = value
44 | except:
45 | kwargs[kw] = value
46 | return kwargs
47 |
48 |
49 | def main(argv=None):
50 | '''this is called if run from command line'''
51 | global g, q, s, m, wg, sg, o
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument('terms', nargs='*', default=[], action="append")
54 | parser.add_argument('-v', '--verbose', required=False, help='verbose', action='store_true')
55 | parser.add_argument('-e', '--explain', required=False, help='include explanations in intermediate repn',
56 | choices=['text','structured','None'])
57 | # parser.add_argument('-o', '--options')
58 | parser.add_argument('-j', '--config', required=False, help='config', default=os.path.join(os.path.dirname(__file__), "config.ini"))
59 | args = parser.parse_args()
60 | # TODO nargs generates a list of lists
61 | terms = args.terms[0]
62 | cmdline = {"verbose": args.verbose,
63 | "explain": None if args.explain=='None' else args.explain}
64 | config = interpretConfig(args.config)
65 | g = htGraph(**cmdline, **config)
66 | s = Thesaurus(**cmdline, **config)
67 | q = Query(terms, g, s, **cmdline, **config)
68 | q.suggestCandidates()
69 | q.dump()
70 | # succeeds with roots = ['offer']
71 | # fails with roots = ['phone']
72 | roots = ['seller', 'phone', 'email', 'offer', 'adultservice', 'webpage']
73 | for root in roots:
74 | try:
75 | # m is steiner tree
76 | # wg is input nondirected graph
77 | # sg is output directed subgraph
78 | (m, wg, sg) = minimalSubgraph(g, root, q)
79 | o = Outline(g, sg, q, root, **cmdline, **config)
80 | except ImpossibleGraph as ig:
81 | if args.verbose:
82 | # print(ig, file=sys.stderr)
83 | print("It is not possible to generate a subgraph with root {}".format(root), file=sys.stderr)
84 | continue
85 | o.detail()
86 |
87 | # call main() if this is run as standalone
88 | if __name__ == "__main__":
89 | sys.exit(main())
90 |
--------------------------------------------------------------------------------
/src/dig/outline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | from collections import defaultdict
5 | from networkx import shortest_path
6 | import json
7 |
8 | iii = None
9 |
10 | def pathFromRoot(graph, cand, node, root):
11 | nodes = shortest_path(graph, root, node)
12 | pathComponents = [root]
13 | waypoints = nodes[0:-1]
14 | for (f,t) in zip(waypoints, waypoints[1:]):
15 | pathComponents.append(graph.labelInGraph((f,t)) or "missing")
16 | # pathComponents.append(nodes[-1].upper())
17 | # terminus is a leaf node, named after class plus relation
18 | # we only want the relation
19 | pathComponents.append(node.split('.')[-1])
20 | path = ".".join(pathComponents)
21 | return path
22 |
23 | class Outline(object):
24 | def __init__(self, graph, subgraph, query, root, verbose=False, explain=False, **kwargs):
25 | self.graph = graph
26 | self.subgraph = subgraph
27 | self.query = query
28 | self.root = root
29 | self.verbose = verbose
30 | self.explain = explain
31 |
32 | def intermediate(self):
33 | global iii
34 | relationsMentioned = []
35 | classesMentioned = []
36 | must = []
37 | should = []
38 | i = defaultdict(list)
39 | i["root"] = self.root
40 | # to begin with, no terms are covered
41 | touches = defaultdict(list)
42 | for a in self.query.ngrams.values():
43 | for cand in a["candidates"]:
44 | if cand.referentType == 'node':
45 | node = cand.referent
46 | if self.graph.isLeaf(node):
47 | # Leaf node corresponds to an equality/fuzzy relation constraint
48 | m = {"path": pathFromRoot(self.graph, cand, node, self.root),
49 | "matchType": "direct" if cand.candidateType == "direct" else "inferred",
50 | # "operands": [cand.referent, cand.content],
51 | "className": cand.referent.split('.')[0],
52 | "relationName": cand.referent.split('.')[1],
53 | "value": cand.content}
54 | if self.explain:
55 | m["_explanation"] = cand.explain(self.explain)
56 | must.append(m)
57 | else:
58 | # Other node corresponds to mention of a class (e.g., the word 'seller' is mentioned)
59 | m = {"className": self.graph.labelInGraph(node)}
60 | if self.explain:
61 | m["_explanation"] = cand.explain(self.explain)
62 | classesMentioned.append(m)
63 | # Record (possibly partial) coverage of query terms
64 | for w in a["words"]:
65 | t = {"term": w,
66 | "foundIn": "node"}
67 | if self.explain:
68 | t["_explanation"] = cand.explain(self.explain)
69 | touches[w].append(t)
70 | elif cand.referentType == 'edge':
71 | edge = cand.referent
72 | # Edge match corresponds to mention of an edge
73 | # May or may not correspond to relation constraint on that edge
74 | # In future, this might mean we want result to include its class
75 | m = {"className": self.graph.labelInGraph(edge[0]),
76 | "relationName": self.graph.labelInGraph(edge)}
77 | if self.explain:
78 | m["_explanation"] = cand.explain(self.explain)
79 | relationsMentioned.append(m)
80 | # Record (possibly partial) coverage of query terms
81 | for w in a["words"]:
82 | t = {"term": w,
83 | "foundIn": "edge"}
84 | if self.explain:
85 | t["_explanation"] = cand.explain(self.explain)
86 | touches[w].append(t)
87 | # Any terms never covered are now free-text matches
88 | for term in self.query.terms:
89 | if not touches[term]:
90 | s = {"matchType": "free",
91 | "operands": [term]}
92 | if self.explain:
93 | s["_explanation"] = "{} uninterpretable".format(term)
94 | should.append(s)
95 |
96 | i["touches"] = touches
97 | i["relationsMentioned"] = relationsMentioned
98 | i["classesMentioned"] = classesMentioned
99 | i["must"] = must
100 | i["should"] = should
101 | iii = i
102 | return i
103 |
104 | def detail(self, file=sys.stdout):
105 | # print (root,g,q,s,m,wg,sg)
106 | print("", file=file)
107 | if self.verbose:
108 | print("\nRoot {}".format(self.root), file=file)
109 | print("\nDetail of outline {}".format(self), file=file)
110 | print("Input Graph: {}".format(self.graph), file=file)
111 | print("Input Keywords: {}".format(self.query.terms), file=file)
112 | print("Input Keyword Coloring: \n{}".format(self.query.dumpToString(indent=2)), file=file)
113 | print("Relevant Subgraph: {}".format(self.subgraph), file=file)
114 | print("Intermediate Repn:", file=file)
115 | print(json.dumps(self.intermediate(), sort_keys=True, indent=4), file=file)
116 |
--------------------------------------------------------------------------------
/src/dig/prep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | INDEXNAME=twitter
4 |
5 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/"
6 |
7 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d '{
8 | "mappings": {
9 | "tweet": {
10 | "properties": {
11 | "text": {
12 | "type": "string",
13 | "term_vector": "yes",
14 | "store" : true,
15 | "index_analyzer" : "fulltext_analyzer"
16 | },
17 | "fullname": {
18 | "type": "string",
19 | "term_vector": "no",
20 | "index_analyzer" : "fulltext_analyzer"
21 | },
22 | "eyecolor": {
23 | "type": "string",
24 | "term_vector": "no",
25 | "index": "not_analyzed"
26 | }
27 | }
28 | }
29 | },
30 | "settings" : {
31 | "index" : {
32 | "number_of_shards" : 1,
33 | "number_of_replicas" : 0
34 | },
35 | "analysis": {
36 | "analyzer": {
37 | "fulltext_analyzer": {
38 | "type": "custom",
39 | "tokenizer": "whitespace",
40 | "filter": [
41 | "lowercase",
42 | "type_as_payload"
43 | ]
44 | }
45 | }
46 | }
47 | }
48 | }'
49 |
50 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{
51 | "fullname" : "John Doe",
52 | "text" : "twitter test test test ",
53 | "eyecolor": "blue"
54 | }'
55 |
56 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{
57 | "fullname" : "Jane Doe",
58 | "text" : "Another twitter test ...",
59 | "eyecolor": "blue"
60 | }'
61 |
62 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{
63 | "fullname" : "Robot",
64 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
65 | "eyecolor": "red"
66 | }'
--------------------------------------------------------------------------------
/src/dig/prep2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | INDEXNAME=twitter2
4 |
5 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/"
6 |
7 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d '{
8 | "mappings": {
9 | "tweet": {
10 | "properties": {
11 | "text": {
12 | "type": "string",
13 | "term_vector": "yes",
14 | "store" : true,
15 | "index_analyzer" : "fulltext_analyzer"
16 | },
17 | "fullname": {
18 | "type": "string",
19 | "term_vector": "no",
20 | "index_analyzer" : "fulltext_analyzer"
21 | },
22 | "eyecolor": {
23 | "type": "string",
24 | "term_vector": "no",
25 | "index": "not_analyzed"
26 | },
27 | "children": {
28 | "type": "nested",
29 | "properties": {
30 | "name": {
31 | "type": "string",
32 | "index": "not_analyzed"
33 | },
34 | "school": {
35 | "type": "string",
36 | "index": "not_analyzed"
37 | }
38 | }
39 | }
40 | }
41 | }
42 | },
43 | "settings" : {
44 | "index" : {
45 | "number_of_shards" : 1,
46 | "number_of_replicas" : 0
47 | },
48 | "analysis": {
49 | "analyzer": {
50 | "fulltext_analyzer": {
51 | "type": "custom",
52 | "tokenizer": "whitespace",
53 | "filter": [
54 | "lowercase",
55 | "type_as_payload"
56 | ]
57 | }
58 | }
59 | }
60 | }
61 | }'
62 |
63 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{
64 | "fullname" : "John Doe",
65 | "text" : "twitter test test test ",
66 | "eyecolor": "blue",
67 | "children": [ {"name": "Alice", "school": "Aardvark"} ]
68 | }'
69 |
70 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{
71 | "fullname" : "Jane Doe",
72 | "text" : "Another twitter test ...",
73 | "eyecolor": "blue",
74 | "children": [ {"name": "Bob", "school": "Aardvark"},
75 | {"name": "Carole", "school": "Aardvark"},
76 | {"name": "Dan", "school": "Aardvark"},
77 | {"name": "Eve", "school": "Badger"} ]
78 | }'
79 |
80 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{
81 | "fullname" : "Robot3",
82 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
83 | "eyecolor": "red",
84 | "children": [ {"name": "Ronald3", "school": "Factory"},
85 | {"name": "Rhonda3", "school": "Junkyard"}]
86 | }'
87 |
88 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/4?pretty=true" -d '{
89 | "fullname" : "Robot4",
90 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
91 | "eyecolor": "red",
92 | "children": [ {"name": "Ronald4", "school": "Factory"},
93 | {"name": "Rhonda4", "school": "Junkyard"}]
94 | }'
95 |
96 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/5?pretty=true" -d '{
97 | "fullname" : "Robot5",
98 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
99 | "eyecolor": "red",
100 | "children": [ {"name": "Ronald5", "school": "Factory"},
101 | {"name": "Rhonda5", "school": "Junkyard"}]
102 | }'
103 |
104 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/6?pretty=true" -d '{
105 | "fullname" : "Robot6",
106 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
107 | "eyecolor": "red",
108 | "children": [ {"name": "Ronald6", "school": "Factory"},
109 | {"name": "Rhonda6", "school": "Junkyard"}]
110 | }'
111 |
112 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/7?pretty=true" -d '{
113 | "fullname" : "Robot7",
114 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
115 | "eyecolor": "red",
116 | "children": [ {"name": "Ronald7", "school": "Factory"},
117 | {"name": "Rhonda7", "school": "Junkyard"}]
118 | }'
119 |
120 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/8?pretty=true" -d '{
121 | "fullname" : "Richard Bloggs",
122 | "text" : "one two",
123 | "eyecolor": "brown",
124 | "children": [ {"name": "Frank", "school": "Coyote"},
125 | {"name": "Glenda", "school": "Coyote"}]
126 | }'
--------------------------------------------------------------------------------
/src/dig/prep_ht.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | INDEXNAME=ht
4 |
5 | pushd $PROJ/dig-elasticsearch
6 | git pull
7 | mapping=$PROJ/dig-elasticsearch/types/webpage/esMapping-dig-ht-DT.json
8 | popd
9 |
10 | curl -k -XPUT "https://darpamemex:darpamemex@esc.memexproxy.com/${indexName}" -d @$mapping
11 |
12 |
13 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/"
14 |
15 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d @$mapping
16 |
17 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{
18 | "fullname" : "John Doe",
19 | "text" : "twitter test test test ",
20 | "eyecolor": "blue"
21 | }'
22 |
23 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{
24 | "fullname" : "Jane Doe",
25 | "text" : "Another twitter test ...",
26 | "eyecolor": "blue"
27 | }'
28 |
29 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{
30 | "fullname" : "Robot",
31 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine",
32 | "eyecolor": "red"
33 | }'
--------------------------------------------------------------------------------
/src/dig/query.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | from itertools import count
5 | from synonym import Thesaurus, Synonym
6 | from collections import Counter
7 | try:
8 | from StringIO import StringIO
9 | except ImportError:
10 | from io import StringIO
11 | from util import canonList
12 | from copy import copy
13 |
14 | class Candidate(object):
15 | def __init__(self, referent=None, referentType=None, candidateType=None, synonym=None, distance=0):
16 | # referent is something in the graph: a node or edge, which for us is a string or tuple of strings
17 | self.referent = referent
18 | # referentType is 'node' or 'edge'
19 | self.referentType = referentType
20 | # candidateType is 'direct', 'levenshtein', 'hybridJaccard', 'synonym'
21 | # candidateType is 'direct', 'levenshtein', 'hybridJaccard', 'word2vec', 'wordnet'
22 | self.candidateType = candidateType
23 | self.synonym = synonym
24 | # distance = 0 for direct
25 | # presumably distance > 0 for non-direct
26 | self.distance = distance
27 |
28 | @property
29 | def indicator(self):
30 | try:
31 | return self.synonym and self.synonym.indicator
32 | except:
33 | return None
34 |
35 | @property
36 | def content(self):
37 | try:
38 | return self.synonym and self.synonym.content
39 | except:
40 | return self.synonym
41 |
42 | def __str__(self, *args, **kwargs):
43 | sig = "None"
44 | try:
45 | sig = (self.referentType or "")
46 | sig += " "
47 | sig += (self.candidateType or "")
48 | sig += " "
49 | sig += (self.referentsLabel() or "")
50 | sig += " "
51 | sig += (str(getattr(self,"synonym",None) or ""))
52 | except Exception as _:
53 | pass
54 | return "<" + str(type(self).__name__) + " " + sig + ">"
55 |
56 | def __repr__(self, *args, **kwargs):
57 | return self.__str__()
58 |
59 | def referentsLabel(self):
60 | return "/".join(canonList(self.referent))
61 |
62 | def summary(self):
63 | try:
64 | return self.candidateType
65 | except:
66 | return None
67 |
68 | def explain(self, style):
69 | if style == 'text':
70 | return self.textExplain()
71 | elif style == 'structured':
72 | return self.structuredExplain()
73 | elif not style:
74 | return None
75 | else:
76 | raise ValueError("Unknown explanation style {}".format(style))
77 |
78 | def textExplain(self):
79 | prefix = "Cand"
80 | try:
81 | if self.candidateType=='direct':
82 | return "{}: {} {}: Direct({})".format(prefix, self.referentType, self.referentsLabel(), self.indicator)
83 | elif self.candidateType=='levenshtein':
84 | return "{}: {} {}: Levenshtein({})={}".format(prefix, self.referentType, self.referent, self.synonym, self.distance)
85 | elif self.candidateType=='hybridJaccard':
86 | return "{}: {} {}: HybridJaccard({})".format(prefix, self.referentType, self.referent, self.synonym)
87 | elif self.candidateType=='wordnet':
88 | s = self.synonym
89 | return "{}: {} {}: Wordnet({},{})=>{}".format(prefix, self.referentType, self.referent, s.source, s.indicator, s.content)
90 | elif self.candidateType=='word2vec':
91 | s = self.synonym
92 | return "{}: {} {}: Word2vec({},{})=>{}".format(prefix, self.referentType, self.referent, s.source, s.indicator, s.content)
93 | except:
94 | pass
95 | return str(self)
96 |
97 | def structuredExplain(self):
98 | d = copy(self.__dict__)
99 | synonym = d.get('synonym')
100 | if synonym:
101 | d['synonym'] = synonym.__dict__
102 | return d
103 |
104 | def binding(self):
105 | # return "Binding of indicator {} is content {}".format(self.indicator, self.content)
106 | return (self.candidateType, self.indicator, self.content)
107 |
108 | class Query(object):
109 | def __init__(self, terms, graph, thesaurus=None,
110 | direct_enable=True,
111 | levenshtein_enable=True,
112 | levenshtein_above_score=0.0,
113 | levenshtein_within_score=1.0,
114 | hybridjaccard_enable=True,
115 | hybridjaccard_allowexact_enable=False,
116 | **kwargs):
117 | self.terms = terms
118 | self.graph = graph
119 | # self.thesaurus = thesaurus or Thesaurus()
120 | self.thesaurus = thesaurus
121 | self.direct_enable = direct_enable
122 | self.levenshtein_enable = levenshtein_enable
123 | self.levenshtein_above_score = levenshtein_above_score
124 | self.levenshtein_within_score = levenshtein_within_score
125 | self.hybridjaccard_enable = hybridjaccard_enable
126 | self.hybridjaccard_allowexact_enable = hybridjaccard_allowexact_enable
127 | self.initNgrams(terms)
128 |
129 | def __str__(self, *args, **kwargs):
130 | limit = 4
131 | sig = "None"
132 | try:
133 | sig = " ".join(self.terms[0:limit]) + "..."
134 | except Exception as _:
135 | pass
136 | return "<" + str(type(self).__name__) + " " + sig + ">"
137 |
138 | def __repr__(self, *args, **kwargs):
139 | return self.__str__(*args, **kwargs)
140 |
141 | def initNgrams(self, terms):
142 | self.ngrams = {}
143 | for term,idx in zip(terms, count(0,2)):
144 | # print("Term 1 {}".format(term))
145 | # print("Assign spot {} to unigram {}".format(idx,term))
146 | self.ngrams[term] = None
147 | self.ngrams[term] = {"term": term,
148 | "words": [term],
149 | "index": idx,
150 | "cardinality": 1}
151 | for t1,t2,idx in zip(terms, terms[1:], count(1,2)):
152 | term = t1 + "_" + t2
153 | # print("Assign spot {} to bigram {}".format(idx, term))
154 | self.ngrams[term] = {"term": term,
155 | "words": [t1, t2],
156 | "index": idx,
157 | "cardinality": 2}
158 |
159 | def suggestCandidates(self):
160 | # singletons only
161 | graph = self.graph
162 | ngrams = self.ngrams
163 | thesaurus = self.thesaurus
164 | # levenshtein config
165 | levenshteinWithin = self.levenshtein_within_score
166 | levenshteinAbove = self.levenshtein_above_score
167 | # hybrid jaccard config
168 | hybridJaccardAllowExact = self.hybridjaccard_allowexact_enable
169 |
170 | for q,d in ngrams.items():
171 | keyword = q
172 | d["candidates"] = []
173 |
174 | # SINGLETON
175 | if d["cardinality"] == 1:
176 | # singleton, direct node
177 | if self.direct_enable:
178 | for node in graph.nodes():
179 | if graph.nodeMatch(node, keyword):
180 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0)
181 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym))
182 | # singleton, direct edge
183 | for edge in graph.edges():
184 | if graph.edgeMatch(edge, keyword):
185 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0)
186 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym))
187 |
188 | # singleton, levenshtein node
189 | if self.levenshtein_enable:
190 | for node in graph.nodes():
191 | try:
192 | (closest, away) = graph.nodeEditWithin(node, keyword, levenshteinWithin, above=levenshteinAbove)
193 | synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away)
194 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='levenshtein', distance=away, synonym=synonym))
195 | except TypeError:
196 | pass
197 | # singleton, levenshtein edge
198 | for edge in graph.edges():
199 | try:
200 | (closest,away) = graph.edgeEditWithin(edge, keyword, levenshteinWithin, above=levenshteinAbove)
201 | synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away)
202 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='levenshtein', distance=away, synonym=synonym))
203 | except TypeError:
204 | pass
205 | # singleton, hybrid jaccard node
206 | if self.hybridjaccard_enable:
207 | for node in graph.nodes():
208 | best = graph.nodeNearMatch(node, keyword, allowExact=hybridJaccardAllowExact)
209 | if best:
210 | synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best)
211 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='hybridJaccard', synonym=synonym))
212 | # singleton, hybrid jaccard edge
213 | for edge in graph.edges():
214 | best = graph.edgeNearMatch(edge, keyword, allowExact=hybridJaccardAllowExact)
215 | synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best)
216 | if best:
217 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='hybridJaccard', synonym=synonym))
218 |
219 | # singleton, synonym
220 | if self.thesaurus:
221 | for synonym in thesaurus.generateSynonyms(keyword):
222 | content = synonym.content
223 | # singleton, synonym node
224 | for node in graph.nodes():
225 | if graph.nodeMatch(node, content):
226 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym))
227 | # singleton, synonym edge
228 | for edge in graph.edges():
229 | if graph.edgeMatch(edge, content):
230 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym))
231 |
232 | # MULTIWORD
233 | elif d["cardinality"] >= 2:
234 | if self.direct_enable:
235 | # multiword, direct
236 | for node in graph.nodes():
237 | if graph.nodeMatch(node, keyword):
238 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0)
239 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym))
240 | for edge in graph.edges():
241 | if graph.edgeMatch(edge, keyword):
242 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0)
243 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym))
244 | # TODO: multiword, levenshtein (or jaro_winkler, hj)
245 | # NIY
246 | # multiword, synonym
247 | for synonym in thesaurus.generateSynonyms(keyword):
248 | content = synonym.content
249 | for node in graph.nodes():
250 | if graph.nodeMatch(node, synonym):
251 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym))
252 | for edge in graph.edges():
253 | if graph.edgeMatch(edge, synonym):
254 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym))
255 |
256 | # def initNgrams0(self, terms):
257 | # self.ngrams = {}
258 | # for term,idx in zip(terms, count(0,2)):
259 | # # print("Term 1 {}".format(term))
260 | # # print("Assign spot {} to unigram {}".format(idx,term))
261 | # self.ngrams[term] = None
262 | # self.ngrams[term] = {"term": term,
263 | # "words": [term],
264 | # "index": idx,
265 | # "cardinality": 1}
266 | # for t1,t2,idx in zip(terms, terms[1:], count(1,2)):
267 | # term = t1 + "_" + t2
268 | # # print("Assign spot {} to bigram {}".format(idx, term))
269 | # self.ngrams[term] = {"term": term,
270 | # "words": [t1, t2],
271 | # "index": idx,
272 | # "cardinality": 2}
273 |
274 | # def dump0(self):
275 | # byIndex = [None] * (2*len(self.terms) - 1)
276 | # for d in self.ngrams.values():
277 | # byIndex[d['index']] = d
278 | # for d in byIndex:
279 | # try:
280 | # idx = d['index']
281 | # ngramType = "unigram" if idx%2 else "bigram"
282 | # q = d.get('term', '')
283 | # v = d.get('candidates', [])
284 | # # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "\n".join(v)))
285 | # # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))))
286 | # summaries = Counter([c.summary() for c in v])
287 | # # print("{}{}. {}: {} ({})".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))," unknown"))
288 | # print("{}{}. {}: {} ({})".format(ngramType, idx, q, "{} candidates".format(len(v)), summaries))
289 | # except:
290 | # print(d)
291 |
292 | def dump(self, file=sys.stdout):
293 | byIndex = [None] * (2*len(self.terms) - 1)
294 | for d in self.ngrams.values():
295 | byIndex[d['index']] = d
296 | for d in byIndex:
297 | try:
298 | idx = d['index']
299 | # seems backward
300 | ngramType = "bigram" if idx%2 else "unigram"
301 | q = d.get('term', '')
302 | v = d.get('candidates', [])
303 | # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "\n".join(v)))
304 | # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))))
305 | # summaries = Counter([c.summary() for c in v])
306 | # print("{}{}. {}: {} ({})".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))," unknown"))
307 | print("({}) {}. {}:".format(ngramType, idx, q), file=file)
308 | # print("Candidates:")
309 | if v:
310 | for c in v:
311 | # print(c.summary())
312 | # print(c)
313 | print(" " + c.explain(), file=file)
314 | else:
315 | print(" None", file=file)
316 | except:
317 | print(d, file=file)
318 |
319 | def dumpToString(self, indent=0):
320 | buffer = StringIO()
321 | self.dump(file=buffer)
322 | s = buffer.getvalue()
323 | buffer.close()
324 | prefix = " " * indent
325 | return (prefix + s.replace("\n", "\n" + prefix)
326 | if prefix
327 | else s)
328 |
--------------------------------------------------------------------------------
/src/dig/synonym.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys, os
4 |
5 | from nltk.corpus import wordnet as wn
6 | import word2vec
7 | import urllib
8 | from urllib.parse import quote
9 | from builtins import setattr
10 |
11 | class Synonym(object):
12 |
13 | """Synonym records a link between two surface forms:
14 | a known word or collocation (the seed or indicator)
15 | e.g., 'blue', 'eye_color'
16 | and
17 | a word or collocation believed to be equivalent/or related (the target or content)
18 | e.g., 'sky'."""
19 |
20 | def __init__(self, *args, indicator=None, content=None, score=1.0, source=None, **kwargs):
21 | self.indicator = indicator
22 | self.content = content
23 | self.score = score
24 | self.source = source
25 | for (k, v) in kwargs.items():
26 | setattr(self, k, v)
27 |
28 | def __str__(self, *args, **kwargs):
29 | sig = "{}({})=>{}{}".format(self.detailSource(),
30 | getattr(self, "indicator", "*INDICATOR*"),
31 | getattr(self, "content", "*CONTENT*"),
32 | # "" if getattr(self, "score", 1.0)==1.0 else " " + getattr(self, "score", "unknown")
33 | "")
34 | return "<" + str(type(self).__name__) + " " + sig + ">"
35 |
36 | def __repr__(self, *args, **kwargs):
37 | return self.__str__(*args, **kwargs)
38 |
39 | def detailSource(self):
40 | try:
41 | return self.source + "." + self.rel
42 | except AttributeError:
43 | try:
44 | return self.source
45 | except:
46 | return "*SOURCE*"
47 |
48 | def explain(self):
49 | return str(self)
50 |
51 | # the GoogleNews-vectors data I downloaded wasn't happy on the Mac, tended to misindex words
52 | # e.g., model['dog'] was missing but model['og'] was found
53 | # model = word2vec.load('/opt/word2vec/data/GoogleNews-vectors-negative300.bin')
54 | # model = word2vec.load(os.path.join(WORD2VEC_DATA_DIR, WORD2VEC_DATA_FILE)
55 |
56 | class SynonymGenerator(object):
57 | pass
58 |
59 | WORD2VEC_DATA_DIR = '/opt/word2vec/data'
60 | WORD2VEC_DATA_FILE = "text8-phrases.bin"
61 | WORD2VEC_SIZE = 10
62 | WORD2VEC_MINIMUM_SCORE = 0.5
63 |
64 | class Word2VecSynonymGenerator(SynonymGenerator):
65 |
66 | def __init__(self,
67 | dataDir=WORD2VEC_DATA_DIR,
68 | dataFile=WORD2VEC_DATA_FILE,
69 | size=WORD2VEC_SIZE,
70 | minimumScore=WORD2VEC_MINIMUM_SCORE):
71 | super(Word2VecSynonymGenerator, self).__init__()
72 | # word2vec config
73 | self.dataDir = dataDir
74 | self.dataFile = dataFile
75 | self.size = size
76 | self.minimumScore = minimumScore
77 | if self.dataDir and self.dataFile:
78 | self.word2vecModel = word2vec.load(os.path.join(self.dataDir, self.dataFile))
79 |
80 | def generateSynonyms(self, indicator):
81 | """collocation indicator must be specified as word1_word2"""
82 | if isinstance(indicator, (list, tuple)):
83 | indicator = "_".join(indicator)
84 | size = self.size
85 | minimumScore = self.minimumScore
86 | try:
87 | model = self.word2vecModel
88 | (indexes, metrics) = model.cosine(indicator, size)
89 | array = model.generate_response(indexes, metrics)
90 | for (syn, similarityScore) in array:
91 | if similarityScore >= minimumScore:
92 | yield(Synonym(indicator=indicator, content=syn, score=similarityScore, source='word2vec'))
93 | except:
94 | pass
95 | pass
96 |
97 | WORDNET_PARTS_OF_SPEECH = ['n', 'v', 'a', 'r']
98 | WORDNET_LEMMA_MIN_COUNT = 1
99 | # POS self/factor up/factor down/factor
100 | WORDNET_NEIGHBORHOOD = (('n', (True, 1), (True, 0.5), (True, 0.5)),
101 | ('v', (True, 1), (True, 0.5), (True, 0.5)),
102 | ('a', (True, 1), (False, 0), (True, 0.5)),
103 | ('r', (True, 1), (False, 0), (True, 0.5)))
104 |
105 |
106 | # TODO: interrogate pertanyms and derivationally_related_forms, which are stored only on the resultant lemmas
107 | # TODO: holonyms (synechdoche), metonymy in general
108 |
109 | class WordnetSynonymGenerator(SynonymGenerator):
110 |
111 | def __init__(self,
112 | partsOfSpeech=WORDNET_PARTS_OF_SPEECH,
113 | lemmaMinCount=WORDNET_LEMMA_MIN_COUNT,
114 | neighborhood=WORDNET_NEIGHBORHOOD):
115 | super(WordnetSynonymGenerator, self).__init__()
116 | # wordnet config
117 | self.wn = wn
118 | self.wordnetPartsOfSpeech = WORDNET_PARTS_OF_SPEECH
119 | self.wordnetLemmaMinCount = WORDNET_LEMMA_MIN_COUNT
120 | self.wordnetNeighborhood = WORDNET_NEIGHBORHOOD
121 |
122 | def generateSynonyms(self, indicator):
123 | """lemmas with count=0 are generally quite rare, so drop them
124 | may generate a lemma more than once, possible with different parameters"""
125 | neighborhood = self.wordnetNeighborhood
126 | pos = self.wordnetPartsOfSpeech
127 | wn = self.wn
128 | # Avoid lemmas with counts lower than this
129 | # Many WN unusual lemmas have zero
130 | minCount = self.wordnetLemmaMinCount
131 | def generateSynsetSynonyms(synset, rel, factor):
132 | for lemma in synset.lemmas():
133 | count = lemma.count()
134 | if count > minCount:
135 | name = lemma.name()
136 | if name == indicator:
137 | continue
138 | yield(Synonym(indicator=indicator, content=name, lemma=lemma, synset=synset, pos=pos, factor=factor,
139 | rel=rel, count=count, score=count*factor, source='wordnet'))
140 |
141 | for pos, (here, hereFactor), (up, upFactor), (down, downFactor) in neighborhood:
142 | for synset in wn.synsets(indicator, pos=pos):
143 | if here:
144 | for g in generateSynsetSynonyms(synset, "self", hereFactor):
145 | yield(g)
146 | if up:
147 | for parent in synset.hypernyms():
148 | for g in generateSynsetSynonyms(parent, "hypernym", upFactor):
149 | yield(g)
150 | if down:
151 | for child in synset.hyponyms():
152 | for g in generateSynsetSynonyms(child, "hyponym", downFactor):
153 | yield(g)
154 |
155 |
156 | class SwoogleSynonymGenerator(SynonymGenerator):
157 |
158 | def __init(self):
159 | super(SwoogleSynonymGenerator, self).__init__()
160 | # swoogle config
161 | self.swoogle = True
162 | self.swoogleUriTemplate = '''http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="{}"&phrase2="{}"'''
163 |
164 | def generateSynonyms(self, indicator):
165 | """Incomplete"""
166 | score = 0
167 | url = self.swoogleUriTemplate.format(quote)
168 | try:
169 | request = urllib.request.Request(url)
170 | response = urllib.request.urlopen(request)
171 | score = str(response.read().decode('utf-8')).replace('\"','')
172 | score = float(score)
173 | except Exception as _:
174 | pass
175 | pass
176 |
177 | class EasyESASynonymGenerator(SynonymGenerator):
178 | def __init(self):
179 | super(EasyESASynonymGenerator, self).__init__()
180 |
181 | def generateSynonyms(self, indicator):
182 | pass
183 |
184 | class Thesaurus(object):
185 | def __init__(self,
186 | word2vec_enable=True,
187 | word2vec_data_dir=WORD2VEC_DATA_DIR,
188 | word2vec_data_file=WORD2VEC_DATA_FILE,
189 | word2vec_size=WORD2VEC_SIZE,
190 | word2vec_minimum_score=WORD2VEC_MINIMUM_SCORE,
191 | wordnet_enable=True,
192 | # wordnetPartsOfSpeech=WORDNET_PARTS_OF_SPEECH,
193 | wordnet_lemma_min_count=WORDNET_LEMMA_MIN_COUNT,
194 | # wordnet_neighborhood=WORDNET_NEIGHBORHOOD,
195 | wordnet_n_enable = True,
196 | wordnet_n_self_factor = 1.0,
197 | wordnet_n_hypernym_factor = 0.5,
198 | wordnet_n_hyponym_factor = 0.5,
199 | wordnet_v_enable = True,
200 | wordnet_v_self_factor = 1.0,
201 | wordnet_v_hypernym_factor = 0.5,
202 | wordnet_v_hyponym_factor = 0.5,
203 | wordnet_a_enable = True,
204 | wordnet_a_self_factor = 1.0,
205 | wordnet_a_hypernym_factor = 0,
206 | wordnet_a_hyponym_factor = 0.5,
207 | wordnet_r_enable = True,
208 | wordnet_r_self_factor = 1.0,
209 | wordnet_r_hypernym_factor = 0,
210 | wordnet_r_hyponym_factor = 0.5,
211 | swoogle_enable=False,
212 | swoogle_uri_template=None,
213 | easyesa_enable=False,
214 | **kwargs):
215 | synonymGenerators = {}
216 | if word2vec_enable:
217 | synonymGenerators['word2vec'] = Word2VecSynonymGenerator(dataDir=word2vec_data_dir,
218 | dataFile=word2vec_data_file,
219 | size=word2vec_size,
220 | minimumScore=word2vec_minimum_score)
221 | if wordnet_enable:
222 | partsOfSpeech = []
223 | neighborhood = []
224 | if wordnet_n_enable:
225 | partsOfSpeech.append('n')
226 | neighborhood.append( ('n',
227 | (wordnet_n_self_factor>0, wordnet_n_self_factor),
228 | (wordnet_n_hypernym_factor>0, wordnet_n_hypernym_factor),
229 | (wordnet_n_hyponym_factor>0, wordnet_n_hyponym_factor)) )
230 | if wordnet_v_enable:
231 | partsOfSpeech.append('v')
232 | neighborhood.append( ('v',
233 | (wordnet_v_self_factor>0, wordnet_v_self_factor),
234 | (wordnet_v_hypernym_factor>0, wordnet_v_hypernym_factor),
235 | (wordnet_v_hyponym_factor>0, wordnet_v_hyponym_factor)) )
236 | if wordnet_a_enable:
237 | partsOfSpeech.append('a')
238 | neighborhood.append( ('a',
239 | (wordnet_a_self_factor>0, wordnet_a_self_factor),
240 | (wordnet_a_hypernym_factor>0, wordnet_a_hypernym_factor),
241 | (wordnet_a_hyponym_factor>0, wordnet_a_hyponym_factor)) )
242 | if wordnet_r_enable:
243 | partsOfSpeech.append('r')
244 | neighborhood.append( ('r',
245 | (wordnet_r_self_factor>0, wordnet_r_self_factor),
246 | (wordnet_r_hypernym_factor>0, wordnet_r_hypernym_factor),
247 | (wordnet_r_hyponym_factor>0, wordnet_r_hyponym_factor)) )
248 | synonymGenerators['wordnet'] = WordnetSynonymGenerator(partsOfSpeech=partsOfSpeech,
249 | lemmaMinCount=wordnet_lemma_min_count,
250 | neighborhood=neighborhood)
251 | if swoogle_enable:
252 | synonymGenerators['swoogle'] = SwoogleSynonymGenerator(uriTemplate=swoogle_uri_template)
253 | if easyesa_enable:
254 | synonymGenerators['easyESA'] = EasyESASynonymGenerator()
255 | self.synonymGenerators = synonymGenerators
256 |
257 | def generateSynonyms(self, indicator):
258 | for (_, syngen) in self.synonymGenerators.items():
259 | for g in syngen.generateSynonyms(indicator):
260 | yield(g)
261 |
--------------------------------------------------------------------------------
/src/dig/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from networkx import *
4 |
5 | g=Graph()
6 | g.add_node('a')
7 | g.add_node('b')
8 | g.add_node('c')
9 | g.add_node('d')
10 | g.add_node('e')
11 | g.add_node('f')
12 | g.add_node('g')
13 | g.add_node('h')
14 | g.add_node('i')
15 | g.add_node('j')
16 | g.add_node('k')
17 |
18 | g.add_edge('a','b', weight=1)
19 | g.add_edge('a','c', weight=2)
20 | g.add_edge('b','d', weight=1)
21 | g.add_edge('c','d', weight=2)
22 | g.add_edge('c','e', weight=1)
23 | g.add_edge('d','f', weight=2)
24 | g.add_edge('f','g', weight=1)
25 | g.add_edge('d','h', weight=2)
26 | g.add_edge('h','i', weight=1)
27 | g.add_edge('h','j', weight=2)
28 | g.add_edge('h','k', weight=1)
29 | # cycle
30 | g.add_edge('b','a', weight=1)
31 |
--------------------------------------------------------------------------------
/src/dig/z-attic/wordSimilarity.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import sys
3 | import json
4 | import math
5 | from urllib.parse import quote
6 | from threading import Thread
7 |
8 | class WordSimilarity:
9 |
10 | scoreDictionary = {}
11 | scoreDictionary['esa'] = 0
12 | scoreDictionary['swoogle'] = 0
13 |
14 | # 1 - EasyESA client
15 | # a score of 1 and -1 results in a perfect match
16 | # treshold values to consider 0.07, 0.052 and 0.04
17 | def getEasyESAScore(word1,word2):
18 |
19 | WordSimilarity.scoreDictionary['esa'] = 0
20 | url = "http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1="+quote(word1)+'&term2='+quote(word2)
21 | try:
22 | request = urllib.request.Request(url)
23 | response = urllib.request.urlopen(request)
24 | score = str(response.read().decode('utf-8')).replace('\"','')
25 | if float(score)> 0:
26 | print("ESA %s %s => %s" % (word1, word2, score))
27 | WordSimilarity.scoreDictionary['esa'] = float(score)
28 | except Exception as e:
29 | WordSimilarity.scoreDictionary['esa'] = 0
30 |
31 | # 2 - ws4j client
32 | def getWs4jScore(word1,word2):
33 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+quote(word1)+"%3A%3A"+quote(word2)
34 | request = urllib.request.Request(url)
35 | request.add_header('Accept', 'application/json')
36 | response = urllib.request.urlopen(request)
37 | responseStr = response.read().decode('utf-8')
38 | # fetch json from the response
39 | jsonStr = json.loads(responseStr)
40 | score = float(jsonStr['result'][0]['score'])
41 | return score
42 |
43 | # 3 - UMBC Semantic Similarity service
44 | #
45 | # Documentation availabel at http://swoogle.umbc.edu/SimService/api.html
46 | def getSwoogleScore(word1,word2):
47 | WordSimilarity.scoreDictionary['swoogle'] = 0
48 | url = "http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="+quote(word1)+'&phrase2='+quote(word2)
49 | try:
50 | request = urllib.request.Request(url)
51 | response = urllib.request.urlopen(request)
52 | score = str(response.read().decode('utf-8')).replace('\"','')
53 | score = float(score)
54 | if score > 0:
55 | print("Swoogle %s / %s => %s" % (word1, word2, score))
56 | WordSimilarity.scoreDictionary['swoogle'] = score
57 | except Exception as e:
58 | WordSimilarity.scoreDictionary['swoogle'] = 0
59 |
60 |
61 | # As of now using only EasyESA.
62 | # call the method 2 ws4j client if needed
63 | # a score of 1 and -1 results in a perfect match
64 | # treshold values to consider 0.07, 0.052 and 0.04
65 | def isPredicateSimilar(word1,word2):
66 | #score = math.fabs(WordSimilarity.getEasyESAScore(word1,word2))
67 |
68 | esaThread = Thread(target=WordSimilarity.getEasyESAScore, args=(word1,word2,))
69 | swoogleThread = Thread(target=WordSimilarity.getSwoogleScore, args=(word1,word2,))
70 |
71 | esaThread.start()
72 | swoogleThread.start()
73 | esaThread.join()
74 | swoogleThread.join()
75 |
76 | ESAscore = WordSimilarity.scoreDictionary['esa']
77 | #WordSimilarity.getEasyESAScore(word1,word2)
78 | ESAScaledScore = 0
79 | if(ESAscore>0 and ESAscore<=0.04):
80 | ESAScaledScore = 1
81 | elif(ESAscore>0.04 and ESAscore<=0.06):
82 | ESAScaledScore = 2
83 | elif(ESAscore>0.07):
84 | ESAScaledScore = 3
85 | else:
86 | ESAScaledScore = 0
87 |
88 | SwoogleScore = WordSimilarity.scoreDictionary['swoogle']
89 | # WordSimilarity.getSwoogleScore(word1,word2)
90 | SwoogleScaledScore = 0
91 | if(SwoogleScore>0 and SwoogleScore<0.6):
92 | SwoogleScaledScore = 1
93 | elif(SwoogleScore>=0.6 and SwoogleScore<0.7):
94 | SwoogleScaledScore = 2
95 | elif(SwoogleScore>=0.7):
96 | SwoogleScaledScore = 3
97 | else:
98 | SwoogleScaledScore = 0
99 |
100 | if(ESAScaledScore>SwoogleScaledScore):
101 | print("Using ESA")
102 | score = ESAScaledScore
103 | else:
104 | print("Using Swoogle")
105 | score = SwoogleScaledScore
106 |
107 | if(score>=2):
108 | return score
109 | else:
110 | return -1
111 |
--------------------------------------------------------------------------------
/src/graphSearch.py:
--------------------------------------------------------------------------------
1 | from ngramsEngine import ngramsEngine
2 | from ngramTree import *
3 | from pivotEntityRecognition import *
4 | from colorAssignment import ColorAssignment
5 | from sparqlClient import SparqlClient
6 | import inflection
7 | import urllib.request
8 | import sys
9 |
10 | class GraphSearch:
11 |
12 | def __init__(self):
13 | a =1
14 |
15 | # Method that prints the initial color assigned
16 | def printColors(treeObj,rootNode):
17 |
18 | # Reset the visited flag for the traversal
19 | treeObj.resetVisitedFlag(rootNode)
20 | listNgrams = []
21 | stack = []
22 | stack.append(rootNode)
23 |
24 | while(stack):
25 | currNode = stack.pop()
26 | if not currNode.isVisited:
27 | currNode.isVisited = True
28 | #print('---------')
29 | listNgrams.append(currNode.data)
30 | #print(currNode.data)
31 | #print(currNode.color)
32 | for childNodes in currNode.children:
33 | stack.append(childNodes)
34 | return listNgrams
35 |
36 | # Print the Pivot entities recogised
37 | def printpre(resourceList):
38 | print('------------ Pivot Entity Recognition --------------')
39 | if(len(resourceList)==0):
40 | print('no pivot entity found')
41 | else:
42 | for res in resourceList:
43 | print('Resource name : '+res.uri)
44 | print("Label : "+res.label)
45 | print("Incoming Links : "+str(res.support))
46 | print("keyword : "+res.keyword)
47 | print("colors : "+str(res.colors))
48 | print('------------------------')
49 |
50 | # Print factnodes
51 | def printTriplets(tripleList):
52 | for triple in tripleList:
53 | print('----')
54 | obj = triple.object
55 | print(str(obj.score))
56 | print(str(obj.colors))
57 | print(str(obj.keyword))
58 | print(str(triple.subject.uri) + ' ' + str(triple.predicate.uri) + ' ' + str(triple.object.uri))
59 |
60 |
61 |
62 |
63 | # Gets the bigrams from the sentence and returns the bigrams that are to be covered
64 | def getBiGramList(sentence,resource):
65 |
66 | sentenceList = sentence.split(' ')
67 | resourceKeyword = resource.keyword.split(' ')
68 |
69 | # remove the bigrams that has
70 | for key in resourceKeyword:
71 | sentenceList.remove(key)
72 |
73 | biGramList = []
74 |
75 | # Form the bigrams
76 | if(len(sentenceList)!=0):
77 | for i in range(0,len(sentenceList)-1):
78 | biGramList.append(sentenceList[i]+' '+sentenceList[i+1])
79 |
80 | return biGramList
81 |
82 |
83 | # Ranks the results coverage first followed by the scores
84 | def rankResults(listFactNodes,length):
85 | # new list will contain lists of nodes with each list at index corresponding to the number of colors covered by the node
86 | newList = []
87 |
88 | # initialize the list
89 | for i in range(0,length):
90 | newList.append([])
91 |
92 | # insert the nodes at the appropriate index lists
93 | for node in listFactNodes:
94 | index = int(len(node.colors)-1)
95 | newList[index].append(node)
96 |
97 | # sort list on scores
98 | for list in newList:
99 | list.sort(key=lambda x: x.score, reverse=True)
100 |
101 | # flatten the sorted list
102 | returnList = []
103 | for i in range(len(newList)-1,-1,-1):
104 | for node in newList[i]:
105 | returnList.append(node)
106 |
107 | return returnList
108 |
109 |
110 |
111 | # Driver method
112 | def main():
113 |
114 | # Ask the user to input the query
115 | sentence = input("Enter the query : ")
116 |
117 | print()
118 | print()
119 | print('Phase 1 ... N GRAM Generation')
120 | # Generate the n-grams
121 | ngramsEngineObj = ngramsEngine()
122 | listNgrams,lookupList = ngramsEngineObj.generateNGrams(sentence)
123 |
124 | print('Generated N-grams')
125 |
126 |
127 | # Start building the n-gram tree by selecting the root node
128 | rootWord = listNgrams[0]
129 | rootNode = Node(rootWord)
130 |
131 |
132 | # Construct the tree with the root node
133 | treeObj = NgramTree(rootNode)
134 | treeObj.constructTree(listNgrams,lookupList)
135 |
136 | # Print tree
137 | #treeObj.printNode(rootNode)
138 | print('N-gram tree constructed')
139 |
140 | print()
141 | print('Phase 2 ... Color assignment')
142 |
143 | # Color assignment
144 | colorAssignmentObj = ColorAssignment()
145 | colorAssignmentObj.assignInitialColors(rootNode,lookupList)
146 |
147 |
148 | # Prints colours
149 | #print(printColors(treeObj,rootNode))
150 | print('Completed initial color assignment')
151 | #exit(3)
152 | print()
153 | print('Phase 3 ... PivotEntityRecognition')
154 | # Make use of the spotlight to get the pivot entities sorted on the number of incoming links
155 | spotlightObject = PivotEntityRecognition()
156 | resourceList = spotlightObject.getPivotElement(sentence)
157 |
158 |
159 | #print PRE
160 | printpre(resourceList)
161 | print('Got the pivot element')
162 | print()
163 |
164 |
165 | print('Phase 4 ... Search Phase')
166 | print()
167 |
168 | # get the initial fact nodes
169 | listFactNodes = []
170 |
171 | for resource in resourceList :
172 | # Get the bi-gram list
173 | biGramList = getBiGramList(sentence,resource)
174 | listFactNodes.extend(SparqlClient.getAllTripletsForPivotElement(resource,biGramList))
175 |
176 |
177 | for factNode in listFactNodes:
178 | if(factNode.isExplored == False and factNode.object.isUri):
179 | biGramList = getBiGramList(sentence,factNode.object)
180 | listFactNodes.extend(SparqlClient.getAllTripletsForPivotElement(factNode.object,biGramList))
181 |
182 | resultsList = rankResults(listFactNodes,len(sentence.split(' ')))
183 |
184 | printTriplets(resultsList)
185 |
186 | if __name__ == '__main__':
187 | main()
188 |
189 |
190 |
191 |
192 |
--------------------------------------------------------------------------------
/src/ngramTree.py:
--------------------------------------------------------------------------------
1 | from ngramsEngine import ngramsEngine
2 |
3 | # This class represents Node of the tree
4 | # A Node has value/data and also has links to its children stored as a list
5 | class Node(object):
6 |
7 | def __init__(self, data):
8 | self.data = data # Data in the node
9 | self.color = [] # Colour assignment for the node
10 | self.children = [] # Represents the child nodes
11 | self.isDuplicate = False # Checks if this node is the child of 2 different nodes
12 | self.isVisited = False # This flag helps is traversal
13 |
14 |
15 | #Used to add child node to current node
16 | def add_child(self, obj):
17 | self.children.append(obj)
18 |
19 |
20 | # This represents a n-gram tree
21 | class NgramTree(object):
22 |
23 | def __init__(self,rootNode):
24 | self.rootNode = rootNode
25 |
26 | # Reset the visited flag to False
27 | def resetVisitedFlag(self,node):
28 | for n in node.children:
29 | self.resetVisitedFlag(n)
30 | node.isVisited = False
31 |
32 | # Post order traversal of the tree (DFS)
33 | def post_order(self,node):
34 | for n in node.children:
35 | self.post_order(n)
36 |
37 | if not node.isVisited:
38 | node.isVisited = True
39 | print(node.data)
40 |
41 |
42 | # BFS traversal
43 | def printNode(self,node):
44 | if node is None:
45 | return
46 | if not node.isVisited:
47 | node.isVisited = True
48 | print(node.data)
49 |
50 | for c in node.children:
51 | self.printNode(c)
52 |
53 |
54 |
55 | # This module builds the n-gram tree with the basic idea of BFS traversal
56 | # Input : List1, List2
57 | # List1 : ['a b c d', 'a b c', 'b c d', 'a b', 'b c', 'c d', 'a', 'b', 'c', 'd']
58 | # List2 : [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']]
59 | # Algorithm :
60 | # while(queue):
61 | # CurrentNode = queue.pop()
62 | # search for tokens that have a length = length(CurrentNode.value) - 1
63 | # ##[Get this from List2]
64 | # ##[These tokens will be the nodes at the next level in the tree]
65 | # check if the tokens are already in the tree
66 | # if not create new node with these tokens
67 | # add the token nodes as the children of CurrentNode
68 |
69 | def constructTree(self,listNgrams,lookupList):
70 |
71 | # This dictionary is used to track the nodes that are in the tree
72 | # key:Nodevalue value:Node
73 | treeDictionary = {}
74 |
75 | nodeQueue = [] # This list exhibits behaviour of a Queue
76 | nodeQueue.append(self.rootNode) # Add the root node to the Queue to begin search
77 | treeDictionary[self.rootNode.data] = self.rootNode # Add the root to the treeDictionary as it is seen
78 |
79 |
80 | while(nodeQueue):
81 | currentNode = nodeQueue.pop(0) # Pop the queue
82 | data = currentNode.data # Get the data of the node
83 |
84 | dataLen = len(data.split(' ')) # Get the length of the n-grams in the current token
85 |
86 | if(dataLen-2 >= 0): # Stop if we have reached situation where current iteration is for individual tokens
87 | listChildren = lookupList[dataLen-2] # Get the tokens that have a lenghth 1 less than that of the current token from the look up list
88 |
89 | for child in listChildren:
90 | if child in data: # Check if the child is a sustring of the token
91 |
92 | if child not in treeDictionary: # Check if a node for 'child' is already created. If so retrieve that node and set it as a duplicate
93 | newNode = Node(child)
94 | nodeQueue.append(newNode)
95 | treeDictionary[child] = newNode
96 | else:
97 | newNode = treeDictionary[child]
98 | newNode.isDuplicate = True
99 |
100 | currentNode.add_child(newNode) # Add this child to the parent node
101 |
102 | # Reset the visited flag for the traversal
103 | self.resetVisitedFlag(self.rootNode)
104 |
105 | #self.printNode(self.rootNode)
106 | #self.post_order(self.rootNode)
107 |
108 |
109 | def main(query):
110 | ngramsEngineObj = ngramsEngine()
111 | listNgrams,lookupList = ngramsEngineObj.generateNGrams(query)
112 |
113 | rootWord = listNgrams[0]
114 | rootNode = Node(rootWord)
115 |
116 | treeObj = NgramTree(rootNode)
117 | treeObj.constructTree(listNgrams,lookupList)
118 |
119 |
120 |
121 | if __name__ == '__main__':
122 | main()
--------------------------------------------------------------------------------
/src/ngramsEngine.py:
--------------------------------------------------------------------------------
1 | from nltk.util import ngrams
2 |
3 | # This class makes use of nltk library for generating n-grams given a query
4 | class ngramsEngine(object):
5 |
6 | def __init__(self):
7 | self = self
8 |
9 | # Module to print n-grams
10 | def printNGrams(self,ngramsList):
11 | for token in ngramsList:
12 | print(token.strip())
13 |
14 |
15 | # Module that generates n-grams list
16 | # Input : query
17 | # Output : Two lists are returned.
18 | # 1st list : This has all the n-grams arranged hierarchically
19 | # 2nd list : This is a list that as list of n-grams grouped together based on the length
20 | # All n-grams of length 1 goes in index 0 of the List2
21 | # All n-grams of length 2 goes in index 1 of the List2
22 | # All n-grams of length 3 goes in index 2 of the List2
23 | # EX: i/p - a b c d
24 | # List1 : ['a b c d', 'a b c', 'b c d', 'a b', 'b c', 'c d', 'a', 'b', 'c', 'd']
25 | # List2 : [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']]
26 |
27 | def generateNGrams(self,query):
28 |
29 | # This stores the n-grams as generated by NLTK
30 | ngramsNLTKList = []
31 |
32 | # Get the inital n-gram list built
33 | for n in range(len(query),0,-1):
34 | ngramsNLTKList.extend(ngrams(query.split(),n))
35 |
36 | # Actual n-gram list (List 1 as in the description)
37 | ngramList = []
38 |
39 | # A look up list (List 2 as in the description)
40 | lookupList = []
41 |
42 | # Join the individual lists to get the n-grams
43 | for ngram in ngramsNLTKList:
44 | ngramList.append((' '.join(ngram)).strip())
45 |
46 | # Determine the length of the lookupList required
47 | if(len(ngramsNLTKList)>0):
48 | maxLength = len(ngramsNLTKList[0])
49 | for i in range(maxLength):
50 | lookupList.append([])
51 |
52 | # Fill in the lookupList
53 | # All n-grams of length 1 goes in index 0
54 | # All n-grams of length 2 goes in index 1
55 | # All n-grams of length 3 goes in index 2 ...
56 | for token in ngramsNLTKList:
57 | joinedToken = ' '.join(token).strip()
58 | listLength = len(joinedToken)
59 | currentList = lookupList[len(token)-1]
60 | currentList.append(' '.join(token))
61 |
62 | return ngramList,lookupList
63 |
64 |
65 | def main():
66 | ngramsEngineObject = ngramsEngine()
67 | query = input(" Enter the query : ")
68 |
69 | ngramsList,lookupList = ngramsEngineObject.generateNGrams(query.strip())
70 | ngramsEngineObject.printNGrams(ngramsList)
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/src/pivotEntityRecognition.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import sys
3 | import json
4 | from resourceGraph import Resource
5 | from colorAssignment import ColorAssignment
6 |
7 | # Spotlight service for pivot entity recognition
8 | class PivotEntityRecognition:
9 |
10 | def __init__(self):
11 | sentence = ''
12 |
13 | # This method updates the colours covered by a resource
14 | def updateColors(self,resourceList):
15 | for res in resourceList:
16 | tokens = res.keyword.split(' ')
17 | for token in tokens:
18 | if(token in ColorAssignment.colorDictionary):
19 | res.colors.append(int(ColorAssignment.colorDictionary[token]))
20 |
21 | return resourceList
22 |
23 |
24 | # Parses the final resource values into a Pivot Object
25 | def getPivotObject(self,resource):
26 |
27 | # Get the URI, Label and Support
28 | if('@uri' in resource):
29 | uri = resource['@uri']
30 | label = ''
31 | support = 0
32 |
33 | if('@label' in resource):
34 | label = resource['@label']
35 |
36 | if('@support' in resource):
37 | try:
38 | support = int(resource['@support'])
39 | except ValueError:
40 | support = 0
41 |
42 | pivotElement = Resource('',label,support,'')
43 | pivotElement.isUri = True
44 | return pivotElement
45 | else:
46 | return None
47 |
48 | # Main logic of parsing implemented here
49 | def parseJson(self,jsonStr):
50 |
51 | #print(jsonStr)
52 |
53 | # Return list of pivot objects
54 | resourceList = []
55 |
56 | if('annotation' not in jsonStr):
57 | return resourceList
58 |
59 | if('surfaceForm' not in jsonStr['annotation']):
60 | return resourceList
61 |
62 | pivotTerms = jsonStr['annotation']['surfaceForm']
63 |
64 | #print(pivotTerms)
65 |
66 | # This happens only when the return type has one entity key word
67 | if(type(pivotTerms) is dict):
68 | #If there is no pivot entity
69 | if('resource' not in pivotTerms):
70 | return resourceList
71 | # If there is only one entity identified for the keyword
72 | if(type(pivotTerms['resource']) is dict):
73 | #If there is only one pivot identified for the query
74 | pivotElement = self.getPivotObject(pivotTerms['resource'])
75 | if(pivotElement is not None):
76 | pivotElement.keyword = pivotTerms['@name']
77 | resourceList.append(pivotElement)
78 | else:
79 | for resource in pivotTerms['resource']:
80 | pivotElement = self.getPivotObject(resource)
81 | if(pivotElement is not None):
82 | pivotElement.keyword = pivotTerms['@name']
83 | resourceList.append(pivotElement)
84 |
85 | # This happens when the return type has multiple entity keywords
86 | elif(type(pivotTerms) is list):
87 |
88 | for resources in pivotTerms:
89 | # This happens only when the return type has one entity key word
90 | if(type(resources) is dict):
91 | #If there is no pivot entity
92 | if('resource' not in resources):
93 | continue
94 | # If there is only one entity identified for the keyword
95 | if(type(resources['resource']) is dict):
96 | #If there is only one pivot identified for the query
97 | pivotElement = self.getPivotObject(resources['resource'])
98 | if(pivotElement is not None):
99 | pivotElement.keyword = resources['@name']
100 | resourceList.append(pivotElement)
101 | else:
102 | for resource in resources['resource']:
103 | pivotElement = self.getPivotObject(resource)
104 | if(pivotElement is not None):
105 | pivotElement.keyword = resources['@name']
106 | resourceList.append(pivotElement)
107 |
108 | # Sort the resource list on the number of incoming links
109 | resourceList.sort(key=lambda x: x.support, reverse=True)
110 | # Update the colors represented by the resources
111 | resourceList = self.updateColors(resourceList)
112 |
113 | return resourceList
114 |
115 | # Queries DBPedia spotlight to get the values
116 | def requestSpotlight(self):
117 | #encode spaces
118 | sentence = self.sentence.replace(' ','%20')
119 |
120 | #restrict types to person,organistion and location
121 | urlTypes = 'types=DBpedia:Person,Schema:Person,DBpedia:Company,DBpedia:Organisation,Schema:Organization,DBpedia:AdministrativeRegion,DBpedia:PopulatedPlace,DBpedia:Place,Schema:Place'
122 | url = "http://spotlight.dbpedia.org/rest/candidates?types="+urlTypes+"&text="+sentence
123 |
124 | request = urllib.request.Request(url)
125 | request.add_header('Accept', 'application/json')
126 | response = urllib.request.urlopen(request)
127 | responseStr = str(response.read().decode('utf-8'))
128 |
129 | # fetch json from the response
130 | jsonStr = json.loads(responseStr)
131 |
132 | #Parse json
133 | return(self.parseJson(jsonStr))
134 |
135 | # Entry point of the class
136 | def getPivotElement(self,query):
137 |
138 | self.sentence = query
139 | #Make request
140 | return(self.requestSpotlight())
141 |
142 |
143 | if __name__ == '__main__':
144 | spotlightObj = PivotEntityRecognition()
145 | sentence = input(" Enter the keyword query : ")
146 | resourceList = spotlightObj.getPivotElement(sentence)
147 |
148 | if(len(resourceList)==0):
149 | print('no pivot entity found')
150 | else:
151 | for res in resourceList:
152 | print(res.uri+" "+res.label+" "+str(res.support)+" "+res.keyword)
153 |
154 |
--------------------------------------------------------------------------------
/src/queries.txt:
--------------------------------------------------------------------------------
1 |
2 | longest river
3 |
4 |
5 | cars that are produced in Germany
6 | German cars --> picks
7 |
8 |
9 |
10 | Mother and father of Prince Harry and Prince William
11 | Prince Harry mother father --->
12 | Prince William parents -->
13 |
14 |
15 |
16 |
17 | ----
18 | 1.0
19 | [0, 1, 2]
20 | prince william parents
21 |
22 | ----
23 | 1.0
24 | [0, 1, 2]
25 | prince william parents
26 |
27 | ----
28 | 1.0
29 | [0, 1, 2]
30 | prince william parents
31 |
32 | ----
33 | 1.0
34 | [0, 1, 2]
35 | prince william parents
36 |
37 | ----
38 | 1.0
39 | [0, 1, 2]
40 | prince william parents
41 |
42 | ----
43 | 1.0
44 | [0, 1, 2]
45 | prince william parents
46 |
47 | ----
48 | 1.0
49 | [0, 1, 2]
50 | prince william parents
51 |
52 | ----
53 | 1.0
54 | [0, 1, 2]
55 | prince william parents
56 |
57 |
58 |
59 | ---------------
60 |
61 | Prince Harry parents --> (could not match parents)
62 | Prince Harry Mother -->
63 | Current label : Prince Harry of Wales
64 | Keywords yet to cover : ['Mother']
65 |
66 |
67 | latest U.S. state admitted
68 |
69 | number of languages spoken in Turkmenistan
70 | Turkmenistan languages
71 | ----
72 | 1.0
73 | [0, 1]
74 | Turkmenistan languages
75 |
76 | ----
77 | 1.0
78 | [0, 1]
79 | Turkmenistan languages
80 |
81 | ----
82 | 1.0
83 | [0, 1]
84 | Turkmenistan languages
85 |
86 | ----
87 | 0.9172847553
88 | [0, 1]
89 | Turkmenistan languages
90 |
91 | ----
92 | 0.9172847553
93 | [0, 1]
94 | Turkmenistan languages
95 |
96 | ----
97 | 0.7649522742
98 | [0, 1]
99 | Turkmenistan languages
100 | languages
101 | ----
102 | 0.7649522742
103 | [0, 1]
104 | Turkmenistan languages
105 | Inter-ethnic
106 |
107 | -------
108 | movies directed by Francis Ford Coppola
109 | Francis Ford Coppola directed movies - -> nothing
110 | Francis Ford Coppola movies - nothing on his dbpedia page (is a subject of many other movies)
111 |
112 | maiden name of Angela Merkel
113 | ----
114 | 1.0
115 | [0, 1, 3]
116 | Angela Merkel name
117 | Angela Merkel
118 | ----
119 | 1.0
120 | [0, 1, 3]
121 | Angela Merkel name
122 | Merkel, Angela
123 | ----
124 | 1.0
125 | [0, 1, 3]
126 | Angela Merkel name
127 | Angela Merkel
128 | ----
129 | 1.0
130 | [0, 1, 3]
131 | Angela Merkel name
132 | Merkel, Angela
133 |
134 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=maiden%20name&term2=birth%20name
135 | returns "0.0120668066"
136 |
137 |
138 |
139 | Australian nonprofit organizations
140 | could not detect pivot element
141 |
142 | Military conflicts in which Lawrence of Arabia participated
143 |
144 |
145 | number of inhabitants in Maribor
146 | inhabitants Maribor
147 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=inhabitants&term2=population "0.0345099577"
148 |
149 |
150 | games developed by GMT
151 |
152 | husband of Amanda Palmer
153 | Amanda Palmer husband
154 |
155 | Current label : Amanda Palmer
156 | Keywords yet to cover : ['husband']
157 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=spouse&term2=husband
158 | "0.0558511518"
159 |
160 | islands that belong to Japan
161 | Exploring ...
162 |
163 | Current label : Japan
164 | Keywords yet to cover : ['islands']
165 | Exploring ...
166 |
167 | Current label : Japan (band)
168 | Keywords yet to cover : ['islands']
169 | Exploring ...
170 |
171 | Current label : Islands (band)
172 | Keywords yet to cover : ['japan']
173 |
174 | ruling party in Lisbon
175 | ----
176 | 0.7246611098
177 | [0, 1, 2]
178 | lisbon ruling party
179 |
180 | ----
181 | 0.5502401986
182 | [0, 1, 2]
183 | lisbon ruling party
184 |
185 | ----
186 | 0.132051861
187 | [0, 1, 2]
188 | lisbon ruling party
189 | Helena Roseta
190 | ----
191 | 0.132051861
192 | [0, 1, 2]
193 | lisbon ruling party
194 | Michael B. Lewis
195 | ----
196 | 0.1086613063
197 | [0, 1, 2]
198 | lisbon ruling party
199 |
200 | ----
201 | 0.1058211493
202 | [0, 1, 2]
203 | lisbon ruling party
204 |
205 | ----
206 | 0.1058211493
207 | [0, 1, 2]
208 | lisbon ruling party
209 | Mayor
210 | ----
211 | 0.0982569384
212 | [0, 1, 2]
213 | lisbon ruling party
214 |
215 | ----
216 | 0.0982569384
217 | [0, 1, 2]
218 | lisbon ruling party
219 |
220 | ----
221 | 0.0982569384
222 | [0, 1, 2]
223 | lisbon ruling party
224 |
225 | ----
226 | 0.0982569384
227 | [0, 1, 2]
228 | lisbon ruling party
229 |
230 | ----
231 | 0.0913513348
232 | [0, 1, 2]
233 | lisbon ruling party
234 |
235 | ----
236 | 0.0887198372
237 | [0, 1, 2]
238 | lisbon ruling party
239 |
240 | ----
241 | 0.0799152153
242 | [0, 1, 2]
243 | lisbon ruling party
244 | Praça do Município
245 | ----
246 | 0.0794172227
247 | [0, 1, 2]
248 | lisbon ruling party
249 | 9
250 | ----
251 | 0.0794172227
252 | [0, 1, 2]
253 | lisbon ruling party
254 | W
255 | ----
256 | 0.0794172227
257 | [0, 1, 2]
258 | lisbon ruling party
259 | 8
260 | ----
261 | 0.0794172227
262 | [0, 1, 2]
263 | lisbon ruling party
264 | 18
265 | ----
266 | 0.0731704343
267 | [0, 1, 2]
268 | lisbon ruling party
269 | 38
270 | ----
271 | 0.0731704343
272 | [0, 1, 2]
273 | lisbon ruling party
274 | 42
275 | ----
276 | 0.0731704343
277 | [0, 1, 2]
278 | lisbon ruling party
279 | N
280 | ----
281 | 0.0731704343
282 | [0, 1, 2]
283 | lisbon ruling party
284 | 29
285 | ----
286 | 0.0727422443
287 | [0, 1, 2]
288 | lisbon ruling party
289 | Lisbon
290 | ----
291 | 0.0727422443
292 | [0, 1, 2]
293 | lisbon ruling party
294 |
295 | ----
296 | 0.0708160618
297 | [0, 1, 2]
298 | lisbon ruling party
299 |
300 | ----
301 | 0.8752961422
302 | [0, 2]
303 | lisbon party
304 |
305 | ----
306 | 0.6660475643
307 | [0, 2]
308 | lisbon party
309 |
310 | ----
311 | 0.1526511536
312 | [0, 2]
313 | lisbon party
314 | Helena Roseta
315 | ----
316 | 0.1526511536
317 | [0, 2]
318 | lisbon party
319 | Michael B. Lewis
320 | ----
321 | 0.1179090591
322 | [0, 2]
323 | lisbon party
324 |
325 | ----
326 | 0.1179090591
327 | [0, 2]
328 | lisbon party
329 | Mayor
330 | ----
331 | 0.105693768
332 | [0, 2]
333 | lisbon party
334 |
335 | ----
336 | 0.0989016354
337 | [0, 2]
338 | lisbon party
339 |
340 | ----
341 | 0.0974877736
342 | [0, 2]
343 | lisbon party
344 |
345 | ----
346 | 0.0974877736
347 | [0, 2]
348 | lisbon party
349 |
350 | ----
351 | 0.0974877736
352 | [0, 2]
353 | lisbon party
354 |
355 | ----
356 | 0.0974877736
357 | [0, 2]
358 | lisbon party
359 |
360 | ----
361 | 0.0859793517
362 | [0, 2]
363 | lisbon party
364 |
365 | ----
366 | 0.0791254163
367 | [0, 2]
368 | lisbon party
369 | Praça do Município
370 | ----
371 | 0.0778143857
372 | [0, 2]
373 | lisbon party
374 | 9
375 | ----
376 | 0.0778143857
377 | [0, 2]
378 | lisbon party
379 | W
380 | ----
381 | 0.0778143857
382 | [0, 2]
383 | lisbon party
384 | 8
385 | ----
386 | 0.0778143857
387 | [0, 2]
388 | lisbon party
389 | 18
390 | ----
391 | 0.0729892814
392 | [0, 2]
393 | lisbon party
394 | Lisbon
395 | ----
396 | 0.0729892814
397 | [0, 2]
398 | lisbon party
399 |
400 | ----
401 | 0.0722854064
402 | [0, 2]
403 | lisbon party
404 | 38
405 | ----
406 | 0.0722854064
407 | [0, 2]
408 | lisbon party
409 | 42
410 | ----
411 | 0.0722854064
412 | [0, 2]
413 | lisbon party
414 | N
415 | ----
416 | 0.0722854064
417 | [0, 2]
418 | lisbon party
419 | 29
420 |
421 |
422 | Apollo 14 astronauts
423 | does not exists in dbpedia
424 |
425 | cosmonauts
426 | German cities with more than 250000 inhabitants
427 | second highest mountain on Earth
428 | professional skateboarders from Sweden
429 | band leaders that play trumpet
430 | countries have more than ten caves
431 | mayor of Berlin ---> berlin mayor
432 |
433 | Formula 1 driver with the most races
434 | youngest player in the Premier League
435 | Methodist politicians
436 | People that were born in Vienna and died in Berlin
437 | number of times that Jane Fonda married
438 | companies in Munich
439 | professional surfers born in Australia
440 | countries connected by the Rhine
--------------------------------------------------------------------------------
/src/resourceGraph.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | # Model class for resource elements
4 | class Resource:
5 | def __init__(self,uri,label,support,keyword):
6 | self.uri = uri # URI of the resource.
7 | self.label = label # Label of the resource
8 | self.support = int(support) # Importance/ represents the number of incoming links in DBPedia on to the resource
9 | self.keyword = keyword # Keyword represented by the resource
10 | self.colors = [] # Colors assigned
11 | self.score = 0
12 | self.isUri = False
13 |
14 | # Fact node model class.
15 | # Fact node is a node that represents a RDF Triple.
16 | # In addition, we also maintain the keywords in the query that this fact node covers
17 | class FactNode:
18 | def __init__(self,subject,predicate,object):
19 | self.subject = subject # Subject of the fact node
20 | self.predicate = predicate # Predicate
21 | self.object = object # Object
22 | self.colors = [] # Colours
23 | self.children = [] # Child Nodes
24 | self.score = 0 # Represents the score of the the current fact node - This is a cumulative score
25 | self.isExplored = False # A boolean flag to check if the currect fact node is explored during search
26 |
27 | # Used to add child node to current node
28 | def add_child(self, obj):
29 | self.children.append(obj)
30 |
31 | # Set colors of the fact node from the colors of subject , predicate and object resources
32 | # Eg.
33 | # Fact_node triple -> dbPedia:Bill_Gates dbPedia:spouse dbPedia:Melinda_Gates
34 | # dbPedia:Bill_Gates covers colors 2,3
35 | # dbPedia:spouse covers colours 1
36 | # dbPedia:Melinda_Gates covers 1,2,3
37 |
38 | # then the fact node covers 1,2,3
39 | def set_colors(self):
40 |
41 | for color in self.subject.colors:
42 | if(color not in self.colors):
43 | self.colors.append(color)
44 |
45 | for color in self.predicate.colors:
46 | if(color not in self.colors):
47 | self.colors.append(color)
48 |
49 | for color in self.object.colors:
50 | if(color not in self.colors):
51 | self.colors.append(color)
52 |
53 | # Resource Graph Model class
54 | # This graph will have Fact nodes as the nodes which inturn will have Resources
55 | class ResourceGraph:
56 | def __init__(self,rootNode):
57 | self.rootNode = rootNode
58 |
59 |
60 |
--------------------------------------------------------------------------------
/src/sparqlClient.py:
--------------------------------------------------------------------------------
1 | import json
2 | import inflection
3 | from SPARQLWrapper import SPARQLWrapper, JSON , XML
4 | from colorAssignment import ColorAssignment
5 | from wordSimilarity import WordSimilarity
6 | from collections import OrderedDict
7 | from resourceGraph import Resource
8 | from resourceGraph import FactNode
9 |
10 | # This represents a DBPedia triplet object
11 | class DBPediaTriplet:
12 | def __init__(self,subject,predicate,object):
13 | self.subject = subject
14 | self.object = object
15 | self.predicate = predicate
16 |
17 |
18 |
19 | # This represents the sparql quering engine
20 | class SparqlClient :
21 |
22 | def findAverageScorePhraseSentence(keyword,actualPredicateValue):
23 | score = 0
24 | count = 0
25 | for key1 in keyword.lower().split(' '):
26 | for key2 in actualPredicateValue.lower().split(' '):
27 | count+=1
28 | if(key1 == key2):
29 | score += 3.0
30 | else:
31 | similarityScore = WordSimilarity.isPredicateSimilar(key1,key2)
32 | if(similarityScore==-1):
33 | similarityScore = 0
34 | score += similarityScore
35 |
36 | if(count!=0):
37 | if(score==0):
38 | return -1
39 | else:
40 | return (score/count)
41 | else:
42 | return -1
43 |
44 | # This method is used to filter the predicates
45 | def filterPredicates(predicate,keywordList):
46 |
47 | # vocab dictionary contains the predicates that we do not want to consider
48 | vocabDictionary = ['rdf-schema#comment','22-rdf-syntax-ns#type','abstract','owl#sameAs','subject']
49 |
50 | predicateList = []
51 |
52 | # from the predicate URI, just consider the property and ignore the vocabulary
53 | # http://dbpedia.org/resource/Name -----> consider 'Name'
54 | predicateValue = predicate.split('/')[-1]
55 |
56 | # ignore if the predicate property is in vocab dictionary
57 | if(predicateValue in vocabDictionary):
58 | return predicateList
59 |
60 | # Boolean value indicating phrase sentence
61 | isPhraseSentence = False
62 |
63 | # Handles the camel case properties
64 | # camel cases will be returned seperated by _
65 | camelCaseValue = inflection.underscore(predicateValue)
66 | if '_' in camelCaseValue:
67 | isPhraseSentence = True
68 | else:
69 | isPhraseSentence = False
70 |
71 | predicateValues = camelCaseValue.split('_')
72 |
73 |
74 | # camel case with _ to a string seperated by spaces
75 | actualPredicateValue = ''
76 | for value in predicateValues:
77 | actualPredicateValue = actualPredicateValue + ' ' + value
78 |
79 | actualPredicateValue = actualPredicateValue.strip()
80 |
81 |
82 | # iterate over each uncovered keyword and check if the predicate is semantically similar to the keyword
83 | for keyword in keywordList:
84 | # semantic similarity
85 | if(keyword.lower()==actualPredicateValue.lower()):
86 | score = 3.0
87 | #elif(isPhraseSentence):
88 | #score = SparqlClient.findAverageScorePhraseSentence(keyword,actualPredicateValue)
89 | #print('phrase'+str(score))
90 | else:
91 | score = WordSimilarity.isPredicateSimilar(keyword,actualPredicateValue)
92 | #print(' no phrase'+str(score))
93 |
94 |
95 |
96 | if(score!=-1):
97 | predicateObject = Resource('<'+predicate+'>',predicateValue,0,keyword)
98 |
99 | # bi-gram scenario
100 | individualKeyword = keyword.split(' ')
101 | for key in individualKeyword:
102 | predicateObject.colors.append(ColorAssignment.colorDictionary[key])
103 |
104 | predicateObject.score = score
105 | predicateObject.isUri = True
106 | predicateList.append(predicateObject)
107 |
108 | return predicateList
109 |
110 |
111 |
112 | # This method is used to get the list of keywords that is not covered by the current element
113 | def getUncoveredKeywords(colorList,biGramList):
114 | keywordList = []
115 |
116 | # Join the list to make it a single string
117 | pivotColors = ''.join(str(x) for x in colorList)
118 |
119 | # Suppose we want to explore uncovered bi-grams, include them in the list
120 | if(len(biGramList)>0):
121 | keywordList.extend(biGramList)
122 |
123 | # make use of the color dictionary to identify uncovered keywords
124 | for keyword,color in ColorAssignment.colorDictionary.items():
125 | if(str(color) not in pivotColors):
126 | keywordList.append(keyword)
127 |
128 | return keywordList
129 |
130 |
131 | def findObjectKeywordMatch(object):
132 |
133 | # get the object value
134 | objectVal = object.label
135 |
136 | # Join the list to make it a single string
137 | colors = ''.join(str(x) for x in object.colors)
138 |
139 | # make use of the color dictionary to identify uncovered keywords
140 | for keyword,color in ColorAssignment.colorDictionary.items():
141 | if(str(color) not in colors):
142 | if(keyword == objectVal):
143 | object.score = object.score + 3.0
144 | object.colors.append(color)
145 |
146 | return object
147 |
148 |
149 |
150 | # Returns the triples for the pivot element
151 | def getAllTripletsForPivotElement(resource,biGramList):
152 | print(' Exploring ... ')
153 | tripletList = []
154 | # Get the URI of the element
155 | pivotElement = resource.uri
156 | print(pivotElement)
157 | print('Current label : ' + resource.label)
158 |
159 | # Get a list of keywords that the current element does not cover
160 | keywordList = SparqlClient.getUncoveredKeywords(resource.colors,biGramList)
161 | print('Keywords yet to cover : ' + str(keywordList))
162 |
163 | # If the resource covers all keywords, stop exploring this node
164 | if(len(keywordList)==0):
165 | return tripletList
166 |
167 |
168 | sparql = SPARQLWrapper("http://dbpedia.org/sparql") # Assigns an endpoint
169 | sparql.setReturnFormat(JSON) # Sets the return format to be json
170 | # Queries the endpoint to retrive all the triplets that have pivot element as subject
171 | sparql.setQuery("""
172 | PREFIX rdfs:
173 | SELECT ?p ?o
174 | WHERE { """ + pivotElement + """ ?p ?o
175 | }
176 | """)
177 |
178 | try:
179 | results = sparql.query().convert()
180 | except Exception as e:
181 | print(e)
182 | print(' DBPedia is down for maintanance') # Exception
183 | return tripletList
184 |
185 |
186 | # Find predicates that are semantically similar to uncovered keywords
187 | for result in results["results"]["bindings"]:
188 |
189 | # Considering only 'en' language
190 | if(result["o"]["type"]!= 'uri' ):
191 | if("xml:lang" in result['o'] and result["o"]["xml:lang"]!='en'):
192 | continue
193 |
194 |
195 | # Get the sematically similar predicates
196 | predicateList = SparqlClient.filterPredicates(result["p"]["value"],keywordList)
197 |
198 | if len(predicateList)!=0:
199 | for predicate in predicateList:
200 |
201 | isUri = False
202 | objectval = result["o"]["value"]
203 |
204 | # form the URI if object is of type URI
205 | if(result["o"]["type"]=='uri'):
206 | isUri = True
207 | objectval = '<'+objectval+'>'
208 |
209 | # remove duplicated keyword scenario
210 | set = []
211 | set.extend(resource.keyword.split(' '))
212 | for x in predicate.keyword.split(' '):
213 | if x not in set:
214 | set.append(x)
215 |
216 | set = ' '.join(str(x) for x in set)
217 |
218 | object = Resource(objectval,result["o"]["value"].split('/')[-1],0,set)
219 |
220 | # set the properties and form the fact node
221 | if(isUri):
222 | object.isUri = True
223 |
224 | object.score = resource.score + predicate.score
225 | for color in resource.colors:
226 | if(color not in object.colors):
227 | object.colors.append(color)
228 |
229 | for color in predicate.colors:
230 | if(color not in object.colors):
231 | object.colors.append(color)
232 |
233 | object = SparqlClient.findObjectKeywordMatch(object)
234 |
235 | factNodeObj = FactNode(resource,predicate,object)
236 | factNodeObj.score = object.score
237 | factNodeObj.set_colors()
238 | tripletList.append(factNodeObj)
239 | '''
240 | else:
241 |
242 | objectList = SparqlClient.filterPredicates(result["o"]["value"],keywordList)
243 |
244 | for objectResource in objectList:
245 |
246 | isUri = False
247 | predicateVal = '<'+result["p"]["value"]+'>'
248 |
249 | # remove duplicated keyword scenario
250 | set = []
251 | set.extend(resource.keyword.split(' '))
252 | for x in objectResource.keyword.split(' '):
253 | if x not in set:
254 | set.append(x)
255 |
256 | set = ' '.join(str(x) for x in set)
257 |
258 | predicate = Resource(predicateVal,result["p"]["value"].split('/')[-1],0,set)
259 |
260 | # set the properties and form the fact node
261 | predicate.isUri = True
262 |
263 | object.score = resource.score + object.score
264 | for color in resource.colors:
265 | if(color not in object.colors):
266 | object.colors.append(color)
267 |
268 | for color in predicate.colors:
269 | if(color not in object.colors):
270 | object.colors.append(color)
271 |
272 | object = SparqlClient.findObjectKeywordMatch(object)
273 |
274 | factNodeObj = FactNode(resource,predicate,object)
275 | factNodeObj.score = object.score
276 | factNodeObj.set_colors()
277 | tripletList.append(factNodeObj)
278 | '''
279 | # Sort the list and return
280 | return tripletList
281 |
282 |
--------------------------------------------------------------------------------
/src/testSparqlEndPoint.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import sys
3 | import json
4 | import math
5 | from urllib.parse import quote
6 | from SPARQLWrapper import SPARQLWrapper, JSON , XML
7 |
8 | sparql = SPARQLWrapper("http://dbpedia.org/sparql") # Assigns an endpoint
9 | sparql.setReturnFormat(JSON) # Sets the return format to be json
10 | # Queries the endpoint to retrive all the triplets that have pivot element as subject
11 | pivotElement = ''
12 |
13 |
14 | sparql.setQuery("""
15 | PREFIX rdfs:
16 | SELECT ?p ?label
17 | WHERE { """ + pivotElement + """ ?p ?label
18 | }
19 | """)
20 | '''
21 | sparql.setQuery("""
22 | PREFIX rdfs:
23 | SELECT ?label
24 | WHERE {
25 | rdfs:label ?label .
26 | }
27 | """)
28 | '''
29 |
30 | try:
31 | results = sparql.query().convert()
32 | except Exception as e:
33 | print(e)
34 | print(' DBPedia is down for maintanance')
35 | exit(3)
36 |
37 | # Find predicates that are semantically similar to uncovered keywords
38 | for result in results["results"]["bindings"]:
39 |
40 | # Considering only 'en' language
41 | print(result["label"]["value"])
42 |
43 |
44 | print('done')
45 |
--------------------------------------------------------------------------------
/src/wordSimilarity.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import sys
3 | import json
4 | import math
5 | from urllib.parse import quote
6 | from threading import Thread
7 |
8 | class WordSimilarity:
9 |
10 | scoreDictionary = {}
11 | scoreDictionary['esa'] = 0
12 | scoreDictionary['swoogle'] = 0
13 |
14 | # 1 - EasyESA client
15 | # a score of 1 and -1 results in a perfect match
16 | # treshold values to consider 0.07, 0.052 and 0.04
17 | def getEasyESAScore(word1,word2):
18 |
19 | WordSimilarity.scoreDictionary['esa'] = 0
20 | url = "http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1="+quote(word1)+'&term2='+quote(word2)
21 | try:
22 | request = urllib.request.Request(url)
23 | response = urllib.request.urlopen(request)
24 | score = str(response.read().decode('utf-8')).replace('\"','')
25 | WordSimilarity.scoreDictionary['esa'] = float(score)
26 | except Exception as e:
27 | WordSimilarity.scoreDictionary['esa'] = 0
28 |
29 | # 2 - ws4j client
30 | def getWs4jScore(word1,word2):
31 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+quote(word1)+"%3A%3A"+quote(word2)
32 | request = urllib.request.Request(url)
33 | request.add_header('Accept', 'application/json')
34 | response = urllib.request.urlopen(request)
35 | responseStr = response.read().decode('utf-8')
36 | # fetch json from the response
37 | jsonStr = json.loads(responseStr)
38 | score = float(jsonStr['result'][0]['score'])
39 | return score
40 |
41 | # 3 - UMBC Semantic Similarity service
42 | #
43 | # Documentation availabel at http://swoogle.umbc.edu/SimService/api.html
44 | def getSwoogleScore(word1,word2):
45 | WordSimilarity.scoreDictionary['swoogle'] = 0
46 | url = "http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="+quote(word1)+'&phrase2='+quote(word2)
47 | try:
48 | request = urllib.request.Request(url)
49 | response = urllib.request.urlopen(request)
50 | score = str(response.read().decode('utf-8')).replace('\"','')
51 | score = float(score)
52 | WordSimilarity.scoreDictionary['swoogle'] = score
53 | except Exception as e:
54 | WordSimilarity.scoreDictionary['swoogle'] = 0
55 |
56 |
57 | # As of now using only EasyESA.
58 | # call the method 2 ws4j client if needed
59 | # a score of 1 and -1 results in a perfect match
60 | # treshold values to consider 0.07, 0.052 and 0.04
61 | def isPredicateSimilar(word1,word2):
62 | #score = math.fabs(WordSimilarity.getEasyESAScore(word1,word2))
63 |
64 | esaThread = Thread(target=WordSimilarity.getEasyESAScore, args=(word1,word2,))
65 | swoogleThread = Thread(target=WordSimilarity.getSwoogleScore, args=(word1,word2,))
66 |
67 | esaThread.start()
68 | swoogleThread.start()
69 | esaThread.join()
70 | swoogleThread.join()
71 |
72 | ESAscore = WordSimilarity.scoreDictionary['esa']
73 | #WordSimilarity.getEasyESAScore(word1,word2)
74 | ESAScaledScore = 0
75 | if(ESAscore>0 and ESAscore<=0.04):
76 | ESAScaledScore = 1
77 | elif(ESAscore>0.04 and ESAscore<=0.06):
78 | ESAScaledScore = 2
79 | elif(ESAscore>0.07):
80 | ESAScaledScore = 3
81 | else:
82 | ESAScaledScore = 0
83 |
84 | SwoogleScore = WordSimilarity.scoreDictionary['swoogle']
85 | # WordSimilarity.getSwoogleScore(word1,word2)
86 | SwoogleScaledScore = 0
87 | if(SwoogleScore>0 and SwoogleScore<0.6):
88 | SwoogleScaledScore = 1
89 | elif(SwoogleScore>=0.6 and SwoogleScore<0.7):
90 | SwoogleScaledScore = 2
91 | elif(SwoogleScore>=0.7):
92 | SwoogleScaledScore = 3
93 | else:
94 | SwoogleScaledScore = 0
95 |
96 | if(ESAScaledScore>SwoogleScaledScore):
97 | score = ESAScaledScore
98 | else:
99 | score = SwoogleScaledScore
100 |
101 | if(score>=2):
102 | return score
103 | else:
104 | return -1
105 |
--------------------------------------------------------------------------------
/src/ws4j.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import sys
3 | import json
4 | from wordSimilarity import *
5 |
6 | word1 = input(" Enter the word1 : ")
7 | word2 = input(" Enter the word2 : ")
8 |
9 |
10 | #url = "http://spotlight.dbpedia.org/rest/annotate?types=DBPedia:Person&text="+sentence+"&confidence=0.2&support=20"
11 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+word1+"%3A%3A"+word2
12 | #print(url)
13 | request = urllib.request.Request(url)
14 | request.add_header('Accept', 'application/json')
15 | response = urllib.request.urlopen(request)
16 |
17 | responseStr = response.read().decode('utf-8')
18 |
19 | # fetch json from the response
20 | jsonStr = json.loads(responseStr)
21 |
22 | print(jsonStr['result'][0]['score'])
23 |
--------------------------------------------------------------------------------