├── .gitignore ├── .project ├── .pydevproject ├── LICENSE ├── README.md ├── docs ├── Design.pages ├── SESSA.pdf ├── TREO.pdf ├── graph-keyword-search-poster-PPT.pptx ├── graph-keyword-search-poster.key └── graph-keyword-search-poster.pdf └── src ├── colorAssignment.py ├── dig ├── README.txt ├── SteinerTree.py ├── config.ini ├── esMapping-dig-ht-DT.json ├── graph.py ├── harvest.py ├── main.py ├── outline.py ├── prep.sh ├── prep2.sh ├── prep_ht.sh ├── query.py ├── synonym.py ├── test.py ├── util.py └── z-attic │ └── wordSimilarity.py ├── graphSearch.py ├── ngramTree.py ├── ngramsEngine.py ├── pivotEntityRecognition.py ├── queries.txt ├── resourceGraph.py ├── saq-2015_training_set.xml ├── sparqlClient.py ├── testSparqlEndPoint.py ├── wordSimilarity.py └── ws4j.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # emacs 57 | *~ 58 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | graph-keyword-search 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | /${PROJECT_DIR_NAME}/src 6 | /${PROJECT_DIR_NAME}/src/dig 7 | 8 | python 3.0 9 | gks python 3 10 | 11 | /opt/dig/venv/gks/lib/python3.5/site-packages/Levenshtein 12 | /opt/dig/venv/gks/lib/python3.5/site-packages/word2vec 13 | /opt/dig/venv/gks/lib/python3.5/site-packages/word2vec-0.8-py3.5.egg-info 14 | ${PROJ}/hybrid-jaccard 15 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5.egg-info/PKG-INFO 16 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5.egg-info 17 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5-macosx-10.10-x86_64.egg 18 | /opt/dig/venv/gks/lib/python3.5/site-packages/python_Levenshtein-0.12.0-py3.5-macosx-10.10-x86_64.egg/Levenshtein 19 | 20 | 21 | PROJ 22 | /Users/philpot/Documents/project 23 | 24 | 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # graph-keyword-search 2 | 3 | 1. Download and install the latest version of python3 - https://www.python.org/downloads/ 4 | 2. Check the python3 installation by opening shell(Terminal or command prompt), type python3 at the shell. 5 | 3. Download and install nltk using - 'pip3 install nltk' command at the shell 6 | 4. Download and install inflection library - 'pip3 install inflection' command at the shell 7 | 5. Download and install SPARQLWrapper library - 'pip3 install SPARQLWrapper' command at the shell 8 | 9 | Running the program : 10 | 11 | python3 graphSearch.py 12 | 13 | -------------------------------------------------------------------------------- /docs/Design.pages: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/Design.pages -------------------------------------------------------------------------------- /docs/SESSA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/SESSA.pdf -------------------------------------------------------------------------------- /docs/TREO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/TREO.pdf -------------------------------------------------------------------------------- /docs/graph-keyword-search-poster-PPT.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster-PPT.pptx -------------------------------------------------------------------------------- /docs/graph-keyword-search-poster.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster.key -------------------------------------------------------------------------------- /docs/graph-keyword-search-poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/graph-keyword-search/28cd46f561a2b41a12548a09818d0628d931d2cd/docs/graph-keyword-search-poster.pdf -------------------------------------------------------------------------------- /src/colorAssignment.py: -------------------------------------------------------------------------------- 1 | from ngramTree import * 2 | 3 | # This class is responsible for assignment of colors to the nodes in the ngram tree 4 | class ColorAssignment: 5 | 6 | colorDictionary = {} # This dictionary stores the individual tokens ['a', 'b', 'c', 'd'] and their color values 7 | 8 | # lookuplist - [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']] 9 | def assignInitialColors(self,rootNode,lookupList): 10 | 11 | if rootNode and len(lookupList)>=1: 12 | oneGrams = lookupList[0] # Gets the one grams 13 | 14 | for index in range(len(oneGrams)): 15 | if(oneGrams[index] not in self.colorDictionary): 16 | self.colorDictionary[oneGrams[index]] = index # This assigns the color values to each token 17 | 18 | 19 | stack = [] 20 | stack.append(rootNode) # Using stack for DFS 21 | 22 | while(stack): 23 | currNode = stack.pop() 24 | 25 | # Assign colors to this node based on the presence of tokens 26 | if not currNode.isVisited: # If a node repeats, do not initialize color again 27 | 28 | currNode.isVisited = True 29 | 30 | tokens = currNode.data.split(' ') # Check for individual tokens 31 | 32 | for token in tokens: 33 | if(token in self.colorDictionary): 34 | currNode.color.append(self.colorDictionary[token]) # Assign colors 35 | 36 | for childNodes in currNode.children: # Add children to the stack 37 | stack.append(childNodes) 38 | 39 | 40 | #return rootNode 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/dig/README.txt: -------------------------------------------------------------------------------- 1 | Per document queries such as _termvector and _mtermvectors seem to 2 | mostly be concerned with multiple occurrences of the (presumed) 3 | repeated word in a single analyzed field. 4 | 5 | POST http://localhost:9200/twitter/tweet/3/_termvector 6 | { 7 | } 8 | 9 | looks good 10 | 11 | Multi-term queries require that we specify all document IDs in the body 12 | 13 | POST http://localhost:9200/twitter/tweet/_mtermvectors 14 | {"ids" : ["1", "2"], 15 | "parameters": { 16 | "fields": ["text"], 17 | "term_statistics": "false" 18 | } 19 | } 20 | 21 | looks good 22 | 23 | Not as meaningful for our more nominal/enumerable values. 24 | 25 | 26 | https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html 27 | Term aggregation seems reasonable for our needs: 28 | 29 | POST http://localhost:9200/twitter/tweet/_search?search_type=count 30 | { 31 | "query": { 32 | "match_all": {} 33 | }, 34 | "aggs": { 35 | "eyecolors": { 36 | "terms": { 37 | "field": "eyecolor" 38 | } 39 | } 40 | } 41 | } 42 | 43 | This is the nested equivalent: 44 | 45 | POST http://localhost:9200/twitter2/tweet/_search?search_type=count 46 | { 47 | "query": { 48 | "match_all": {} 49 | }, 50 | "aggs": { 51 | "schools": { 52 | "nested": { 53 | "path": "children" 54 | }, 55 | "aggs": { 56 | "aggname": { 57 | "terms": { 58 | "field": "children.school"} 59 | } 60 | } 61 | } 62 | } 63 | } 64 | 65 | giving result such as 66 | 67 | { 68 | "took": 1, 69 | "timed_out": false, 70 | "_shards": { 71 | "total": 1, 72 | "successful": 1, 73 | "failed": 0 74 | }, 75 | "hits": { 76 | "total": 3, 77 | "max_score": 0, 78 | "hits": [] 79 | }, 80 | "aggregations": { 81 | "schools": { 82 | "doc_count": 7, 83 | "aggname": { 84 | "doc_count_error_upper_bound": 0, 85 | "sum_other_doc_count": 0, 86 | "buckets": [ 87 | { 88 | "key": "Aardvark", 89 | "doc_count": 4 90 | }, 91 | { 92 | "key": "Badger", 93 | "doc_count": 1 94 | }, 95 | { 96 | "key": "Factory", 97 | "doc_count": 1 98 | }, 99 | { 100 | "key": "Junkyard", 101 | "doc_count": 1 102 | } 103 | ] 104 | } 105 | } 106 | } 107 | } 108 | 109 | For our domain, the query looks like: 110 | 111 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count 112 | { 113 | "query": { 114 | "match_all": {} 115 | }, 116 | "aggs": { 117 | "itemOfferedAgg": { 118 | "nested": { 119 | "path": "itemOffered" 120 | }, 121 | "aggs": { 122 | "termsSubAgg": { 123 | "terms": { 124 | "field": "itemOffered.hairColor", 125 | "size" : 20 126 | } 127 | } 128 | } 129 | } 130 | } 131 | } 132 | 133 | yielding 134 | 135 | { 136 | "took": 1492, 137 | "timed_out": false, 138 | "_shards": { 139 | "total": 20, 140 | "successful": 20, 141 | "failed": 0 142 | }, 143 | "hits": { 144 | "total": 19134836, 145 | "max_score": 0, 146 | "hits": [] 147 | }, 148 | "aggregations": { 149 | "itemOfferedAgg": { 150 | "doc_count": 19134836, 151 | "termsSubAgg": { 152 | "doc_count_error_upper_bound": 0, 153 | "sum_other_doc_count": 345, 154 | "buckets": [ 155 | { 156 | "key": "blond", 157 | "doc_count": 813715 158 | }, 159 | { 160 | "key": "brown", 161 | "doc_count": 605642 162 | }, 163 | { 164 | "key": "NONE", 165 | "doc_count": 295217 166 | }, 167 | { 168 | "key": "black", 169 | "doc_count": 199892 170 | }, 171 | { 172 | "key": "red", 173 | "doc_count": 142948 174 | }, 175 | { 176 | "key": "blonde", 177 | "doc_count": 27069 178 | }, 179 | { 180 | "key": "auburn", 181 | "doc_count": 14732 182 | }, 183 | { 184 | "key": "gray", 185 | "doc_count": 6624 186 | }, 187 | { 188 | "key": "brunette", 189 | "doc_count": 3396 190 | }, 191 | { 192 | "key": "light brown", 193 | "doc_count": 1813 194 | }, 195 | { 196 | "key": "dark brown", 197 | "doc_count": 1350 198 | }, 199 | { 200 | "key": "other", 201 | "doc_count": 862 202 | }, 203 | { 204 | "key": "chestnut", 205 | "doc_count": 735 206 | }, 207 | { 208 | "key": "dirty brown", 209 | "doc_count": 345 210 | }, 211 | { 212 | "key": "auburn red", 213 | "doc_count": 259 214 | }, 215 | { 216 | "key": "auburnred", 217 | "doc_count": 142 218 | }, 219 | { 220 | "key": "strawberry blonde", 221 | "doc_count": 142 222 | }, 223 | { 224 | "key": "white", 225 | "doc_count": 29 226 | }, 227 | { 228 | "key": "long", 229 | "doc_count": 23 230 | }, 231 | { 232 | "key": "long brown", 233 | "doc_count": 18 234 | } 235 | ] 236 | } 237 | } 238 | } 239 | } 240 | 241 | For the basis to make sense, the filter portion should be specified to include only those documents with the aggregated value? 242 | 243 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count 244 | { 245 | "query": { 246 | "filtered": { 247 | "query": { 248 | "match_all": {} 249 | }, 250 | "filter": { 251 | "nested": { 252 | "path": "itemOffered", 253 | "filter": { 254 | "exists": { 255 | "field": "eyeColor" 256 | } 257 | } 258 | } 259 | } 260 | } 261 | }, 262 | 263 | "aggs": { 264 | "itemOfferedAgg": { 265 | "nested": { 266 | "path": "itemOffered" 267 | }, 268 | "aggs": { 269 | "termsSubAgg": { 270 | "terms": { 271 | "field": "itemOffered.eyeColor", 272 | "size" : 100 273 | } 274 | } 275 | } 276 | } 277 | } 278 | } 279 | 280 | TOPLEVEL objects in our ES are (with useful attributes): 281 | webpage (root WebPage) 282 | mainEntity* -> offer 283 | publisher.name [y] 284 | adultservice (root AdultService) 285 | eyeColor 286 | hairColor 287 | name 288 | personAge 289 | offers* -> offer 290 | offer (root Offer) 291 | availableAtOrFrom* -> place/address 292 | itemOffered* -> adultservice 293 | priceSpecification.price [x] 294 | priceSpecification.billingIncrement [x] 295 | priceSpecification.unitCode [x] 296 | priceSpecification.name [x] 297 | phone (root PhoneNumber) 298 | seller (root PersonOrOrganization) 299 | email (root EmailAddress) 300 | 301 | Thus non-toplevel objects include: 302 | address 303 | geo 304 | priceSpecification 305 | publisher 306 | 307 | -------------------------------------------------------------------------------- /src/dig/SteinerTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Sep 7 12:57:57 2013 4 | 5 | @author: xinghualu 6 | @modified: philpot 7 | """ 8 | 9 | # This is a generalized implementation of the Kou algorithm for creating Steiner Trees. 10 | # It can be used with any networkx weighted graph. 11 | 12 | from heapq import heappush, heappop 13 | from networkx import Graph, bidirectional_dijkstra, has_path 14 | 15 | # import json 16 | # def hashableDict(d): 17 | # return json.dumps(d, sort_keys=True) 18 | # 19 | # def unhashableDict(hashableDict): 20 | # return json.loads(hashableDict) 21 | 22 | ## Extract a Steiner tree from a weighted graph, given a list of vertices of interest 23 | # @param G A Graph with weighted edges 24 | # @param voi A list of vertices of interest 25 | # @param generator A method to make a new Graph instance (in the case that you've extended Graph) 26 | # \returns a new graph if no errors, None otherwise 27 | def make_steiner_tree(G, voi, generator=None): 28 | mst = Graph() 29 | for v in voi: 30 | if not v in G: 31 | raise ValueError("make_steiner_tree(): Vertex {} not in original graph".format(v)) 32 | if len(voi) == 0: 33 | return mst 34 | if len(voi) == 1: 35 | mst.add_node(voi[0]) 36 | return mst 37 | 38 | # Initially, use (a version of) Kruskal's algorithm to extract a minimal spanning tree 39 | # from a weighted graph. This algorithm differs in that only a subset of vertices are 40 | # going to be present in the final subgraph (which is not truly a MST - must use Prim's 41 | # algorithm later. 42 | 43 | # extract all shortest paths among the voi 44 | heapq = [] 45 | paths = {} 46 | 47 | # load all the paths bwteen the Steiner vertices. Store them in a heap queue 48 | # and reconstruct the MST of the complete graph using Kruskal's algorithm 49 | for i in range(len(voi) - 1): 50 | v1 = voi[i] 51 | for v2 in voi[i+1:]: 52 | result = bidirectional_dijkstra(G, v1, v2) 53 | if result == False: 54 | raise RuntimeError("The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2)) 55 | #print "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2) 56 | distance, vertList = result 57 | keys = [v1, v2] 58 | keys.sort() 59 | key = "%s:%s" % tuple(keys) 60 | paths[key] = (vertList) 61 | heappush(heapq, (distance, v1, v2)) 62 | 63 | # construct the minimum spanning tree of the complete graph 64 | while heapq: 65 | w, v1, v2 = heappop(heapq) 66 | # if no path exists yet between v1 and v2, add this one 67 | if v1 not in mst or v2 not in mst or not has_path(mst, v1, v2): 68 | mst.add_edge(v1, v2, weight=w) 69 | 70 | # check if the graph is tree and correct 71 | sTree = set(mst.nodes()) 72 | sSteiner = set(voi) 73 | if sTree ^ sSteiner: 74 | raise RuntimeError('Failed to construct MST spanning tree') 75 | 76 | # reconstruct subgraph of origGraph using the paths 77 | if generator is None: 78 | subgraph = Graph() 79 | else: 80 | subgraph = generator() 81 | for edge in mst.edges_iter(data=True): 82 | keys = [edge[0],edge[1]] 83 | keys.sort() 84 | key = "%s:%s" % tuple(keys) 85 | vList = paths[key] 86 | for i in range(len(vList) - 1): 87 | v1 = vList[i] 88 | v2 = vList[i+1] 89 | w = G[v1][v2] 90 | subgraph.add_edge(v1, v2, w) 91 | # get rid of possible loops - result will be a true MST 92 | subgraph = make_prim_mst(subgraph, generator) 93 | 94 | # remove intermediate nodes in paths that are not in list of voi 95 | return _trimTree(subgraph, voi) 96 | 97 | 98 | ## remove intermediate nodes in paths that are not in list of voi in given graph 99 | # @param graph A weighted Graph 100 | # @param voi 101 | #/return graph An updated version of the Graph 102 | def _trimTree(graph, voi): 103 | trimKeepTrack = [] 104 | firstNode = voi[0] 105 | if len(graph.neighbors(firstNode)) < 2: 106 | trimKeepTrack.append(firstNode) 107 | firstNeighbor = graph.neighbors(firstNode)[0] 108 | trimKeepTrack.append(firstNeighbor) 109 | graph = _trim(firstNeighbor, graph, trimKeepTrack, voi) 110 | else: 111 | trimKeepTrack.append(firstNode) 112 | graph = _trim(firstNode, graph, trimKeepTrack, voi) 113 | return graph 114 | 115 | def _trim(node, graph, trimKeepTrack, voi): 116 | if len(list(graph.adj[node].keys())) > 1: 117 | for nodeNeighbor in list(graph.adj[node].keys()): 118 | if nodeNeighbor not in trimKeepTrack: 119 | trimKeepTrack.append(nodeNeighbor) 120 | graph = _trim(nodeNeighbor, graph, trimKeepTrack, voi) 121 | if len(list(graph.adj[node].keys())) < 2: 122 | if node not in voi: 123 | graph.remove_node(node) 124 | return graph 125 | 126 | 127 | """ 128 | Prim's algorithm: constructs the minimum spanning tree (MST) from an instance of weighted Graph 129 | @param G An weighted Graph() 130 | @param generator A method to make a new Graph instance (in the case that you've extended Graph) 131 | \returns A MST version of G 132 | """ 133 | ## generate the Prim's algorithm MST 134 | # @param G A weighted Graph 135 | # @param generator Always set to None 136 | # /return mst Returns the created MST 137 | def make_prim_mst(G, generator=None): 138 | if generator is None: 139 | mst = Graph() 140 | else: 141 | mst = generator() 142 | # priorityQ is a list of list (the reverse of the edge tuple with the weight in the front) 143 | priorityQ = [] 144 | firstNode = G.nodes()[0] 145 | mst.add_node(firstNode) 146 | for edge in G.edges_iter(firstNode, data=True): 147 | if len(edge) != 3 or edge[2] is None: 148 | raise ValueError("make_prim_mst accepts a weighted graph only (with numerical weights)") 149 | heappush(priorityQ, (edge[2]['weight'], edge)) 150 | while len(mst.edges()) < (G.order()-1): 151 | _, minEdge = heappop(priorityQ) 152 | if len(minEdge) != 3 or minEdge[2] is None: 153 | raise ValueError("make_prim_mst accepts a weighted graph only (with numerical weights)") 154 | v1, v2, _ = minEdge 155 | if v1 not in mst: 156 | for edge in G.edges_iter(v1, data=True): 157 | if edge == minEdge: 158 | continue 159 | heappush(priorityQ, (edge[2]['weight'], edge)) 160 | elif v2 not in mst: 161 | for edge in G.edges_iter(v2, data=True): 162 | if edge == minEdge: 163 | continue 164 | heappush(priorityQ, (edge[2]['weight'], edge)) 165 | else: 166 | # non-crossing edge 167 | continue 168 | mst.add_edge(minEdge[0],minEdge[1],minEdge[2]) 169 | return mst 170 | -------------------------------------------------------------------------------- /src/dig/config.ini: -------------------------------------------------------------------------------- 1 | [direct] 2 | enable = true 3 | 4 | [wordnet] 5 | enable = true 6 | lemma_min_count = 1 7 | n_enable = true 8 | n_self_factor = 1.0 9 | n_hypernym_factor = 0.5 10 | n_hyponym_factor = 0.5 11 | v_enable = true 12 | v_self_factor = 1.0 13 | v_hypernym_factor = 0.5 14 | v_hyponym_factor = 0.5 15 | a_enable = true 16 | a_self_factor = 1.0 17 | a_hypernym_factor = 0 18 | a_hyponym_factor = 0.5 19 | r_enable = true 20 | r_self_factor = 1.0 21 | r_hypernym_factor = 0 22 | r_hyponym_factor = 0.5 23 | 24 | [word2vec] 25 | enable = true 26 | data_dir = /opt/word2vec/data 27 | data_file = text8-phrases.bin 28 | size = 10 29 | minimum_score = 0.5 30 | 31 | [swoogle] 32 | enable = false 33 | uri_template = http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1={}&phrase2={} 34 | 35 | [easyesa] 36 | enable = false 37 | 38 | [levenshtein] 39 | enable = true 40 | above_score = 0.0 41 | within_score = 1.0 42 | 43 | [hybridjaccard] 44 | enable = true 45 | allowexact_enable = false 46 | -------------------------------------------------------------------------------- /src/dig/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import defaultdict 4 | from collections import namedtuple 5 | import json 6 | from queue import Queue 7 | import re 8 | import sys, os 9 | from pprint import pprint 10 | 11 | from Levenshtein import distance 12 | # from StringMatcher import distance 13 | # from Levenshtein.StringMatcher import distance 14 | from networkx import Graph, DiGraph 15 | 16 | from SteinerTree import make_steiner_tree 17 | from hybridJaccard import HybridJaccard 18 | 19 | LEAF_VOCAB_CACHE = "/Users/philpot/Documents/project/graph-keyword-search/src/dig/data/cache" 20 | 21 | def loadLeafVocab(pathdesc, root=LEAF_VOCAB_CACHE): 22 | pathname = os.path.join(root, pathdesc + ".json") 23 | with open(pathname, 'r') as f: 24 | j = json.load(f) 25 | # dict of (value, count) 26 | byCount = sorted([(v,q) for (q,v) in j['histo'].items()], reverse=True) 27 | return [t[1] for t in byCount] 28 | 29 | def localPath(suffix): 30 | return os.path.join(os.path.dirname(__file__), suffix) 31 | 32 | # http://stackoverflow.com/a/9283563/2077242 33 | def camelCaseWords(label): 34 | label = re.sub(r'((?<=[a-z])[A-Z]|(?".format(type(self).__name__, 45 | self.number_of_nodes(), 46 | self.number_of_edges()) 47 | 48 | def installDomain(self, domainType=None): 49 | if domainType == 'ht': 50 | self.add_node('seller', nodeType='Class', className='PersonOrOrganization', indexRoot='seller') 51 | 52 | self.add_node('phone', nodeType='Class', className='PhoneNumber', indexRoot='phone') 53 | self.add_edge('seller', 'phone', edgeType='ObjectProperty', relationName='telephone') 54 | 55 | self.add_node('phone.name', nodeType='leaf', vocabDescriptor='seller_telephone_name') 56 | self.add_edge('phone', 'phone.name', edgeType='DataProperty', relationName='name') 57 | 58 | self.add_node('email', nodeType='Class', className='EmailAddress', indexRoot='email') 59 | self.add_edge('seller', 'email', edgeType='ObjectProperty', relationName='email') 60 | # for now this ES query doesn't work 61 | # self.add_node('email.name', nodeType='leaf', values=loadLeafVocab('seller_email_name'), vocabDescriptor='seller_email_name') 62 | # so use flat data instead 63 | self.add_node('email.name', nodeType='leaf', vocabDescriptor='email_name') 64 | self.add_edge('email', 'email.name', edgeType='DataProperty', relationName='name') 65 | 66 | self.add_node('offer', nodeType='Class', className='Offer', indexRoot='offer') 67 | self.add_edge('offer', 'seller', edgeType='ObjectProperty', relationName='seller') 68 | self.add_edge('seller', 'offer', edgeType='ObjectProperty', relationName='makesOffer') 69 | 70 | self.add_node('priceSpecification', nodeType='Class', className='PriceSpecification') 71 | self.add_node('priceSpecification.billingIncrement', nodeType='leaf', vocabDescriptor='offer_priceSpecification_billingIncrement') 72 | self.add_edge('priceSpecification', 'priceSpecification.billingIncrement', edgeType='DataProperty', relationName='billingIncrement') 73 | self.add_node('priceSpecification.price', nodeType='leaf', vocabDescriptor='offer_priceSpecification_price') 74 | self.add_edge('priceSpecification', 'priceSpecification.price', edgeType='DataProperty', relationName='price') 75 | self.add_node('priceSpecification.name', nodeType='leaf', vocabDescriptor='offer_priceSpecification_name') 76 | self.add_edge('priceSpecification', 'priceSpecification.name', edgeType='DataProperty', relationName='name') 77 | self.add_node('priceSpecification.unitCode', nodeType='leaf', vocabDescriptor='offer_priceSpecification_unitCode') 78 | self.add_edge('priceSpecification', 'priceSpecification.unitCode', edgeType='DataProperty', relationName='unitCode') 79 | 80 | self.add_edge('offer', 'priceSpecification', edgeType='ObjectProperty', relationName='priceSpecification') 81 | 82 | self.add_node('adultservice', nodeType='Class', className='AdultService', indexRoot='adultservice') 83 | self.add_node('adultservice.eyeColor', nodeType='leaf', 84 | vocabDescriptor='adultservice_eyeColor', 85 | matcherDescriptor=HybridJaccard(ref_path=localPath("data/config/hybridJaccard/eyeColor_reference_wiki.txt"), 86 | config_path=localPath("data/config/hybridJaccard/eyeColor_config.txt"))) 87 | self.add_edge('adultservice', 'adultservice.eyeColor', edgeType='DataProperty', relationName='eyeColor') 88 | 89 | self.add_node('adultservice.hairColor', nodeType='leaf', 90 | vocabDescriptor='adultservice_hairColor', 91 | matcherDescriptor=HybridJaccard(ref_path=localPath("data/config/hybridJaccard/hairColor_reference_wiki.txt"), 92 | config_path=localPath("data/config/hybridJaccard/hairColor_config.txt"))) 93 | self.add_edge('adultservice', 'adultservice.hairColor', edgeType='DataProperty', relationName='hairColor') 94 | self.add_node('adultservice.name', nodeType='leaf', vocabDescriptor='adultservice_name') 95 | self.add_edge('adultservice', 'adultservice.name', edgeType='DataProperty', relationName='name') 96 | self.add_node('adultservice.personAge', nodeType='leaf', vocabDescriptor='adultservice_personAge') 97 | self.add_edge('adultservice', 'adultservice.personAge', edgeType='DataProperty', relationName='personAge') 98 | 99 | self.add_edge('offer', 'adultservice', edgeType='ObjectProperty', relationName='itemOffered') 100 | self.add_edge('adultservice', 'offer', edgeType='ObjectProperty', relationName='offers') 101 | 102 | self.add_node('place', nodeType='Class', className='Place') 103 | self.add_node('postaladdress', nodeType='Class', className='PostalAddress') 104 | 105 | self.add_edge('offer', 'place', edgeType='ObjectProperty', relationName='availableAtOrFrom') 106 | self.add_edge('place', 'postaladdress', edgeType='ObjectProperty', relationName='address') 107 | 108 | self.add_node('postaladdress.addressLocality', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressLocality') 109 | self.add_edge('postaladdress', 'postaladdress.addressLocality', edgeType='DataProperty', relationName='addressLocality') 110 | self.add_node('postaladdress.addressRegion', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressRegion') 111 | self.add_edge('postaladdress', 'postaladdress.addressRegion', edgeType='DataProperty', relationName='addressRegion') 112 | self.add_node('postaladdress.addressCountry', nodeType='leaf', vocabDescriptor='offer_availableAtOrFrom_address_addressCountry') 113 | self.add_edge('postaladdress', 'postaladdress.addressCountry', edgeType='DataProperty', relationName='addressCountry') 114 | 115 | self.add_node('webpage', nodeType='Class', className='WebPage', indexRoot='webpage') 116 | self.add_edge('offer', 'webpage', edgeType='ObjectProperty', relationName='mainEntityOfPage') 117 | self.add_edge('webpage', 'offer', edgeType='ObjectProperty', relationName='mainEntity') 118 | self.add_node('publisher', nodeType='Class', className='Organization') 119 | self.add_edge('webpage', 'publisher', edgeType='ObjectProperty', relationName='publisher') 120 | self.add_node('publisher.name', nodeType='leaf', vocabDescriptor='webpage_publisher_name') 121 | self.add_edge('publisher', 'publisher.name', edgeType='DataProperty', relationName='name') 122 | 123 | def labelInGraph(self, nodeOrEdge): 124 | try: 125 | return self.node[nodeOrEdge]['className'] 126 | except: 127 | try: 128 | return self.edge[nodeOrEdge[0]][nodeOrEdge[1]]['relationName'] 129 | except: 130 | return None 131 | 132 | def populateValues(self, nodeOrEdge): 133 | try: 134 | node = nodeOrEdge 135 | nodeType = self.node[node]['nodeType'] 136 | if nodeType == 'leaf': 137 | self.populateLeafNode(node) 138 | elif nodeType == 'Class': 139 | self.populateClassNode(node) 140 | except Exception as _: 141 | edge = nodeOrEdge 142 | (node1, node2) = edge 143 | edgeType = self.edge[node1][node2]['edgeType'] 144 | if edgeType == 'ObjectProperty': 145 | self.populateRelationEdge(edge) 146 | elif edgeType == 'DataProperty': 147 | self.populateAttributeEdge(edge) 148 | 149 | # The problem is that "values" is too general. We can associate values with nodes and edges via a variety of semantics: 150 | # (1) instances from ES, presumably only for leaf nodes 151 | # (2) ontology labels, ontology descriptions, presumably only for edges and interior nodes 152 | 153 | def populateLeafNode(self, node): 154 | self.node[node]['values'] = loadLeafVocab(self.node[node]['vocabDescriptor']) 155 | self.node[node]['valueOrigin'] = 'leafVocab' 156 | 157 | # The next three probably should use the same methodology/same code 158 | def populateClassNode(self, node): 159 | self.node[node]['values'] = list(set([node, self.node[node]['className']])) 160 | self.node[node]['valueOrigin'] = 'ontology' 161 | 162 | def populateRelationEdge(self, edge): 163 | (node1, node2) = edge 164 | self.edge[node1][node2]['values'] = [camelCaseWords(self.edge[node1][node2]['relationName'])] 165 | self.edge[node1][node2]['valueOrigin'] = 'ontology' 166 | 167 | def populateAttributeEdge(self, edge): 168 | (node1, node2) = edge 169 | self.edge[node1][node2]['values'] = [camelCaseWords(self.edge[node1][node2]['relationName'])] 170 | self.edge[node1][node2]['valueOrigin'] = 'ontology' 171 | 172 | def isLeaf(self, nodeOrEdge): 173 | try: 174 | return self.node[nodeOrEdge]['valueOrigin'] == 'leafVocab' 175 | except: 176 | return False 177 | 178 | def populateAll(self): 179 | for node in self.nodes(): 180 | self.populateValues(node) 181 | for edge in self.edges(): 182 | self.populateValues(edge) 183 | 184 | def nodeMatch(self, node, label): 185 | """list generator""" 186 | return label.lower().replace('_', ' ') in (value.lower() for value in self.node[node]['values']) 187 | 188 | def edgeMatch(self, edge, label): 189 | """list generator""" 190 | return label.lower().replace('_', ' ') in (value.lower() for value in self.edge[edge[0]][edge[1]]['values']) 191 | 192 | def nodeEditWithin(self, node, label, within=1, above=None): 193 | """set above=0 to avoid matching node value exactly identical to label 194 | Does not find closest node values, just any values within interval""" 195 | l = label.lower().replace('_', ' ') 196 | for value in self.node[node]['values']: 197 | value = value.lower().replace('_', ' ') 198 | actual = distance(l, value) 199 | if (above==None or actual>above) and actual <= within: 200 | # if levenshtein is 0, return true value 0.0 201 | # return actual or 0.0 202 | return(value, actual) 203 | 204 | def edgeEditWithin(self, edge, label, within=1, above=None): 205 | """set above=0 to avoid matching edge value exactly identical to label""" 206 | l = label.lower().replace('_', ' ') 207 | for value in self.edge[edge[0]][edge[1]]['values']: 208 | value = value.lower().replace('_', ' ') 209 | actual = distance(l, value) 210 | if (not above or actual>above) and actual <= within: 211 | # if levenshtein is 0, return true value 0.0 212 | return actual or 0.0 213 | 214 | def nodeNearMatch(self, node, label, allowExact=False): 215 | """set allowExact to True to look up values directly here""" 216 | label = label.lower().replace('_', ' ') 217 | # print(self.node[node]) 218 | try: 219 | hjMatcher = self.node[node]['matcherDescriptor'] 220 | best = hjMatcher.findBestMatch(label) 221 | if best != "NONE": 222 | for value in self.node[node]['values']: 223 | value = value.lower().replace('_', ' ') 224 | if ((label != value) or allowExact) and (best==value): 225 | # HJ(label)== a value from node and 226 | # either we allow exact or see that label is not exactly the retrieved value 227 | # print(best) 228 | return best 229 | except KeyError: 230 | pass 231 | 232 | def edgeNearMatch(self, edge, label, allowExact=False): 233 | """set allowExact to True to look up values directly here""" 234 | label = label.lower().replace('_', ' ') 235 | try: 236 | hjMatcher = self.edge[edge[0]][edge[1]]['matcherDescriptor'] 237 | best = hjMatcher.findBestMatch(label) 238 | if best != "NONE": 239 | for value in self.edge[edge[0]][edge[1]]['values']: 240 | value = value.lower().replace('_', ' ') 241 | if ((label != value) or allowExact) and (best==value): 242 | # HJ(label)== a value from edge and 243 | # either we allow exact or see that label is not exactly the retrieved value 244 | return best 245 | except KeyError: 246 | pass 247 | 248 | def generateSubgraph(self, node): 249 | seen = set() 250 | def visitNode(n1): 251 | if n1 in seen: 252 | pass 253 | else: 254 | yield(("node",n1)) 255 | seen.add(n1) 256 | for n2 in self.edge[n1]: 257 | yield from visitEdge((n1,n2)) 258 | def visitEdge(e): 259 | (_,n2) = e 260 | if e in seen: 261 | pass 262 | else: 263 | yield(("edge",e)) 264 | seen.add(e) 265 | yield from visitNode(n2) 266 | return visitNode(node) 267 | 268 | """SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10}, 269 | {"docType": "adultservice", "fieldName": "hairColor", "size": 10}, 270 | {"docType": "adultservice", "fieldName": "name", "size": 200}, 271 | {"docType": "adultservice", "fieldName": "personAge", "size": 20}, 272 | 273 | {"docType": "phone", "fieldName": "name", "size": 200}, 274 | 275 | {"docType": "email", "fieldName": "name", "size": 200}, 276 | 277 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200}, 278 | # Ignore webpage.description, webpage.dateCreated 279 | 280 | # Ignore offer.identifier 281 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10}, 282 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 200}, 283 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 200}, 284 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10}, 285 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200}, 286 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200}, 287 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200}, 288 | # Ignore offer.availableAtOrFrom.name 289 | # Ignore offer.availableAtOrFrom.geo.lat, offer.availableAtOrFrom.geo.lon 290 | ]""" 291 | 292 | wg = None 293 | 294 | nodeDesig = namedtuple('nodeDesig', 'nodeType, nodeRefs') 295 | 296 | def truenodeDesig(node): 297 | """Render a kgraph node as a wgraph node""" 298 | return nodeDesig(nodeType='truenode', nodeRefs=(node,)) 299 | 300 | def edgenodeDesig(edge): 301 | """Render a kgraph edge as a wgraph node""" 302 | return nodeDesig(nodeType='edgenode', nodeRefs=edge) 303 | 304 | class ImpossibleGraph(Exception): 305 | def __init__(self, message): 306 | # Call the base class constructor with the parameters it needs 307 | super(ImpossibleGraph, self).__init__(message) 308 | 309 | # PythonDecorators/entry_exit_class.py 310 | 311 | class entry_exit(object): 312 | 313 | def __init__(self, f): 314 | self.f = f 315 | 316 | def __call__(self, *args): 317 | print("Entering", self.f.__name__) 318 | r = self.f(*args) 319 | print("Exited", self.f.__name__) 320 | return(r) 321 | 322 | def minimalSubgraph(kgraph, root, query, verbose=False): 323 | # transform into weighted nondirected graph 324 | # all nodes become nodes ("truenode") 325 | # all edges also become nodes ("edgenode") 326 | # induce edge with weight 1 for each node/edge and edge/node 327 | # except: traverse starting at root, dropping any backlinks [?] 328 | 329 | # required contains nodes/edges from original kgraph 330 | required = defaultdict(list) 331 | # To start with, we don't know if root has any cands 332 | required[truenodeDesig(root)]=[] 333 | for a in query.ngrams.values(): 334 | for cand in a["candidates"]: 335 | if cand.referentType == 'node': 336 | #required.add(truenodeDesig(cand.referent)) 337 | required[truenodeDesig(cand.referent)].append(cand) 338 | elif cand.referentType == 'edge': 339 | #required.add(edgenodeDesig(cand.referent)) 340 | required[edgenodeDesig(cand.referent)].append(cand) 341 | if verbose: 342 | print("Steiner tree must contain:") 343 | for n, c in required.items(): 344 | print(" ", n.nodeRefs[0], c) 345 | 346 | # seen contains nodes/edges from original kgraph 347 | seen = set() 348 | 349 | # q contains nodes/edges from original kgraph 350 | q = Queue(maxsize=kgraph.number_of_nodes() + 3*kgraph.number_of_edges()) 351 | q.put(root) 352 | 353 | # wg contains wgnodes, wgedges 354 | global wg 355 | wg = Graph() 356 | 357 | while not q.empty(): 358 | # print("Queue size: {}; wg size {}".format(q.qsize(), len(wg)), file=sys.stderr) 359 | obj = q.get() 360 | # print("Dequeue {}".format(obj), file=sys.stderr) 361 | if not obj in seen: 362 | if isinstance(obj, (str)): 363 | # unseen kgraph node 364 | seen.add(obj) 365 | node = obj 366 | # print("wg: add true node {}".format(node), file=sys.stderr) 367 | wg.add_node(truenodeDesig(node)) 368 | for node2 in kgraph.edge[node]: 369 | # print("Enqueue edge {} {}".format(node, node2), file=sys.stderr) 370 | q.put((node,node2)) 371 | elif isinstance(obj, (list, tuple)) and len(obj)==2: 372 | # unseen kgraph edge 373 | seen.add(obj) 374 | # edge = obj 375 | # create a node representing original edge 376 | (node1, node2) = obj 377 | truenode1 = truenodeDesig(node1) 378 | truenode2 = truenodeDesig(node2) 379 | edge = obj 380 | # print("wg: add edge node {}".format(edge), file=sys.stderr) 381 | edgenode = edgenodeDesig(edge) 382 | wg.add_node(edgenode) 383 | wg.add_edge(truenode1, edgenode, weight=1) 384 | wg.add_edge(edgenode, truenode2, weight=1) 385 | # print("Enqueue node {}".format(node2), file=sys.stderr) 386 | q.put(node2) 387 | else: 388 | print("Unexpected {}".format(obj), file=sys.stderr) 389 | else: 390 | # print("Obj {} already seen".format(obj), file=sys.stderr) 391 | pass 392 | 393 | # print("Weighted non-directed graph") 394 | # pprint(wg.nodes()) 395 | # return (None, wg) 396 | # generate minimal steiner tree 397 | try: 398 | requiredNodes = list(required.keys()) 399 | st = make_steiner_tree(wg, requiredNodes) 400 | # convert back to directed graph 401 | neededTruenodes = [nd.nodeRefs[0] for nd in st.nodes() if nd.nodeType=='truenode'] 402 | subg = kgraph.subgraph(neededTruenodes) 403 | return (st, wg, subg) 404 | except ValueError as ve: 405 | if "not in original graph" in str(ve): 406 | raise ImpossibleGraph("Cannot generate subgraph of {} containing {}".format("weightedGraph", requiredNodes)) 407 | else: 408 | raise(ve) 409 | 410 | g = None 411 | 412 | def htGraph(**kwargs): 413 | global g 414 | g = KGraph(domainType='ht') 415 | g.populateAll() 416 | return g 417 | -------------------------------------------------------------------------------- /src/dig/harvest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | if sys.version_info < (3, 0): 5 | raise "must use python 3.0 or greater" 6 | 7 | import sys 8 | from elasticsearch import Elasticsearch 9 | from pprint import pprint 10 | import json 11 | from collections import OrderedDict 12 | import os 13 | 14 | CA_CERTS_PATH='/Users/philpot/aws/credentials/certs.pem' 15 | 16 | es = Elasticsearch( 17 | [ 18 | 'https://darpamemex:darpamemex@esc.memexproxy.com/' # dig-ht-latest/offer 19 | # 'http://user:secret@localhost:9200/', 20 | ], 21 | # make sure we verify SSL certificates (off by default) 22 | verify_certs=True, 23 | # provide a path to CA certs on disk 24 | ca_certs=CA_CERTS_PATH 25 | ) 26 | 27 | def makeBodyNested(fieldName="name", innerPath="itemOffered", size=10): 28 | return { 29 | "query": { 30 | "match_all": {} 31 | }, 32 | "aggs": { 33 | "toplevelAgg": { 34 | "nested": { 35 | "path": innerPath 36 | }, 37 | "aggs": { 38 | "termAgg": { 39 | "terms": { 40 | "field": "{}.{}".format(innerPath, fieldName), 41 | "size" : size 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | 49 | def makeBodyDirect(fieldName="name", size=10): 50 | return { 51 | "query": { 52 | "match_all": {} 53 | }, 54 | "aggs": { 55 | "termAgg": { 56 | "terms": { 57 | "field": fieldName, 58 | "size": size 59 | } 60 | } 61 | } 62 | } 63 | 64 | def makeBody(fieldName="name", innerPath="", size=10): 65 | if innerPath: 66 | return makeBodyNested(fieldName=fieldName, 67 | innerPath=innerPath, 68 | size=size) 69 | else: 70 | return makeBodyDirect(fieldName=fieldName, 71 | size=size) 72 | 73 | """ 74 | {'_shards': {'failed': 0, 'successful': 20, 'total': 20}, 75 | 'aggregations': {'toplevelAgg': {'doc_count': 19134836, 76 | 'termAgg': {'buckets': [{'doc_count': 18104, 77 | 'key': 'jessica'}, 78 | {'doc_count': 15956, 79 | 'key': 'ashley'}, 80 | {'doc_count': 12748, 81 | 'key': 'amber'}, 82 | {'doc_count': 12037, 83 | 'key': 'tiffany'}, 84 | {'doc_count': 11808, 85 | 'key': 'bella'}, 86 | {'doc_count': 11628, 87 | 'key': 'mya'}, 88 | {'doc_count': 11514, 89 | 'key': 'candy'}, 90 | {'doc_count': 10963, 91 | 'key': 'nikki'}, 92 | {'doc_count': 10932, 93 | 'key': 'diamond'}, 94 | {'doc_count': 10808, 95 | 'key': 'lexi'}], 96 | 'doc_count_error_upper_bound': 2728, 97 | 'sum_other_doc_count': 1322532}}}, 98 | 'hits': {'hits': [], 'max_score': 0.0, 'total': 19134836}, 99 | 'timed_out': False, 100 | 'took': 1422} 101 | """ 102 | 103 | def harvest(index="dig-ht-latest", docType="webpage",fieldName="addressCountry", innerPath="", size=10): 104 | nested = True if innerPath else False 105 | body=makeBody(fieldName=fieldName, innerPath=innerPath, size=size) 106 | result = es.search(index=index, 107 | doc_type=docType, 108 | body=body, 109 | search_type="count") 110 | agg = result['aggregations']['toplevelAgg']['termAgg'] if nested else result['aggregations']['termAgg'] 111 | report = {"docType": docType, 112 | "fieldName": fieldName, 113 | "innerPath": innerPath, 114 | "size": size, 115 | # use 'result' later to get hitsTotal, sum_other_doc_count if needed 116 | "result": result, 117 | # collections.OrderedDict is serialized to JSON in the order keys were added 118 | # so preserves decreasing value order 119 | "histo": OrderedDict() 120 | } 121 | for bucket in agg['buckets']: 122 | report["histo"][bucket["key"]] = bucket["doc_count"] 123 | return report 124 | 125 | # def outputPathname(docType="webpage", innerPath="mainEntity.availableAtOrFrom.address", fieldName="addressCountry", root="/tmp", **kwargs): 126 | # return os.path.join(root, "{}_{}_{}.json".format(docType, innerPath.replace('.', '_').replace('__','_'), fieldName)) 127 | 128 | OUTPUT_ROOT = "/Users/philpot/Documents/project/graph-keyword-search/src/dig/data/cache" 129 | 130 | def outputPathname(docType="webpage", innerPath="", fieldName="addressCountry", root=OUTPUT_ROOT, **kwargs): 131 | return os.path.join(root, "{}_{}_{}.json".format(docType, innerPath.replace('.', '_').replace('__','_'), fieldName)) 132 | 133 | WORKING=[ # works 134 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 200}, 135 | # works 136 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200}, 137 | {"docType": "offer", "innerPath": "seller", "fieldName": "name", "size": 200}, 138 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "personAge", "size": 20}, 139 | 140 | {"docType": "offer", "innerPath": "mainEntityOfPage.publisher", "fieldName": "name", "size": 20}, 141 | {"docType": "seller", "innerPath": "makesOffer.mainEntityOfPage.publisher", "fieldName": "name", "size": 20}, 142 | {"docType": "phone", "innerPath": "owner.makesOffer.mainEntityOfPage.publisher", "fieldName": "name", "size": 20}, 143 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10}, 144 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 10}, 145 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 10}, 146 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10}, 147 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10}, 148 | 149 | {"docType": "offer", "innerPath": "availableAtOrFrom", "fieldName": "name", "size": 10}, 150 | {"docType": "offer", "innerPath": "availableAtOrFrom.geo", "fieldName": "lat", "size": 10}, 151 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "hairColor", "size": 20}, 152 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "eyeColor", "size": 20}, 153 | {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 20}, 154 | {"docType": "offer", "innerPath": "availableAtOrFrom.geo", "fieldName": "lat", "size": 10}, 155 | {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 10}, 156 | {"docType": "seller", "innerPath": "telephone", "fieldName": "a", "size": 10} 157 | ] 158 | 159 | 160 | SPECS=[ # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200}, 161 | # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200}, 162 | # {"docType": "webpage", "innerPath": "mainEntity.availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200}, 163 | 164 | ###{"docType": "seller", "innerPath": "email", "fieldName": "name", "size": 10}, 165 | ###{"docType": "seller", "innerPath": "email", "fieldName": "a", "size": 10}, 166 | ###{"docType": "offer", "innerPath": "seller.telephone", "fieldName": "a", "size": 10}, 167 | # WORKS 168 | ###{"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 10}, 169 | # DOES NOT WORK in pyelasticsearch, only in sense/curl 170 | {"docType": "offer", "innerPath": "seller.telephone", "fieldName": "name", "size": 10}, 171 | # WORKS 172 | {"docType": "webpage", "innerPath": "mainEntity.seller.telephone", "fieldName": "name", "size": 10} 173 | 174 | # Doesn't work 175 | # {"docType": "offer", "innerPath": "seller.telephone", "fieldName": "name", "size": 200}, 176 | # ??? 177 | # {"docType": "offer", "innerPath": "seller", "fieldName": "a", "size": 200}, 178 | # {"docType": "offer", "innerPath": "itemOffered", "fieldName": "a", "size": 200}, 179 | 180 | # {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 200} 181 | # bad syntax 182 | # {"docType": "address", "innerPath": "", "fieldName": "addressCountry", "size": 200} 183 | # doesn't work 184 | # probably the issue w.r.t. nested Pedro suggested 185 | # but sibling fields do work 186 | ] 187 | 188 | # SPECS=[ {"docType": "offer", "innerPath": "itemOffered", "fieldName": "name", "size": 10} ] 189 | 190 | SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10} ] 191 | 192 | SPECS=[ {"docType": "adultservice", "fieldName": "eyeColor", "size": 10}, 193 | {"docType": "adultservice", "fieldName": "hairColor", "size": 10}, 194 | {"docType": "adultservice", "fieldName": "name", "size": 200}, 195 | {"docType": "adultservice", "fieldName": "personAge", "size": 20}, 196 | 197 | # These are valid, but has flat distribution, so not useful for suggestion 198 | # {"docType": "phone", "fieldName": "name", "size": 200}, 199 | # {"docType": "email", "fieldName": "name", "size": 200}, 200 | # Instead seller-centric distribution 201 | {"docType": "seller", "innerPath": "telephone", "fieldName": "name", "size": 200}, 202 | {"docType": "seller", "innerPath": "email", "fieldName": "name", "size": 200}, 203 | 204 | {"docType": "webpage", "innerPath": "publisher", "fieldName": "name", "size": 200}, 205 | # Ignore webpage.description, webpage.dateCreated 206 | 207 | # Ignore offer.identifier 208 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "billingIncrement", "size": 10}, 209 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "price", "size": 200}, 210 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "name", "size": 200}, 211 | {"docType": "offer", "innerPath": "priceSpecification", "fieldName": "unitCode", "size": 10}, 212 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressLocality", "size": 200}, 213 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressRegion", "size": 200}, 214 | {"docType": "offer", "innerPath": "availableAtOrFrom.address", "fieldName": "addressCountry", "size": 200}, 215 | # Ignore offer.availableAtOrFrom.name 216 | # Ignore offer.availableAtOrFrom.geo.lat, offer.availableAtOrFrom.geo.lon 217 | 218 | ] 219 | 220 | def harvestToFile(spec): 221 | outPath = None 222 | try: 223 | outPath = outputPathname(**spec) 224 | except: 225 | pass 226 | try: 227 | h = harvest(**spec) 228 | print("Harvest to {}".format(outPath), file=sys.stderr) 229 | with open(outPath, 'w') as f: 230 | # Don't use sort_keys here 231 | # We are counting on the behavior where collections.OrderedDict is 232 | # serialized in the order keys were added. If we add things in 233 | # order of decreasing counts, the order will stick, unless we use sort_keys. 234 | json.dump(h, f, indent=4) 235 | except Exception as e: 236 | print("Error [{}] during processing of {}".format(e, outPath)) 237 | 238 | def generateAll (): 239 | for spec in SPECS: 240 | print() 241 | print(spec) 242 | # harvestToFile(spec) 243 | try: 244 | h = harvest(**spec) 245 | # pprint(h) 246 | l = -1 247 | try: 248 | try: 249 | # nested 250 | b = h["result"]["aggregations"]["toplevelAgg"]["termAgg"]["buckets"] 251 | except: 252 | # direct 253 | b = h["result"]["aggregations"]["termAgg"]["buckets"] 254 | l = len(b) 255 | if l>0: 256 | print("Success %d for %s" % (l, spec), file=sys.stderr) 257 | q = 5 258 | for i,v in zip(range(q+1), b[0:q]): 259 | print("value %d is %s" % (i, v)) 260 | elif l==0: 261 | print("No data for %s" % (spec), file=sys.stderr) 262 | else: 263 | pass 264 | except Exception as e: 265 | print("Nothing happened for %s" % (spec), file=sys.stderr) 266 | print(e, file=sys.stderr) 267 | except Exception as e: 268 | print("Failed during %s" % (spec), file=sys.stderr) 269 | print(e, file=sys.stderr) 270 | 271 | """ 272 | 273 | POST https://darpamemex:darpamemex@esc.memexproxy.com/dig-ht-latest/offer/_search?search_type=count 274 | { 275 | "query": { 276 | "filtered": { 277 | "query": { 278 | "match_all": {} 279 | }, 280 | "filter": { 281 | "nested": { 282 | "path": "itemOffered", 283 | "filter": { 284 | "exists": { 285 | "field": "eyeColor" 286 | } 287 | } 288 | } 289 | } 290 | } 291 | }, 292 | 293 | "aggs": { 294 | "toplevelAgg": { 295 | "nested": { 296 | "path": "itemOffered" 297 | }, 298 | "aggs": { 299 | "termAgg": { 300 | "terms": { 301 | "field": "itemOffered.eyeColor", 302 | "size" : 100 303 | } 304 | } 305 | } 306 | } 307 | } 308 | } 309 | """ 310 | -------------------------------------------------------------------------------- /src/dig/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os 4 | import argparse 5 | 6 | from graph import htGraph, ImpossibleGraph, minimalSubgraph 7 | from query import Query 8 | from synonym import Thesaurus 9 | # from outline import Outline, iii 10 | from outline import * 11 | import configparser 12 | 13 | from pprint import pprint 14 | 15 | g = None 16 | q = None 17 | s = None 18 | m = None 19 | wg = None 20 | sg = None 21 | o = None 22 | 23 | def interpretConfig(configFile, verbose=False): 24 | try: 25 | cfg = configparser.ConfigParser() 26 | cfg.read(configFile) 27 | except: 28 | if verbose: 29 | print("Unable to read any configuration from {}".format(configFile), file=sys.stderr) 30 | kwargs = {} 31 | for sectionName in cfg.sections(): 32 | section = cfg[sectionName] 33 | for key, value in section.items(): 34 | kw = sectionName + '_' + key 35 | try: 36 | if key.endswith('count') or key.endswith('size') or key.endswith('length'): 37 | kwargs[kw] = section.getint(key) 38 | elif key.endswith('factor') or key.endswith('score'): 39 | kwargs[kw] = section.getfloat(key) 40 | elif key.endswith('enable'): 41 | kwargs[kw] = section.getboolean(key) 42 | else: 43 | kwargs[kw] = value 44 | except: 45 | kwargs[kw] = value 46 | return kwargs 47 | 48 | 49 | def main(argv=None): 50 | '''this is called if run from command line''' 51 | global g, q, s, m, wg, sg, o 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('terms', nargs='*', default=[], action="append") 54 | parser.add_argument('-v', '--verbose', required=False, help='verbose', action='store_true') 55 | parser.add_argument('-e', '--explain', required=False, help='include explanations in intermediate repn', 56 | choices=['text','structured','None']) 57 | # parser.add_argument('-o', '--options') 58 | parser.add_argument('-j', '--config', required=False, help='config', default=os.path.join(os.path.dirname(__file__), "config.ini")) 59 | args = parser.parse_args() 60 | # TODO nargs generates a list of lists 61 | terms = args.terms[0] 62 | cmdline = {"verbose": args.verbose, 63 | "explain": None if args.explain=='None' else args.explain} 64 | config = interpretConfig(args.config) 65 | g = htGraph(**cmdline, **config) 66 | s = Thesaurus(**cmdline, **config) 67 | q = Query(terms, g, s, **cmdline, **config) 68 | q.suggestCandidates() 69 | q.dump() 70 | # succeeds with roots = ['offer'] 71 | # fails with roots = ['phone'] 72 | roots = ['seller', 'phone', 'email', 'offer', 'adultservice', 'webpage'] 73 | for root in roots: 74 | try: 75 | # m is steiner tree 76 | # wg is input nondirected graph 77 | # sg is output directed subgraph 78 | (m, wg, sg) = minimalSubgraph(g, root, q) 79 | o = Outline(g, sg, q, root, **cmdline, **config) 80 | except ImpossibleGraph as ig: 81 | if args.verbose: 82 | # print(ig, file=sys.stderr) 83 | print("It is not possible to generate a subgraph with root {}".format(root), file=sys.stderr) 84 | continue 85 | o.detail() 86 | 87 | # call main() if this is run as standalone 88 | if __name__ == "__main__": 89 | sys.exit(main()) 90 | -------------------------------------------------------------------------------- /src/dig/outline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from collections import defaultdict 5 | from networkx import shortest_path 6 | import json 7 | 8 | iii = None 9 | 10 | def pathFromRoot(graph, cand, node, root): 11 | nodes = shortest_path(graph, root, node) 12 | pathComponents = [root] 13 | waypoints = nodes[0:-1] 14 | for (f,t) in zip(waypoints, waypoints[1:]): 15 | pathComponents.append(graph.labelInGraph((f,t)) or "missing") 16 | # pathComponents.append(nodes[-1].upper()) 17 | # terminus is a leaf node, named after class plus relation 18 | # we only want the relation 19 | pathComponents.append(node.split('.')[-1]) 20 | path = ".".join(pathComponents) 21 | return path 22 | 23 | class Outline(object): 24 | def __init__(self, graph, subgraph, query, root, verbose=False, explain=False, **kwargs): 25 | self.graph = graph 26 | self.subgraph = subgraph 27 | self.query = query 28 | self.root = root 29 | self.verbose = verbose 30 | self.explain = explain 31 | 32 | def intermediate(self): 33 | global iii 34 | relationsMentioned = [] 35 | classesMentioned = [] 36 | must = [] 37 | should = [] 38 | i = defaultdict(list) 39 | i["root"] = self.root 40 | # to begin with, no terms are covered 41 | touches = defaultdict(list) 42 | for a in self.query.ngrams.values(): 43 | for cand in a["candidates"]: 44 | if cand.referentType == 'node': 45 | node = cand.referent 46 | if self.graph.isLeaf(node): 47 | # Leaf node corresponds to an equality/fuzzy relation constraint 48 | m = {"path": pathFromRoot(self.graph, cand, node, self.root), 49 | "matchType": "direct" if cand.candidateType == "direct" else "inferred", 50 | # "operands": [cand.referent, cand.content], 51 | "className": cand.referent.split('.')[0], 52 | "relationName": cand.referent.split('.')[1], 53 | "value": cand.content} 54 | if self.explain: 55 | m["_explanation"] = cand.explain(self.explain) 56 | must.append(m) 57 | else: 58 | # Other node corresponds to mention of a class (e.g., the word 'seller' is mentioned) 59 | m = {"className": self.graph.labelInGraph(node)} 60 | if self.explain: 61 | m["_explanation"] = cand.explain(self.explain) 62 | classesMentioned.append(m) 63 | # Record (possibly partial) coverage of query terms 64 | for w in a["words"]: 65 | t = {"term": w, 66 | "foundIn": "node"} 67 | if self.explain: 68 | t["_explanation"] = cand.explain(self.explain) 69 | touches[w].append(t) 70 | elif cand.referentType == 'edge': 71 | edge = cand.referent 72 | # Edge match corresponds to mention of an edge 73 | # May or may not correspond to relation constraint on that edge 74 | # In future, this might mean we want result to include its class 75 | m = {"className": self.graph.labelInGraph(edge[0]), 76 | "relationName": self.graph.labelInGraph(edge)} 77 | if self.explain: 78 | m["_explanation"] = cand.explain(self.explain) 79 | relationsMentioned.append(m) 80 | # Record (possibly partial) coverage of query terms 81 | for w in a["words"]: 82 | t = {"term": w, 83 | "foundIn": "edge"} 84 | if self.explain: 85 | t["_explanation"] = cand.explain(self.explain) 86 | touches[w].append(t) 87 | # Any terms never covered are now free-text matches 88 | for term in self.query.terms: 89 | if not touches[term]: 90 | s = {"matchType": "free", 91 | "operands": [term]} 92 | if self.explain: 93 | s["_explanation"] = "{} uninterpretable".format(term) 94 | should.append(s) 95 | 96 | i["touches"] = touches 97 | i["relationsMentioned"] = relationsMentioned 98 | i["classesMentioned"] = classesMentioned 99 | i["must"] = must 100 | i["should"] = should 101 | iii = i 102 | return i 103 | 104 | def detail(self, file=sys.stdout): 105 | # print (root,g,q,s,m,wg,sg) 106 | print("", file=file) 107 | if self.verbose: 108 | print("\nRoot {}".format(self.root), file=file) 109 | print("\nDetail of outline {}".format(self), file=file) 110 | print("Input Graph: {}".format(self.graph), file=file) 111 | print("Input Keywords: {}".format(self.query.terms), file=file) 112 | print("Input Keyword Coloring: \n{}".format(self.query.dumpToString(indent=2)), file=file) 113 | print("Relevant Subgraph: {}".format(self.subgraph), file=file) 114 | print("Intermediate Repn:", file=file) 115 | print(json.dumps(self.intermediate(), sort_keys=True, indent=4), file=file) 116 | -------------------------------------------------------------------------------- /src/dig/prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | INDEXNAME=twitter 4 | 5 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/" 6 | 7 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d '{ 8 | "mappings": { 9 | "tweet": { 10 | "properties": { 11 | "text": { 12 | "type": "string", 13 | "term_vector": "yes", 14 | "store" : true, 15 | "index_analyzer" : "fulltext_analyzer" 16 | }, 17 | "fullname": { 18 | "type": "string", 19 | "term_vector": "no", 20 | "index_analyzer" : "fulltext_analyzer" 21 | }, 22 | "eyecolor": { 23 | "type": "string", 24 | "term_vector": "no", 25 | "index": "not_analyzed" 26 | } 27 | } 28 | } 29 | }, 30 | "settings" : { 31 | "index" : { 32 | "number_of_shards" : 1, 33 | "number_of_replicas" : 0 34 | }, 35 | "analysis": { 36 | "analyzer": { 37 | "fulltext_analyzer": { 38 | "type": "custom", 39 | "tokenizer": "whitespace", 40 | "filter": [ 41 | "lowercase", 42 | "type_as_payload" 43 | ] 44 | } 45 | } 46 | } 47 | } 48 | }' 49 | 50 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{ 51 | "fullname" : "John Doe", 52 | "text" : "twitter test test test ", 53 | "eyecolor": "blue" 54 | }' 55 | 56 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{ 57 | "fullname" : "Jane Doe", 58 | "text" : "Another twitter test ...", 59 | "eyecolor": "blue" 60 | }' 61 | 62 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{ 63 | "fullname" : "Robot", 64 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 65 | "eyecolor": "red" 66 | }' -------------------------------------------------------------------------------- /src/dig/prep2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | INDEXNAME=twitter2 4 | 5 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/" 6 | 7 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d '{ 8 | "mappings": { 9 | "tweet": { 10 | "properties": { 11 | "text": { 12 | "type": "string", 13 | "term_vector": "yes", 14 | "store" : true, 15 | "index_analyzer" : "fulltext_analyzer" 16 | }, 17 | "fullname": { 18 | "type": "string", 19 | "term_vector": "no", 20 | "index_analyzer" : "fulltext_analyzer" 21 | }, 22 | "eyecolor": { 23 | "type": "string", 24 | "term_vector": "no", 25 | "index": "not_analyzed" 26 | }, 27 | "children": { 28 | "type": "nested", 29 | "properties": { 30 | "name": { 31 | "type": "string", 32 | "index": "not_analyzed" 33 | }, 34 | "school": { 35 | "type": "string", 36 | "index": "not_analyzed" 37 | } 38 | } 39 | } 40 | } 41 | } 42 | }, 43 | "settings" : { 44 | "index" : { 45 | "number_of_shards" : 1, 46 | "number_of_replicas" : 0 47 | }, 48 | "analysis": { 49 | "analyzer": { 50 | "fulltext_analyzer": { 51 | "type": "custom", 52 | "tokenizer": "whitespace", 53 | "filter": [ 54 | "lowercase", 55 | "type_as_payload" 56 | ] 57 | } 58 | } 59 | } 60 | } 61 | }' 62 | 63 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{ 64 | "fullname" : "John Doe", 65 | "text" : "twitter test test test ", 66 | "eyecolor": "blue", 67 | "children": [ {"name": "Alice", "school": "Aardvark"} ] 68 | }' 69 | 70 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{ 71 | "fullname" : "Jane Doe", 72 | "text" : "Another twitter test ...", 73 | "eyecolor": "blue", 74 | "children": [ {"name": "Bob", "school": "Aardvark"}, 75 | {"name": "Carole", "school": "Aardvark"}, 76 | {"name": "Dan", "school": "Aardvark"}, 77 | {"name": "Eve", "school": "Badger"} ] 78 | }' 79 | 80 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{ 81 | "fullname" : "Robot3", 82 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 83 | "eyecolor": "red", 84 | "children": [ {"name": "Ronald3", "school": "Factory"}, 85 | {"name": "Rhonda3", "school": "Junkyard"}] 86 | }' 87 | 88 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/4?pretty=true" -d '{ 89 | "fullname" : "Robot4", 90 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 91 | "eyecolor": "red", 92 | "children": [ {"name": "Ronald4", "school": "Factory"}, 93 | {"name": "Rhonda4", "school": "Junkyard"}] 94 | }' 95 | 96 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/5?pretty=true" -d '{ 97 | "fullname" : "Robot5", 98 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 99 | "eyecolor": "red", 100 | "children": [ {"name": "Ronald5", "school": "Factory"}, 101 | {"name": "Rhonda5", "school": "Junkyard"}] 102 | }' 103 | 104 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/6?pretty=true" -d '{ 105 | "fullname" : "Robot6", 106 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 107 | "eyecolor": "red", 108 | "children": [ {"name": "Ronald6", "school": "Factory"}, 109 | {"name": "Rhonda6", "school": "Junkyard"}] 110 | }' 111 | 112 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/7?pretty=true" -d '{ 113 | "fullname" : "Robot7", 114 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 115 | "eyecolor": "red", 116 | "children": [ {"name": "Ronald7", "school": "Factory"}, 117 | {"name": "Rhonda7", "school": "Junkyard"}] 118 | }' 119 | 120 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/8?pretty=true" -d '{ 121 | "fullname" : "Richard Bloggs", 122 | "text" : "one two", 123 | "eyecolor": "brown", 124 | "children": [ {"name": "Frank", "school": "Coyote"}, 125 | {"name": "Glenda", "school": "Coyote"}] 126 | }' -------------------------------------------------------------------------------- /src/dig/prep_ht.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | INDEXNAME=ht 4 | 5 | pushd $PROJ/dig-elasticsearch 6 | git pull 7 | mapping=$PROJ/dig-elasticsearch/types/webpage/esMapping-dig-ht-DT.json 8 | popd 9 | 10 | curl -k -XPUT "https://darpamemex:darpamemex@esc.memexproxy.com/${indexName}" -d @$mapping 11 | 12 | 13 | curl -XDELETE "http://localhost:9200/${INDEXNAME}/" 14 | 15 | curl -s -XPUT "http://localhost:9200/${INDEXNAME}/" -d @$mapping 16 | 17 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/1?pretty=true" -d '{ 18 | "fullname" : "John Doe", 19 | "text" : "twitter test test test ", 20 | "eyecolor": "blue" 21 | }' 22 | 23 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/2?pretty=true" -d '{ 24 | "fullname" : "Jane Doe", 25 | "text" : "Another twitter test ...", 26 | "eyecolor": "blue" 27 | }' 28 | 29 | curl -XPUT "http://localhost:9200/${INDEXNAME}/tweet/3?pretty=true" -d '{ 30 | "fullname" : "Robot", 31 | "text" : "one two two three three three four four four four four four four five four five six seven eight nine", 32 | "eyecolor": "red" 33 | }' -------------------------------------------------------------------------------- /src/dig/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from itertools import count 5 | from synonym import Thesaurus, Synonym 6 | from collections import Counter 7 | try: 8 | from StringIO import StringIO 9 | except ImportError: 10 | from io import StringIO 11 | from util import canonList 12 | from copy import copy 13 | 14 | class Candidate(object): 15 | def __init__(self, referent=None, referentType=None, candidateType=None, synonym=None, distance=0): 16 | # referent is something in the graph: a node or edge, which for us is a string or tuple of strings 17 | self.referent = referent 18 | # referentType is 'node' or 'edge' 19 | self.referentType = referentType 20 | # candidateType is 'direct', 'levenshtein', 'hybridJaccard', 'synonym' 21 | # candidateType is 'direct', 'levenshtein', 'hybridJaccard', 'word2vec', 'wordnet' 22 | self.candidateType = candidateType 23 | self.synonym = synonym 24 | # distance = 0 for direct 25 | # presumably distance > 0 for non-direct 26 | self.distance = distance 27 | 28 | @property 29 | def indicator(self): 30 | try: 31 | return self.synonym and self.synonym.indicator 32 | except: 33 | return None 34 | 35 | @property 36 | def content(self): 37 | try: 38 | return self.synonym and self.synonym.content 39 | except: 40 | return self.synonym 41 | 42 | def __str__(self, *args, **kwargs): 43 | sig = "None" 44 | try: 45 | sig = (self.referentType or "") 46 | sig += " " 47 | sig += (self.candidateType or "") 48 | sig += " " 49 | sig += (self.referentsLabel() or "") 50 | sig += " " 51 | sig += (str(getattr(self,"synonym",None) or "")) 52 | except Exception as _: 53 | pass 54 | return "<" + str(type(self).__name__) + " " + sig + ">" 55 | 56 | def __repr__(self, *args, **kwargs): 57 | return self.__str__() 58 | 59 | def referentsLabel(self): 60 | return "/".join(canonList(self.referent)) 61 | 62 | def summary(self): 63 | try: 64 | return self.candidateType 65 | except: 66 | return None 67 | 68 | def explain(self, style): 69 | if style == 'text': 70 | return self.textExplain() 71 | elif style == 'structured': 72 | return self.structuredExplain() 73 | elif not style: 74 | return None 75 | else: 76 | raise ValueError("Unknown explanation style {}".format(style)) 77 | 78 | def textExplain(self): 79 | prefix = "Cand" 80 | try: 81 | if self.candidateType=='direct': 82 | return "{}: {} {}: Direct({})".format(prefix, self.referentType, self.referentsLabel(), self.indicator) 83 | elif self.candidateType=='levenshtein': 84 | return "{}: {} {}: Levenshtein({})={}".format(prefix, self.referentType, self.referent, self.synonym, self.distance) 85 | elif self.candidateType=='hybridJaccard': 86 | return "{}: {} {}: HybridJaccard({})".format(prefix, self.referentType, self.referent, self.synonym) 87 | elif self.candidateType=='wordnet': 88 | s = self.synonym 89 | return "{}: {} {}: Wordnet({},{})=>{}".format(prefix, self.referentType, self.referent, s.source, s.indicator, s.content) 90 | elif self.candidateType=='word2vec': 91 | s = self.synonym 92 | return "{}: {} {}: Word2vec({},{})=>{}".format(prefix, self.referentType, self.referent, s.source, s.indicator, s.content) 93 | except: 94 | pass 95 | return str(self) 96 | 97 | def structuredExplain(self): 98 | d = copy(self.__dict__) 99 | synonym = d.get('synonym') 100 | if synonym: 101 | d['synonym'] = synonym.__dict__ 102 | return d 103 | 104 | def binding(self): 105 | # return "Binding of indicator {} is content {}".format(self.indicator, self.content) 106 | return (self.candidateType, self.indicator, self.content) 107 | 108 | class Query(object): 109 | def __init__(self, terms, graph, thesaurus=None, 110 | direct_enable=True, 111 | levenshtein_enable=True, 112 | levenshtein_above_score=0.0, 113 | levenshtein_within_score=1.0, 114 | hybridjaccard_enable=True, 115 | hybridjaccard_allowexact_enable=False, 116 | **kwargs): 117 | self.terms = terms 118 | self.graph = graph 119 | # self.thesaurus = thesaurus or Thesaurus() 120 | self.thesaurus = thesaurus 121 | self.direct_enable = direct_enable 122 | self.levenshtein_enable = levenshtein_enable 123 | self.levenshtein_above_score = levenshtein_above_score 124 | self.levenshtein_within_score = levenshtein_within_score 125 | self.hybridjaccard_enable = hybridjaccard_enable 126 | self.hybridjaccard_allowexact_enable = hybridjaccard_allowexact_enable 127 | self.initNgrams(terms) 128 | 129 | def __str__(self, *args, **kwargs): 130 | limit = 4 131 | sig = "None" 132 | try: 133 | sig = " ".join(self.terms[0:limit]) + "..." 134 | except Exception as _: 135 | pass 136 | return "<" + str(type(self).__name__) + " " + sig + ">" 137 | 138 | def __repr__(self, *args, **kwargs): 139 | return self.__str__(*args, **kwargs) 140 | 141 | def initNgrams(self, terms): 142 | self.ngrams = {} 143 | for term,idx in zip(terms, count(0,2)): 144 | # print("Term 1 {}".format(term)) 145 | # print("Assign spot {} to unigram {}".format(idx,term)) 146 | self.ngrams[term] = None 147 | self.ngrams[term] = {"term": term, 148 | "words": [term], 149 | "index": idx, 150 | "cardinality": 1} 151 | for t1,t2,idx in zip(terms, terms[1:], count(1,2)): 152 | term = t1 + "_" + t2 153 | # print("Assign spot {} to bigram {}".format(idx, term)) 154 | self.ngrams[term] = {"term": term, 155 | "words": [t1, t2], 156 | "index": idx, 157 | "cardinality": 2} 158 | 159 | def suggestCandidates(self): 160 | # singletons only 161 | graph = self.graph 162 | ngrams = self.ngrams 163 | thesaurus = self.thesaurus 164 | # levenshtein config 165 | levenshteinWithin = self.levenshtein_within_score 166 | levenshteinAbove = self.levenshtein_above_score 167 | # hybrid jaccard config 168 | hybridJaccardAllowExact = self.hybridjaccard_allowexact_enable 169 | 170 | for q,d in ngrams.items(): 171 | keyword = q 172 | d["candidates"] = [] 173 | 174 | # SINGLETON 175 | if d["cardinality"] == 1: 176 | # singleton, direct node 177 | if self.direct_enable: 178 | for node in graph.nodes(): 179 | if graph.nodeMatch(node, keyword): 180 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) 181 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym)) 182 | # singleton, direct edge 183 | for edge in graph.edges(): 184 | if graph.edgeMatch(edge, keyword): 185 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) 186 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym)) 187 | 188 | # singleton, levenshtein node 189 | if self.levenshtein_enable: 190 | for node in graph.nodes(): 191 | try: 192 | (closest, away) = graph.nodeEditWithin(node, keyword, levenshteinWithin, above=levenshteinAbove) 193 | synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away) 194 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='levenshtein', distance=away, synonym=synonym)) 195 | except TypeError: 196 | pass 197 | # singleton, levenshtein edge 198 | for edge in graph.edges(): 199 | try: 200 | (closest,away) = graph.edgeEditWithin(edge, keyword, levenshteinWithin, above=levenshteinAbove) 201 | synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away) 202 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='levenshtein', distance=away, synonym=synonym)) 203 | except TypeError: 204 | pass 205 | # singleton, hybrid jaccard node 206 | if self.hybridjaccard_enable: 207 | for node in graph.nodes(): 208 | best = graph.nodeNearMatch(node, keyword, allowExact=hybridJaccardAllowExact) 209 | if best: 210 | synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best) 211 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='hybridJaccard', synonym=synonym)) 212 | # singleton, hybrid jaccard edge 213 | for edge in graph.edges(): 214 | best = graph.edgeNearMatch(edge, keyword, allowExact=hybridJaccardAllowExact) 215 | synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best) 216 | if best: 217 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='hybridJaccard', synonym=synonym)) 218 | 219 | # singleton, synonym 220 | if self.thesaurus: 221 | for synonym in thesaurus.generateSynonyms(keyword): 222 | content = synonym.content 223 | # singleton, synonym node 224 | for node in graph.nodes(): 225 | if graph.nodeMatch(node, content): 226 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym)) 227 | # singleton, synonym edge 228 | for edge in graph.edges(): 229 | if graph.edgeMatch(edge, content): 230 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym)) 231 | 232 | # MULTIWORD 233 | elif d["cardinality"] >= 2: 234 | if self.direct_enable: 235 | # multiword, direct 236 | for node in graph.nodes(): 237 | if graph.nodeMatch(node, keyword): 238 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) 239 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym)) 240 | for edge in graph.edges(): 241 | if graph.edgeMatch(edge, keyword): 242 | synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) 243 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym)) 244 | # TODO: multiword, levenshtein (or jaro_winkler, hj) 245 | # NIY 246 | # multiword, synonym 247 | for synonym in thesaurus.generateSynonyms(keyword): 248 | content = synonym.content 249 | for node in graph.nodes(): 250 | if graph.nodeMatch(node, synonym): 251 | d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym)) 252 | for edge in graph.edges(): 253 | if graph.edgeMatch(edge, synonym): 254 | d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym)) 255 | 256 | # def initNgrams0(self, terms): 257 | # self.ngrams = {} 258 | # for term,idx in zip(terms, count(0,2)): 259 | # # print("Term 1 {}".format(term)) 260 | # # print("Assign spot {} to unigram {}".format(idx,term)) 261 | # self.ngrams[term] = None 262 | # self.ngrams[term] = {"term": term, 263 | # "words": [term], 264 | # "index": idx, 265 | # "cardinality": 1} 266 | # for t1,t2,idx in zip(terms, terms[1:], count(1,2)): 267 | # term = t1 + "_" + t2 268 | # # print("Assign spot {} to bigram {}".format(idx, term)) 269 | # self.ngrams[term] = {"term": term, 270 | # "words": [t1, t2], 271 | # "index": idx, 272 | # "cardinality": 2} 273 | 274 | # def dump0(self): 275 | # byIndex = [None] * (2*len(self.terms) - 1) 276 | # for d in self.ngrams.values(): 277 | # byIndex[d['index']] = d 278 | # for d in byIndex: 279 | # try: 280 | # idx = d['index'] 281 | # ngramType = "unigram" if idx%2 else "bigram" 282 | # q = d.get('term', '') 283 | # v = d.get('candidates', []) 284 | # # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "\n".join(v))) 285 | # # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v)))) 286 | # summaries = Counter([c.summary() for c in v]) 287 | # # print("{}{}. {}: {} ({})".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))," unknown")) 288 | # print("{}{}. {}: {} ({})".format(ngramType, idx, q, "{} candidates".format(len(v)), summaries)) 289 | # except: 290 | # print(d) 291 | 292 | def dump(self, file=sys.stdout): 293 | byIndex = [None] * (2*len(self.terms) - 1) 294 | for d in self.ngrams.values(): 295 | byIndex[d['index']] = d 296 | for d in byIndex: 297 | try: 298 | idx = d['index'] 299 | # seems backward 300 | ngramType = "bigram" if idx%2 else "unigram" 301 | q = d.get('term', '') 302 | v = d.get('candidates', []) 303 | # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "\n".join(v))) 304 | # print("{}{}. {}: {}".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v)))) 305 | # summaries = Counter([c.summary() for c in v]) 306 | # print("{}{}. {}: {} ({})".format(" " if idx%2 else "", idx, q, "{} candidates".format(len(v))," unknown")) 307 | print("({}) {}. {}:".format(ngramType, idx, q), file=file) 308 | # print("Candidates:") 309 | if v: 310 | for c in v: 311 | # print(c.summary()) 312 | # print(c) 313 | print(" " + c.explain(), file=file) 314 | else: 315 | print(" None", file=file) 316 | except: 317 | print(d, file=file) 318 | 319 | def dumpToString(self, indent=0): 320 | buffer = StringIO() 321 | self.dump(file=buffer) 322 | s = buffer.getvalue() 323 | buffer.close() 324 | prefix = " " * indent 325 | return (prefix + s.replace("\n", "\n" + prefix) 326 | if prefix 327 | else s) 328 | -------------------------------------------------------------------------------- /src/dig/synonym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os 4 | 5 | from nltk.corpus import wordnet as wn 6 | import word2vec 7 | import urllib 8 | from urllib.parse import quote 9 | from builtins import setattr 10 | 11 | class Synonym(object): 12 | 13 | """Synonym records a link between two surface forms: 14 | a known word or collocation (the seed or indicator) 15 | e.g., 'blue', 'eye_color' 16 | and 17 | a word or collocation believed to be equivalent/or related (the target or content) 18 | e.g., 'sky'.""" 19 | 20 | def __init__(self, *args, indicator=None, content=None, score=1.0, source=None, **kwargs): 21 | self.indicator = indicator 22 | self.content = content 23 | self.score = score 24 | self.source = source 25 | for (k, v) in kwargs.items(): 26 | setattr(self, k, v) 27 | 28 | def __str__(self, *args, **kwargs): 29 | sig = "{}({})=>{}{}".format(self.detailSource(), 30 | getattr(self, "indicator", "*INDICATOR*"), 31 | getattr(self, "content", "*CONTENT*"), 32 | # "" if getattr(self, "score", 1.0)==1.0 else " " + getattr(self, "score", "unknown") 33 | "") 34 | return "<" + str(type(self).__name__) + " " + sig + ">" 35 | 36 | def __repr__(self, *args, **kwargs): 37 | return self.__str__(*args, **kwargs) 38 | 39 | def detailSource(self): 40 | try: 41 | return self.source + "." + self.rel 42 | except AttributeError: 43 | try: 44 | return self.source 45 | except: 46 | return "*SOURCE*" 47 | 48 | def explain(self): 49 | return str(self) 50 | 51 | # the GoogleNews-vectors data I downloaded wasn't happy on the Mac, tended to misindex words 52 | # e.g., model['dog'] was missing but model['og'] was found 53 | # model = word2vec.load('/opt/word2vec/data/GoogleNews-vectors-negative300.bin') 54 | # model = word2vec.load(os.path.join(WORD2VEC_DATA_DIR, WORD2VEC_DATA_FILE) 55 | 56 | class SynonymGenerator(object): 57 | pass 58 | 59 | WORD2VEC_DATA_DIR = '/opt/word2vec/data' 60 | WORD2VEC_DATA_FILE = "text8-phrases.bin" 61 | WORD2VEC_SIZE = 10 62 | WORD2VEC_MINIMUM_SCORE = 0.5 63 | 64 | class Word2VecSynonymGenerator(SynonymGenerator): 65 | 66 | def __init__(self, 67 | dataDir=WORD2VEC_DATA_DIR, 68 | dataFile=WORD2VEC_DATA_FILE, 69 | size=WORD2VEC_SIZE, 70 | minimumScore=WORD2VEC_MINIMUM_SCORE): 71 | super(Word2VecSynonymGenerator, self).__init__() 72 | # word2vec config 73 | self.dataDir = dataDir 74 | self.dataFile = dataFile 75 | self.size = size 76 | self.minimumScore = minimumScore 77 | if self.dataDir and self.dataFile: 78 | self.word2vecModel = word2vec.load(os.path.join(self.dataDir, self.dataFile)) 79 | 80 | def generateSynonyms(self, indicator): 81 | """collocation indicator must be specified as word1_word2""" 82 | if isinstance(indicator, (list, tuple)): 83 | indicator = "_".join(indicator) 84 | size = self.size 85 | minimumScore = self.minimumScore 86 | try: 87 | model = self.word2vecModel 88 | (indexes, metrics) = model.cosine(indicator, size) 89 | array = model.generate_response(indexes, metrics) 90 | for (syn, similarityScore) in array: 91 | if similarityScore >= minimumScore: 92 | yield(Synonym(indicator=indicator, content=syn, score=similarityScore, source='word2vec')) 93 | except: 94 | pass 95 | pass 96 | 97 | WORDNET_PARTS_OF_SPEECH = ['n', 'v', 'a', 'r'] 98 | WORDNET_LEMMA_MIN_COUNT = 1 99 | # POS self/factor up/factor down/factor 100 | WORDNET_NEIGHBORHOOD = (('n', (True, 1), (True, 0.5), (True, 0.5)), 101 | ('v', (True, 1), (True, 0.5), (True, 0.5)), 102 | ('a', (True, 1), (False, 0), (True, 0.5)), 103 | ('r', (True, 1), (False, 0), (True, 0.5))) 104 | 105 | 106 | # TODO: interrogate pertanyms and derivationally_related_forms, which are stored only on the resultant lemmas 107 | # TODO: holonyms (synechdoche), metonymy in general 108 | 109 | class WordnetSynonymGenerator(SynonymGenerator): 110 | 111 | def __init__(self, 112 | partsOfSpeech=WORDNET_PARTS_OF_SPEECH, 113 | lemmaMinCount=WORDNET_LEMMA_MIN_COUNT, 114 | neighborhood=WORDNET_NEIGHBORHOOD): 115 | super(WordnetSynonymGenerator, self).__init__() 116 | # wordnet config 117 | self.wn = wn 118 | self.wordnetPartsOfSpeech = WORDNET_PARTS_OF_SPEECH 119 | self.wordnetLemmaMinCount = WORDNET_LEMMA_MIN_COUNT 120 | self.wordnetNeighborhood = WORDNET_NEIGHBORHOOD 121 | 122 | def generateSynonyms(self, indicator): 123 | """lemmas with count=0 are generally quite rare, so drop them 124 | may generate a lemma more than once, possible with different parameters""" 125 | neighborhood = self.wordnetNeighborhood 126 | pos = self.wordnetPartsOfSpeech 127 | wn = self.wn 128 | # Avoid lemmas with counts lower than this 129 | # Many WN unusual lemmas have zero 130 | minCount = self.wordnetLemmaMinCount 131 | def generateSynsetSynonyms(synset, rel, factor): 132 | for lemma in synset.lemmas(): 133 | count = lemma.count() 134 | if count > minCount: 135 | name = lemma.name() 136 | if name == indicator: 137 | continue 138 | yield(Synonym(indicator=indicator, content=name, lemma=lemma, synset=synset, pos=pos, factor=factor, 139 | rel=rel, count=count, score=count*factor, source='wordnet')) 140 | 141 | for pos, (here, hereFactor), (up, upFactor), (down, downFactor) in neighborhood: 142 | for synset in wn.synsets(indicator, pos=pos): 143 | if here: 144 | for g in generateSynsetSynonyms(synset, "self", hereFactor): 145 | yield(g) 146 | if up: 147 | for parent in synset.hypernyms(): 148 | for g in generateSynsetSynonyms(parent, "hypernym", upFactor): 149 | yield(g) 150 | if down: 151 | for child in synset.hyponyms(): 152 | for g in generateSynsetSynonyms(child, "hyponym", downFactor): 153 | yield(g) 154 | 155 | 156 | class SwoogleSynonymGenerator(SynonymGenerator): 157 | 158 | def __init(self): 159 | super(SwoogleSynonymGenerator, self).__init__() 160 | # swoogle config 161 | self.swoogle = True 162 | self.swoogleUriTemplate = '''http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="{}"&phrase2="{}"''' 163 | 164 | def generateSynonyms(self, indicator): 165 | """Incomplete""" 166 | score = 0 167 | url = self.swoogleUriTemplate.format(quote) 168 | try: 169 | request = urllib.request.Request(url) 170 | response = urllib.request.urlopen(request) 171 | score = str(response.read().decode('utf-8')).replace('\"','') 172 | score = float(score) 173 | except Exception as _: 174 | pass 175 | pass 176 | 177 | class EasyESASynonymGenerator(SynonymGenerator): 178 | def __init(self): 179 | super(EasyESASynonymGenerator, self).__init__() 180 | 181 | def generateSynonyms(self, indicator): 182 | pass 183 | 184 | class Thesaurus(object): 185 | def __init__(self, 186 | word2vec_enable=True, 187 | word2vec_data_dir=WORD2VEC_DATA_DIR, 188 | word2vec_data_file=WORD2VEC_DATA_FILE, 189 | word2vec_size=WORD2VEC_SIZE, 190 | word2vec_minimum_score=WORD2VEC_MINIMUM_SCORE, 191 | wordnet_enable=True, 192 | # wordnetPartsOfSpeech=WORDNET_PARTS_OF_SPEECH, 193 | wordnet_lemma_min_count=WORDNET_LEMMA_MIN_COUNT, 194 | # wordnet_neighborhood=WORDNET_NEIGHBORHOOD, 195 | wordnet_n_enable = True, 196 | wordnet_n_self_factor = 1.0, 197 | wordnet_n_hypernym_factor = 0.5, 198 | wordnet_n_hyponym_factor = 0.5, 199 | wordnet_v_enable = True, 200 | wordnet_v_self_factor = 1.0, 201 | wordnet_v_hypernym_factor = 0.5, 202 | wordnet_v_hyponym_factor = 0.5, 203 | wordnet_a_enable = True, 204 | wordnet_a_self_factor = 1.0, 205 | wordnet_a_hypernym_factor = 0, 206 | wordnet_a_hyponym_factor = 0.5, 207 | wordnet_r_enable = True, 208 | wordnet_r_self_factor = 1.0, 209 | wordnet_r_hypernym_factor = 0, 210 | wordnet_r_hyponym_factor = 0.5, 211 | swoogle_enable=False, 212 | swoogle_uri_template=None, 213 | easyesa_enable=False, 214 | **kwargs): 215 | synonymGenerators = {} 216 | if word2vec_enable: 217 | synonymGenerators['word2vec'] = Word2VecSynonymGenerator(dataDir=word2vec_data_dir, 218 | dataFile=word2vec_data_file, 219 | size=word2vec_size, 220 | minimumScore=word2vec_minimum_score) 221 | if wordnet_enable: 222 | partsOfSpeech = [] 223 | neighborhood = [] 224 | if wordnet_n_enable: 225 | partsOfSpeech.append('n') 226 | neighborhood.append( ('n', 227 | (wordnet_n_self_factor>0, wordnet_n_self_factor), 228 | (wordnet_n_hypernym_factor>0, wordnet_n_hypernym_factor), 229 | (wordnet_n_hyponym_factor>0, wordnet_n_hyponym_factor)) ) 230 | if wordnet_v_enable: 231 | partsOfSpeech.append('v') 232 | neighborhood.append( ('v', 233 | (wordnet_v_self_factor>0, wordnet_v_self_factor), 234 | (wordnet_v_hypernym_factor>0, wordnet_v_hypernym_factor), 235 | (wordnet_v_hyponym_factor>0, wordnet_v_hyponym_factor)) ) 236 | if wordnet_a_enable: 237 | partsOfSpeech.append('a') 238 | neighborhood.append( ('a', 239 | (wordnet_a_self_factor>0, wordnet_a_self_factor), 240 | (wordnet_a_hypernym_factor>0, wordnet_a_hypernym_factor), 241 | (wordnet_a_hyponym_factor>0, wordnet_a_hyponym_factor)) ) 242 | if wordnet_r_enable: 243 | partsOfSpeech.append('r') 244 | neighborhood.append( ('r', 245 | (wordnet_r_self_factor>0, wordnet_r_self_factor), 246 | (wordnet_r_hypernym_factor>0, wordnet_r_hypernym_factor), 247 | (wordnet_r_hyponym_factor>0, wordnet_r_hyponym_factor)) ) 248 | synonymGenerators['wordnet'] = WordnetSynonymGenerator(partsOfSpeech=partsOfSpeech, 249 | lemmaMinCount=wordnet_lemma_min_count, 250 | neighborhood=neighborhood) 251 | if swoogle_enable: 252 | synonymGenerators['swoogle'] = SwoogleSynonymGenerator(uriTemplate=swoogle_uri_template) 253 | if easyesa_enable: 254 | synonymGenerators['easyESA'] = EasyESASynonymGenerator() 255 | self.synonymGenerators = synonymGenerators 256 | 257 | def generateSynonyms(self, indicator): 258 | for (_, syngen) in self.synonymGenerators.items(): 259 | for g in syngen.generateSynonyms(indicator): 260 | yield(g) 261 | -------------------------------------------------------------------------------- /src/dig/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from networkx import * 4 | 5 | g=Graph() 6 | g.add_node('a') 7 | g.add_node('b') 8 | g.add_node('c') 9 | g.add_node('d') 10 | g.add_node('e') 11 | g.add_node('f') 12 | g.add_node('g') 13 | g.add_node('h') 14 | g.add_node('i') 15 | g.add_node('j') 16 | g.add_node('k') 17 | 18 | g.add_edge('a','b', weight=1) 19 | g.add_edge('a','c', weight=2) 20 | g.add_edge('b','d', weight=1) 21 | g.add_edge('c','d', weight=2) 22 | g.add_edge('c','e', weight=1) 23 | g.add_edge('d','f', weight=2) 24 | g.add_edge('f','g', weight=1) 25 | g.add_edge('d','h', weight=2) 26 | g.add_edge('h','i', weight=1) 27 | g.add_edge('h','j', weight=2) 28 | g.add_edge('h','k', weight=1) 29 | # cycle 30 | g.add_edge('b','a', weight=1) 31 | -------------------------------------------------------------------------------- /src/dig/z-attic/wordSimilarity.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import sys 3 | import json 4 | import math 5 | from urllib.parse import quote 6 | from threading import Thread 7 | 8 | class WordSimilarity: 9 | 10 | scoreDictionary = {} 11 | scoreDictionary['esa'] = 0 12 | scoreDictionary['swoogle'] = 0 13 | 14 | # 1 - EasyESA client 15 | # a score of 1 and -1 results in a perfect match 16 | # treshold values to consider 0.07, 0.052 and 0.04 17 | def getEasyESAScore(word1,word2): 18 | 19 | WordSimilarity.scoreDictionary['esa'] = 0 20 | url = "http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1="+quote(word1)+'&term2='+quote(word2) 21 | try: 22 | request = urllib.request.Request(url) 23 | response = urllib.request.urlopen(request) 24 | score = str(response.read().decode('utf-8')).replace('\"','') 25 | if float(score)> 0: 26 | print("ESA %s %s => %s" % (word1, word2, score)) 27 | WordSimilarity.scoreDictionary['esa'] = float(score) 28 | except Exception as e: 29 | WordSimilarity.scoreDictionary['esa'] = 0 30 | 31 | # 2 - ws4j client 32 | def getWs4jScore(word1,word2): 33 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+quote(word1)+"%3A%3A"+quote(word2) 34 | request = urllib.request.Request(url) 35 | request.add_header('Accept', 'application/json') 36 | response = urllib.request.urlopen(request) 37 | responseStr = response.read().decode('utf-8') 38 | # fetch json from the response 39 | jsonStr = json.loads(responseStr) 40 | score = float(jsonStr['result'][0]['score']) 41 | return score 42 | 43 | # 3 - UMBC Semantic Similarity service 44 | # 45 | # Documentation availabel at http://swoogle.umbc.edu/SimService/api.html 46 | def getSwoogleScore(word1,word2): 47 | WordSimilarity.scoreDictionary['swoogle'] = 0 48 | url = "http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="+quote(word1)+'&phrase2='+quote(word2) 49 | try: 50 | request = urllib.request.Request(url) 51 | response = urllib.request.urlopen(request) 52 | score = str(response.read().decode('utf-8')).replace('\"','') 53 | score = float(score) 54 | if score > 0: 55 | print("Swoogle %s / %s => %s" % (word1, word2, score)) 56 | WordSimilarity.scoreDictionary['swoogle'] = score 57 | except Exception as e: 58 | WordSimilarity.scoreDictionary['swoogle'] = 0 59 | 60 | 61 | # As of now using only EasyESA. 62 | # call the method 2 ws4j client if needed 63 | # a score of 1 and -1 results in a perfect match 64 | # treshold values to consider 0.07, 0.052 and 0.04 65 | def isPredicateSimilar(word1,word2): 66 | #score = math.fabs(WordSimilarity.getEasyESAScore(word1,word2)) 67 | 68 | esaThread = Thread(target=WordSimilarity.getEasyESAScore, args=(word1,word2,)) 69 | swoogleThread = Thread(target=WordSimilarity.getSwoogleScore, args=(word1,word2,)) 70 | 71 | esaThread.start() 72 | swoogleThread.start() 73 | esaThread.join() 74 | swoogleThread.join() 75 | 76 | ESAscore = WordSimilarity.scoreDictionary['esa'] 77 | #WordSimilarity.getEasyESAScore(word1,word2) 78 | ESAScaledScore = 0 79 | if(ESAscore>0 and ESAscore<=0.04): 80 | ESAScaledScore = 1 81 | elif(ESAscore>0.04 and ESAscore<=0.06): 82 | ESAScaledScore = 2 83 | elif(ESAscore>0.07): 84 | ESAScaledScore = 3 85 | else: 86 | ESAScaledScore = 0 87 | 88 | SwoogleScore = WordSimilarity.scoreDictionary['swoogle'] 89 | # WordSimilarity.getSwoogleScore(word1,word2) 90 | SwoogleScaledScore = 0 91 | if(SwoogleScore>0 and SwoogleScore<0.6): 92 | SwoogleScaledScore = 1 93 | elif(SwoogleScore>=0.6 and SwoogleScore<0.7): 94 | SwoogleScaledScore = 2 95 | elif(SwoogleScore>=0.7): 96 | SwoogleScaledScore = 3 97 | else: 98 | SwoogleScaledScore = 0 99 | 100 | if(ESAScaledScore>SwoogleScaledScore): 101 | print("Using ESA") 102 | score = ESAScaledScore 103 | else: 104 | print("Using Swoogle") 105 | score = SwoogleScaledScore 106 | 107 | if(score>=2): 108 | return score 109 | else: 110 | return -1 111 | -------------------------------------------------------------------------------- /src/graphSearch.py: -------------------------------------------------------------------------------- 1 | from ngramsEngine import ngramsEngine 2 | from ngramTree import * 3 | from pivotEntityRecognition import * 4 | from colorAssignment import ColorAssignment 5 | from sparqlClient import SparqlClient 6 | import inflection 7 | import urllib.request 8 | import sys 9 | 10 | class GraphSearch: 11 | 12 | def __init__(self): 13 | a =1 14 | 15 | # Method that prints the initial color assigned 16 | def printColors(treeObj,rootNode): 17 | 18 | # Reset the visited flag for the traversal 19 | treeObj.resetVisitedFlag(rootNode) 20 | listNgrams = [] 21 | stack = [] 22 | stack.append(rootNode) 23 | 24 | while(stack): 25 | currNode = stack.pop() 26 | if not currNode.isVisited: 27 | currNode.isVisited = True 28 | #print('---------') 29 | listNgrams.append(currNode.data) 30 | #print(currNode.data) 31 | #print(currNode.color) 32 | for childNodes in currNode.children: 33 | stack.append(childNodes) 34 | return listNgrams 35 | 36 | # Print the Pivot entities recogised 37 | def printpre(resourceList): 38 | print('------------ Pivot Entity Recognition --------------') 39 | if(len(resourceList)==0): 40 | print('no pivot entity found') 41 | else: 42 | for res in resourceList: 43 | print('Resource name : '+res.uri) 44 | print("Label : "+res.label) 45 | print("Incoming Links : "+str(res.support)) 46 | print("keyword : "+res.keyword) 47 | print("colors : "+str(res.colors)) 48 | print('------------------------') 49 | 50 | # Print factnodes 51 | def printTriplets(tripleList): 52 | for triple in tripleList: 53 | print('----') 54 | obj = triple.object 55 | print(str(obj.score)) 56 | print(str(obj.colors)) 57 | print(str(obj.keyword)) 58 | print(str(triple.subject.uri) + ' ' + str(triple.predicate.uri) + ' ' + str(triple.object.uri)) 59 | 60 | 61 | 62 | 63 | # Gets the bigrams from the sentence and returns the bigrams that are to be covered 64 | def getBiGramList(sentence,resource): 65 | 66 | sentenceList = sentence.split(' ') 67 | resourceKeyword = resource.keyword.split(' ') 68 | 69 | # remove the bigrams that has 70 | for key in resourceKeyword: 71 | sentenceList.remove(key) 72 | 73 | biGramList = [] 74 | 75 | # Form the bigrams 76 | if(len(sentenceList)!=0): 77 | for i in range(0,len(sentenceList)-1): 78 | biGramList.append(sentenceList[i]+' '+sentenceList[i+1]) 79 | 80 | return biGramList 81 | 82 | 83 | # Ranks the results coverage first followed by the scores 84 | def rankResults(listFactNodes,length): 85 | # new list will contain lists of nodes with each list at index corresponding to the number of colors covered by the node 86 | newList = [] 87 | 88 | # initialize the list 89 | for i in range(0,length): 90 | newList.append([]) 91 | 92 | # insert the nodes at the appropriate index lists 93 | for node in listFactNodes: 94 | index = int(len(node.colors)-1) 95 | newList[index].append(node) 96 | 97 | # sort list on scores 98 | for list in newList: 99 | list.sort(key=lambda x: x.score, reverse=True) 100 | 101 | # flatten the sorted list 102 | returnList = [] 103 | for i in range(len(newList)-1,-1,-1): 104 | for node in newList[i]: 105 | returnList.append(node) 106 | 107 | return returnList 108 | 109 | 110 | 111 | # Driver method 112 | def main(): 113 | 114 | # Ask the user to input the query 115 | sentence = input("Enter the query : ") 116 | 117 | print() 118 | print() 119 | print('Phase 1 ... N GRAM Generation') 120 | # Generate the n-grams 121 | ngramsEngineObj = ngramsEngine() 122 | listNgrams,lookupList = ngramsEngineObj.generateNGrams(sentence) 123 | 124 | print('Generated N-grams') 125 | 126 | 127 | # Start building the n-gram tree by selecting the root node 128 | rootWord = listNgrams[0] 129 | rootNode = Node(rootWord) 130 | 131 | 132 | # Construct the tree with the root node 133 | treeObj = NgramTree(rootNode) 134 | treeObj.constructTree(listNgrams,lookupList) 135 | 136 | # Print tree 137 | #treeObj.printNode(rootNode) 138 | print('N-gram tree constructed') 139 | 140 | print() 141 | print('Phase 2 ... Color assignment') 142 | 143 | # Color assignment 144 | colorAssignmentObj = ColorAssignment() 145 | colorAssignmentObj.assignInitialColors(rootNode,lookupList) 146 | 147 | 148 | # Prints colours 149 | #print(printColors(treeObj,rootNode)) 150 | print('Completed initial color assignment') 151 | #exit(3) 152 | print() 153 | print('Phase 3 ... PivotEntityRecognition') 154 | # Make use of the spotlight to get the pivot entities sorted on the number of incoming links 155 | spotlightObject = PivotEntityRecognition() 156 | resourceList = spotlightObject.getPivotElement(sentence) 157 | 158 | 159 | #print PRE 160 | printpre(resourceList) 161 | print('Got the pivot element') 162 | print() 163 | 164 | 165 | print('Phase 4 ... Search Phase') 166 | print() 167 | 168 | # get the initial fact nodes 169 | listFactNodes = [] 170 | 171 | for resource in resourceList : 172 | # Get the bi-gram list 173 | biGramList = getBiGramList(sentence,resource) 174 | listFactNodes.extend(SparqlClient.getAllTripletsForPivotElement(resource,biGramList)) 175 | 176 | 177 | for factNode in listFactNodes: 178 | if(factNode.isExplored == False and factNode.object.isUri): 179 | biGramList = getBiGramList(sentence,factNode.object) 180 | listFactNodes.extend(SparqlClient.getAllTripletsForPivotElement(factNode.object,biGramList)) 181 | 182 | resultsList = rankResults(listFactNodes,len(sentence.split(' '))) 183 | 184 | printTriplets(resultsList) 185 | 186 | if __name__ == '__main__': 187 | main() 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /src/ngramTree.py: -------------------------------------------------------------------------------- 1 | from ngramsEngine import ngramsEngine 2 | 3 | # This class represents Node of the tree 4 | # A Node has value/data and also has links to its children stored as a list 5 | class Node(object): 6 | 7 | def __init__(self, data): 8 | self.data = data # Data in the node 9 | self.color = [] # Colour assignment for the node 10 | self.children = [] # Represents the child nodes 11 | self.isDuplicate = False # Checks if this node is the child of 2 different nodes 12 | self.isVisited = False # This flag helps is traversal 13 | 14 | 15 | #Used to add child node to current node 16 | def add_child(self, obj): 17 | self.children.append(obj) 18 | 19 | 20 | # This represents a n-gram tree 21 | class NgramTree(object): 22 | 23 | def __init__(self,rootNode): 24 | self.rootNode = rootNode 25 | 26 | # Reset the visited flag to False 27 | def resetVisitedFlag(self,node): 28 | for n in node.children: 29 | self.resetVisitedFlag(n) 30 | node.isVisited = False 31 | 32 | # Post order traversal of the tree (DFS) 33 | def post_order(self,node): 34 | for n in node.children: 35 | self.post_order(n) 36 | 37 | if not node.isVisited: 38 | node.isVisited = True 39 | print(node.data) 40 | 41 | 42 | # BFS traversal 43 | def printNode(self,node): 44 | if node is None: 45 | return 46 | if not node.isVisited: 47 | node.isVisited = True 48 | print(node.data) 49 | 50 | for c in node.children: 51 | self.printNode(c) 52 | 53 | 54 | 55 | # This module builds the n-gram tree with the basic idea of BFS traversal 56 | # Input : List1, List2 57 | # List1 : ['a b c d', 'a b c', 'b c d', 'a b', 'b c', 'c d', 'a', 'b', 'c', 'd'] 58 | # List2 : [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']] 59 | # Algorithm : 60 | # while(queue): 61 | # CurrentNode = queue.pop() 62 | # search for tokens that have a length = length(CurrentNode.value) - 1 63 | # ##[Get this from List2] 64 | # ##[These tokens will be the nodes at the next level in the tree] 65 | # check if the tokens are already in the tree 66 | # if not create new node with these tokens 67 | # add the token nodes as the children of CurrentNode 68 | 69 | def constructTree(self,listNgrams,lookupList): 70 | 71 | # This dictionary is used to track the nodes that are in the tree 72 | # key:Nodevalue value:Node 73 | treeDictionary = {} 74 | 75 | nodeQueue = [] # This list exhibits behaviour of a Queue 76 | nodeQueue.append(self.rootNode) # Add the root node to the Queue to begin search 77 | treeDictionary[self.rootNode.data] = self.rootNode # Add the root to the treeDictionary as it is seen 78 | 79 | 80 | while(nodeQueue): 81 | currentNode = nodeQueue.pop(0) # Pop the queue 82 | data = currentNode.data # Get the data of the node 83 | 84 | dataLen = len(data.split(' ')) # Get the length of the n-grams in the current token 85 | 86 | if(dataLen-2 >= 0): # Stop if we have reached situation where current iteration is for individual tokens 87 | listChildren = lookupList[dataLen-2] # Get the tokens that have a lenghth 1 less than that of the current token from the look up list 88 | 89 | for child in listChildren: 90 | if child in data: # Check if the child is a sustring of the token 91 | 92 | if child not in treeDictionary: # Check if a node for 'child' is already created. If so retrieve that node and set it as a duplicate 93 | newNode = Node(child) 94 | nodeQueue.append(newNode) 95 | treeDictionary[child] = newNode 96 | else: 97 | newNode = treeDictionary[child] 98 | newNode.isDuplicate = True 99 | 100 | currentNode.add_child(newNode) # Add this child to the parent node 101 | 102 | # Reset the visited flag for the traversal 103 | self.resetVisitedFlag(self.rootNode) 104 | 105 | #self.printNode(self.rootNode) 106 | #self.post_order(self.rootNode) 107 | 108 | 109 | def main(query): 110 | ngramsEngineObj = ngramsEngine() 111 | listNgrams,lookupList = ngramsEngineObj.generateNGrams(query) 112 | 113 | rootWord = listNgrams[0] 114 | rootNode = Node(rootWord) 115 | 116 | treeObj = NgramTree(rootNode) 117 | treeObj.constructTree(listNgrams,lookupList) 118 | 119 | 120 | 121 | if __name__ == '__main__': 122 | main() -------------------------------------------------------------------------------- /src/ngramsEngine.py: -------------------------------------------------------------------------------- 1 | from nltk.util import ngrams 2 | 3 | # This class makes use of nltk library for generating n-grams given a query 4 | class ngramsEngine(object): 5 | 6 | def __init__(self): 7 | self = self 8 | 9 | # Module to print n-grams 10 | def printNGrams(self,ngramsList): 11 | for token in ngramsList: 12 | print(token.strip()) 13 | 14 | 15 | # Module that generates n-grams list 16 | # Input : query 17 | # Output : Two lists are returned. 18 | # 1st list : This has all the n-grams arranged hierarchically 19 | # 2nd list : This is a list that as list of n-grams grouped together based on the length 20 | # All n-grams of length 1 goes in index 0 of the List2 21 | # All n-grams of length 2 goes in index 1 of the List2 22 | # All n-grams of length 3 goes in index 2 of the List2 23 | # EX: i/p - a b c d 24 | # List1 : ['a b c d', 'a b c', 'b c d', 'a b', 'b c', 'c d', 'a', 'b', 'c', 'd'] 25 | # List2 : [['a', 'b', 'c', 'd'], ['a b', 'b c', 'c d'], ['a b c', 'b c d'], ['a b c d']] 26 | 27 | def generateNGrams(self,query): 28 | 29 | # This stores the n-grams as generated by NLTK 30 | ngramsNLTKList = [] 31 | 32 | # Get the inital n-gram list built 33 | for n in range(len(query),0,-1): 34 | ngramsNLTKList.extend(ngrams(query.split(),n)) 35 | 36 | # Actual n-gram list (List 1 as in the description) 37 | ngramList = [] 38 | 39 | # A look up list (List 2 as in the description) 40 | lookupList = [] 41 | 42 | # Join the individual lists to get the n-grams 43 | for ngram in ngramsNLTKList: 44 | ngramList.append((' '.join(ngram)).strip()) 45 | 46 | # Determine the length of the lookupList required 47 | if(len(ngramsNLTKList)>0): 48 | maxLength = len(ngramsNLTKList[0]) 49 | for i in range(maxLength): 50 | lookupList.append([]) 51 | 52 | # Fill in the lookupList 53 | # All n-grams of length 1 goes in index 0 54 | # All n-grams of length 2 goes in index 1 55 | # All n-grams of length 3 goes in index 2 ... 56 | for token in ngramsNLTKList: 57 | joinedToken = ' '.join(token).strip() 58 | listLength = len(joinedToken) 59 | currentList = lookupList[len(token)-1] 60 | currentList.append(' '.join(token)) 61 | 62 | return ngramList,lookupList 63 | 64 | 65 | def main(): 66 | ngramsEngineObject = ngramsEngine() 67 | query = input(" Enter the query : ") 68 | 69 | ngramsList,lookupList = ngramsEngineObject.generateNGrams(query.strip()) 70 | ngramsEngineObject.printNGrams(ngramsList) 71 | 72 | if __name__ == '__main__': 73 | main() 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/pivotEntityRecognition.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import sys 3 | import json 4 | from resourceGraph import Resource 5 | from colorAssignment import ColorAssignment 6 | 7 | # Spotlight service for pivot entity recognition 8 | class PivotEntityRecognition: 9 | 10 | def __init__(self): 11 | sentence = '' 12 | 13 | # This method updates the colours covered by a resource 14 | def updateColors(self,resourceList): 15 | for res in resourceList: 16 | tokens = res.keyword.split(' ') 17 | for token in tokens: 18 | if(token in ColorAssignment.colorDictionary): 19 | res.colors.append(int(ColorAssignment.colorDictionary[token])) 20 | 21 | return resourceList 22 | 23 | 24 | # Parses the final resource values into a Pivot Object 25 | def getPivotObject(self,resource): 26 | 27 | # Get the URI, Label and Support 28 | if('@uri' in resource): 29 | uri = resource['@uri'] 30 | label = '' 31 | support = 0 32 | 33 | if('@label' in resource): 34 | label = resource['@label'] 35 | 36 | if('@support' in resource): 37 | try: 38 | support = int(resource['@support']) 39 | except ValueError: 40 | support = 0 41 | 42 | pivotElement = Resource('',label,support,'') 43 | pivotElement.isUri = True 44 | return pivotElement 45 | else: 46 | return None 47 | 48 | # Main logic of parsing implemented here 49 | def parseJson(self,jsonStr): 50 | 51 | #print(jsonStr) 52 | 53 | # Return list of pivot objects 54 | resourceList = [] 55 | 56 | if('annotation' not in jsonStr): 57 | return resourceList 58 | 59 | if('surfaceForm' not in jsonStr['annotation']): 60 | return resourceList 61 | 62 | pivotTerms = jsonStr['annotation']['surfaceForm'] 63 | 64 | #print(pivotTerms) 65 | 66 | # This happens only when the return type has one entity key word 67 | if(type(pivotTerms) is dict): 68 | #If there is no pivot entity 69 | if('resource' not in pivotTerms): 70 | return resourceList 71 | # If there is only one entity identified for the keyword 72 | if(type(pivotTerms['resource']) is dict): 73 | #If there is only one pivot identified for the query 74 | pivotElement = self.getPivotObject(pivotTerms['resource']) 75 | if(pivotElement is not None): 76 | pivotElement.keyword = pivotTerms['@name'] 77 | resourceList.append(pivotElement) 78 | else: 79 | for resource in pivotTerms['resource']: 80 | pivotElement = self.getPivotObject(resource) 81 | if(pivotElement is not None): 82 | pivotElement.keyword = pivotTerms['@name'] 83 | resourceList.append(pivotElement) 84 | 85 | # This happens when the return type has multiple entity keywords 86 | elif(type(pivotTerms) is list): 87 | 88 | for resources in pivotTerms: 89 | # This happens only when the return type has one entity key word 90 | if(type(resources) is dict): 91 | #If there is no pivot entity 92 | if('resource' not in resources): 93 | continue 94 | # If there is only one entity identified for the keyword 95 | if(type(resources['resource']) is dict): 96 | #If there is only one pivot identified for the query 97 | pivotElement = self.getPivotObject(resources['resource']) 98 | if(pivotElement is not None): 99 | pivotElement.keyword = resources['@name'] 100 | resourceList.append(pivotElement) 101 | else: 102 | for resource in resources['resource']: 103 | pivotElement = self.getPivotObject(resource) 104 | if(pivotElement is not None): 105 | pivotElement.keyword = resources['@name'] 106 | resourceList.append(pivotElement) 107 | 108 | # Sort the resource list on the number of incoming links 109 | resourceList.sort(key=lambda x: x.support, reverse=True) 110 | # Update the colors represented by the resources 111 | resourceList = self.updateColors(resourceList) 112 | 113 | return resourceList 114 | 115 | # Queries DBPedia spotlight to get the values 116 | def requestSpotlight(self): 117 | #encode spaces 118 | sentence = self.sentence.replace(' ','%20') 119 | 120 | #restrict types to person,organistion and location 121 | urlTypes = 'types=DBpedia:Person,Schema:Person,DBpedia:Company,DBpedia:Organisation,Schema:Organization,DBpedia:AdministrativeRegion,DBpedia:PopulatedPlace,DBpedia:Place,Schema:Place' 122 | url = "http://spotlight.dbpedia.org/rest/candidates?types="+urlTypes+"&text="+sentence 123 | 124 | request = urllib.request.Request(url) 125 | request.add_header('Accept', 'application/json') 126 | response = urllib.request.urlopen(request) 127 | responseStr = str(response.read().decode('utf-8')) 128 | 129 | # fetch json from the response 130 | jsonStr = json.loads(responseStr) 131 | 132 | #Parse json 133 | return(self.parseJson(jsonStr)) 134 | 135 | # Entry point of the class 136 | def getPivotElement(self,query): 137 | 138 | self.sentence = query 139 | #Make request 140 | return(self.requestSpotlight()) 141 | 142 | 143 | if __name__ == '__main__': 144 | spotlightObj = PivotEntityRecognition() 145 | sentence = input(" Enter the keyword query : ") 146 | resourceList = spotlightObj.getPivotElement(sentence) 147 | 148 | if(len(resourceList)==0): 149 | print('no pivot entity found') 150 | else: 151 | for res in resourceList: 152 | print(res.uri+" "+res.label+" "+str(res.support)+" "+res.keyword) 153 | 154 | -------------------------------------------------------------------------------- /src/queries.txt: -------------------------------------------------------------------------------- 1 | 2 | longest river 3 | 4 | 5 | cars that are produced in Germany 6 | German cars --> picks 7 | 8 | 9 | 10 | Mother and father of Prince Harry and Prince William 11 | Prince Harry mother father ---> 12 | Prince William parents --> 13 | 14 | 15 | 16 | 17 | ---- 18 | 1.0 19 | [0, 1, 2] 20 | prince william parents 21 | 22 | ---- 23 | 1.0 24 | [0, 1, 2] 25 | prince william parents 26 | 27 | ---- 28 | 1.0 29 | [0, 1, 2] 30 | prince william parents 31 | 32 | ---- 33 | 1.0 34 | [0, 1, 2] 35 | prince william parents 36 | 37 | ---- 38 | 1.0 39 | [0, 1, 2] 40 | prince william parents 41 | 42 | ---- 43 | 1.0 44 | [0, 1, 2] 45 | prince william parents 46 | 47 | ---- 48 | 1.0 49 | [0, 1, 2] 50 | prince william parents 51 | 52 | ---- 53 | 1.0 54 | [0, 1, 2] 55 | prince william parents 56 | 57 | 58 | 59 | --------------- 60 | 61 | Prince Harry parents --> (could not match parents) 62 | Prince Harry Mother --> 63 | Current label : Prince Harry of Wales 64 | Keywords yet to cover : ['Mother'] 65 | 66 | 67 | latest U.S. state admitted 68 | 69 | number of languages spoken in Turkmenistan 70 | Turkmenistan languages 71 | ---- 72 | 1.0 73 | [0, 1] 74 | Turkmenistan languages 75 | 76 | ---- 77 | 1.0 78 | [0, 1] 79 | Turkmenistan languages 80 | 81 | ---- 82 | 1.0 83 | [0, 1] 84 | Turkmenistan languages 85 | 86 | ---- 87 | 0.9172847553 88 | [0, 1] 89 | Turkmenistan languages 90 | 91 | ---- 92 | 0.9172847553 93 | [0, 1] 94 | Turkmenistan languages 95 | 96 | ---- 97 | 0.7649522742 98 | [0, 1] 99 | Turkmenistan languages 100 | languages 101 | ---- 102 | 0.7649522742 103 | [0, 1] 104 | Turkmenistan languages 105 | Inter-ethnic 106 | 107 | ------- 108 | movies directed by Francis Ford Coppola 109 | Francis Ford Coppola directed movies - -> nothing 110 | Francis Ford Coppola movies - nothing on his dbpedia page (is a subject of many other movies) 111 | 112 | maiden name of Angela Merkel 113 | ---- 114 | 1.0 115 | [0, 1, 3] 116 | Angela Merkel name 117 | Angela Merkel 118 | ---- 119 | 1.0 120 | [0, 1, 3] 121 | Angela Merkel name 122 | Merkel, Angela 123 | ---- 124 | 1.0 125 | [0, 1, 3] 126 | Angela Merkel name 127 | Angela Merkel 128 | ---- 129 | 1.0 130 | [0, 1, 3] 131 | Angela Merkel name 132 | Merkel, Angela 133 | 134 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=maiden%20name&term2=birth%20name 135 | returns "0.0120668066" 136 | 137 | 138 | 139 | Australian nonprofit organizations 140 | could not detect pivot element 141 | 142 | Military conflicts in which Lawrence of Arabia participated 143 | 144 | 145 | number of inhabitants in Maribor 146 | inhabitants Maribor 147 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=inhabitants&term2=population "0.0345099577" 148 | 149 | 150 | games developed by GMT 151 | 152 | husband of Amanda Palmer 153 | Amanda Palmer husband 154 | 155 | Current label : Amanda Palmer 156 | Keywords yet to cover : ['husband'] 157 | http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1=spouse&term2=husband 158 | "0.0558511518" 159 | 160 | islands that belong to Japan 161 | Exploring ... 162 | 163 | Current label : Japan 164 | Keywords yet to cover : ['islands'] 165 | Exploring ... 166 | 167 | Current label : Japan (band) 168 | Keywords yet to cover : ['islands'] 169 | Exploring ... 170 | 171 | Current label : Islands (band) 172 | Keywords yet to cover : ['japan'] 173 | 174 | ruling party in Lisbon 175 | ---- 176 | 0.7246611098 177 | [0, 1, 2] 178 | lisbon ruling party 179 | 180 | ---- 181 | 0.5502401986 182 | [0, 1, 2] 183 | lisbon ruling party 184 | 185 | ---- 186 | 0.132051861 187 | [0, 1, 2] 188 | lisbon ruling party 189 | Helena Roseta 190 | ---- 191 | 0.132051861 192 | [0, 1, 2] 193 | lisbon ruling party 194 | Michael B. Lewis 195 | ---- 196 | 0.1086613063 197 | [0, 1, 2] 198 | lisbon ruling party 199 | 200 | ---- 201 | 0.1058211493 202 | [0, 1, 2] 203 | lisbon ruling party 204 | 205 | ---- 206 | 0.1058211493 207 | [0, 1, 2] 208 | lisbon ruling party 209 | Mayor 210 | ---- 211 | 0.0982569384 212 | [0, 1, 2] 213 | lisbon ruling party 214 | 215 | ---- 216 | 0.0982569384 217 | [0, 1, 2] 218 | lisbon ruling party 219 | 220 | ---- 221 | 0.0982569384 222 | [0, 1, 2] 223 | lisbon ruling party 224 | 225 | ---- 226 | 0.0982569384 227 | [0, 1, 2] 228 | lisbon ruling party 229 | 230 | ---- 231 | 0.0913513348 232 | [0, 1, 2] 233 | lisbon ruling party 234 | 235 | ---- 236 | 0.0887198372 237 | [0, 1, 2] 238 | lisbon ruling party 239 | 240 | ---- 241 | 0.0799152153 242 | [0, 1, 2] 243 | lisbon ruling party 244 | Praça do Município 245 | ---- 246 | 0.0794172227 247 | [0, 1, 2] 248 | lisbon ruling party 249 | 9 250 | ---- 251 | 0.0794172227 252 | [0, 1, 2] 253 | lisbon ruling party 254 | W 255 | ---- 256 | 0.0794172227 257 | [0, 1, 2] 258 | lisbon ruling party 259 | 8 260 | ---- 261 | 0.0794172227 262 | [0, 1, 2] 263 | lisbon ruling party 264 | 18 265 | ---- 266 | 0.0731704343 267 | [0, 1, 2] 268 | lisbon ruling party 269 | 38 270 | ---- 271 | 0.0731704343 272 | [0, 1, 2] 273 | lisbon ruling party 274 | 42 275 | ---- 276 | 0.0731704343 277 | [0, 1, 2] 278 | lisbon ruling party 279 | N 280 | ---- 281 | 0.0731704343 282 | [0, 1, 2] 283 | lisbon ruling party 284 | 29 285 | ---- 286 | 0.0727422443 287 | [0, 1, 2] 288 | lisbon ruling party 289 | Lisbon 290 | ---- 291 | 0.0727422443 292 | [0, 1, 2] 293 | lisbon ruling party 294 | 295 | ---- 296 | 0.0708160618 297 | [0, 1, 2] 298 | lisbon ruling party 299 | 300 | ---- 301 | 0.8752961422 302 | [0, 2] 303 | lisbon party 304 | 305 | ---- 306 | 0.6660475643 307 | [0, 2] 308 | lisbon party 309 | 310 | ---- 311 | 0.1526511536 312 | [0, 2] 313 | lisbon party 314 | Helena Roseta 315 | ---- 316 | 0.1526511536 317 | [0, 2] 318 | lisbon party 319 | Michael B. Lewis 320 | ---- 321 | 0.1179090591 322 | [0, 2] 323 | lisbon party 324 | 325 | ---- 326 | 0.1179090591 327 | [0, 2] 328 | lisbon party 329 | Mayor 330 | ---- 331 | 0.105693768 332 | [0, 2] 333 | lisbon party 334 | 335 | ---- 336 | 0.0989016354 337 | [0, 2] 338 | lisbon party 339 | 340 | ---- 341 | 0.0974877736 342 | [0, 2] 343 | lisbon party 344 | 345 | ---- 346 | 0.0974877736 347 | [0, 2] 348 | lisbon party 349 | 350 | ---- 351 | 0.0974877736 352 | [0, 2] 353 | lisbon party 354 | 355 | ---- 356 | 0.0974877736 357 | [0, 2] 358 | lisbon party 359 | 360 | ---- 361 | 0.0859793517 362 | [0, 2] 363 | lisbon party 364 | 365 | ---- 366 | 0.0791254163 367 | [0, 2] 368 | lisbon party 369 | Praça do Município 370 | ---- 371 | 0.0778143857 372 | [0, 2] 373 | lisbon party 374 | 9 375 | ---- 376 | 0.0778143857 377 | [0, 2] 378 | lisbon party 379 | W 380 | ---- 381 | 0.0778143857 382 | [0, 2] 383 | lisbon party 384 | 8 385 | ---- 386 | 0.0778143857 387 | [0, 2] 388 | lisbon party 389 | 18 390 | ---- 391 | 0.0729892814 392 | [0, 2] 393 | lisbon party 394 | Lisbon 395 | ---- 396 | 0.0729892814 397 | [0, 2] 398 | lisbon party 399 | 400 | ---- 401 | 0.0722854064 402 | [0, 2] 403 | lisbon party 404 | 38 405 | ---- 406 | 0.0722854064 407 | [0, 2] 408 | lisbon party 409 | 42 410 | ---- 411 | 0.0722854064 412 | [0, 2] 413 | lisbon party 414 | N 415 | ---- 416 | 0.0722854064 417 | [0, 2] 418 | lisbon party 419 | 29 420 | 421 | 422 | Apollo 14 astronauts 423 | does not exists in dbpedia 424 | 425 | cosmonauts 426 | German cities with more than 250000 inhabitants 427 | second highest mountain on Earth 428 | professional skateboarders from Sweden 429 | band leaders that play trumpet 430 | countries have more than ten caves 431 | mayor of Berlin ---> berlin mayor 432 | 433 | Formula 1 driver with the most races 434 | youngest player in the Premier League 435 | Methodist politicians 436 | People that were born in Vienna and died in Berlin 437 | number of times that Jane Fonda married 438 | companies in Munich 439 | professional surfers born in Australia 440 | countries connected by the Rhine -------------------------------------------------------------------------------- /src/resourceGraph.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | # Model class for resource elements 4 | class Resource: 5 | def __init__(self,uri,label,support,keyword): 6 | self.uri = uri # URI of the resource. 7 | self.label = label # Label of the resource 8 | self.support = int(support) # Importance/ represents the number of incoming links in DBPedia on to the resource 9 | self.keyword = keyword # Keyword represented by the resource 10 | self.colors = [] # Colors assigned 11 | self.score = 0 12 | self.isUri = False 13 | 14 | # Fact node model class. 15 | # Fact node is a node that represents a RDF Triple. 16 | # In addition, we also maintain the keywords in the query that this fact node covers 17 | class FactNode: 18 | def __init__(self,subject,predicate,object): 19 | self.subject = subject # Subject of the fact node 20 | self.predicate = predicate # Predicate 21 | self.object = object # Object 22 | self.colors = [] # Colours 23 | self.children = [] # Child Nodes 24 | self.score = 0 # Represents the score of the the current fact node - This is a cumulative score 25 | self.isExplored = False # A boolean flag to check if the currect fact node is explored during search 26 | 27 | # Used to add child node to current node 28 | def add_child(self, obj): 29 | self.children.append(obj) 30 | 31 | # Set colors of the fact node from the colors of subject , predicate and object resources 32 | # Eg. 33 | # Fact_node triple -> dbPedia:Bill_Gates dbPedia:spouse dbPedia:Melinda_Gates 34 | # dbPedia:Bill_Gates covers colors 2,3 35 | # dbPedia:spouse covers colours 1 36 | # dbPedia:Melinda_Gates covers 1,2,3 37 | 38 | # then the fact node covers 1,2,3 39 | def set_colors(self): 40 | 41 | for color in self.subject.colors: 42 | if(color not in self.colors): 43 | self.colors.append(color) 44 | 45 | for color in self.predicate.colors: 46 | if(color not in self.colors): 47 | self.colors.append(color) 48 | 49 | for color in self.object.colors: 50 | if(color not in self.colors): 51 | self.colors.append(color) 52 | 53 | # Resource Graph Model class 54 | # This graph will have Fact nodes as the nodes which inturn will have Resources 55 | class ResourceGraph: 56 | def __init__(self,rootNode): 57 | self.rootNode = rootNode 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/sparqlClient.py: -------------------------------------------------------------------------------- 1 | import json 2 | import inflection 3 | from SPARQLWrapper import SPARQLWrapper, JSON , XML 4 | from colorAssignment import ColorAssignment 5 | from wordSimilarity import WordSimilarity 6 | from collections import OrderedDict 7 | from resourceGraph import Resource 8 | from resourceGraph import FactNode 9 | 10 | # This represents a DBPedia triplet object 11 | class DBPediaTriplet: 12 | def __init__(self,subject,predicate,object): 13 | self.subject = subject 14 | self.object = object 15 | self.predicate = predicate 16 | 17 | 18 | 19 | # This represents the sparql quering engine 20 | class SparqlClient : 21 | 22 | def findAverageScorePhraseSentence(keyword,actualPredicateValue): 23 | score = 0 24 | count = 0 25 | for key1 in keyword.lower().split(' '): 26 | for key2 in actualPredicateValue.lower().split(' '): 27 | count+=1 28 | if(key1 == key2): 29 | score += 3.0 30 | else: 31 | similarityScore = WordSimilarity.isPredicateSimilar(key1,key2) 32 | if(similarityScore==-1): 33 | similarityScore = 0 34 | score += similarityScore 35 | 36 | if(count!=0): 37 | if(score==0): 38 | return -1 39 | else: 40 | return (score/count) 41 | else: 42 | return -1 43 | 44 | # This method is used to filter the predicates 45 | def filterPredicates(predicate,keywordList): 46 | 47 | # vocab dictionary contains the predicates that we do not want to consider 48 | vocabDictionary = ['rdf-schema#comment','22-rdf-syntax-ns#type','abstract','owl#sameAs','subject'] 49 | 50 | predicateList = [] 51 | 52 | # from the predicate URI, just consider the property and ignore the vocabulary 53 | # http://dbpedia.org/resource/Name -----> consider 'Name' 54 | predicateValue = predicate.split('/')[-1] 55 | 56 | # ignore if the predicate property is in vocab dictionary 57 | if(predicateValue in vocabDictionary): 58 | return predicateList 59 | 60 | # Boolean value indicating phrase sentence 61 | isPhraseSentence = False 62 | 63 | # Handles the camel case properties 64 | # camel cases will be returned seperated by _ 65 | camelCaseValue = inflection.underscore(predicateValue) 66 | if '_' in camelCaseValue: 67 | isPhraseSentence = True 68 | else: 69 | isPhraseSentence = False 70 | 71 | predicateValues = camelCaseValue.split('_') 72 | 73 | 74 | # camel case with _ to a string seperated by spaces 75 | actualPredicateValue = '' 76 | for value in predicateValues: 77 | actualPredicateValue = actualPredicateValue + ' ' + value 78 | 79 | actualPredicateValue = actualPredicateValue.strip() 80 | 81 | 82 | # iterate over each uncovered keyword and check if the predicate is semantically similar to the keyword 83 | for keyword in keywordList: 84 | # semantic similarity 85 | if(keyword.lower()==actualPredicateValue.lower()): 86 | score = 3.0 87 | #elif(isPhraseSentence): 88 | #score = SparqlClient.findAverageScorePhraseSentence(keyword,actualPredicateValue) 89 | #print('phrase'+str(score)) 90 | else: 91 | score = WordSimilarity.isPredicateSimilar(keyword,actualPredicateValue) 92 | #print(' no phrase'+str(score)) 93 | 94 | 95 | 96 | if(score!=-1): 97 | predicateObject = Resource('<'+predicate+'>',predicateValue,0,keyword) 98 | 99 | # bi-gram scenario 100 | individualKeyword = keyword.split(' ') 101 | for key in individualKeyword: 102 | predicateObject.colors.append(ColorAssignment.colorDictionary[key]) 103 | 104 | predicateObject.score = score 105 | predicateObject.isUri = True 106 | predicateList.append(predicateObject) 107 | 108 | return predicateList 109 | 110 | 111 | 112 | # This method is used to get the list of keywords that is not covered by the current element 113 | def getUncoveredKeywords(colorList,biGramList): 114 | keywordList = [] 115 | 116 | # Join the list to make it a single string 117 | pivotColors = ''.join(str(x) for x in colorList) 118 | 119 | # Suppose we want to explore uncovered bi-grams, include them in the list 120 | if(len(biGramList)>0): 121 | keywordList.extend(biGramList) 122 | 123 | # make use of the color dictionary to identify uncovered keywords 124 | for keyword,color in ColorAssignment.colorDictionary.items(): 125 | if(str(color) not in pivotColors): 126 | keywordList.append(keyword) 127 | 128 | return keywordList 129 | 130 | 131 | def findObjectKeywordMatch(object): 132 | 133 | # get the object value 134 | objectVal = object.label 135 | 136 | # Join the list to make it a single string 137 | colors = ''.join(str(x) for x in object.colors) 138 | 139 | # make use of the color dictionary to identify uncovered keywords 140 | for keyword,color in ColorAssignment.colorDictionary.items(): 141 | if(str(color) not in colors): 142 | if(keyword == objectVal): 143 | object.score = object.score + 3.0 144 | object.colors.append(color) 145 | 146 | return object 147 | 148 | 149 | 150 | # Returns the triples for the pivot element 151 | def getAllTripletsForPivotElement(resource,biGramList): 152 | print(' Exploring ... ') 153 | tripletList = [] 154 | # Get the URI of the element 155 | pivotElement = resource.uri 156 | print(pivotElement) 157 | print('Current label : ' + resource.label) 158 | 159 | # Get a list of keywords that the current element does not cover 160 | keywordList = SparqlClient.getUncoveredKeywords(resource.colors,biGramList) 161 | print('Keywords yet to cover : ' + str(keywordList)) 162 | 163 | # If the resource covers all keywords, stop exploring this node 164 | if(len(keywordList)==0): 165 | return tripletList 166 | 167 | 168 | sparql = SPARQLWrapper("http://dbpedia.org/sparql") # Assigns an endpoint 169 | sparql.setReturnFormat(JSON) # Sets the return format to be json 170 | # Queries the endpoint to retrive all the triplets that have pivot element as subject 171 | sparql.setQuery(""" 172 | PREFIX rdfs: 173 | SELECT ?p ?o 174 | WHERE { """ + pivotElement + """ ?p ?o 175 | } 176 | """) 177 | 178 | try: 179 | results = sparql.query().convert() 180 | except Exception as e: 181 | print(e) 182 | print(' DBPedia is down for maintanance') # Exception 183 | return tripletList 184 | 185 | 186 | # Find predicates that are semantically similar to uncovered keywords 187 | for result in results["results"]["bindings"]: 188 | 189 | # Considering only 'en' language 190 | if(result["o"]["type"]!= 'uri' ): 191 | if("xml:lang" in result['o'] and result["o"]["xml:lang"]!='en'): 192 | continue 193 | 194 | 195 | # Get the sematically similar predicates 196 | predicateList = SparqlClient.filterPredicates(result["p"]["value"],keywordList) 197 | 198 | if len(predicateList)!=0: 199 | for predicate in predicateList: 200 | 201 | isUri = False 202 | objectval = result["o"]["value"] 203 | 204 | # form the URI if object is of type URI 205 | if(result["o"]["type"]=='uri'): 206 | isUri = True 207 | objectval = '<'+objectval+'>' 208 | 209 | # remove duplicated keyword scenario 210 | set = [] 211 | set.extend(resource.keyword.split(' ')) 212 | for x in predicate.keyword.split(' '): 213 | if x not in set: 214 | set.append(x) 215 | 216 | set = ' '.join(str(x) for x in set) 217 | 218 | object = Resource(objectval,result["o"]["value"].split('/')[-1],0,set) 219 | 220 | # set the properties and form the fact node 221 | if(isUri): 222 | object.isUri = True 223 | 224 | object.score = resource.score + predicate.score 225 | for color in resource.colors: 226 | if(color not in object.colors): 227 | object.colors.append(color) 228 | 229 | for color in predicate.colors: 230 | if(color not in object.colors): 231 | object.colors.append(color) 232 | 233 | object = SparqlClient.findObjectKeywordMatch(object) 234 | 235 | factNodeObj = FactNode(resource,predicate,object) 236 | factNodeObj.score = object.score 237 | factNodeObj.set_colors() 238 | tripletList.append(factNodeObj) 239 | ''' 240 | else: 241 | 242 | objectList = SparqlClient.filterPredicates(result["o"]["value"],keywordList) 243 | 244 | for objectResource in objectList: 245 | 246 | isUri = False 247 | predicateVal = '<'+result["p"]["value"]+'>' 248 | 249 | # remove duplicated keyword scenario 250 | set = [] 251 | set.extend(resource.keyword.split(' ')) 252 | for x in objectResource.keyword.split(' '): 253 | if x not in set: 254 | set.append(x) 255 | 256 | set = ' '.join(str(x) for x in set) 257 | 258 | predicate = Resource(predicateVal,result["p"]["value"].split('/')[-1],0,set) 259 | 260 | # set the properties and form the fact node 261 | predicate.isUri = True 262 | 263 | object.score = resource.score + object.score 264 | for color in resource.colors: 265 | if(color not in object.colors): 266 | object.colors.append(color) 267 | 268 | for color in predicate.colors: 269 | if(color not in object.colors): 270 | object.colors.append(color) 271 | 272 | object = SparqlClient.findObjectKeywordMatch(object) 273 | 274 | factNodeObj = FactNode(resource,predicate,object) 275 | factNodeObj.score = object.score 276 | factNodeObj.set_colors() 277 | tripletList.append(factNodeObj) 278 | ''' 279 | # Sort the list and return 280 | return tripletList 281 | 282 | -------------------------------------------------------------------------------- /src/testSparqlEndPoint.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import sys 3 | import json 4 | import math 5 | from urllib.parse import quote 6 | from SPARQLWrapper import SPARQLWrapper, JSON , XML 7 | 8 | sparql = SPARQLWrapper("http://dbpedia.org/sparql") # Assigns an endpoint 9 | sparql.setReturnFormat(JSON) # Sets the return format to be json 10 | # Queries the endpoint to retrive all the triplets that have pivot element as subject 11 | pivotElement = '' 12 | 13 | 14 | sparql.setQuery(""" 15 | PREFIX rdfs: 16 | SELECT ?p ?label 17 | WHERE { """ + pivotElement + """ ?p ?label 18 | } 19 | """) 20 | ''' 21 | sparql.setQuery(""" 22 | PREFIX rdfs: 23 | SELECT ?label 24 | WHERE { 25 | rdfs:label ?label . 26 | } 27 | """) 28 | ''' 29 | 30 | try: 31 | results = sparql.query().convert() 32 | except Exception as e: 33 | print(e) 34 | print(' DBPedia is down for maintanance') 35 | exit(3) 36 | 37 | # Find predicates that are semantically similar to uncovered keywords 38 | for result in results["results"]["bindings"]: 39 | 40 | # Considering only 'en' language 41 | print(result["label"]["value"]) 42 | 43 | 44 | print('done') 45 | -------------------------------------------------------------------------------- /src/wordSimilarity.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import sys 3 | import json 4 | import math 5 | from urllib.parse import quote 6 | from threading import Thread 7 | 8 | class WordSimilarity: 9 | 10 | scoreDictionary = {} 11 | scoreDictionary['esa'] = 0 12 | scoreDictionary['swoogle'] = 0 13 | 14 | # 1 - EasyESA client 15 | # a score of 1 and -1 results in a perfect match 16 | # treshold values to consider 0.07, 0.052 and 0.04 17 | def getEasyESAScore(word1,word2): 18 | 19 | WordSimilarity.scoreDictionary['esa'] = 0 20 | url = "http://vmdeb20.deri.ie:8890/esaservice?task=esa&term1="+quote(word1)+'&term2='+quote(word2) 21 | try: 22 | request = urllib.request.Request(url) 23 | response = urllib.request.urlopen(request) 24 | score = str(response.read().decode('utf-8')).replace('\"','') 25 | WordSimilarity.scoreDictionary['esa'] = float(score) 26 | except Exception as e: 27 | WordSimilarity.scoreDictionary['esa'] = 0 28 | 29 | # 2 - ws4j client 30 | def getWs4jScore(word1,word2): 31 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+quote(word1)+"%3A%3A"+quote(word2) 32 | request = urllib.request.Request(url) 33 | request.add_header('Accept', 'application/json') 34 | response = urllib.request.urlopen(request) 35 | responseStr = response.read().decode('utf-8') 36 | # fetch json from the response 37 | jsonStr = json.loads(responseStr) 38 | score = float(jsonStr['result'][0]['score']) 39 | return score 40 | 41 | # 3 - UMBC Semantic Similarity service 42 | # 43 | # Documentation availabel at http://swoogle.umbc.edu/SimService/api.html 44 | def getSwoogleScore(word1,word2): 45 | WordSimilarity.scoreDictionary['swoogle'] = 0 46 | url = "http://swoogle.umbc.edu/StsService/GetStsSim?operation=api&phrase1="+quote(word1)+'&phrase2='+quote(word2) 47 | try: 48 | request = urllib.request.Request(url) 49 | response = urllib.request.urlopen(request) 50 | score = str(response.read().decode('utf-8')).replace('\"','') 51 | score = float(score) 52 | WordSimilarity.scoreDictionary['swoogle'] = score 53 | except Exception as e: 54 | WordSimilarity.scoreDictionary['swoogle'] = 0 55 | 56 | 57 | # As of now using only EasyESA. 58 | # call the method 2 ws4j client if needed 59 | # a score of 1 and -1 results in a perfect match 60 | # treshold values to consider 0.07, 0.052 and 0.04 61 | def isPredicateSimilar(word1,word2): 62 | #score = math.fabs(WordSimilarity.getEasyESAScore(word1,word2)) 63 | 64 | esaThread = Thread(target=WordSimilarity.getEasyESAScore, args=(word1,word2,)) 65 | swoogleThread = Thread(target=WordSimilarity.getSwoogleScore, args=(word1,word2,)) 66 | 67 | esaThread.start() 68 | swoogleThread.start() 69 | esaThread.join() 70 | swoogleThread.join() 71 | 72 | ESAscore = WordSimilarity.scoreDictionary['esa'] 73 | #WordSimilarity.getEasyESAScore(word1,word2) 74 | ESAScaledScore = 0 75 | if(ESAscore>0 and ESAscore<=0.04): 76 | ESAScaledScore = 1 77 | elif(ESAscore>0.04 and ESAscore<=0.06): 78 | ESAScaledScore = 2 79 | elif(ESAscore>0.07): 80 | ESAScaledScore = 3 81 | else: 82 | ESAScaledScore = 0 83 | 84 | SwoogleScore = WordSimilarity.scoreDictionary['swoogle'] 85 | # WordSimilarity.getSwoogleScore(word1,word2) 86 | SwoogleScaledScore = 0 87 | if(SwoogleScore>0 and SwoogleScore<0.6): 88 | SwoogleScaledScore = 1 89 | elif(SwoogleScore>=0.6 and SwoogleScore<0.7): 90 | SwoogleScaledScore = 2 91 | elif(SwoogleScore>=0.7): 92 | SwoogleScaledScore = 3 93 | else: 94 | SwoogleScaledScore = 0 95 | 96 | if(ESAScaledScore>SwoogleScaledScore): 97 | score = ESAScaledScore 98 | else: 99 | score = SwoogleScaledScore 100 | 101 | if(score>=2): 102 | return score 103 | else: 104 | return -1 105 | -------------------------------------------------------------------------------- /src/ws4j.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import sys 3 | import json 4 | from wordSimilarity import * 5 | 6 | word1 = input(" Enter the word1 : ") 7 | word2 = input(" Enter the word2 : ") 8 | 9 | 10 | #url = "http://spotlight.dbpedia.org/rest/annotate?types=DBPedia:Person&text="+sentence+"&confidence=0.2&support=20" 11 | url = "http://ws4jdemo.appspot.com/ws4j?measure=wup&args="+word1+"%3A%3A"+word2 12 | #print(url) 13 | request = urllib.request.Request(url) 14 | request.add_header('Accept', 'application/json') 15 | response = urllib.request.urlopen(request) 16 | 17 | responseStr = response.read().decode('utf-8') 18 | 19 | # fetch json from the response 20 | jsonStr = json.loads(responseStr) 21 | 22 | print(jsonStr['result'][0]['score']) 23 | --------------------------------------------------------------------------------