├── LICENSE ├── README.md ├── __init__.py ├── pymln.py ├── semantic ├── .DS_Store ├── MLN │ ├── .DS_Store │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-36.pyc │ └── src │ │ ├── Argument.py │ │ ├── Clust.py │ │ ├── MLN.py │ │ └── Part.py ├── Parse.py ├── __init__.py ├── __pycache__ │ ├── Agenda.cpython-36.pyc │ ├── Argument.cpython-36.pyc │ ├── Clust.cpython-36.pyc │ ├── Executor.cpython-36.pyc │ ├── MLN.cpython-36.pyc │ ├── Parse.cpython-36.pyc │ ├── Part.cpython-36.pyc │ ├── Scorer.cpython-36.pyc │ └── __init__.cpython-36.pyc └── src │ ├── Agenda.py │ ├── Executor.py │ ├── Scorer.py │ └── SearchOp.py ├── syntax ├── .DS_Store ├── Nodes │ ├── Article.py │ ├── Sentence.py │ ├── Token.py │ ├── TreeNode.py │ └── __pycache__ │ │ ├── Article.cpython-36.pyc │ │ ├── Sentence.cpython-36.pyc │ │ ├── Token.cpython-36.pyc │ │ └── TreeNode.cpython-36.pyc ├── Relations │ ├── .DS_Store │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-36.pyc │ └── src │ │ ├── ArgType.py │ │ ├── Path.py │ │ └── RelType.py ├── StanfordParseReader.py ├── __init__.py └── __pycache__ │ ├── Article.cpython-36.pyc │ ├── Path.cpython-36.pyc │ ├── RelType.cpython-36.pyc │ ├── Sentence.cpython-36.pyc │ ├── StanfordParseReader.cpython-36.pyc │ ├── Token.cpython-36.pyc │ ├── TreeNode.cpython-36.pyc │ └── __init__.cpython-36.pyc └── utils ├── Utils.py ├── __init__.py └── __pycache__ ├── Utils.cpython-36.pyc └── __init__.cpython-36.pyc /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gallup Government, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pymln 2 | Python implementation of unsupervised semantic parsing and markov logic network knowledgebase induction. This work is funded through DARPA’s ASKE program () as part of Gallup's MULTIVAC project. 3 | 4 | ## This is a work in progress. 5 | 6 | This software is derived from the USP (Beta Version) Software by the University of Washington, available here: http://alchemy.cs.washington.edu/usp/ 7 | 8 | 9 | 10 | All of the documentation and software included in the USP (Beta Version) Software is copyrighted by Hoifung Poon and Pedro Domingos. 11 | 12 | 13 | Copyright [2009-11] Hoifung Poon and Pedro Domingos. All rights reserved. 14 | 15 | 16 | Contact: Hoifung Poon (hoifung.poon@gmail.com). 17 | 18 | 19 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 20 | 21 | 22 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 23 | 24 | 25 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 26 | 27 | 28 | 3. All advertising materials mentioning features or use of this software must display the following acknowledgment: "This product includes software developed by Hoifung Poon and Pedro Domingos in the Department of Computer Science and Engineering at the University of Washington". 29 | 30 | 31 | 4. Your publications acknowledge the use or contribution made by the Software to your research using the following citation(s): 32 | 33 | Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing", in Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp. 34 | 35 | 36 | 5. Neither the name of the University of Washington nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 37 | 38 | 39 | THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF WASHINGTON AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF WASHINGTON OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 | 41 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Python implementation of Unsupervised Semantic Parsing system, from: 5 | # 6 | # Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing", 7 | # in Proceedings of the Conference on Empirical Methods in Natural Language 8 | # Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp. 9 | # 10 | 11 | -------------------------------------------------------------------------------- /pymln.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Python implementation of Unsupervised Semantic Parsing system, from: 5 | # 6 | # Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing", 7 | # in Proceedings of the Conference on Empirical Methods in Natural Language 8 | # Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp. 9 | # 10 | 11 | import argparse 12 | import os 13 | 14 | from semantic import Parse 15 | from semantic.MLN import MLN 16 | 17 | def read_input_files(DIR): 18 | '''Read files given by list of names ''' 19 | files = set() 20 | for file in os.listdir(DIR): 21 | if file.endswith(".dep"): 22 | files.add(file) 23 | 24 | return files 25 | 26 | 27 | def run(params): 28 | if os.path.isabs(params['data_dir']): 29 | data_dir = params['data_dir'] 30 | else: 31 | data_dir = os.path.join(os.getcwd(), params['data_dir']) 32 | 33 | if os.path.isabs(params['results_dir']): 34 | results_dir = params['results_dir'] 35 | else: 36 | results_dir = os.path.join(os.getcwd(), params['results_dir']) 37 | 38 | priorNumParam = params['priorNumParam'] 39 | priorNumConj = params['priorNumConj'] 40 | 41 | parser = Parse.Parse() 42 | 43 | # Get files 44 | input_files = read_input_files(data_dir) 45 | 46 | # Parse files into MLN knowledge base 47 | parser.parse(input_files) 48 | 49 | # Save knowledge base files to disk 50 | MLN.printModel(results_dir) 51 | 52 | return None 53 | 54 | 55 | if __name__ == '__main__': 56 | prs = argparse.ArgumentParser(description='Parse scientific articles into' 57 | ' Markov Logic Network knowledge base. \n' 58 | 'Usage: python -m pymln.py [-d dataDir] ' 59 | '[-r resultDir] [-p priorNumParam] [-c ' 60 | 'priorNumConj]') 61 | prs.add_argument('-d', '--data_dir', 62 | help='Directory of source files. If not specified, ' 63 | 'defaults to the current working directory.') 64 | prs.add_argument('-r', '--results_dir', 65 | help='Directory to save results files. If not specified,' 66 | ' defaults to the current working directory.') 67 | prs.add_argument('-p', '--priorNumParam', 68 | help='Prior on parameter number. If not specified,' 69 | ' defaults to 5.') 70 | prs.add_argument('-c', '--priorNumConj', 71 | help='Prior on number of conjunctive parts assigned to ' 72 | 'same cluster. If not specified, defaults to 10.') 73 | 74 | args = vars(prs.parse_args()) 75 | 76 | # Default argument values 77 | params = {'priorNumParam': 5, 'priorNumConj': 10, 'data_dir': os.getcwd(), 78 | 'results_dir': os.getcwd()} 79 | 80 | # If specified in call, override defaults 81 | for par in params: 82 | if args[par] is not None: 83 | params[par] = args[par] 84 | 85 | run(params) 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /semantic/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/.DS_Store -------------------------------------------------------------------------------- /semantic/MLN/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/MLN/.DS_Store -------------------------------------------------------------------------------- /semantic/MLN/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from syntax.Relations import RelType 3 | 4 | 5 | class MLN(object): 6 | def __init__(self): 7 | return None 8 | 9 | 10 | class Argument(object): 11 | def __init__(self, argNode, path, argPart): 12 | self._argNode = argNode 13 | self._path = path 14 | self._argPart = argPart 15 | 16 | return None 17 | 18 | def getPath(self): 19 | return self._path 20 | 21 | def getPart(self): 22 | return self._argPart 23 | 24 | def getNode(self): 25 | return self._argNode 26 | 27 | 28 | # 29 | # Part class 30 | # 31 | 32 | class Part(object): 33 | # dictionary mapping {str: Part} 34 | rootNodeId_part = {} 35 | # dictionary mapping {int: set(str)} 36 | clustIdx_partRootNodeIds = {} 37 | # dictionary mapping {(int, int): set((str, str))} 38 | pairClustIdxs_pairPartRootNodeIds = {} 39 | # dictionary mapping {int: set((int, int))} 40 | clustIdx_pairClustIdxs = {} 41 | 42 | def __init__(self, relTreeRoot): 43 | self._isDebug = False 44 | 45 | self._relTreeRoot = relTreeRoot 46 | self._relTypeIdx = RelType.getRelType(relTreeRoot) 47 | self._clustIdx = -1 48 | self._nxtArgIdx = 0 # Remember next index because _args should be ordered Dict 49 | 50 | self._parPart = None 51 | self._parArgIdx = -1 52 | 53 | # Dictionary mapping {int: Argument} 54 | self._args = {} 55 | # Dictionary mapping {int: int} 56 | self._argIdx_argClustIdx = {} 57 | # Dictionary mapping {int: set(int)} 58 | self._argClustIdx_argIdxs = {} 59 | 60 | return None 61 | 62 | def addArgument(self, arg): 63 | argIdx = self._nxtArgIdx + 1 64 | self._args[argIdx] = arg 65 | 66 | return argIdx 67 | 68 | def changeClust(self, newClustIdx, newRelTypeIdx, clust_only=False): 69 | oldClustIdx = self.getClustIdx() 70 | rootID = self.getRelTreeRoot().getId() 71 | Part.clustIdx_partRootNodeIds[oldClustIdx].remove(rootID) 72 | 73 | if clust_only: 74 | self._relTypeIdx = newRelTypeIdx 75 | else: 76 | ocl = Clust.getClust(oldClustIdx) 77 | ocl.onPartUnsetClust(self) 78 | self.setRelTypeIdx(newRelTypeIdx) 79 | 80 | self.setClust(newClustIdx, clust_only=clust_only) 81 | 82 | parent = self.getParPart() 83 | 84 | if parent is None: 85 | if newClustIdx in Clust.clustIdx_rootCnt: 86 | Clust.clustIdx_rootCnt[newClustIdx] += 1 87 | else: 88 | Clust.clustIdx_rootCnt[newClustIdx] = 1 89 | Clust.clustIdx_rootCnt[newClustIdx] -= 1 90 | else: 91 | parent_clust_id = parent.getClustIdx() 92 | paci = parent.getArgClust(self.getParArgIdx()) 93 | pcl = Clust.getClust(parent_clust_id) 94 | pac = pcl._argClusts[paci] 95 | pac._chdClustIdx_cnt[oldClustIdx] -= 1 96 | 97 | if newClustIdx in pac._chdClustIdx_cnt: 98 | pac._chdClustIdx_cnt[newClustIdx] += 1 99 | else: 100 | pac._chdClustIdx_cnt[newClustIdx] = 1 101 | 102 | pa = (parent_clust_id, paci) 103 | Clust.clustIdx_parArgs[oldClustIdx][pa] -= 1 104 | 105 | if newClustIdx not in Clust.clustIdx_parArgs: 106 | Clust.clustIdx_parArgs[newClustIdx] = {} 107 | 108 | if pa in Clust.clustIdx_parArgs[newClustIdx]: 109 | Clust.clustIdx_parArgs[newClustIdx][pa] += 1 110 | else: 111 | Clust.clustIdx_parArgs[newClustIdx][pa] = 1 112 | 113 | opci = (parent_clust_id, oldClustIdx) 114 | npci = (parent_clust_id, newClustIdx) 115 | ptnid = (parent.getRelTreeRoot().getId(), rootID) 116 | 117 | Part.pairClustIdxs_pairPartRootNodeIds[opci].remove(ptnid) 118 | 119 | if len(Part.pairClustIdxs_pairPartRootNodeIds[opci]) == 0: 120 | Part.clustIdx_pairClustIdxs[oldClustIdx].remove(opci) 121 | Part.clustIdx_pairClustIdxs[parent_clust_id].remove(opci) 122 | 123 | if npci in Part.pairClustIdxs_pairPartRootNodeIds: 124 | Part.pairClustIdxs_pairPartRootNodeIds[npci].add(ptnid) 125 | else: 126 | Part.pairClustIdxs_pairPartRootNodeIds[npci] = set(ptnid) 127 | 128 | Part.clustIdx_pairClustIdxs[parent_clust_id] = npci 129 | if newClustIdx in Part.clustIdx_pairClustIdxs: 130 | Part.clustIdx_pairClustIdxs[newClustIdx].add(npci) 131 | else: 132 | Part.clustIdx_pairClustIdxs[newClustIdx] = set(npci) 133 | 134 | return None 135 | 136 | def changeClustRemap(self, newClustIdx, argClustIdx_newArgClustIdx, clust_only=False): 137 | 138 | if not clust_only: 139 | oldClustIdx = self.getClustIdx() 140 | ocl = Clust.getClust(oldClustIdx) 141 | 142 | self.changeClust(newClustIdx, self.getRelTypeIdx(), clust_only=clust_only) 143 | 144 | argIdx_newArgClustIdx = {} 145 | 146 | for ai, arg in self._args.items(): 147 | oaci = self._argIdx_argClustIdx.pop(ai) 148 | self._argClustIdx_argIdxs[oaci].remove(ai) 149 | 150 | if len(self._argClustIdx_argIdxs[oaci]) == 0: 151 | del self._argClustIdx_argIdxs[oaci] 152 | 153 | argIdx_newArgClustIdx[ai] = argClustIdx_newArgClustIdx[oaci] 154 | 155 | if not clust_only: 156 | ocl.onPartUnsetArg(this, arg, oaci) 157 | 158 | for ai in self._args: 159 | aci = argIdx_newArgClustIdx[ai] 160 | self.setArgClust(ai, aci, clust_only=clust_only) 161 | 162 | return None 163 | 164 | def getArgument(self, argIdx): 165 | return self._args[argIdx] 166 | 167 | def getArguments(self): 168 | return self._args 169 | 170 | def getArgClust(self, argIdx): 171 | if argIdx in self._argIdx_argClustIdx: 172 | return self._argIdx_argClustIdx[argIdx] 173 | else: 174 | return None 175 | 176 | def getParArgIdx(self): 177 | return self._parArgIdx 178 | 179 | def getClustIdx(self): 180 | return self._clustIdx 181 | 182 | def getClustPartRootNodeIds(): 183 | return Part.clustIdx_partRootNodeIds 184 | 185 | def getParArgIdx(self): 186 | return self._parArgIdx 187 | 188 | def getPairPartRootNodeIds(parClustIdx=None, chdClustIdx=None): 189 | if parClustIdx is None or chdClustIdx is None: 190 | return Part.pairClustIdxs_pairPartRootNodeIds 191 | else: 192 | return Part.pairClustIdxs_pairPartRootNodeIds[(parClustIdx, 193 | chdClustIdx)] 194 | 195 | def getParPart(self): 196 | return self._parPart 197 | 198 | def getPartByRootNodeId(rnId): 199 | if rnId in Part.rootNodeId_part: 200 | return Part.rootNodeId_part[rnId] 201 | else: 202 | return None 203 | 204 | def getPartRootNodeIds(clustIdx): 205 | if clustIdx in Part.clustIdx_partRootNodeIds: 206 | return Part.clustIdx_partRootNodeIds[clustIdx] 207 | else: 208 | return None 209 | 210 | def getRelTreeRoot(self): 211 | return self._relTreeRoot 212 | 213 | def getRelTypeIdx(self): 214 | return self._relTypeIdx 215 | 216 | def removeArgument(self, argIdx, clust_only=False): 217 | arg = self.getArgument(argIdx) 218 | 219 | oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx) 220 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 221 | 222 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 223 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 224 | 225 | if not clust_only: 226 | cl = Clust.getClust(self.getClustIdx()) 227 | cl.onPartUnsetArg(self, arg, oldArgClustIdx) 228 | 229 | del self._args[argIdx] 230 | 231 | return None 232 | 233 | 234 | def setArgClust(self, argIdx, argClustIdx, clust_only=False): 235 | oldArgClustIdx = -1 236 | 237 | if argIdx in self._argIdx_argClustIdx: 238 | oldArgClustIdx = self.getArgClust(argIdx) 239 | 240 | if oldArgClustIdx != argClustIdx: 241 | self._argIdx_argClustIdx[argIdx] = argClustIdx 242 | 243 | if argClustIdx in self._argClustIdx_argIdxs: 244 | self._argClustIdx_argIdxs[argClustIdx].add(argIdx) 245 | else: 246 | self._argClustIdx_argIdxs[argClustIdx] = set(argIdx) 247 | 248 | arg = self.getArgument(argIdx) 249 | 250 | if not clust_only: 251 | cl = Clust.getClust(self.getClustIdx()) 252 | 253 | if oldArgClustIdx < 0: 254 | if not clust_only: 255 | cl.onPartSetArg(self, arg, argClustIdx) 256 | else: 257 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 258 | 259 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 260 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 261 | 262 | if not clust_only: 263 | cl.onPartSetArg(self, arg, argClustIdx, oldArgClustIdx) 264 | 265 | return None 266 | 267 | def setClust(self, clustIdx, clust_only=False): 268 | self._clustIdx = clustIdx 269 | rootID = self.getRelTreeRoot().getId() 270 | 271 | if clustIdx in Part.clustIdx_partRootNodeIds: 272 | Part.clustIdx_partRootNodeIds[clustIdx].add(rootID) 273 | else: 274 | Part.clustIdx_partRootNodeIds[clustIdx] = set(rootID) 275 | 276 | if not clust_only: 277 | cl = Clust.getClust(clustIdx) 278 | cl.onPartSetClust(self) 279 | 280 | return None 281 | 282 | def setParent(self, parPart, parArgIdx): 283 | ''' 284 | Unset previous parent if it exists 285 | ''' 286 | if self.getParPart() is not None: 287 | self.unsetParent() 288 | 289 | self._parPart = parPart 290 | self._parArgIdx = parArgIdx 291 | clustIdx = self.getClustIdx() 292 | parClustID = parPart.getClustIdx() 293 | 294 | assert (parClustID >= 0) & (clustIdx >= 0) 295 | 296 | pcci = (parClustID, clustIdx) 297 | 298 | if parClustID in Part.clustIdx_pairClustIdxs: 299 | Part.clustIdx_pairClustIdxs[parClustID].add(pcci) 300 | else: 301 | Part.clustIdx_pairClustIdxs[parClustID] = set(pcci) 302 | 303 | pids = (parPart.getRelTreeRoot().getId(), self.getRelTreeRoot().getId()) 304 | 305 | if pcci in Part.pairClustIdxs_pairPartRootNodeIds: 306 | Part.pairClustIdxs_pairPartRootNodeIds[pcci].add(pids) 307 | else: 308 | Part.pairClustIdxs_pairPartRootNodeIds[pcci] = set(pids) 309 | 310 | if parPart is not None: 311 | arg = parPart.getArgument(parArgIdx) 312 | dep = arg._path.getDep() 313 | 314 | if (parClustID != clustIdx) & dep.startswith('conj_'): 315 | if parClustID < clustIdx: 316 | pci = pcci 317 | else: 318 | pci = (pcci[1], pcci[0]) 319 | 320 | if pci not in Clust._pairClustIdxs_conjCnt: 321 | Clust.pairClustIdxs_conjCnt[pci] = 1 322 | else: 323 | Clust.pairClustIdxs_conjCnt[pci] += 1 324 | 325 | return None 326 | 327 | def setRelTypeIdx(self, newRelTypeIdx): 328 | self._relTypeIdx = newRelTypeIdx 329 | cl = Clust.getClust(self._clustIdx) 330 | cl.onPartSetRelTypeIdx(newRelTypeIdx) 331 | 332 | return None 333 | 334 | def unsetArgClust(self, argIdx, clust_only=False): 335 | oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx) 336 | arg = self.getArgument(argIdx) 337 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 338 | 339 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 340 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 341 | 342 | if not clust_only: 343 | cl = Clust.getClust(self.getClustIdx()) 344 | cl.onPartUnsetArg(self, arg, oldArgClustIdx) 345 | 346 | return None 347 | 348 | def unsetParent(self): 349 | ''' 350 | Remove parent-child cluster index information 351 | Remove parent-child relationship index information 352 | NEEDS ADDITIONAL FACTORING - where does Cluster come from? 353 | ''' 354 | parent = self.getParPart() 355 | clustIdx = self.getClustIdx() 356 | 357 | if parent is not None: 358 | parClustID = parent.getClustIdx() 359 | 360 | pcci = (parClustID, clustIdx) 361 | Part.clustIdx_pairClustIdxs[parClustID].remove(pcci) 362 | 363 | pids = (parent.getRelTreeRoot().getId(), 364 | self.getRelTreeRoot().getId()) 365 | Part.pairClustIdxs_pairPartRootNodeIds[pcci].remove(pids) 366 | 367 | arg = parent.getArgument(self.getParArgIdx()) 368 | dep = arg._path.getDep() 369 | 370 | if (parClustID != clustIdx) & dep.startswith('conj_'): 371 | if parClustID < clustIdx: 372 | pci = pcci 373 | else: 374 | pci = (pcci[1], pcci[0]) 375 | 376 | if pci in Clust._pairClustIdxs_conjCnt: 377 | Clust.pairClustIdxs_conjCnt[pci] -= 1 378 | if Clust.pairClustIdxs_conjCnt[pci] == 0: 379 | del Clust.pairClustIdxs_conjCnt[pci] 380 | 381 | return None 382 | 383 | def unsetRelTypeIdx(self): 384 | old_type = self._relTypeIdx 385 | cl = Clust.getClust(self._clustIdx) 386 | cl.onPartUnsetRelTypeIdx(old_type) 387 | 388 | return None 389 | 390 | 391 | # 392 | # Clust 393 | # 394 | 395 | class Clust(object): 396 | whereasClustIdx = -1 397 | nxtClustIdx = 1 398 | ttlRootCnt = 0 399 | 400 | # Dictionary mapping 401 | pairClustIdx_conjCnt = {} 402 | # Dictionary mapping {int: {(int, int): int}} 403 | clustIdx_parArgs = {} 404 | # Dictionary mapping {int: int} 405 | clustIdx_rootCnt = {} 406 | # Dictionary mapping {str: int} 407 | argComb_cnt = {} 408 | # Dictionary mapping {int: set(str)} 409 | clustIdx_argCombs = {} 410 | # Dictionary mapping {int: Clust} 411 | clusts = {} 412 | # Dictionary mapping {int: set(int)} 413 | relTypeIdx_clustIdx = {} 414 | 415 | def __init__(self): 416 | self._isDebug = False 417 | self._isStop = False 418 | self._clustIdx = -1 419 | self._ttlCnt = 0 420 | self._nxtArgClustIdx = 0 421 | self._type = '' 422 | 423 | # Dictionary mapping {int: int} 424 | self._relTypeIdx_cnt = {} 425 | # Dictionary mapping {int: set(int)} 426 | self._argTypeIdx_argClustIdxs = {} 427 | # Dictionary mapping {int: ArgClust} 428 | self._argClusts = {} 429 | 430 | def incRootCnt(self): 431 | Clust.ttlRootCnt += 1 432 | 433 | if self.getId() not in Clust.clustIdx_rootCnt: 434 | Clust.clustIdx_rootCnt[self.getId()] = 1 435 | else: 436 | Clust.clustIdx_rootCnt[self.getId()] += 1 437 | 438 | return None 439 | 440 | def decRootCnt(self): 441 | Clust.ttlRootCnt -= 1 442 | 443 | Clust.clustIdx_rootCnt[self.getId()] -= 1 444 | 445 | if Clust.clustIdx_rootCnt[self.getId()] == 0: 446 | del Clust.clustIdx_rootCnt[self.getId()] 447 | 448 | return None 449 | 450 | def onPartUnsetRelTypeIdx(self, oldRelTypeIdx): 451 | self._relTypeIdx_cnt[oldRelTypeIdx] -= 1 452 | return None 453 | 454 | def onPartSetRelTypeIdx(self, newRelTypeIdx): 455 | if newRelTypeIdx not in self._relTypeIdx_cnt: 456 | self._relTypeIdx_cnt[newRelTypeIdx] = 1 457 | else: 458 | self._relTypeIdx_cnt[newRelTypeIdx] += 1 459 | 460 | return None 461 | 462 | def onPartSetClust(self, part): 463 | self._ttlCnt += 1 464 | ridx = part.getRelTypeIdx() 465 | self.onPartSetRelTypeIdx(ridx) 466 | 467 | return None 468 | 469 | def onPartUnsetClust(self, part): 470 | self._ttlCnt -= 1 471 | ridx = part.getRelTypeIdx() 472 | self.onPartUnsetRelTypeIdx(ridx) 473 | 474 | return None 475 | 476 | def createArgClust(self, argTypeIdx): 477 | assert argTypeIdx not in self._argTypeIdx_argClustIdxs 478 | argClustIdx = self._nxtArgClustIdx 479 | self._nxtArgClustIdx += 1 480 | ac = ArgClust() 481 | self._argClusts[argClustIdx] = ac 482 | acs = set() 483 | acs.add(argClustIdx) 484 | self._argTypeIdx_argClustIdxs[argTypeIdx] = acs 485 | 486 | return argClustIdx 487 | 488 | def getType(self): 489 | return self._type 490 | 491 | def isStop(self): 492 | return self._isStop 493 | 494 | 495 | def getClustsWithRelType(relTypeIdx): 496 | if relTypeIdx in Clust.relTypeIdx_clustIdx: 497 | return Clust.relTypeIdx_clustIdx[relTypeIdx] 498 | else: 499 | return None 500 | 501 | def createClust(relTypeIdx): 502 | cl = Clust() 503 | cl._clustIdx = Clust.nxtClustIdx 504 | Clust.nxtClustIdx += 1 505 | 506 | rt = RelType.getRelType(relTypeIdx) 507 | cl._type = rt.getType() 508 | rts = rt.toString() 509 | 510 | if rts in ['(V:be)', '(N:%)', '(V:say)', '($:$)']: 511 | cl._isStop = True 512 | 513 | if Clust.whereasClustIdx == -1 and rts == '(IN:whereas)': 514 | Clust.whereasClustIdx = cl._clustIdx 515 | 516 | Clust.clusts[cl._clustIdx] = cl 517 | if relTypeIdx in Clust.relTypeIdx_clustIdx: 518 | Clust.relTypeIdx_clustIdx[relTypeIdx].add(cl._clustIdx) 519 | else: 520 | Clust.relTypeIdx_clustIdx[relTypeIdx] = set(cl._clustIdx) 521 | 522 | return cl._clustIdx 523 | 524 | def removeClust(clust): 525 | del Clust.clusts[clust._clustIdx] 526 | return None 527 | 528 | def getClust(idx): 529 | return Clust.clusts[idx] 530 | 531 | def incRootCnt(self): 532 | Clust.ttlRootCnt += 1 533 | if self.getId() in Clust.clustIdx_rootCnt: 534 | Clust.clustIdx_rootCnt[self.getId()] += 1 535 | else: 536 | Clust.clustIdx_rootCnt[self.getId()] = 1 537 | 538 | def onPartSetClust(self, part): 539 | self._ttlCnt += 1 540 | ridx = part.getRelTypeIdx() 541 | if ridx in self._relTypeIdx_cnt: 542 | self._relTypeIdx_cnt[ridx] += 1 543 | else: 544 | self._relTypeIdx_cnt[ridx] = 1 545 | 546 | return None 547 | 548 | def onPartSetRelTypeIdx(self, newRelTypeIdx): 549 | if newRelTypeIdx in self._relTypeIdx_cnt: 550 | self._relTypeIdx_cnt[newRelTypeIdx] += 1 551 | else: 552 | self._relTypeIdx_cnt[newRelTypeIdx] = 1 553 | 554 | return None 555 | 556 | def removeArgClust(self, argClustIdx): 557 | del self._argClusts[argClustIdx] 558 | toDel = set() 559 | 560 | for ati in self._argTypeIdx_argClustIdxs: 561 | self._argTypeIdx_argClustIdxs[ati].remove(argClustIdx) 562 | 563 | if len(self._argTypeIdx_argClustIdxs[ati]) == 0: 564 | del self._argTypeIdx_argClustIdxs[ati] 565 | 566 | return None 567 | 568 | def addArgComb(clustIdx, chdClustIdxs, chdClustIdx2=None): 569 | if chdClustIdx2 is not None: 570 | chdClustIdxs = [chdClustIdxs, chdClustIdx2] 571 | 572 | ac = Clust.genArgCombStr(clustIdx, chdClustIdxs) 573 | 574 | if clustIdx not in Clust.clustIdx_argCombs: 575 | Clust.clustIdx_argCombs[clustIdx] = set() 576 | 577 | Clust.clustIdx_argCombs[clustIdx].add(ac) 578 | 579 | for idx in chdClustIdxs: 580 | if idx not in Clust.clustIdx_argCombs: 581 | Clust.clustIdx_argCombs[idx] = set() 582 | 583 | Clust.clustIdx_argCombs[idx].add(ac) 584 | 585 | if ac in Clust.argComb_cnt: 586 | Clust.argComb_cnt[ac] += 1 587 | else: 588 | Clust.argComb_cnt[ac] = 1 589 | 590 | return None 591 | 592 | def genArgCombStr(clustIdx, clustIdxs): 593 | s = ':'.join([str(x) for x in [clustIdx] + clustIdxs]) 594 | 595 | return s 596 | 597 | def getArgClustIdxs(self, argTypeIdx): 598 | if argTypeIdx in self._argTypeIdx_argClustIdxs: 599 | return self._argTypeIdx_argClustIdxs[argTypeIdx] 600 | else: 601 | return None 602 | 603 | def onPartSetArg(self, part, arg, argClustIdx, oldArgClustIdx=-1): 604 | argTypeIdx = arg._path.getArgType() 605 | chdClustIdx = arg._artPart.getClusterIdx() 606 | ac = self._argClusts[argClustIdx] 607 | 608 | if argTypeIdx in ac._argTypeIdx_cnt: 609 | ac._argTypeIdx_cnt[argTypeIdx] += 1 610 | else: 611 | ac._argTypeIdx_cnt[argTypeIdx] = 1 612 | 613 | if chdClustIdx in ac._argTypeIdx_cnt: 614 | ac._argTypeIdx_cnt[chdClustIdx] += 1 615 | else: 616 | ac._argTypeIdx_cnt[chdClustIdx] = 1 617 | 618 | ac._ttlArgCnt += 1 619 | 620 | if chdClustIdx not in Clust.clustIdx_parArgs: 621 | Clust.clustIdx_parArgs[chdClustIdx] = {} 622 | 623 | cl_ac = (self.getId(), argClustIdx) 624 | 625 | if cl_ac in Clust.clustIdx_parArgs[chdClustIdx]: 626 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] += 1 627 | else: 628 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] = 1 629 | 630 | newArgNum = len(part._argClustIdx_argIdxs[argClustIdx]) 631 | 632 | if newArgNum in ac._argNum_cnt: 633 | ac._argNum_cnt[newArgNum] += 1 634 | else: 635 | ac._argNum_cnt[newArgNum] = 1 636 | 637 | if newArgNum > 1: 638 | if ac._argNum_cnt[newArgNum-1] == 1: 639 | del ac._argNum_cnt[newArgNum-1] 640 | else: 641 | ac._argNum_cnt[newArgNum-1] -= 1 642 | 643 | ac._partRootTreeNodeIds.add(part.getRelTreeRoot().getId()) 644 | 645 | if oldArgClustIdx >= 0: 646 | self.onPartUnsetArg(part, arg, oldArgClustIdx) 647 | 648 | return None 649 | 650 | def getId(self): 651 | return self._clustIdx 652 | 653 | def onPartUnsetArg(self, part, arg, argClustIdx): 654 | argTypeIdx = arg.getPath().getArgType() 655 | chdClustIdx = arg.getPart().getClustIdx() 656 | ac = self._argClusts[argClustIdx] 657 | 658 | if ac._argTypeIdx_cnt[argTypeIdx] == 1: 659 | del ac._argTypeIdx_cnt[argTypeIdx] 660 | else: 661 | ac._argTypeIdx_cnt[argTypeIdx] -= 1 662 | 663 | if ac._chdClustIdx_cnt[chdClustIdx] == 1: 664 | del ac._chdClustIdx_cnt[chdClustIdx] 665 | else: 666 | ac._chdClustIdx_cnt[chdClustIdx] -= 1 667 | 668 | ac._ttlCnt -= 1 669 | cl_ac = (self.getId(), argClustIdx) 670 | 671 | if Clust.clustIdx_parArgs[chdClustIdx][cl_ac] == 1: 672 | del Clust.clustIdx_parArgs[chdClustIdx][cl_ac] 673 | else: 674 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] -= 1 675 | 676 | if len(Clust.clustIdx_parArgs[chdClustIdx]) == 0: 677 | del Clust.clustIdx_parArgs[chdClustIdx] 678 | 679 | ac._partRootTreeNodeIds.remove(part.getRelTreeRoot().getId()) 680 | 681 | if ac._ttlArgCnt == 0: 682 | self.removeArgClust(argClustIdx) 683 | assert argClustIdx not in part._argClustIdx_argIdxs 684 | else: 685 | oldArgNum = 0 686 | 687 | if argClustIdx in part._argClustIdx_argIdxs: 688 | oldArgNum = part._argClustIdx_argIdxs[argClustIdx] 689 | 690 | if oldArgNum > 0: 691 | if oldArgNum in ac._argNum_cnt: 692 | ac._argNum_cnt[oldArgNum] += 1 693 | else: 694 | ac._argNum_cnt[oldArgNum] = 1 695 | 696 | if ac._argNum_cnt[oldArgNum+1] == 1: 697 | del ac._argNum_cnt[oldArgNum+1] 698 | else: 699 | ac._argNum_cnt[oldArgNum+1] -= 1 700 | 701 | def removePartAndUpdateStat(nid_part): 702 | for nid, p in nid_part.items(): 703 | cl = Clust.getClust(p.getClustIdx()) 704 | 705 | if p.getParPart() is None: 706 | cl.decRootCnt() 707 | 708 | for nid, p in nid_part.items(): 709 | for ai, a in p._args.items(): 710 | p.removeArgument(ai) 711 | cp = a._argPart 712 | cp.unsetParent() 713 | 714 | p.unsetRelType() 715 | 716 | for nid, p in nid_part.items(): 717 | pclust = getClustIdx() 718 | Part.clustIdx_partRootNodeIds[pclust].remove(p.getRelTreeRoot().getId()) 719 | 720 | if len(Part.clustIdx_partRootNodeIds[pclust]) == 0: 721 | del Part.clustIdx_partRootNodeIds[pclust] 722 | 723 | return None 724 | 725 | def updatePartStat(nid_part): 726 | for nid, p in nid_part.items(): 727 | cl = Clust.getClust(p.getClustIdx()) 728 | cl.onPartSetClust(p) 729 | 730 | if p.getParPart() is None: 731 | cl.incRootCnt() 732 | 733 | for ai, arg in p._args: 734 | aci = p._argTypeIdx_argClustIdxs[ai] 735 | cl.onPartSetArg(p, arg, aci) 736 | 737 | return None 738 | 739 | def toString(self): 740 | rts = ['{}:{}'.format(RelType.getRelType(rti).toString(), cnt) 741 | for x, y in self._relTypeIdx_cnt.items()] 742 | s = ',\t'.join(rts) 743 | s = '[' + s + ']' 744 | 745 | return s 746 | 747 | 748 | ''' 749 | End Clust class definitions 750 | ''' 751 | 752 | class ArgClust(object): 753 | def __init__(self): 754 | # Dictionary mapping {int: int} 755 | self._argTypeIdx_cnt = {} 756 | # Dictionary mapping {int: int} 757 | self._chdClustIdx_cnt = {} 758 | # Dictionary mapping {int: int} 759 | self._argNum_cnt = {} 760 | self._ttlArgCnt = 0 761 | self._partRootTreeNodeIds = set() 762 | 763 | def toString(self): 764 | s = '' 765 | for k, v in self._argTypeIdx_cnt.items(): 766 | if len(s) > 0: 767 | s += ' ' 768 | s += '{}:{}'.format(ArgType.getArgType(k), c) 769 | 770 | return s 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | -------------------------------------------------------------------------------- /semantic/MLN/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/MLN/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/MLN/src/Argument.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Argument(object): 4 | def __init__(self, argNode, path, argPart): 5 | self._argNode = argNode 6 | self._path = path 7 | self._argPart = argPart 8 | 9 | return None 10 | 11 | def getPath(self): 12 | return self._path 13 | 14 | def getPart(self): 15 | return self._argPart 16 | 17 | def getNode(self): 18 | return self._argNode 19 | 20 | 21 | -------------------------------------------------------------------------------- /semantic/MLN/src/Clust.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Clust 5 | # 6 | 7 | class Clust(object): 8 | whereasClustIdx = -1 9 | nxtClustIdx = 1 10 | ttlRootCnt = 0 11 | 12 | # Dictionary mapping 13 | pairClustIdx_conjCnt = {} 14 | # Dictionary mapping {int: {(int, int): int}} 15 | clustIdx_parArgs = {} 16 | # Dictionary mapping {int: int} 17 | clustIdx_rootCnt = {} 18 | # Dictionary mapping {str: int} 19 | argComb_cnt = {} 20 | # Dictionary mapping {int: set(str)} 21 | clustIdx_argCombs = {} 22 | # Dictionary mapping {int: Clust} 23 | clusts = {} 24 | # Dictionary mapping {int: set(int)} 25 | relTypeIdx_clustIdx = {} 26 | 27 | def __init__(self): 28 | self._isDebug = False 29 | self._isStop = False 30 | self._clustIdx = -1 31 | self._ttlCnt = 0 32 | self._nxtArgClustIdx = 0 33 | self._type = '' 34 | 35 | # Dictionary mapping {int: int} 36 | self._relTypeIdx_cnt = {} 37 | # Dictionary mapping {int: set(int)} 38 | self._argTypeIdx_argClustIdxs = {} 39 | # Dictionary mapping {int: ArgClust} 40 | self._argClusts = {} 41 | 42 | def incRootCnt(self): 43 | Clust.ttlRootCnt += 1 44 | 45 | if self.getId() not in Clust.clustIdx_rootCnt: 46 | Clust.clustIdx_rootCnt[self.getId()] = 1 47 | else: 48 | Clust.clustIdx_rootCnt[self.getId()] += 1 49 | 50 | return None 51 | 52 | def decRootCnt(self): 53 | Clust.ttlRootCnt -= 1 54 | 55 | Clust.clustIdx_rootCnt[self.getId()] -= 1 56 | 57 | if Clust.clustIdx_rootCnt[self.getId()] == 0: 58 | del Clust.clustIdx_rootCnt[self.getId()] 59 | 60 | return None 61 | 62 | def onPartUnsetRelTypeIdx(self, oldRelTypeIdx): 63 | self._relTypeIdx_cnt[oldRelTypeIdx] -= 1 64 | return None 65 | 66 | def onPartSetRelTypeIdx(self, newRelTypeIdx): 67 | if newRelTypeIdx not in self._relTypeIdx_cnt: 68 | self._relTypeIdx_cnt[newRelTypeIdx] = 1 69 | else: 70 | self._relTypeIdx_cnt[newRelTypeIdx] += 1 71 | 72 | return None 73 | 74 | def onPartSetClust(self, part): 75 | self._ttlCnt += 1 76 | ridx = part.getRelTypeIdx() 77 | self.onPartSetRelTypeIdx(ridx) 78 | 79 | return None 80 | 81 | def onPartUnsetClust(self, part): 82 | self._ttlCnt -= 1 83 | ridx = part.getRelTypeIdx() 84 | self.onPartUnsetRelTypeIdx(ridx) 85 | 86 | return None 87 | 88 | def createArgClust(self, argTypeIdx): 89 | assert argTypeIdx not in self._argTypeIdx_argClustIdxs 90 | argClustIdx = self._nxtArgClustIdx 91 | self._nxtArgClustIdx += 1 92 | ac = ArgClust() 93 | self._argClusts[argClustIdx] = ac 94 | acs = set() 95 | acs.add(argClustIdx) 96 | self._argTypeIdx_argClustIdxs[argTypeIdx] = acs 97 | 98 | return argClustIdx 99 | 100 | def getType(self): 101 | return self._type 102 | 103 | def isStop(self): 104 | return self._isStop 105 | 106 | 107 | def getClustsWithRelType(relTypeIdx): 108 | if relTypeIdx in Clust.relTypeIdx_clustIdx: 109 | return Clust.relTypeIdx_clustIdx[relTypeIdx] 110 | else: 111 | return None 112 | 113 | def createClust(relTypeIdx): 114 | cl = Clust() 115 | cl._clustIdx = Clust.nxtClustIdx 116 | Clust.nxtClustIdx += 1 117 | 118 | rt = RelType.getRelType(relTypeIdx) 119 | cl._type = rt.getType() 120 | rts = rt.toString() 121 | 122 | if rts in ['(V:be)', '(N:%)', '(V:say)', '($:$)']: 123 | cl._isStop = True 124 | 125 | if Clust.whereasClustIdx == -1 and rts == '(IN:whereas)': 126 | Clust.whereasClustIdx = cl._clustIdx 127 | 128 | Clust.clusts[cl._clustIdx] = cl 129 | if relTypeIdx in Clust.relTypeIdx_clustIdx: 130 | Clust.relTypeIdx_clustIdx[relTypeIdx].add(cl._clustIdx) 131 | else: 132 | Clust.relTypeIdx_clustIdx[relTypeIdx] = set(cl._clustIdx) 133 | 134 | return cl._clustIdx 135 | 136 | def removeClust(clust): 137 | del Clust.clusts[clust._clustIdx] 138 | return None 139 | 140 | def getClust(idx): 141 | return Clust.clusts[idx] 142 | 143 | def incRootCnt(self): 144 | Clust.ttlRootCnt += 1 145 | if self.getId() in Clust.clustIdx_rootCnt: 146 | Clust.clustIdx_rootCnt[self.getId()] += 1 147 | else: 148 | Clust.clustIdx_rootCnt[self.getId()] = 1 149 | 150 | def onPartSetClust(self, part): 151 | self._ttlCnt += 1 152 | ridx = part.getRelTypeIdx() 153 | if ridx in self._relTypeIdx_cnt: 154 | self._relTypeIdx_cnt[ridx] += 1 155 | else: 156 | self._relTypeIdx_cnt[ridx] = 1 157 | 158 | return None 159 | 160 | def onPartSetRelTypeIdx(self, newRelTypeIdx): 161 | if newRelTypeIdx in self._relTypeIdx_cnt: 162 | self._relTypeIdx_cnt[newRelTypeIdx] += 1 163 | else: 164 | self._relTypeIdx_cnt[newRelTypeIdx] = 1 165 | 166 | return None 167 | 168 | def removeArgClust(self, argClustIdx): 169 | del self._argClusts[argClustIdx] 170 | toDel = set() 171 | 172 | for ati in self._argTypeIdx_argClustIdxs: 173 | self._argTypeIdx_argClustIdxs[ati].remove(argClustIdx) 174 | 175 | if len(self._argTypeIdx_argClustIdxs[ati]) == 0: 176 | del self._argTypeIdx_argClustIdxs[ati] 177 | 178 | return None 179 | 180 | def addArgComb(clustIdx, chdClustIdxs, chdClustIdx2=None): 181 | if chdClustIdx2 is not None: 182 | chdClustIdxs = [chdClustIdxs, chdClustIdx2] 183 | 184 | ac = Clust.genArgCombStr(clustIdx, chdClustIdxs) 185 | 186 | if clustIdx not in Clust.clustIdx_argCombs: 187 | Clust.clustIdx_argCombs[clustIdx] = set() 188 | 189 | Clust.clustIdx_argCombs[clustIdx].add(ac) 190 | 191 | for idx in chdClustIdxs: 192 | if idx not in Clust.clustIdx_argCombs: 193 | Clust.clustIdx_argCombs[idx] = set() 194 | 195 | Clust.clustIdx_argCombs[idx].add(ac) 196 | 197 | if ac in Clust.argComb_cnt: 198 | Clust.argComb_cnt[ac] += 1 199 | else: 200 | Clust.argComb_cnt[ac] = 1 201 | 202 | return None 203 | 204 | def genArgCombStr(clustIdx, clustIdxs): 205 | s = ':'.join([str(x) for x in [clustIdx] + clustIdxs]) 206 | 207 | return s 208 | 209 | def getArgClustIdxs(self, argTypeIdx): 210 | if argTypeIdx in self._argTypeIdx_argClustIdxs: 211 | return self._argTypeIdx_argClustIdxs[argTypeIdx] 212 | else: 213 | return None 214 | 215 | def onPartSetArg(self, part, arg, argClustIdx, oldArgClustIdx=-1): 216 | argTypeIdx = arg._path.getArgType() 217 | chdClustIdx = arg._artPart.getClusterIdx() 218 | ac = self._argClusts[argClustIdx] 219 | 220 | if argTypeIdx in ac._argTypeIdx_cnt: 221 | ac._argTypeIdx_cnt[argTypeIdx] += 1 222 | else: 223 | ac._argTypeIdx_cnt[argTypeIdx] = 1 224 | 225 | if chdClustIdx in ac._argTypeIdx_cnt: 226 | ac._argTypeIdx_cnt[chdClustIdx] += 1 227 | else: 228 | ac._argTypeIdx_cnt[chdClustIdx] = 1 229 | 230 | ac._ttlArgCnt += 1 231 | 232 | if chdClustIdx not in Clust.clustIdx_parArgs: 233 | Clust.clustIdx_parArgs[chdClustIdx] = {} 234 | 235 | cl_ac = (self.getId(), argClustIdx) 236 | 237 | if cl_ac in Clust.clustIdx_parArgs[chdClustIdx]: 238 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] += 1 239 | else: 240 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] = 1 241 | 242 | newArgNum = len(part._argClustIdx_argIdxs[argClustIdx]) 243 | 244 | if newArgNum in ac._argNum_cnt: 245 | ac._argNum_cnt[newArgNum] += 1 246 | else: 247 | ac._argNum_cnt[newArgNum] = 1 248 | 249 | if newArgNum > 1: 250 | if ac._argNum_cnt[newArgNum-1] == 1: 251 | del ac._argNum_cnt[newArgNum-1] 252 | else: 253 | ac._argNum_cnt[newArgNum-1] -= 1 254 | 255 | ac._partRootTreeNodeIds.add(part.getRelTreeRoot().getId()) 256 | 257 | if oldArgClustIdx >= 0: 258 | self.onPartUnsetArg(part, arg, oldArgClustIdx) 259 | 260 | return None 261 | 262 | def getId(self): 263 | return self._clustIdx 264 | 265 | def onPartUnsetArg(self, part, arg, argClustIdx): 266 | argTypeIdx = arg.getPath().getArgType() 267 | chdClustIdx = arg.getPart().getClustIdx() 268 | ac = self._argClusts[argClustIdx] 269 | 270 | if ac._argTypeIdx_cnt[argTypeIdx] == 1: 271 | del ac._argTypeIdx_cnt[argTypeIdx] 272 | else: 273 | ac._argTypeIdx_cnt[argTypeIdx] -= 1 274 | 275 | if ac._chdClustIdx_cnt[chdClustIdx] == 1: 276 | del ac._chdClustIdx_cnt[chdClustIdx] 277 | else: 278 | ac._chdClustIdx_cnt[chdClustIdx] -= 1 279 | 280 | ac._ttlCnt -= 1 281 | cl_ac = (self.getId(), argClustIdx) 282 | 283 | if Clust.clustIdx_parArgs[chdClustIdx][cl_ac] == 1: 284 | del Clust.clustIdx_parArgs[chdClustIdx][cl_ac] 285 | else: 286 | Clust.clustIdx_parArgs[chdClustIdx][cl_ac] -= 1 287 | 288 | if len(Clust.clustIdx_parArgs[chdClustIdx]) == 0: 289 | del Clust.clustIdx_parArgs[chdClustIdx] 290 | 291 | ac._partRootTreeNodeIds.remove(part.getRelTreeRoot().getId()) 292 | 293 | if ac._ttlArgCnt == 0: 294 | self.removeArgClust(argClustIdx) 295 | assert argClustIdx not in part._argClustIdx_argIdxs 296 | else: 297 | oldArgNum = 0 298 | 299 | if argClustIdx in part._argClustIdx_argIdxs: 300 | oldArgNum = part._argClustIdx_argIdxs[argClustIdx] 301 | 302 | if oldArgNum > 0: 303 | if oldArgNum in ac._argNum_cnt: 304 | ac._argNum_cnt[oldArgNum] += 1 305 | else: 306 | ac._argNum_cnt[oldArgNum] = 1 307 | 308 | if ac._argNum_cnt[oldArgNum+1] == 1: 309 | del ac._argNum_cnt[oldArgNum+1] 310 | else: 311 | ac._argNum_cnt[oldArgNum+1] -= 1 312 | 313 | def removePartAndUpdateStat(nid_part): 314 | for nid, p in nid_part.items(): 315 | cl = Clust.getClust(p.getClustIdx()) 316 | 317 | if p.getParPart() is None: 318 | cl.decRootCnt() 319 | 320 | for nid, p in nid_part.items(): 321 | for ai, a in p._args.items(): 322 | p.removeArgument(ai) 323 | cp = a._argPart 324 | cp.unsetParent() 325 | 326 | p.unsetRelType() 327 | 328 | for nid, p in nid_part.items(): 329 | pclust = getClustIdx() 330 | Part.clustIdx_partRootNodeIds[pclust].remove(p.getRelTreeRoot().getId()) 331 | 332 | if len(Part.clustIdx_partRootNodeIds[pclust]) == 0: 333 | del Part.clustIdx_partRootNodeIds[pclust] 334 | 335 | return None 336 | 337 | def updatePartStat(nid_part): 338 | for nid, p in nid_part.items(): 339 | cl = Clust.getClust(p.getClustIdx()) 340 | cl.onPartSetClust(p) 341 | 342 | if p.getParPart() is None: 343 | cl.incRootCnt() 344 | 345 | for ai, arg in p._args: 346 | aci = p._argTypeIdx_argClustIdxs[ai] 347 | cl.onPartSetArg(p, arg, aci) 348 | 349 | return None 350 | 351 | def toString(self): 352 | rts = ['{}:{}'.format(RelType.getRelType(rti).toString(), cnt) 353 | for x, y in self._relTypeIdx_cnt.items()] 354 | s = ',\t'.join(rts) 355 | s = '[' + s + ']' 356 | 357 | return s 358 | 359 | 360 | ''' 361 | End Clust class definitions 362 | ''' 363 | 364 | class ArgClust(object): 365 | def __init__(self): 366 | # Dictionary mapping {int: int} 367 | self._argTypeIdx_cnt = {} 368 | # Dictionary mapping {int: int} 369 | self._chdClustIdx_cnt = {} 370 | # Dictionary mapping {int: int} 371 | self._argNum_cnt = {} 372 | self._ttlArgCnt = 0 373 | self._partRootTreeNodeIds = set() 374 | 375 | def toString(self): 376 | s = '' 377 | for k, v in self._argTypeIdx_cnt.items(): 378 | if len(s) > 0: 379 | s += ' ' 380 | s += '{}:{}'.format(ArgType.getArgType(k), c) 381 | 382 | return s 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | -------------------------------------------------------------------------------- /semantic/MLN/src/MLN.py: -------------------------------------------------------------------------------- 1 | 2 | class MLN(object): 3 | def __init__(self): 4 | return None 5 | 6 | -------------------------------------------------------------------------------- /semantic/MLN/src/Part.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Part class 5 | # 6 | 7 | from semantic import Argument, Clust 8 | from syntax.Relations import RelType 9 | 10 | class Part(object): 11 | # dictionary mapping {str: Part} 12 | rootNodeId_part = {} 13 | # dictionary mapping {int: set(str)} 14 | clustIdx_partRootNodeIds = {} 15 | # dictionary mapping {(int, int): set((str, str))} 16 | pairClustIdxs_pairPartRootNodeIds = {} 17 | # dictionary mapping {int: set((int, int))} 18 | clustIdx_pairClustIdxs = {} 19 | 20 | def __init__(self, relTreeRoot): 21 | self._isDebug = False 22 | 23 | self._relTreeRoot = relTreeRoot 24 | self._relTypeIdx = RelType.getRelType(relTreeRoot) 25 | self._clustIdx = -1 26 | self._nxtArgIdx = 0 # Remember next index because _args should be ordered Dict 27 | 28 | self._parPart = None 29 | self._parArgIdx = -1 30 | 31 | # Dictionary mapping {int: Argument} 32 | self._args = {} 33 | # Dictionary mapping {int: int} 34 | self._argIdx_argClustIdx = {} 35 | # Dictionary mapping {int: set(int)} 36 | self._argClustIdx_argIdxs = {} 37 | 38 | return None 39 | 40 | def addArgument(self, arg): 41 | argIdx = self._nxtArgIdx + 1 42 | self._args[argIdx] = arg 43 | 44 | return argIdx 45 | 46 | def changeClust(self, newClustIdx, newRelTypeIdx, clust_only=False): 47 | oldClustIdx = self.getClustIdx() 48 | rootID = self.getRelTreeRoot().getId() 49 | Part.clustIdx_partRootNodeIds[oldClustIdx].remove(rootID) 50 | 51 | if clust_only: 52 | self._relTypeIdx = newRelTypeIdx 53 | else: 54 | ocl = Clust.getClust(oldClustIdx) 55 | ocl.onPartUnsetClust(self) 56 | self.setRelTypeIdx(newRelTypeIdx) 57 | 58 | self.setClust(newClustIdx, clust_only=clust_only) 59 | 60 | parent = self.getParPart() 61 | 62 | if parent is None: 63 | if newClustIdx in Clust.clustIdx_rootCnt: 64 | Clust.clustIdx_rootCnt[newClustIdx] += 1 65 | else: 66 | Clust.clustIdx_rootCnt[newClustIdx] = 1 67 | Clust.clustIdx_rootCnt[newClustIdx] -= 1 68 | else: 69 | parent_clust_id = parent.getClustIdx() 70 | paci = parent.getArgClust(self.getParArgIdx()) 71 | pcl = Clust.getClust(parent_clust_id) 72 | pac = pcl._argClusts[paci] 73 | pac._chdClustIdx_cnt[oldClustIdx] -= 1 74 | 75 | if newClustIdx in pac._chdClustIdx_cnt: 76 | pac._chdClustIdx_cnt[newClustIdx] += 1 77 | else: 78 | pac._chdClustIdx_cnt[newClustIdx] = 1 79 | 80 | pa = (parent_clust_id, paci) 81 | Clust.clustIdx_parArgs[oldClustIdx][pa] -= 1 82 | 83 | if newClustIdx not in Clust.clustIdx_parArgs: 84 | Clust.clustIdx_parArgs[newClustIdx] = {} 85 | 86 | if pa in Clust.clustIdx_parArgs[newClustIdx]: 87 | Clust.clustIdx_parArgs[newClustIdx][pa] += 1 88 | else: 89 | Clust.clustIdx_parArgs[newClustIdx][pa] = 1 90 | 91 | opci = (parent_clust_id, oldClustIdx) 92 | npci = (parent_clust_id, newClustIdx) 93 | ptnid = (parent.getRelTreeRoot().getId(), rootID) 94 | 95 | Part.pairClustIdxs_pairPartRootNodeIds[opci].remove(ptnid) 96 | 97 | if len(Part.pairClustIdxs_pairPartRootNodeIds[opci]) == 0: 98 | Part.clustIdx_pairClustIdxs[oldClustIdx].remove(opci) 99 | Part.clustIdx_pairClustIdxs[parent_clust_id].remove(opci) 100 | 101 | if npci in Part.pairClustIdxs_pairPartRootNodeIds: 102 | Part.pairClustIdxs_pairPartRootNodeIds[npci].add(ptnid) 103 | else: 104 | Part.pairClustIdxs_pairPartRootNodeIds[npci] = set(ptnid) 105 | 106 | Part.clustIdx_pairClustIdxs[parent_clust_id] = npci 107 | if newClustIdx in Part.clustIdx_pairClustIdxs: 108 | Part.clustIdx_pairClustIdxs[newClustIdx].add(npci) 109 | else: 110 | Part.clustIdx_pairClustIdxs[newClustIdx] = set(npci) 111 | 112 | return None 113 | 114 | def changeClustRemap(self, newClustIdx, argClustIdx_newArgClustIdx, clust_only=False): 115 | 116 | if not clust_only: 117 | oldClustIdx = self.getClustIdx() 118 | ocl = Clust.getClust(oldClustIdx) 119 | 120 | self.changeClust(newClustIdx, self.getRelTypeIdx(), clust_only=clust_only) 121 | 122 | argIdx_newArgClustIdx = {} 123 | 124 | for ai, arg in self._args.items(): 125 | oaci = self._argIdx_argClustIdx.pop(ai) 126 | self._argClustIdx_argIdxs[oaci].remove(ai) 127 | 128 | if len(self._argClustIdx_argIdxs[oaci]) == 0: 129 | del self._argClustIdx_argIdxs[oaci] 130 | 131 | argIdx_newArgClustIdx[ai] = argClustIdx_newArgClustIdx[oaci] 132 | 133 | if not clust_only: 134 | ocl.onPartUnsetArg(this, arg, oaci) 135 | 136 | for ai in self._args: 137 | aci = argIdx_newArgClustIdx[ai] 138 | self.setArgClust(ai, aci, clust_only=clust_only) 139 | 140 | return None 141 | 142 | def getArgument(self, argIdx): 143 | return self._args[argIdx] 144 | 145 | def getArguments(self): 146 | return self._args 147 | 148 | def getArgClust(self, argIdx): 149 | if argIdx in self._argIdx_argClustIdx: 150 | return self._argIdx_argClustIdx[argIdx] 151 | else: 152 | return None 153 | 154 | def getParArgIdx(self): 155 | return self._parArgIdx 156 | 157 | def getClustIdx(self): 158 | return self._clustIdx 159 | 160 | def getClustPartRootNodeIds(): 161 | return Part.clustIdx_partRootNodeIds 162 | 163 | def getParArgIdx(self): 164 | return self._parArgIdx 165 | 166 | def getPairPartRootNodeIds(parClustIdx=None, chdClustIdx=None): 167 | if parClustIdx is None or chdClustIdx is None: 168 | return Part.pairClustIdxs_pairPartRootNodeIds 169 | else: 170 | return Part.pairClustIdxs_pairPartRootNodeIds[(parClustIdx, 171 | chdClustIdx)] 172 | 173 | def getParPart(self): 174 | return self._parPart 175 | 176 | def getPartByRootNodeId(rnId): 177 | if rnId in Part.rootNodeId_part: 178 | return Part.rootNodeId_part[rnId] 179 | else: 180 | return None 181 | 182 | def getPartRootNodeIds(clustIdx): 183 | if clustIdx in Part.clustIdx_partRootNodeIds: 184 | return Part.clustIdx_partRootNodeIds[clustIdx] 185 | else: 186 | return None 187 | 188 | def getRelTreeRoot(self): 189 | return self._relTreeRoot 190 | 191 | def getRelTypeIdx(self): 192 | return self._relTypeIdx 193 | 194 | def removeArgument(self, argIdx, clust_only=False): 195 | arg = self.getArgument(argIdx) 196 | 197 | oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx) 198 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 199 | 200 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 201 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 202 | 203 | if not clust_only: 204 | cl = Clust.getClust(self.getClustIdx()) 205 | cl.onPartUnsetArg(self, arg, oldArgClustIdx) 206 | 207 | del self._args[argIdx] 208 | 209 | return None 210 | 211 | 212 | def setArgClust(self, argIdx, argClustIdx, clust_only=False): 213 | oldArgClustIdx = -1 214 | 215 | if argIdx in self._argIdx_argClustIdx: 216 | oldArgClustIdx = self.getArgClust(argIdx) 217 | 218 | if oldArgClustIdx != argClustIdx: 219 | self._argIdx_argClustIdx[argIdx] = argClustIdx 220 | 221 | if argClustIdx in self._argClustIdx_argIdxs: 222 | self._argClustIdx_argIdxs[argClustIdx].add(argIdx) 223 | else: 224 | self._argClustIdx_argIdxs[argClustIdx] = set(argIdx) 225 | 226 | arg = self.getArgument(argIdx) 227 | 228 | if not clust_only: 229 | cl = Clust.getClust(self.getClustIdx()) 230 | 231 | if oldArgClustIdx < 0: 232 | if not clust_only: 233 | cl.onPartSetArg(self, arg, argClustIdx) 234 | else: 235 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 236 | 237 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 238 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 239 | 240 | if not clust_only: 241 | cl.onPartSetArg(self, arg, argClustIdx, oldArgClustIdx) 242 | 243 | return None 244 | 245 | def setClust(self, clustIdx, clust_only=False): 246 | self._clustIdx = clustIdx 247 | rootID = self.getRelTreeRoot().getId() 248 | 249 | if clustIdx in Part.clustIdx_partRootNodeIds: 250 | Part.clustIdx_partRootNodeIds[clustIdx].add(rootID) 251 | else: 252 | Part.clustIdx_partRootNodeIds[clustIdx] = set(rootID) 253 | 254 | if not clust_only: 255 | cl = Clust.getClust(clustIdx) 256 | cl.onPartSetClust(self) 257 | 258 | return None 259 | 260 | def setParent(self, parPart, parArgIdx): 261 | ''' 262 | Unset previous parent if it exists 263 | ''' 264 | if self.getParPart() is not None: 265 | self.unsetParent() 266 | 267 | self._parPart = parPart 268 | self._parArgIdx = parArgIdx 269 | clustIdx = self.getClustIdx() 270 | parClustID = parPart.getClustIdx() 271 | 272 | assert (parClustID >= 0) & (clustIdx >= 0) 273 | 274 | pcci = (parClustID, clustIdx) 275 | 276 | if parClustID in Part.clustIdx_pairClustIdxs: 277 | Part.clustIdx_pairClustIdxs[parClustID].add(pcci) 278 | else: 279 | Part.clustIdx_pairClustIdxs[parClustID] = set(pcci) 280 | 281 | pids = (parPart.getRelTreeRoot().getId(), self.getRelTreeRoot().getId()) 282 | 283 | if pcci in Part.pairClustIdxs_pairPartRootNodeIds: 284 | Part.pairClustIdxs_pairPartRootNodeIds[pcci].add(pids) 285 | else: 286 | Part.pairClustIdxs_pairPartRootNodeIds[pcci] = set(pids) 287 | 288 | if parPart is not None: 289 | arg = parPart.getArgument(parArgIdx) 290 | dep = arg._path.getDep() 291 | 292 | if (parClustID != clustIdx) & dep.startswith('conj_'): 293 | if parClustID < clustIdx: 294 | pci = pcci 295 | else: 296 | pci = (pcci[1], pcci[0]) 297 | 298 | if pci not in Clust._pairClustIdxs_conjCnt: 299 | Clust.pairClustIdxs_conjCnt[pci] = 1 300 | else: 301 | Clust.pairClustIdxs_conjCnt[pci] += 1 302 | 303 | return None 304 | 305 | def setRelTypeIdx(self, newRelTypeIdx): 306 | self._relTypeIdx = newRelTypeIdx 307 | cl = Clust.getClust(self._clustIdx) 308 | cl.onPartSetRelTypeIdx(newRelTypeIdx) 309 | 310 | return None 311 | 312 | def unsetArgClust(self, argIdx, clust_only=False): 313 | oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx) 314 | arg = self.getArgument(argIdx) 315 | self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) 316 | 317 | if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: 318 | self._argClustIdx_argIdxs.remove(oldArgClustIdx) 319 | 320 | if not clust_only: 321 | cl = Clust.getClust(self.getClustIdx()) 322 | cl.onPartUnsetArg(self, arg, oldArgClustIdx) 323 | 324 | return None 325 | 326 | def unsetParent(self): 327 | ''' 328 | Remove parent-child cluster index information 329 | Remove parent-child relationship index information 330 | NEEDS ADDITIONAL FACTORING - where does Cluster come from? 331 | ''' 332 | parent = self.getParPart() 333 | clustIdx = self.getClustIdx() 334 | 335 | if parent is not None: 336 | parClustID = parent.getClustIdx() 337 | 338 | pcci = (parClustID, clustIdx) 339 | Part.clustIdx_pairClustIdxs[parClustID].remove(pcci) 340 | 341 | pids = (parent.getRelTreeRoot().getId(), 342 | self.getRelTreeRoot().getId()) 343 | Part.pairClustIdxs_pairPartRootNodeIds[pcci].remove(pids) 344 | 345 | arg = parent.getArgument(self.getParArgIdx()) 346 | dep = arg._path.getDep() 347 | 348 | if (parClustID != clustIdx) & dep.startswith('conj_'): 349 | if parClustID < clustIdx: 350 | pci = pcci 351 | else: 352 | pci = (pcci[1], pcci[0]) 353 | 354 | if pci in Clust._pairClustIdxs_conjCnt: 355 | Clust.pairClustIdxs_conjCnt[pci] -= 1 356 | if Clust.pairClustIdxs_conjCnt[pci] == 0: 357 | del Clust.pairClustIdxs_conjCnt[pci] 358 | 359 | return None 360 | 361 | def unsetRelTypeIdx(self): 362 | old_type = self._relTypeIdx 363 | cl = Clust.getClust(self._clustIdx) 364 | cl.onPartUnsetRelTypeIdx(old_type) 365 | 366 | return None 367 | 368 | 369 | 370 | -------------------------------------------------------------------------------- /semantic/Parse.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from semantic.MLN import Argument, Clust, Part 4 | from semantic import Agenda, Executor, Scorer 5 | from syntax import StanfordParseReader 6 | from syntax.Nodes import TreeNode 7 | from syntax.Relations import Path 8 | from utils import Utils 9 | 10 | class Parse(object): 11 | def __init__(self, priorNumParam=None, priorNumConj=None): 12 | self.debug = False 13 | self.numSents = 0 14 | self.numTkns = 0 15 | 16 | self.id_article = {} 17 | 18 | self.rootTreeNodeIds = set() 19 | self.parseReader = StanfordParseReader() 20 | self.scorer = Scorer() 21 | self.agenda = Agenda() 22 | self.executor = Executor() 23 | 24 | def createArgs(self, ai, sj, sent, idx): 25 | nid = Utils.genTreeNodeId(ai, sj, idx) 26 | node = TreeNode.getTreeNode(nid) 27 | np = Part.getPartByRootNodeId(nid) 28 | ncl = Clust.getClust(np.getClustIdx()) 29 | chds = sent.get_children(idx) 30 | 31 | if chds is not None: 32 | for dep, cidx in chds: 33 | cid = Utils.genTreeNodeId(ai, sj, cidx) 34 | p = Path(dep) 35 | argTypeIdx = p.getArgType() 36 | cp = Part.getPartByRootNodeId(cid) 37 | 38 | if cp.getParPart() is None: 39 | continue 40 | 41 | arg = Argument(node, p, cp) 42 | argIdx = np.addArgument(arg) 43 | cp.setParent(np, argIdx) 44 | argClustIdxs = ncl.getArgClustIdxs(argTypeIdx) 45 | argClustIdx = -1 46 | 47 | if argClustIdxs is None: 48 | argClustIdx = ncl.createArgClust(argTypeIdx) 49 | else: 50 | argClustIdx = next(iter(argClustIdxs)) 51 | 52 | np.setArgClust(argIdx, argClustIdx) 53 | self.createArgs(ai, sj, sent, cidx) 54 | 55 | return None 56 | 57 | def chkArgs(self): 58 | ''' 59 | To Do: for debugging purposes 60 | ''' 61 | 62 | return None 63 | 64 | def initialize(self, arts): 65 | # 66 | # Look to vectorize this 67 | # 68 | for art in arts: 69 | self.id_article[art.uid] = art 70 | self.numSents += len(art.sentences) 71 | 72 | for j, sent in enumerate(art.sentences): 73 | self.initializeSent(i, j, sent) 74 | 75 | def initializeSent(self, ai, sj, sent): 76 | self.numTkns += len(sent.get_tokens()) 77 | 78 | if len(sent.tkn_children) < 1: 79 | return None 80 | 81 | for k, tok in enumerate(sent.tokens): 82 | if self.isIgnore(sent, k): 83 | continue 84 | 85 | # from utils 86 | part, clustIdx = part_from_node(ai, sj, sent, k) 87 | 88 | part.setClust(clustIdx) 89 | 90 | roots = sent.get_children(0) 91 | assert len(roots) == 1 92 | 93 | for k, v in roots.items(): 94 | dep_idx = (k, v) 95 | idx = v 96 | sub_node_id = Utils.genTreeNodeId(ai, sj, idx) 97 | rootTreeNodeIds.add(sub_node_id) 98 | node_part = Part.getPartByRootNodeId(sub_node_id) 99 | if node_part is None: 100 | continue 101 | ncl = Clust.getClust(node_part.getClustIdx()) 102 | ncl.incRootCnt() 103 | self.createArgs(ai, sj, sent, idx) 104 | 105 | return None 106 | 107 | def part_from_node(ai, sj, sent, k): 108 | node_id = Utils.genTreeNodeId(ai,sj,k) 109 | tn = TreeNode(node_id, sent.get_tokens(k)) 110 | part = Part(tn) 111 | relTypeIdx = part.getRelTypeIdx() 112 | clustIdx = -1 113 | clustIdxs = Clust.getClustsWithRelType(relTypeIdx) 114 | 115 | if clustIdxs is not None: 116 | clustIdx = next(iter(clustIdxs)) 117 | else: 118 | clustIdx = Clust.createClust(relTypeIdx) 119 | 120 | return part, clustIdx 121 | 122 | def isIgnore(sent, k): 123 | while True: 124 | try: 125 | parent = sent.get_parent(k) 126 | except KeyError: 127 | break 128 | else: 129 | k = parent[1] 130 | 131 | return (k>0) 132 | 133 | def mergeArgs(self): 134 | for clustIdx in Clust.clusts: 135 | cl = Clust.getClust(clustIdx) 136 | newArgClusts = {} 137 | cnt_acis = [] 138 | 139 | for argClustIdx in cl._argClusts: 140 | acl = cl._argClusts[argClustIdx] 141 | cnt = acl._ttlArgCnt 142 | cnt_acis.append((cnt,argClustIdx)) 143 | 144 | cnt_acis.sort(reverse=True) 145 | 146 | for item in cnt_acis: 147 | aci = item[1] 148 | ac = cl._argClusts[aci] 149 | 150 | if len(newArgClusts) == 0: 151 | newArgClusts[aci] = ac 152 | 153 | maxScore = 0 154 | maxMap = -1 155 | 156 | for acix in newArgClusts: 157 | score = self.scorer.scoreMergeArgs(cl, acix, aci) 158 | acx = cl._argClusts[acix] 159 | 160 | if score > maxScore: 161 | maxScore = score 162 | maxMap = acix 163 | 164 | if maxMap >= 0: 165 | acx = cl._argClusts[maxMap] 166 | self.executor.mergeArg(cl, maxMap, aci) 167 | else: 168 | newArgClusts[aci] = ac 169 | 170 | cl._argClusts = newArgClusts 171 | 172 | def parse(self, files): 173 | articles = [] 174 | 175 | for file in files: 176 | a = self.parseReader.readParse(file) 177 | articles.append(a) 178 | 179 | self.initialize(articles) 180 | self.mergeArgs() 181 | self.agenda.createAgenda() 182 | self.agenda.procAgenda() 183 | 184 | return None 185 | 186 | def reparse(self, aid, si): 187 | a = id_article[aid] 188 | sent = a.sentences[si] 189 | 190 | children = sent.get_children(0) 191 | 192 | if children is None: 193 | return None 194 | elif len(children) == 0: 195 | return None 196 | else: 197 | old_nid_part = {} 198 | 199 | for ni in range(len(sent.get_tokens())): 200 | if isIgnore(sent, ni): 201 | continue 202 | nid = Utils.genTreeNodeId(aid, si, ni) 203 | np = Part.getPartByRootNodeId(nid) 204 | del Part.rootTreeNodeId_part[nid] 205 | old_nid_part[nid] = np 206 | 207 | nid_part = {} 208 | 209 | for ni in range(len(sent.get_tokens())): 210 | if isIgnore(sent, ni): 211 | continue 212 | part, clustIdx = part_from_node(aid, si, sent, ni) 213 | nid_part[Utils.genTreeNodeId(aid, si, ni)] = part 214 | part.setClust(clustIdx, clust_only=True) 215 | 216 | roots = sent.get_children(0) 217 | assert len(roots) == 1 218 | 219 | dep_idx = next(iter(roots)) 220 | idx = dep_idx[1] 221 | nid = Utils.genTreeNodeId(aid, si, idx) 222 | np = Part.getPartByRootNodeId(nid) 223 | 224 | if np is not None: 225 | setArgs(aid, si, sent, idx) 226 | 227 | maxImp = 1 228 | 229 | while maxImp > 0: 230 | rp, ap = None, None 231 | maxImp = 0 232 | 233 | for prt in nid_part.values(): 234 | for arg in prt.getArguments().values(): 235 | score = self.scorer.scoreOpComposePart(prt,arg) 236 | 237 | if score > maxImp: 238 | maxImp = score 239 | rp, ap = prt, arg 240 | 241 | if maxImp <= 0: 242 | break 243 | 244 | self.executor.execComposePart(rp, ap) 245 | del nid_part[ap.getRelTreeRoot().getId()] 246 | 247 | Clust.removePartAndUpdateStat(old_nid_part) 248 | Clust.updatePartStat(nid_part) 249 | 250 | return None 251 | 252 | def setArgs(self, aid, si, sent, idx): 253 | nid = Utils.genTreeNodeId(aid, si, idx) 254 | node = TreeNode.getTreeNode(nid) 255 | np = Part.getPartByRootNodeId(nid) 256 | ncl = Clust.getClust(np.getClustIdx()) 257 | chds = sent.get_children(idx) 258 | 259 | if chds is None: 260 | return None 261 | else: 262 | for dep, cidx in chds: 263 | cid = Utils.genTreeNodeId(aid, si, cidx) 264 | p = Path(dep) 265 | argTypeIdx = p.getArgType() 266 | cp = Part.getPartByRootNodeId(cid) 267 | 268 | if cp.getParPart() is not None: 269 | continue 270 | 271 | arg = Argument(node, p, cp) 272 | argIdx = np.addArgument(arg) 273 | cp.setParent(np, argIdx) 274 | argClustIdxs = ncl.getArgClustIdxs(argTypeIdx) 275 | argClustIdx = -1 276 | 277 | if argClustIdxs is None: 278 | argClustIdx = ncl.createArgClust(argTypeIdx) 279 | else: 280 | argClustIdx = next(iter(argClustIdxs)) 281 | 282 | np.setArgClust(argIdx, argClustIdx, clust_only=True) 283 | 284 | setArgs(aid, si, sent, cidx) 285 | 286 | return None 287 | -------------------------------------------------------------------------------- /semantic/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | class Agenda(object): 3 | def __init__(self): 4 | 5 | return None 6 | 7 | 8 | class Executor(object): 9 | def __init__(self): 10 | 11 | return None 12 | 13 | 14 | class Scorer(object): 15 | def __init__(self): 16 | 17 | return None 18 | 19 | 20 | class SearchOp(object): 21 | def __init__(self): 22 | 23 | return None 24 | 25 | -------------------------------------------------------------------------------- /semantic/__pycache__/Agenda.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Agenda.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Argument.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Argument.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Clust.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Clust.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Executor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Executor.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/MLN.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/MLN.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Parse.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Parse.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Part.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Part.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/Scorer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/Scorer.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /semantic/src/Agenda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/src/Agenda.py -------------------------------------------------------------------------------- /semantic/src/Executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/src/Executor.py -------------------------------------------------------------------------------- /semantic/src/Scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/src/Scorer.py -------------------------------------------------------------------------------- /semantic/src/SearchOp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/semantic/src/SearchOp.py -------------------------------------------------------------------------------- /syntax/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/.DS_Store -------------------------------------------------------------------------------- /syntax/Nodes/Article.py: -------------------------------------------------------------------------------- 1 | 2 | from . import Sentence 3 | 4 | class Article(object): 5 | ''' 6 | An Article() is merely a collection of Sentences() (represented as a list) 7 | and an article id, which can be of any particular type but should be unique 8 | in a collection of Articles. 9 | ''' 10 | def __init__(self, fn=None, sentences=[]): 11 | self.uid = fn 12 | self.sentences = sentences 13 | 14 | def __repr__(self): 15 | return str(self.__dict__) 16 | 17 | -------------------------------------------------------------------------------- /syntax/Nodes/Sentence.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Sentence class 5 | # 6 | 7 | from . import Token 8 | 9 | class Sentence(object): 10 | def __init__(self): 11 | ''' 12 | Each sentence consists of: 13 | _tokens: A list of individual tokens in the sentence, containing POS, 14 | lemma, and actual form of the word/item. 15 | _tkn_children: A dictionary mapping parents (denoted by the integer 16 | keys) to children (sets of integer, string tuples). 17 | _tkn_par: A dictionary mapping children (denoted by integer keys) to 18 | parents (tuples of string, integer values) 19 | ''' 20 | self._tokens = [] 21 | 22 | # Dictionary mapping {int: set((int, str))} 23 | self._tkn_children = {0: set()} 24 | # Dictionary mapping {int: (str, int)} 25 | self._tkn_par = {} 26 | 27 | return None 28 | 29 | 30 | def __repr__(self): 31 | return ('Tokens: ' + str(self._tokens) + '\n' + 'Parents: ' + 32 | str(self._tkn_par) + '\n' + 'Children: ' + str(self._tkn_children)) 33 | 34 | def get_tokens(self, idx=None): 35 | ''' 36 | Return Tokens at the specified indices. 37 | ''' 38 | if idx is None: 39 | return self._tokens 40 | elif isinstance(idx, list): 41 | return [self._tokens[i] for i in idx] 42 | else: 43 | raise ValueError 44 | 45 | def get_token(self, idx): 46 | ''' 47 | Return the Token() at the specified index. 48 | ''' 49 | return self._tokens[idx] 50 | 51 | def add_token(self, tok): 52 | ''' 53 | Append the Token() to the list of _tokens. 54 | ''' 55 | assert isinstance(tok, Token) 56 | self._tokens.append(tok) 57 | 58 | return None 59 | 60 | def get_children(self, parent=None): 61 | ''' 62 | Return the child/children of the parent specified by the given key. If 63 | no key specified, return them all. 64 | ''' 65 | if parent is not None: 66 | if parent in self._tkn_children: 67 | c = self._tkn_children[parent] 68 | else: 69 | c = None 70 | else: 71 | c = self._tkn_children 72 | 73 | return c 74 | 75 | def set_children(self, parent, kids): 76 | ''' 77 | Add the child/children specified by the key/kids key/value pair. 78 | ''' 79 | assert isinstance(kids, set) 80 | self._tkn_children[parent] = kids 81 | 82 | return None 83 | 84 | def add_child(self, parent, kid): 85 | ''' 86 | Add/update the child/children specified by the key/kids key/value pair. 87 | ''' 88 | assert parent in self._tkn_children 89 | self._tkn_children[parent].add(kid) 90 | 91 | return None 92 | 93 | def get_parent(self, kid): 94 | ''' 95 | Return the parent of the child specified by the given key. 96 | ''' 97 | return self._tkn_par[kid] 98 | 99 | def set_parent(self, kid, parent): 100 | ''' 101 | Add/update the parent specified by the given key/parent value pair. 102 | ''' 103 | assert isinstance(parent, tuple) 104 | self._tkn_par[kid] = parent 105 | 106 | return None 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /syntax/Nodes/Token.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Token(object): 4 | def __init__(self, pos, lemma, form=None): 5 | self._pos = pos 6 | self._lemma = lemma 7 | 8 | if form is None: 9 | self._form = lemma 10 | else: 11 | self._form = form 12 | 13 | self._tkn_cnt = dict() 14 | 15 | def __repr__(self): 16 | return self.toString() 17 | 18 | 19 | def getForm(self): 20 | return self._form 21 | 22 | 23 | def getPOS(self): 24 | return self._pos 25 | 26 | 27 | def getLemma(self): 28 | return self._lemma 29 | 30 | 31 | def isContent(pos=None): 32 | if pos is None: 33 | pos = self._pos 34 | 35 | result = pos in ['J','R','V','N'] 36 | 37 | return result 38 | 39 | 40 | def isVerb(self): 41 | return self._pos[0] == 'V' 42 | 43 | 44 | def isNoun(self): 45 | return (self._pos[0] == 'N') | (self._pos.startswith('PRP')) 46 | 47 | 48 | def compareTo(self, t): 49 | this = sum([ord(x) for x in self._lemma]) 50 | that = sum([ord(x) for x in t.getLemma()]) 51 | result = this - that 52 | 53 | if result == 0: 54 | this = sum([ord(x) for x in self.pos]) 55 | that = sum([ord(x) for x in t.getPOS()]) 56 | result = this - that 57 | return result 58 | 59 | 60 | def equals(self, t): 61 | return (self._pos == t.getPOS()) & (self._lemma == t.getLemma()) 62 | 63 | 64 | def hashCode(self): 65 | return hash(self) 66 | 67 | 68 | def toString(self): 69 | return (self._pos + ":" + self._lemma) 70 | -------------------------------------------------------------------------------- /syntax/Nodes/TreeNode.py: -------------------------------------------------------------------------------- 1 | 2 | from . import Token 3 | 4 | class TreeNode(object): 5 | # dictionary mapping {str: TreeNode} 6 | id_treeNodes = {} 7 | 8 | def getTreeNode(idx): 9 | return TreeNode.id_treeNodes[idx] 10 | 11 | 12 | def __init__(self, idx, tkn): 13 | self._id = idx 14 | self._tkn = tkn 15 | self._children = {} 16 | TreeNode.id_treeNodes[idx] = self 17 | 18 | def addChild(self, dep, child): 19 | try: 20 | tns = self._children[dep] 21 | except KeyError: 22 | tns = set(child) 23 | self._children[dep] = tns 24 | else: 25 | self._children[dep] = tns.add(child) 26 | 27 | return None 28 | 29 | def getId(self): 30 | return self._id 31 | 32 | def getToken(self): 33 | return self._tkn 34 | 35 | def getChildren(self): 36 | return self._children 37 | 38 | def compareTo(self, z): 39 | if not isinstance(z, TreeNode): 40 | raise ValueError 41 | 42 | return self._tkn.compareTo(z.tkn_) 43 | 44 | def equals(self, o): 45 | return self.compareTo(o) == 0 46 | 47 | def toString(self): 48 | return self._tkn.toString() 49 | 50 | def getTreeStr(self): 51 | id_str = {} 52 | 53 | if (len(self._children) > 0): 54 | for dep in self._children.keys(): 55 | nodes = self._children[dep] 56 | s = '' 57 | for node in nodes: 58 | if dep.startswith('prep_') or dep.startswith('conj_'): 59 | s = dep[5:] + ' ' 60 | s = s + node.getTreeStr() 61 | id_str[node.getId()] = s 62 | 63 | id_str[self._id] = self._tkn.getLemma() 64 | result = ' '.join([id_str[x] for x in id_str.keys()]) 65 | 66 | return result 67 | 68 | 69 | -------------------------------------------------------------------------------- /syntax/Nodes/__pycache__/Article.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Nodes/__pycache__/Article.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/Nodes/__pycache__/Sentence.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Nodes/__pycache__/Sentence.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/Nodes/__pycache__/Token.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Nodes/__pycache__/Token.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/Nodes/__pycache__/TreeNode.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Nodes/__pycache__/TreeNode.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/Relations/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Relations/.DS_Store -------------------------------------------------------------------------------- /syntax/Relations/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ..Nodes import Token, TreeNode 3 | 4 | class RelType(object): 5 | _relTypes = [] 6 | _relTypeStr_idx = {} 7 | 8 | def __init__(self): 9 | self._str = None 10 | self._type = '' 11 | 12 | def getType(self): 13 | return self._type 14 | 15 | def getRelType(target): 16 | if target is None: 17 | return None 18 | elif isinstance(target,int): 19 | return RelType.relTypes[idx] 20 | else: 21 | s = RelType.genTypeStr(target) 22 | 23 | try: 24 | _ = _relTypeStr_idx[s] 25 | except KeyError: 26 | t = RelType() 27 | t._str = s 28 | 29 | if target.getToken().isContent(): 30 | t._type = 'C' 31 | else: 32 | t._type = 'N' 33 | 34 | RelType.relTypes.append(t) 35 | RelType.relTypeStr_idx[s] = len(RelType.relTypes) - 1 36 | 37 | return RelType.relTypeStr_idx[s] 38 | 39 | def genTypeStr(tn): 40 | type_str = '(' 41 | type_str += tn.getToken().toString() 42 | children = tn.getChildren() 43 | 44 | if len(children) > 0: 45 | for child in children: 46 | type_str += ' (' + child 47 | tns = children[child] 48 | 49 | for node in tns: 50 | type_str += ' ' + genTypeStr(node) 51 | 52 | type_str += ')' 53 | 54 | type_str += ')' 55 | 56 | return type_str 57 | 58 | def compareTo(self, z): 59 | this = sum([ord(x) for x in self._str]) 60 | that = sum([ord(x) for x in z.toString()]) 61 | result = this - that 62 | 63 | return result 64 | 65 | def equals(self, o): 66 | return self.compareTo(o)==0 67 | 68 | def toString(self): 69 | return self._str 70 | 71 | 72 | 73 | class Path(object): 74 | def __init__(self, dep, treeRoot=None, argNode=None, dep2=None): 75 | self._dep = dep 76 | self._treeRoot = treeRoot 77 | self._argNode = argNode 78 | self._dep2 = dep2 79 | self._argTypeIdx = -1 80 | self._str = None 81 | 82 | def getDep(self): 83 | return self._dep 84 | 85 | def getTreeRoot(self): 86 | return self._treeRoot 87 | 88 | def getArgNode(self): 89 | return self._argNode 90 | 91 | def getDep2(self): 92 | return self._dep2 93 | 94 | def getArgType(self): 95 | return self._argTypeIdx 96 | 97 | def toString(self): 98 | if self._str is None: 99 | self._str = self.genTypeStr() 100 | 101 | return self._str 102 | 103 | def genTypeStr(self): 104 | typ_str = '<' + self._dep 105 | 106 | if self._treeRoot is not None: 107 | rel_str = RelType.genTypeStr(self._treeRoot) 108 | typ_str += ':' + rel_str + ':' + self._dep2 109 | 110 | typ_str += '>' 111 | 112 | return typ_str 113 | 114 | 115 | 116 | class ArgType(object): 117 | argTypes = [] 118 | # Dictionary mapping {str: int} 119 | argTypeStr_idx = {} 120 | 121 | ARGTYPEIDX_SUBJ = -1 122 | ARGTYPEIDX_OBJ = -1 123 | ARGTYPEIDX_IN = -1 124 | 125 | def __init__(self): 126 | self._dep = None 127 | self._relTypeIdx = -1 128 | self._dep2 = None 129 | self._str = None 130 | 131 | def getArgType(target): 132 | if isinstance(target,int): 133 | result = ArgType.argTypes[idx] 134 | else: 135 | s = target.toString() 136 | 137 | if s not in ArgType.argTypeStr_idx: 138 | t = ArgType() 139 | t._dep = p.getDep() 140 | t._dep2 = p.getDep2() 141 | t._relTypeIdx = -1 142 | 143 | if p.getTreeRoot() is not None: 144 | t._relTypeIdx = RelType.getRelType(p.getTreeRoot()) 145 | 146 | ArgType.argTypes.append(t) 147 | ati = len(ArgType.argTypes) - 1 148 | ArgType.argTypeStr_idx[s] = ati 149 | 150 | if p.getTreeRoot() is None: 151 | if p.getDep() == 'nsubj': 152 | ARGTYPEIDX_SUBJ = ati 153 | elif p.getDep() == 'dobj': 154 | ARGTYPEIDX_OBJ = ati 155 | elif p.getDep() == 'prep_in': 156 | ARGTYPEIDX_IN = ati 157 | 158 | result = ArgType.argTypeStr_idx[s] 159 | 160 | return result 161 | 162 | def compareTo(self, z): 163 | if self._dep is None or z.GetDep() is None: 164 | return None 165 | 166 | this = sum([ord(x) for x in self._dep]) 167 | that = sum([ord(x) for x in z.getDep()]) 168 | result = this - that 169 | 170 | if result == 0: 171 | result = self._relTypeIdx - z._relTypeIdx 172 | 173 | if result == 0: 174 | if self._dep2 is not None: 175 | this = sum([ord(x) for x in self._dep2]) 176 | 177 | try: 178 | that = sum([ord(x) for x in z.getDep2()]) 179 | except TypeError: 180 | result = -1 181 | else: 182 | result = this - that 183 | 184 | return result 185 | 186 | def equals(self, o): 187 | return self.compareTo(o) == 0 188 | 189 | def toString(self): 190 | if self._str is None: 191 | self._str = '<' + self._dep 192 | 193 | if self._relTypeIdx >= 0: 194 | self._str += ':{}:{}'.format(self._relTypeIdx,self._dep2) 195 | 196 | self._str += '>' 197 | 198 | return self._str 199 | 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /syntax/Relations/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/Relations/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/Relations/src/ArgType.py: -------------------------------------------------------------------------------- 1 | 2 | from . import RelType 3 | 4 | class ArgType(object): 5 | argTypes = [] 6 | # Dictionary mapping {str: int} 7 | argTypeStr_idx = {} 8 | 9 | ARGTYPEIDX_SUBJ = -1 10 | ARGTYPEIDX_OBJ = -1 11 | ARGTYPEIDX_IN = -1 12 | 13 | def __init__(self): 14 | self._dep = None 15 | self._relTypeIdx = -1 16 | self._dep2 = None 17 | self._str = None 18 | 19 | def getArgType(target): 20 | if isinstance(target,int): 21 | result = ArgType.argTypes[idx] 22 | else: 23 | s = target.toString() 24 | 25 | if s not in ArgType.argTypeStr_idx: 26 | t = ArgType() 27 | t._dep = p.getDep() 28 | t._dep2 = p.getDep2() 29 | t._relTypeIdx = -1 30 | 31 | if p.getTreeRoot() is not None: 32 | t._relTypeIdx = RelType.getRelType(p.getTreeRoot()) 33 | 34 | ArgType.argTypes.append(t) 35 | ati = len(ArgType.argTypes) - 1 36 | ArgType.argTypeStr_idx[s] = ati 37 | 38 | if p.getTreeRoot() is None: 39 | if p.getDep() == 'nsubj': 40 | ARGTYPEIDX_SUBJ = ati 41 | elif p.getDep() == 'dobj': 42 | ARGTYPEIDX_OBJ = ati 43 | elif p.getDep() == 'prep_in': 44 | ARGTYPEIDX_IN = ati 45 | 46 | result = ArgType.argTypeStr_idx[s] 47 | 48 | return result 49 | 50 | def compareTo(self, z): 51 | if self._dep is None or z.GetDep() is None: 52 | return None 53 | 54 | this = sum([ord(x) for x in self._dep]) 55 | that = sum([ord(x) for x in z.getDep()]) 56 | result = this - that 57 | 58 | if result == 0: 59 | result = self._relTypeIdx - z._relTypeIdx 60 | 61 | if result == 0: 62 | if self._dep2 is not None: 63 | this = sum([ord(x) for x in self._dep2]) 64 | 65 | try: 66 | that = sum([ord(x) for x in z.getDep2()]) 67 | except TypeError: 68 | result = -1 69 | else: 70 | result = this - that 71 | 72 | return result 73 | 74 | def equals(self, o): 75 | return self.compareTo(o) == 0 76 | 77 | def toString(self): 78 | if self._str is None: 79 | self._str = '<' + self._dep 80 | 81 | if self._relTypeIdx >= 0: 82 | self._str += ':{}:{}'.format(self._relTypeIdx,self._dep2) 83 | 84 | self._str += '>' 85 | 86 | return self._str 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /syntax/Relations/src/Path.py: -------------------------------------------------------------------------------- 1 | 2 | from . import RelType 3 | 4 | class Path(object): 5 | def __init__(self, dep, treeRoot=None, argNode=None, dep2=None): 6 | self._dep = dep 7 | self._treeRoot = treeRoot 8 | self._argNode = argNode 9 | self._dep2 = dep2 10 | self._argTypeIdx = -1 11 | self._str = None 12 | 13 | def getDep(self): 14 | return self._dep 15 | 16 | def getTreeRoot(self): 17 | return self._treeRoot 18 | 19 | def getArgNode(self): 20 | return self._argNode 21 | 22 | def getDep2(self): 23 | return self._dep2 24 | 25 | def getArgType(self): 26 | return self._argTypeIdx 27 | 28 | def toString(self): 29 | if self._str is None: 30 | self._str = self.genTypeStr() 31 | 32 | return self._str 33 | 34 | def genTypeStr(self): 35 | typ_str = '<' + self._dep 36 | 37 | if self._treeRoot is not None: 38 | rel_str = RelType.genTypeStr(self._treeRoot) 39 | typ_str += ':' + rel_str + ':' + self._dep2 40 | 41 | typ_str += '>' 42 | 43 | return typ_str 44 | 45 | -------------------------------------------------------------------------------- /syntax/Relations/src/RelType.py: -------------------------------------------------------------------------------- 1 | 2 | from . import Token, TreeNode 3 | 4 | class RelType(object): 5 | _relTypes = [] 6 | _relTypeStr_idx = {} 7 | 8 | def __init__(self): 9 | self._str = None 10 | self._type = '' 11 | 12 | def getType(self): 13 | return self._type 14 | 15 | def getRelType(target): 16 | if target is None: 17 | return None 18 | elif isinstance(target,int): 19 | return RelType.relTypes[idx] 20 | else: 21 | s = RelType.genTypeStr(target) 22 | 23 | try: 24 | _ = _relTypeStr_idx[s] 25 | except KeyError: 26 | t = RelType() 27 | t._str = s 28 | 29 | if target.getToken().isContent(): 30 | t._type = 'C' 31 | else: 32 | t._type = 'N' 33 | 34 | RelType.relTypes.append(t) 35 | RelType.relTypeStr_idx[s] = len(RelType.relTypes) - 1 36 | 37 | return RelType.relTypeStr_idx[s] 38 | 39 | def genTypeStr(tn): 40 | type_str = '(' 41 | type_str += tn.getToken().toString() 42 | children = tn.getChildren() 43 | 44 | if len(children) > 0: 45 | for child in children: 46 | type_str += ' (' + child 47 | tns = children[child] 48 | 49 | for node in tns: 50 | type_str += ' ' + genTypeStr(node) 51 | 52 | type_str += ')' 53 | 54 | type_str += ')' 55 | 56 | return type_str 57 | 58 | def compareTo(self, z): 59 | this = sum([ord(x) for x in self._str]) 60 | that = sum([ord(x) for x in z.toString()]) 61 | result = this - that 62 | 63 | return result 64 | 65 | def equals(self, o): 66 | return self.compareTo(o)==0 67 | 68 | def toString(self): 69 | return self._str 70 | 71 | -------------------------------------------------------------------------------- /syntax/StanfordParseReader.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | from .Nodes import Article, Sentence, Token 5 | 6 | 7 | class StanfordParseReader(object): 8 | ''' 9 | Replicates StanfordParseReader.java from USP implementation found at 10 | http://alchemy.cs.washington.edu/usp/ 11 | 12 | Given a set of dependency, POS, and morphology files parsed from source 13 | documents, this compiles lists of tokens and dictionary mappings defining 14 | the dependency relationships in the sentences in a given document. 15 | ''' 16 | def __init__(self): 17 | self._isDebug=False 18 | self._ignored_deps = set() 19 | self._ignored_deps.add("aux") 20 | self._ignored_deps.add("auxpass") 21 | self._ignored_deps.add("det") 22 | self._ignored_deps.add("cop") 23 | self._ignored_deps.add("complm") 24 | # self._ignored_deps.add("num") 25 | # self._ignored_deps.add("number") 26 | self._ignored_deps.add("preconj") 27 | self._ignored_deps.add("predet") 28 | self._ignored_deps.add("punct") 29 | # self._ignored_deps.add("quantmod") 30 | 31 | self._ignored_deps.add("expl") 32 | self._ignored_deps.add("mark") 33 | # self._ignored_deps.add("parataxis") 34 | 35 | 36 | def readParse(self, fileName, data_dir, ignoreDep=True): 37 | ''' 38 | Given a filename of the type "$FILENAME.dep" gets the file and 39 | corresponding *.morph and *.input files and reads the Tokens and 40 | Dependency relationships by sentence in those files. Each file in such 41 | a trio should contain the same number of sentences represented as blocks 42 | of text with each dependency/token on its own line, separated by blank 43 | lines. 44 | ''' 45 | file = os.path.splitext(fileName)[0] 46 | morph_file = os.path.join(data_dir, file + '.morph') 47 | input_file = os.path.join(data_dir, file + '.input') 48 | dep_file = os.path.join(data_dir, fileName) 49 | 50 | doc = Article(file) 51 | doc = self.readTokens(doc, morph_file, input_file) 52 | doc = self.readDeps(doc, dep_file, ignoreDep) 53 | 54 | return doc 55 | 56 | 57 | def readTokens(self, doc, morph_file, input_file): 58 | ''' 59 | Reads a morphology and input (POS tagged lemmas) file simultaneously, 60 | parsing single tokens from each line into a Token() object and 61 | appending each Token to its respective Sentence() object, which 62 | are collected in an Article() object "doc" and returned. 63 | ''' 64 | isNew=True 65 | 66 | with open(morph_file, 'r') as mor, open(input_file, 'r') as inp: 67 | for mline in mor.readlines(): 68 | mline = mline.strip() 69 | iline = inp.readline().strip() 70 | 71 | if iline == '': 72 | isNew = True 73 | continue 74 | 75 | ts = iline.split('_') 76 | 77 | if isNew: 78 | sent = Sentence() 79 | sent.add_token(Token('ROOT','ROOT')) 80 | doc.sentences.append(sent) 81 | isNew = False 82 | 83 | pos = ts[1] 84 | lemma = mline.replace(':','.').lower() 85 | form = iline[0] 86 | 87 | doc.sentences[-1].add_token(Token(pos,lemma,form)) 88 | 89 | return doc 90 | 91 | 92 | def readDeps(self, doc, deps_file, ignoreDep): 93 | ''' 94 | Reads a dependency relationships file and adds these relationships to 95 | their respective Sentence() objects in an Article() in the form of 96 | reciprocal python dictionaries. The updated Article() "doc" is then 97 | returned. 98 | ''' 99 | blank = False 100 | senId = 0 101 | 102 | currSent = doc.sentences[senId] 103 | currNonRoots = set() 104 | currRoots = set() 105 | 106 | with open(deps_file, 'r') as d: 107 | for line in d.readlines(): 108 | line = line.strip() 109 | 110 | if len(line) == 0: 111 | if not blank: 112 | senId += 1 113 | 114 | blank = True 115 | 116 | if currRoots is not None: 117 | dep_chds = currSent.get_children(0) 118 | for i in currRoots: 119 | dep_chds.add((i, 'ROOT')) 120 | currSent.set_parent(i, ('ROOT', 0)) 121 | currSent.set_children(0, dep_chds) 122 | doc.sentences[senId] = currSent 123 | 124 | currSent = None 125 | currNonRoots = None 126 | currRoots = None 127 | 128 | continue 129 | else: 130 | if blank: 131 | blank = False 132 | currSent = doc.sentences[senId] 133 | currNonRoots = set() 134 | currRoots = set() 135 | 136 | rel = line[:line.index("(")] 137 | items = line[line.index('('):].replace('(','').replace(')','') 138 | items = items.split(', ') 139 | gov, dep = items[0], items[1] 140 | gov = (int(gov[gov.rfind('-')+1:]), gov[:gov.rfind('-')]) 141 | dep = (int(dep[dep.rfind('-')+1:]), dep[:dep.rfind('-')]) 142 | 143 | if ('conj' not in rel) & (gov[0] == dep[0]): 144 | continue 145 | 146 | currNonRoots.add(dep[0]) 147 | if dep[0] in currRoots: 148 | currRoots.remove(dep[0]) 149 | if gov[0] not in currNonRoots: 150 | currRoots.add(gov[0]) 151 | 152 | if ignoreDep & (rel in self._ignored_deps): 153 | continue 154 | 155 | currSent.set_parent(dep[0], (rel, gov[0])) 156 | 157 | if gov[0] in currSent.get_children(): 158 | currSent.add_child(gov[0], (dep[0], rel)) 159 | else: 160 | currSent.set_children(gov[0], set()) 161 | currSent.add_child(gov[0], (dep[0], rel)) 162 | 163 | if currRoots is not None: 164 | dep_chds = currSent.get_children(0) 165 | for i in currRoots: 166 | dep_chds.add((i, 'ROOT')) 167 | currSent.set_parent(i, ('ROOT', 0)) 168 | currSent.set_children(0, dep_chds) 169 | doc.sentences[senId] = currSent 170 | 171 | currSent = None 172 | currNonRoots = None 173 | currRoots = None 174 | 175 | return doc 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /syntax/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | from .Nodes import Article, Sentence, Token 5 | 6 | 7 | class StanfordParseReader(object): 8 | ''' 9 | Replicates StanfordParseReader.java from USP implementation found at 10 | http://alchemy.cs.washington.edu/usp/ 11 | 12 | Given a set of dependency, POS, and morphology files parsed from source 13 | documents, this compiles lists of tokens and dictionary mappings defining 14 | the dependency relationships in the sentences in a given document. 15 | ''' 16 | def __init__(self): 17 | self._isDebug=False 18 | self._ignored_deps = set() 19 | self._ignored_deps.add("aux") 20 | self._ignored_deps.add("auxpass") 21 | self._ignored_deps.add("det") 22 | self._ignored_deps.add("cop") 23 | self._ignored_deps.add("complm") 24 | # self._ignored_deps.add("num") 25 | # self._ignored_deps.add("number") 26 | self._ignored_deps.add("preconj") 27 | self._ignored_deps.add("predet") 28 | self._ignored_deps.add("punct") 29 | # self._ignored_deps.add("quantmod") 30 | 31 | self._ignored_deps.add("expl") 32 | self._ignored_deps.add("mark") 33 | # self._ignored_deps.add("parataxis") 34 | 35 | 36 | def readParse(self, fileName, data_dir, ignoreDep=True): 37 | ''' 38 | Given a filename of the type "$FILENAME.dep" gets the file and 39 | corresponding *.morph and *.input files and reads the Tokens and 40 | Dependency relationships by sentence in those files. Each file in such 41 | a trio should contain the same number of sentences represented as blocks 42 | of text with each dependency/token on its own line, separated by blank 43 | lines. 44 | ''' 45 | file = os.path.splitext(fileName)[0] 46 | morph_file = os.path.join(data_dir, file + '.morph') 47 | input_file = os.path.join(data_dir, file + '.input') 48 | dep_file = os.path.join(data_dir, fileName) 49 | 50 | doc = Article(file) 51 | doc = self.readTokens(doc, morph_file, input_file) 52 | doc = self.readDeps(doc, dep_file, ignoreDep) 53 | 54 | return doc 55 | 56 | 57 | def readTokens(self, doc, morph_file, input_file): 58 | ''' 59 | Reads a morphology and input (POS tagged lemmas) file simultaneously, 60 | parsing single tokens from each line into a Token() object and 61 | appending each Token to its respective Sentence() object, which 62 | are collected in an Article() object "doc" and returned. 63 | ''' 64 | isNew=True 65 | 66 | with open(morph_file, 'r') as mor, open(input_file, 'r') as inp: 67 | for mline in mor.readlines(): 68 | mline = mline.strip() 69 | iline = inp.readline().strip() 70 | 71 | if iline == '': 72 | isNew = True 73 | continue 74 | 75 | ts = iline.split('_') 76 | 77 | if isNew: 78 | sent = Sentence() 79 | sent.add_token(Token('ROOT','ROOT')) 80 | doc.sentences.append(sent) 81 | isNew = False 82 | 83 | pos = ts[1] 84 | lemma = mline.replace(':','.').lower() 85 | form = iline[0] 86 | 87 | doc.sentences[-1].add_token(Token(pos,lemma,form)) 88 | 89 | return doc 90 | 91 | 92 | def readDeps(self, doc, deps_file, ignoreDep): 93 | ''' 94 | Reads a dependency relationships file and adds these relationships to 95 | their respective Sentence() objects in an Article() in the form of 96 | reciprocal python dictionaries. The updated Article() "doc" is then 97 | returned. 98 | ''' 99 | blank = False 100 | senId = 0 101 | 102 | currSent = doc.sentences[senId] 103 | currNonRoots = set() 104 | currRoots = set() 105 | 106 | with open(deps_file, 'r') as d: 107 | for line in d.readlines(): 108 | line = line.strip() 109 | 110 | if len(line) == 0: 111 | if not blank: 112 | senId += 1 113 | 114 | blank = True 115 | 116 | if currRoots is not None: 117 | dep_chds = currSent.get_children(0) 118 | for i in currRoots: 119 | dep_chds.add((i, 'ROOT')) 120 | currSent.set_parent(i, ('ROOT', 0)) 121 | currSent.set_children(0, dep_chds) 122 | doc.sentences[senId] = currSent 123 | 124 | currSent = None 125 | currNonRoots = None 126 | currRoots = None 127 | 128 | continue 129 | else: 130 | if blank: 131 | blank = False 132 | currSent = doc.sentences[senId] 133 | currNonRoots = set() 134 | currRoots = set() 135 | 136 | rel = line[:line.index("(")] 137 | items = line[line.index('('):].replace('(','').replace(')','') 138 | items = items.split(', ') 139 | gov, dep = items[0], items[1] 140 | gov = (int(gov[gov.rfind('-')+1:]), gov[:gov.rfind('-')]) 141 | dep = (int(dep[dep.rfind('-')+1:]), dep[:dep.rfind('-')]) 142 | 143 | if ('conj' not in rel) & (gov[0] == dep[0]): 144 | continue 145 | 146 | currNonRoots.add(dep[0]) 147 | if dep[0] in currRoots: 148 | currRoots.remove(dep[0]) 149 | if gov[0] not in currNonRoots: 150 | currRoots.add(gov[0]) 151 | 152 | if ignoreDep & (rel in self._ignored_deps): 153 | continue 154 | 155 | currSent.set_parent(dep[0], (rel, gov[0])) 156 | 157 | if gov[0] in currSent.get_children(): 158 | currSent.add_child(gov[0], (dep[0], rel)) 159 | else: 160 | currSent.set_children(gov[0], set()) 161 | currSent.add_child(gov[0], (dep[0], rel)) 162 | 163 | if currRoots is not None: 164 | dep_chds = currSent.get_children(0) 165 | for i in currRoots: 166 | dep_chds.add((i, 'ROOT')) 167 | currSent.set_parent(i, ('ROOT', 0)) 168 | currSent.set_children(0, dep_chds) 169 | doc.sentences[senId] = currSent 170 | 171 | currSent = None 172 | currNonRoots = None 173 | currRoots = None 174 | 175 | return doc 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /syntax/__pycache__/Article.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/Article.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/Path.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/Path.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/RelType.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/RelType.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/Sentence.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/Sentence.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/StanfordParseReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/StanfordParseReader.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/Token.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/Token.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/TreeNode.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/TreeNode.cpython-36.pyc -------------------------------------------------------------------------------- /syntax/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/syntax/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/Utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Utility functions for pymln parsing 5 | # 6 | 7 | 8 | def genTreeNodeID(aid, sid, wid): 9 | node_id = ':'.join([str(x) for x in [aid, sid, wid]]) 10 | 11 | return node_id 12 | 13 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/utils/__init__.py -------------------------------------------------------------------------------- /utils/__pycache__/Utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/utils/__pycache__/Utils.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/pymln/d904fcd4f36c8c2409924e1bd4cc2b07242ae8cb/utils/__pycache__/__init__.cpython-36.pyc --------------------------------------------------------------------------------