├── .gitignore ├── .travis.yml ├── BerkeleyInterface ├── __init__.py ├── berkeleyinterface.py └── interactive.py ├── LICENSE ├── Makefile ├── README.md ├── examples ├── example.py ├── testinput.txt ├── testinput2.txt └── testinput2.txt.parsed ├── requirements.txt ├── setup.py └── tests └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Basics 2 | *.py[cod] 3 | *.txt* 4 | *.jar 5 | *.gr 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Packages 11 | *.egg 12 | *.egg-info 13 | dist 14 | build 15 | eggs 16 | parts 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | __pycache__ 24 | 25 | # Installer logs 26 | pip-log.txt 27 | 28 | # Unit test / coverage reports 29 | .coverage 30 | .tox 31 | nosetests.xml 32 | 33 | # Translations 34 | *.mo 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | install: pip install -r requirements.txt --use-mirrors 5 | script: make test 6 | notifications: 7 | email: 8 | recipients: 9 | - beth.mcnany@gmail.com 10 | - benjamin@bengfort.com 11 | on_success: change 12 | on_failure: always 13 | -------------------------------------------------------------------------------- /BerkeleyInterface/__init__.py: -------------------------------------------------------------------------------- 1 | # BerkeleyInterface 2 | # A Python wrapper for the Berkeley Parser. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Feb 05 09:42:59 2014 -0500 6 | # 7 | # Copyright (C) 2013 UMD Metacognitive Lab 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [] bengfort@cs.umd.edu $ 11 | 12 | """ 13 | A Python wrapper for the Berkeley Parser. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | from berkeleyinterface import * 21 | -------------------------------------------------------------------------------- /BerkeleyInterface/berkeleyinterface.py: -------------------------------------------------------------------------------- 1 | # BerkeleyInterface.berkeleyinterface 2 | # Main functionality of the interface 3 | # 4 | # Author: Elizabeth McNany 5 | # Created: Tue Jul 09 14:20:34 2013 -0400 6 | # 7 | # Copyright (C) 2013 UMD Metacognitive Lab 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: berkeleyinterface.py [] beth@cs.umd.edu $ 11 | 12 | """ 13 | Python interface to the Berkeley Parser 14 | 15 | This has the advantage over other implementations which essentially automate a 16 | call to the jar file: this actually duplicates the main() method, allowing 17 | multiple parse calls and ability to modify options without the overhead of 18 | loading the grammar file each time (and without having to use Java!) 19 | """ 20 | 21 | ########################################################################## 22 | ## Imports 23 | ########################################################################## 24 | 25 | import sys 26 | import re 27 | import jpype 28 | from StringIO import StringIO 29 | 30 | ########################################################################## 31 | ## Main Functionality 32 | ########################################################################## 33 | 34 | 35 | def __outputTrees(parseTrees, outputData, parser, opts, line, sentenceID): 36 | ''' 37 | Write tree information to outputData. This is a reimplementation of the 38 | private method of the same name from BerkeleyParser.java. 39 | ''' 40 | # todo cleanup? 41 | delimiter = "\t" 42 | if opts.ec_format: 43 | newList = [] 44 | for parsedTree in parseTrees: 45 | if parsedTree.getChildren().isEmpty(): 46 | continue 47 | if parser.getLogLikelihood(parsedTree) != float("-inf"): 48 | newList.append(parsedTree) 49 | parseTrees = newList 50 | outputData.write("%s\t%s\n" % (len(parseTrees), sentenceID)) 51 | delimiter = ",\t" 52 | 53 | for parsedTree in parseTrees: 54 | addDelimiter = False 55 | if opts.tree_likelihood: 56 | treeLL = float("-inf") if parsedTree.getChildren().isEmpty() \ 57 | else parser.getLogLikelihood(parsedTree) 58 | if treeLL == float("-inf"): 59 | continue 60 | outputData.write("%s"%treeLL) 61 | addDelimiter = True 62 | if opts.sentence_likelihood: 63 | allLL = float("-inf") if parsedTree.getChildren().isEmpty() \ 64 | else parser.getLogLikelihood() 65 | if addDelimiter: 66 | outputData.write(delimiter) 67 | addDelimiter = True 68 | if opts.ec_format: 69 | outputData.write("sentenceLikelihood ") 70 | outputData.write("%s"%allLL) 71 | if not opts.binarize: 72 | TreeAnnotations = jpype.JClass("edu.berkeley.nlp.PCFGLA.TreeAnnotations") 73 | parsedTree = TreeAnnotations.unAnnotateTree(parsedTree, opts.keepFunctionLabels) 74 | if opts.confidence: 75 | treeLL = float("-inf") if parsedTree.getChildren().isEmpty() \ 76 | else parser.getConfidence(parsedTree) 77 | if addDelimiter: 78 | outputData.write(delimiter) 79 | addDelimiter = True 80 | if opts.ec_format: 81 | outputData.write("confidence ") 82 | outputData.write("%s"%treeLL) 83 | elif opts.modelScore: 84 | score = float("-inf") if parsedTree.getChildren().isEmpty() \ 85 | else parser.getModelScore(parsedTree) 86 | if addDelimiter: 87 | outputData.write(delimiter) 88 | addDelimiter = True 89 | if opts.ec_format: 90 | outputData.write("maxRuleScore ") 91 | outputData.write("%.8f"%score) 92 | 93 | if opts.ec_format: 94 | outputData.write("\n") 95 | elif addDelimiter: 96 | outputData.write(delimiter) 97 | if not parsedTree.getChildren().isEmpty(): 98 | treeString = parsedTree.getChildren().get(0).toString() 99 | if len(parsedTree.getChildren()) != 1: 100 | sys.stderr.write("ROOT has more than one child!") 101 | parsedTree.setLabel("") 102 | treeString = parsedTree.toString() 103 | if opts.ec_format: 104 | outputData.write("(S1 " + treeString + " )\n"); 105 | else: 106 | outputData.write("( " + treeString + " )\n"); 107 | else: 108 | outputData.write("(())\n") 109 | if opts.render: 110 | try: 111 | writeTreeToImage(parsedTree, re.sub("[^a-zA-Z]", "", line) + ".png") 112 | except jpype.JException(java.lang.RuntimeException), ex: 113 | #todo actually test this exception handling 114 | print "Caught the runtime exception : ", JavaException.message() 115 | print JavaException.stackTrace() 116 | if opts.dumpPosteriors: 117 | blockSize = 50 118 | fileName = opts.grFileName + ".posteriors" 119 | parser.dumpPosteriors(fileName, blockSize) 120 | 121 | if opts.kbest > 1: 122 | outputData.write("\n") 123 | 124 | outputData.flush() 125 | 126 | def startup(classpath): 127 | '''Start the JVM. This MUST be called before any other jpype functions!''' 128 | # regarding memory - YMMV; this worked for my setup 129 | jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=%s" % classpath, "-Xmx500m") 130 | 131 | def dictToArgs(d): 132 | '''Convert a dict of options to a list of command-line-style args''' 133 | boolDefaults = [ "tokenize", "binarize", "scores", "keepFunctionLabels", 134 | "substates", "accurate", "modelScore", "confidence", "sentence_likelihood", 135 | "tree_likelihood", "variational", "render", "chinese", "useGoldPOS", 136 | "dumpPosteriors", "ec_format", 137 | ] # these all default to False and only require the switch if True 138 | 139 | # get a list of "-key", "value" or just "-key" if key is in boolDefaults 140 | args = [j for i in [("-"+k, '%s'%v) if k not in boolDefaults else ("-"+k,) for k,v in d.iteritems()] for j in i] 141 | return args 142 | 143 | def getOpts(args): 144 | ''' 145 | Converts given command-line-style args to opts for parser functions. 146 | 147 | Note that changing options for: 148 | accurate, chinese, grFileName, kbest, nGrammars, nThreads, scores, 149 | substates, viterbi, variational 150 | after calling loadGrammar will NOT update the parser. 151 | 152 | Specifically, options for: 153 | grFileName, kbest, nThreads 154 | are used in both parser setup (loadGrammar) and actual parsing (parseInput) 155 | 156 | Options for: 157 | binarize, confidence, dumpPosteriors, ec_format, goldPOS, inputFile, 158 | keepFunctionLabels, maxLength, modelScore, outputFile, render, 159 | sentence_likelihood, tokenize, tree_likelihood 160 | do not affect the grammar loading and may be changed between those steps. 161 | 162 | The JVM must be started before calling this function. 163 | ''' 164 | Options = jpype.JClass("edu.berkeley.nlp.PCFGLA.BerkeleyParser$Options") 165 | OptionParser = jpype.JClass("edu.berkeley.nlp.PCFGLA.OptionParser") 166 | optParser = OptionParser(Options) 167 | opts = optParser.parse(args, True) 168 | return opts 169 | 170 | def loadGrammar(opts): 171 | ''' 172 | Loads the grammar and lexicon for the parser, given options. 173 | Returns the initialized parser. 174 | ''' 175 | threshold = 1.0 176 | 177 | if opts.chinese: #todo WARNING: THIS IS UNTESTED 178 | Corpus = jpype.JClass("edu.berkeley.nlp.PCFGLA.Corpus") 179 | Corpus.myTreebank = Corpus.TreeBankType.CHINESE 180 | 181 | parser = None 182 | 183 | # load grammar 184 | if opts.nGrammars != 1: #todo 185 | print "Multiple grammars not implemented!" 186 | sys.exit(1) 187 | else: 188 | inFileName = opts.grFileName 189 | ParserData = jpype.JClass("edu.berkeley.nlp.PCFGLA.ParserData") 190 | pData = ParserData.Load(inFileName) 191 | if pData is None: 192 | print "Failed to load grammar from file '%s'."%inFileName 193 | sys.exit(1) 194 | grammar = pData.getGrammar() 195 | lexicon = pData.getLexicon() 196 | Numberer = jpype.JClass("edu.berkeley.nlp.util.Numberer") 197 | Numberer.setNumberers(pData.getNumbs()) 198 | if opts.kbest == 1: 199 | CoarseToFineMaxRuleParser = jpype.JClass("edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser") 200 | parser = CoarseToFineMaxRuleParser(grammar, lexicon, threshold, -1, 201 | opts.viterbi, opts.substates, opts.scores, opts.accurate, opts.variational, 202 | True, True) 203 | else: 204 | CoarseToFineNBestParser = jpype.JClass("edu.berkeley.nlp.PCFGLA.CoarseToFineNBestParser") 205 | parser = CoarseToFineNBestParser(grammar, lexicon, opts.kbest, threshold, 206 | -1, opts.viterbi, opts.substates, opts.scores, opts.accurate, 207 | opts.variational, False, True) 208 | 209 | parser.binarization = pData.getBinarization() 210 | #end else (if nGrammars != 1) 211 | 212 | if opts.nThreads > 1:#todo 213 | m_parser = None 214 | print "Multiple threads not implemented!" 215 | sys.exit(-1) 216 | 217 | return parser 218 | # end loadGrammar 219 | 220 | def parseInput(parser, opts, inputFile=None, outputFile=None): 221 | ''' 222 | Uses parser with opts to parse the input file to output file. 223 | Optional arguments inputFile and outputFile overwrite values in opts. 224 | If a StringIO object is used as output, it will not be closed. 225 | ''' 226 | # initialize input/outputs 227 | inputData = sys.stdin 228 | if inputFile: 229 | if isinstance(inputFile, StringIO): 230 | inputData = inputFile 231 | else: 232 | inputData = file(inputFile, 'r') 233 | elif opts.inputFile: 234 | inputData = file(opts.inputFile, 'r') 235 | 236 | outputData = sys.stdout 237 | if outputFile: 238 | if isinstance(outputFile, StringIO): 239 | outputData = outputFile 240 | else: 241 | outputData = file(outputFile, 'w') 242 | elif opts.outputFile: 243 | outputData = file(opts.outputFile, 'w') 244 | 245 | # read in data 246 | sentenceID = "" 247 | line = inputData.readline() 248 | while line != '': 249 | line = line.strip() 250 | if opts.ec_format and line == "": 251 | continue 252 | 253 | sentence = None 254 | posTags = None 255 | 256 | if opts.goldPOS: # format: "word\tPOS-...\n"; newline between sentences 257 | sentence = [] 258 | posTags = [] 259 | tmp = line.split("\t") 260 | if len(tmp) == 0: 261 | continue 262 | sentence.append(tmp[0]) 263 | tags = tmp[1].split("-") 264 | posTags.append(tags[0]) 265 | 266 | line = inputData.readline().strip() # need to remove newlines 267 | while line != '': 268 | tmp = line.split("\t") 269 | if len(tmp) == 0: 270 | break 271 | sentence.append(tmp[0]) 272 | tags = tmp[1].split("-") 273 | posTags.append(tags[0]) 274 | line = inputData.readline().strip() 275 | else: 276 | if opts.ec_format: 277 | breakIndex = line.index(">") 278 | sentenceID = line[3:breakIndex-1] 279 | line = line[breakIndex+2:len(line)-5] 280 | if not opts.tokenize: 281 | sentence = re.split(r'\s+', line) 282 | else: 283 | PTBLineLexer = jpype.JClass("edu.berkeley.nlp.io.PTBLineLexer") 284 | tokenizer = PTBLineLexer() 285 | sentence = tokenizer.tokenizeLine(line) 286 | 287 | if len(sentence) > opts.maxLength: 288 | outputData.write("(())\n"); 289 | sys.stderr.write("Skipping sentence with %s words since it is too long.") 290 | continue 291 | 292 | if opts.nThreads > 1: #todo 293 | print "Multiple threads still not implemented!" 294 | sys.exit(-1) 295 | else: 296 | parsedTrees = [] 297 | if opts.kbest > 1: 298 | parsedTrees = parser.getKBestConstrainedParses(sentence, posTags, opts.kbest) 299 | if len(parsedTrees) == 0: 300 | Tree = jpype.JClass("edu.berkeley.nlp.syntax.Tree") 301 | parsedTrees.append(Tree("ROOT")) 302 | else: 303 | parsedTrees = [] 304 | pt = None 305 | if posTags != None: 306 | pt = jpype.java.util.ArrayList() 307 | for p in posTags: 308 | pt.add(p) 309 | st = jpype.java.util.ArrayList() 310 | for s in sentence: 311 | st.add(s) 312 | # postags will be None unless using option goldPOS 313 | # len(posTags) == len(sentence) 314 | parsedTree = parser.getBestConstrainedParse(st, pt, None) 315 | if opts.goldPOS and parsedTree.getChildren().isEmpty(): 316 | # comment in Java: "parse error when using goldPOS, try without" 317 | # This will ignore any given tags and just use the default tagger 318 | parsedTree = parser.getBestConstrainedParse(sentence, None, None) 319 | parsedTrees.append(parsedTree) 320 | 321 | # using the reimplemented function because the Java method is private 322 | __outputTrees(parsedTrees, outputData, parser, opts, line, sentenceID) 323 | 324 | line = inputData.readline() 325 | # end while 326 | 327 | if opts.nThreads > 1: #todo 328 | print "Multiple threads still definitely not implemented!" 329 | sys.exit(-1) 330 | 331 | if opts.dumpPosteriors: 332 | fileName = opts.grFileName + ".posteriors" 333 | parser.dumpPosteriors(fileName, -1) 334 | 335 | '''close files''' 336 | inputData.close() 337 | outputData.flush() 338 | if outputData != sys.stdout and not isinstance(outputData, StringIO): 339 | outputData.close() 340 | #end parseInput 341 | 342 | def shutdown(): 343 | '''Shut down the JVM''' 344 | jpype.shutdownJVM() 345 | 346 | # todo command-line usage w/argparse? 347 | -------------------------------------------------------------------------------- /BerkeleyInterface/interactive.py: -------------------------------------------------------------------------------- 1 | # BerkeleyInterface.interactive 2 | # An interactive console to the Berkeley Parser 3 | # 4 | # Author: Elizabeth McNany 5 | # Created: Tue Jul 16 16:40:22 2013 -0400 6 | # 7 | # Copyright (C) 2013 UMD Metacognitive Lab 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: interactive.py [] beth@cs.umd.edu $ 11 | 12 | """ 13 | Basic example demonstrating usage of the interface: Interactive console version! 14 | User can enter utterances repeatedly and exit with ctrl-c 15 | """ 16 | 17 | ########################################################################## 18 | ## Imports 19 | ########################################################################## 20 | 21 | from berkeleyinterface import * 22 | from StringIO import StringIO 23 | import sys 24 | 25 | ########################################################################## 26 | ## Main functionality 27 | ########################################################################## 28 | 29 | # Allow entering a number for kbest parses to show when running 30 | kbest = 1 31 | if len(sys.argv) > 1: 32 | kbest = int(sys.argv[1]) 33 | 34 | # This should be the path to the Berkeley Parser jar file 35 | cp = r'C:\berkeleyparser\BerkeleyParser-1.7.jar' 36 | 37 | # Always start the JVM first! 38 | startup(cp) 39 | 40 | # Set input arguments 41 | # See the BerkeleyParser documentation for information on arguments 42 | gr = r'C:\berkeleyparser\eng_sm6.gr' 43 | args = {"gr":gr, "tokenize":True, "kbest":kbest} 44 | 45 | # Convert args from a dict to the appropriate Java class 46 | opts = getOpts(dictToArgs(args)) 47 | 48 | # Load the grammar file and initialize the parser with our options 49 | parser = loadGrammar(opts) 50 | 51 | # Now, run the parser 52 | print "Enter your input below:\n" 53 | while True: 54 | try: 55 | # User can type into the console and the parse will be written to stdout 56 | strIn = StringIO(raw_input(" > ")) # yes, this is still 2.7... 57 | strOut = StringIO() 58 | parseInput(parser, opts, inputFile=strIn, outputFile=strOut) 59 | print strOut.getvalue() 60 | except EOFError: 61 | print "\n\nGoodbye." 62 | break 63 | 64 | # That's all, folks! 65 | shutdown() 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Berkeley Interface 2 | A Python Wrapper for the Berkeley Parser 3 | Copyright (C) 2013 Elizabeth McNany 4 | 5 | GNU GENERAL PUBLIC LICENSE 6 | Version 2, June 1991 7 | 8 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 9 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 10 | Everyone is permitted to copy and distribute verbatim copies 11 | of this license document, but changing it is not allowed. 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | License is intended to guarantee your freedom to share and change free 18 | software--to make sure the software is free for all its users. This 19 | General Public License applies to most of the Free Software 20 | Foundation's software and to any other program whose authors commit to 21 | using it. (Some other Free Software Foundation software is covered by 22 | the GNU Lesser General Public License instead.) You can apply it to 23 | your programs, too. 24 | 25 | When we speak of free software, we are referring to freedom, not 26 | price. Our General Public Licenses are designed to make sure that you 27 | have the freedom to distribute copies of free software (and charge for 28 | this service if you wish), that you receive source code or can get it 29 | if you want it, that you can change the software or use pieces of it 30 | in new free programs; and that you know you can do these things. 31 | 32 | To protect your rights, we need to make restrictions that forbid 33 | anyone to deny you these rights or to ask you to surrender the rights. 34 | These restrictions translate to certain responsibilities for you if you 35 | distribute copies of the software, or if you modify it. 36 | 37 | For example, if you distribute copies of such a program, whether 38 | gratis or for a fee, you must give the recipients all the rights that 39 | you have. You must make sure that they, too, receive or can get the 40 | source code. And you must show them these terms so they know their 41 | rights. 42 | 43 | We protect your rights with two steps: (1) copyright the software, and 44 | (2) offer you this license which gives you legal permission to copy, 45 | distribute and/or modify the software. 46 | 47 | Also, for each author's protection and ours, we want to make certain 48 | that everyone understands that there is no warranty for this free 49 | software. If the software is modified by someone else and passed on, we 50 | want its recipients to know that what they have is not the original, so 51 | that any problems introduced by others will not reflect on the original 52 | authors' reputations. 53 | 54 | Finally, any free program is threatened constantly by software 55 | patents. We wish to avoid the danger that redistributors of a free 56 | program will individually obtain patent licenses, in effect making the 57 | program proprietary. To prevent this, we have made it clear that any 58 | patent must be licensed for everyone's free use or not licensed at all. 59 | 60 | The precise terms and conditions for copying, distribution and 61 | modification follow. 62 | 63 | GNU GENERAL PUBLIC LICENSE 64 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 65 | 66 | 0. This License applies to any program or other work which contains 67 | a notice placed by the copyright holder saying it may be distributed 68 | under the terms of this General Public License. The "Program", below, 69 | refers to any such program or work, and a "work based on the Program" 70 | means either the Program or any derivative work under copyright law: 71 | that is to say, a work containing the Program or a portion of it, 72 | either verbatim or with modifications and/or translated into another 73 | language. (Hereinafter, translation is included without limitation in 74 | the term "modification".) Each licensee is addressed as "you". 75 | 76 | Activities other than copying, distribution and modification are not 77 | covered by this License; they are outside its scope. The act of 78 | running the Program is not restricted, and the output from the Program 79 | is covered only if its contents constitute a work based on the 80 | Program (independent of having been made by running the Program). 81 | Whether that is true depends on what the Program does. 82 | 83 | 1. You may copy and distribute verbatim copies of the Program's 84 | source code as you receive it, in any medium, provided that you 85 | conspicuously and appropriately publish on each copy an appropriate 86 | copyright notice and disclaimer of warranty; keep intact all the 87 | notices that refer to this License and to the absence of any warranty; 88 | and give any other recipients of the Program a copy of this License 89 | along with the Program. 90 | 91 | You may charge a fee for the physical act of transferring a copy, and 92 | you may at your option offer warranty protection in exchange for a fee. 93 | 94 | 2. You may modify your copy or copies of the Program or any portion 95 | of it, thus forming a work based on the Program, and copy and 96 | distribute such modifications or work under the terms of Section 1 97 | above, provided that you also meet all of these conditions: 98 | 99 | a) You must cause the modified files to carry prominent notices 100 | stating that you changed the files and the date of any change. 101 | 102 | b) You must cause any work that you distribute or publish, that in 103 | whole or in part contains or is derived from the Program or any 104 | part thereof, to be licensed as a whole at no charge to all third 105 | parties under the terms of this License. 106 | 107 | c) If the modified program normally reads commands interactively 108 | when run, you must cause it, when started running for such 109 | interactive use in the most ordinary way, to print or display an 110 | announcement including an appropriate copyright notice and a 111 | notice that there is no warranty (or else, saying that you provide 112 | a warranty) and that users may redistribute the program under 113 | these conditions, and telling the user how to view a copy of this 114 | License. (Exception: if the Program itself is interactive but 115 | does not normally print such an announcement, your work based on 116 | the Program is not required to print an announcement.) 117 | 118 | These requirements apply to the modified work as a whole. If 119 | identifiable sections of that work are not derived from the Program, 120 | and can be reasonably considered independent and separate works in 121 | themselves, then this License, and its terms, do not apply to those 122 | sections when you distribute them as separate works. But when you 123 | distribute the same sections as part of a whole which is a work based 124 | on the Program, the distribution of the whole must be on the terms of 125 | this License, whose permissions for other licensees extend to the 126 | entire whole, and thus to each and every part regardless of who wrote it. 127 | 128 | Thus, it is not the intent of this section to claim rights or contest 129 | your rights to work written entirely by you; rather, the intent is to 130 | exercise the right to control the distribution of derivative or 131 | collective works based on the Program. 132 | 133 | In addition, mere aggregation of another work not based on the Program 134 | with the Program (or with a work based on the Program) on a volume of 135 | a storage or distribution medium does not bring the other work under 136 | the scope of this License. 137 | 138 | 3. You may copy and distribute the Program (or a work based on it, 139 | under Section 2) in object code or executable form under the terms of 140 | Sections 1 and 2 above provided that you also do one of the following: 141 | 142 | a) Accompany it with the complete corresponding machine-readable 143 | source code, which must be distributed under the terms of Sections 144 | 1 and 2 above on a medium customarily used for software interchange; or, 145 | 146 | b) Accompany it with a written offer, valid for at least three 147 | years, to give any third party, for a charge no more than your 148 | cost of physically performing source distribution, a complete 149 | machine-readable copy of the corresponding source code, to be 150 | distributed under the terms of Sections 1 and 2 above on a medium 151 | customarily used for software interchange; or, 152 | 153 | c) Accompany it with the information you received as to the offer 154 | to distribute corresponding source code. (This alternative is 155 | allowed only for noncommercial distribution and only if you 156 | received the program in object code or executable form with such 157 | an offer, in accord with Subsection b above.) 158 | 159 | The source code for a work means the preferred form of the work for 160 | making modifications to it. For an executable work, complete source 161 | code means all the source code for all modules it contains, plus any 162 | associated interface definition files, plus the scripts used to 163 | control compilation and installation of the executable. However, as a 164 | special exception, the source code distributed need not include 165 | anything that is normally distributed (in either source or binary 166 | form) with the major components (compiler, kernel, and so on) of the 167 | operating system on which the executable runs, unless that component 168 | itself accompanies the executable. 169 | 170 | If distribution of executable or object code is made by offering 171 | access to copy from a designated place, then offering equivalent 172 | access to copy the source code from the same place counts as 173 | distribution of the source code, even though third parties are not 174 | compelled to copy the source along with the object code. 175 | 176 | 4. You may not copy, modify, sublicense, or distribute the Program 177 | except as expressly provided under this License. Any attempt 178 | otherwise to copy, modify, sublicense or distribute the Program is 179 | void, and will automatically terminate your rights under this License. 180 | However, parties who have received copies, or rights, from you under 181 | this License will not have their licenses terminated so long as such 182 | parties remain in full compliance. 183 | 184 | 5. You are not required to accept this License, since you have not 185 | signed it. However, nothing else grants you permission to modify or 186 | distribute the Program or its derivative works. These actions are 187 | prohibited by law if you do not accept this License. Therefore, by 188 | modifying or distributing the Program (or any work based on the 189 | Program), you indicate your acceptance of this License to do so, and 190 | all its terms and conditions for copying, distributing or modifying 191 | the Program or works based on it. 192 | 193 | 6. Each time you redistribute the Program (or any work based on the 194 | Program), the recipient automatically receives a license from the 195 | original licensor to copy, distribute or modify the Program subject to 196 | these terms and conditions. You may not impose any further 197 | restrictions on the recipients' exercise of the rights granted herein. 198 | You are not responsible for enforcing compliance by third parties to 199 | this License. 200 | 201 | 7. If, as a consequence of a court judgment or allegation of patent 202 | infringement or for any other reason (not limited to patent issues), 203 | conditions are imposed on you (whether by court order, agreement or 204 | otherwise) that contradict the conditions of this License, they do not 205 | excuse you from the conditions of this License. If you cannot 206 | distribute so as to satisfy simultaneously your obligations under this 207 | License and any other pertinent obligations, then as a consequence you 208 | may not distribute the Program at all. For example, if a patent 209 | license would not permit royalty-free redistribution of the Program by 210 | all those who receive copies directly or indirectly through you, then 211 | the only way you could satisfy both it and this License would be to 212 | refrain entirely from distribution of the Program. 213 | 214 | If any portion of this section is held invalid or unenforceable under 215 | any particular circumstance, the balance of the section is intended to 216 | apply and the section as a whole is intended to apply in other 217 | circumstances. 218 | 219 | It is not the purpose of this section to induce you to infringe any 220 | patents or other property right claims or to contest validity of any 221 | such claims; this section has the sole purpose of protecting the 222 | integrity of the free software distribution system, which is 223 | implemented by public license practices. Many people have made 224 | generous contributions to the wide range of software distributed 225 | through that system in reliance on consistent application of that 226 | system; it is up to the author/donor to decide if he or she is willing 227 | to distribute software through any other system and a licensee cannot 228 | impose that choice. 229 | 230 | This section is intended to make thoroughly clear what is believed to 231 | be a consequence of the rest of this License. 232 | 233 | 8. If the distribution and/or use of the Program is restricted in 234 | certain countries either by patents or by copyrighted interfaces, the 235 | original copyright holder who places the Program under this License 236 | may add an explicit geographical distribution limitation excluding 237 | those countries, so that distribution is permitted only in or among 238 | countries not thus excluded. In such case, this License incorporates 239 | the limitation as if written in the body of this License. 240 | 241 | 9. The Free Software Foundation may publish revised and/or new versions 242 | of the General Public License from time to time. Such new versions will 243 | be similar in spirit to the present version, but may differ in detail to 244 | address new problems or concerns. 245 | 246 | Each version is given a distinguishing version number. If the Program 247 | specifies a version number of this License which applies to it and "any 248 | later version", you have the option of following the terms and conditions 249 | either of that version or of any later version published by the Free 250 | Software Foundation. If the Program does not specify a version number of 251 | this License, you may choose any version ever published by the Free Software 252 | Foundation. 253 | 254 | 10. If you wish to incorporate parts of the Program into other free 255 | programs whose distribution conditions are different, write to the author 256 | to ask for permission. For software which is copyrighted by the Free 257 | Software Foundation, write to the Free Software Foundation; we sometimes 258 | make exceptions for this. Our decision will be guided by the two goals 259 | of preserving the free status of all derivatives of our free software and 260 | of promoting the sharing and reuse of software generally. 261 | 262 | NO WARRANTY 263 | 264 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 265 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 266 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 267 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 268 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 269 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 270 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 271 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 272 | REPAIR OR CORRECTION. 273 | 274 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 275 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 276 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 277 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 278 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 279 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 280 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 281 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 282 | POSSIBILITY OF SUCH DAMAGES. 283 | 284 | END OF TERMS AND CONDITIONS 285 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/sh 2 | 3 | LOCALPATH := $(CURDIR) 4 | TESTPATH := $(LOCALPATH)/tests 5 | 6 | .PHONY: test 7 | 8 | test: 9 | nosetests -v --with-coverage --cover-package=BerkleyInterface --cover-inclusive --cover-erase tests 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Enabled Berkeley Parser [![Build Status][status_image]][travis_link] # 2 | 3 | [status_image]: https://travis-ci.org/mclumd/berkeleyinterface.png?branch=master 4 | [travis_link]: https://travis-ci.org/mclumd/berkeleyinterface 5 | 6 | This has the advantage over other implementations which essentially automate a call to the jar file: this actually duplicates the `main` method, allowing multiple parse calls and ability to modify options without the overhead of loading the grammar file each time (and without having to use Java!) 7 | 8 | **N.B.** several features completely ignored (notably, `-nThreads`, `-nGrammars`, `-kbest` options) and some theoretically implemented but untested (`-chinese`) 9 | 10 | See example.py for a short running demo of the interface. 11 | 12 | ## Environment ## 13 | 14 | Note that this package requires the Berkeley Parser 1.6 or Berkeley Parser 1.7 JAR file (depending on your version of Java) as well as a grammar file. You can tell the Python module the location of these files via an ENVVAR: 15 | 16 | export BERKELEY_PARSER_JAR=/path/to/berkeley/parser.jar 17 | export BERKELEY_PARSER_GRM=/path/toberkeley/english.gr 18 | 19 | Otherwise the default is currently set to C:\berkeleyparser\ for Windows systems. In the future we'll add a search path to perform lookups for sane places on OS X and Linux as well. 20 | 21 | ## Installation and Dependencies ## 22 | 23 | Although a package script has been setup to install the Python module, one dependency cannot currently be fulfilled by PyPi- the core bridge between Python and Java, JPype. The authors have submitted a support request to have JPype included into PyPi, but until that time, you'll have to download and install this dependency yourself. 24 | 25 | Download the JPype package from [JPype 0.5.4.2](http://jpype.sourceforge.net/) and unpackage it in your current working directory. Run the command: 26 | 27 | pip install JPype-0.5.4.2 28 | 29 | Which should begin the installation process (we highly recommend that you use virtualenv and virtualenvwrapper to do this). Then simply run: 30 | 31 | python setup.py install 32 | 33 | And the BerkeleyInterface package should be installed into your Python path. 34 | 35 | *Note*: For Mac users, you may have to modify JPype's settings a bit, according to this [Stackoverflow Question](http://stackoverflow.com/questions/18524501/installing-jpype-in-mountain-lion). Modify the `JPype-0.5.4.2/setup.py` file to include the line following line: 36 | 37 | def setupInclusion(self): 38 | self.includeDirs = [ 39 | self.javaHome+"/include", 40 | self.javaHome+"/include/"+self.jdkInclude, 41 | "src/native/common/include", 42 | "src/native/python/include", 43 | 44 | #I added this line below. The folder contains a jni.h 45 | "/System/Library/Frameworks/JavaVM.framework/Versions/A/Headers/" 46 | ] 47 | 48 | Then run the `pip install JPype-0.5.4.2` command and it should work. 49 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | # example 2 | # Basic example demonstrating usage of the interface 3 | # 4 | # Author: Elizabeth McNany 5 | # Created: Tue Jul 09 14:20:34 2013 -0400 6 | # 7 | # Copyright (C) 2013 UMD Metacognitive Lab 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: example.py [] beth@cs.umd.edu $ 11 | 12 | """ 13 | Basic example demonstrating usage of the interface 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | from BerkeleyInterface import * 22 | 23 | JAR_PATH = r'C:\berkeleyparser\BerkeleyParser-1.7.jar' 24 | GRM_PATH = r'C:\berkeleyparser\eng_sm6.gr' 25 | 26 | # This should be the path to the Berkeley Parser jar file 27 | 28 | cp = os.environ.get("BERKELEY_PARSER_JAR", JAR_PATH) 29 | 30 | # Always start the JVM first! 31 | startup(cp) 32 | 33 | # Set input arguments 34 | # See the BerkeleyParser documentation for information on arguments 35 | # Notably: a grammar file ("gr") is required, and if inputFile / outputFile 36 | # are not given, it will default to stdin/stdout 37 | gr = os.environ.get("BERKELEY_PARSER_GRM", GRM_PATH) 38 | args = {"gr":gr, "tokenize":True, "inputFile":"testinput.txt"} 39 | 40 | # Convert args from a dict to the appropriate Java class 41 | opts = getOpts(dictToArgs(args)) 42 | 43 | # Load the grammar file and initialize the parser with our options 44 | parser = loadGrammar(opts) 45 | 46 | # Now, actually parse the input file 47 | # (Since we didn't specify an output file, it will go to stdout) 48 | parseInput(parser, opts) 49 | 50 | # At this point, we have done the equivalent of running from the command line: 51 | # java -client -jar C:\berkeleyparser\BerkeleyParser-1.7.jar\BerkeleyParser-1.7.jar -gr eng_sm6.gr -inputFile testinput.txt -tokenize 52 | 53 | # We can change opts between parses, in this case to change input/outputs 54 | # See documentation for getOpts for a list of options which are safe to modify 55 | # and will not require reinitializing the parser 56 | opts.inputFile = "testinput2.txt" 57 | opts.outputFile = opts.inputFile + ".parsed" 58 | 59 | # Or, we could modify the original args dictionary: 60 | args["inputFile"] = "testinput2.txt" 61 | args["outputFile"] = args["inputFile"] + ".parsed" 62 | opts = getOpts(dictToArgs(args)) 63 | 64 | # Parse again, with our modified options 65 | parseInput(parser, opts) 66 | 67 | # We can also take advantage of Python's built-in StringIO class, 68 | # which allows us to use strings like files 69 | from StringIO import StringIO 70 | strIn = StringIO("Hello, world!\nThe quick brown fox jumped over the lazy dogs.") 71 | strOut = StringIO() 72 | parseInput(parser, opts, outputFile=strOut) 73 | 74 | # Now we can retrieve the output as a string: 75 | result = strOut.getvalue() 76 | print "\nStringIO Result:\n",result 77 | 78 | # That's all, folks! 79 | shutdown() 80 | -------------------------------------------------------------------------------- /examples/testinput.txt: -------------------------------------------------------------------------------- 1 | The man hit the building with the bat. 2 | -------------------------------------------------------------------------------- /examples/testinput2.txt: -------------------------------------------------------------------------------- 1 | Mary had a little lamb whose fleece was white as snow. 2 | -------------------------------------------------------------------------------- /examples/testinput2.txt.parsed: -------------------------------------------------------------------------------- 1 | ( (S (NP (NNP Mary)) (VP (VBD had) (NP (NP (DT a) (JJ little) (NN lamb)) (SBAR (WHNP (WP$ whose) (NN fleece)) (S (VP (VBD was) (ADJP (JJ white)) (PP (IN as) (NP (NN snow)))))))) (. .)) ) 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #JPype==0.5.4.2 2 | coverage==3.7.1 3 | nose==1.3.0 4 | python-dateutil==2.2 5 | six==1.5.2 6 | wsgiref==0.1.2 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # setup 3 | # Setup script for BerkeleyInterface 4 | # 5 | # Author: Benjamin Bengfort 6 | # Created: Wed Feb 05 09:07:42 2014 -0500 7 | # 8 | # Copyright (C) 2013 UMD Metacognitive Lab 9 | # For license information, see LICENSE.txt 10 | # 11 | # ID: setup.py [] bengfort@cs.umd.edu $ 12 | 13 | """ 14 | Setup script for BerkeleyInterface 15 | """ 16 | 17 | ########################################################################## 18 | ## Imports 19 | ########################################################################## 20 | 21 | try: 22 | from setuptools import setup 23 | from setuptools import find_packages 24 | except ImportError: 25 | raise ImportError("Could not import \"setuptools\"." 26 | "Please install the setuptools package.") 27 | 28 | ########################################################################## 29 | ## Package Information 30 | ########################################################################## 31 | 32 | packages = find_packages(where=".", exclude=("tests", "bin", "docs", "fixtures",)) 33 | requires = [] 34 | 35 | with open('requirements.txt', 'r') as reqfile: 36 | for line in reqfile: 37 | if line.startswith('#'): continue 38 | requires.append(line.strip()) 39 | 40 | classifiers = ( 41 | 'Development Status :: 3 - Alpha', 42 | 'Environment :: MacOS X', 43 | 'Environment :: Console', 44 | 'Environment :: Other Environment', 45 | 'Intended Audience :: Science/Research', 46 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 47 | 'Natural Language :: English', 48 | 'Operating System :: MacOS :: MacOS X', 49 | 'Operating System :: POSIX :: Linux', 50 | 'Programming Language :: Python :: 2.7', 51 | 'Programming Language :: Java', 52 | 'Topic :: Scientific/Engineering', 53 | 'Topic :: Scientific/Engineering :: Information Analysis', 54 | 'Topic :: Software Development :: Libraries :: Java Libraries', 55 | 'Topic :: Software Development :: Libraries :: Python Modules', 56 | 'Topic :: Text Processing', 57 | 'Topic :: Text Processing :: Linguistic', 58 | ) 59 | 60 | config = { 61 | "name": "BerkeleyInterface", 62 | "version": "0.2", 63 | "description": "A Python wrapper for the Berkeley Parser", 64 | "author": "Elizabeth McNany", 65 | "author_email": "beth@cs.umd.edu", 66 | "url": "https://github.com/mclumd/berkeleyinterface", 67 | "packages": packages, 68 | "install_requires": requires, 69 | "classifiers": classifiers, 70 | "zip_safe": True, 71 | "scripts": [], 72 | } 73 | 74 | ########################################################################## 75 | ## Run setup script 76 | ########################################################################## 77 | 78 | if __name__ == '__main__': 79 | setup(**config) 80 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests 2 | # Testing for the Berkeley Interface Package 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Feb 05 09:05:45 2014 -0500 6 | # 7 | # Copyright (C) 2013 UMD Metacognitive Lab 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [] bengfort@cs.umd.edu $ 11 | 12 | """ 13 | Testing for the Berkeley Interface Package 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import unittest 22 | 23 | ########################################################################## 24 | ## TestCases 25 | ########################################################################## 26 | 27 | class InitializationTest(unittest.TestCase): 28 | 29 | def test_initialization(self): 30 | """ 31 | Test a simple world fact to kick off testing 32 | """ 33 | self.assertEqual(2**3, 8) 34 | 35 | @unittest.skip("Need to find way to install JPype on Travis") 36 | def test_import(self): 37 | """ 38 | We are able to import our packages 39 | """ 40 | try: 41 | import BerkeleyInterface as berkeley 42 | except ImportError: 43 | self.fail("Unable to import the BerkeleyInterface module!") 44 | --------------------------------------------------------------------------------