├── .document ├── .gitignore ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.orig.rdoc ├── README.rdoc ├── Rakefile ├── TESTS_STATUS.rdoc ├── VERSION.yml ├── examples └── stanford-sentence-parser.rb ├── lib ├── stanfordparser.rb └── stanfordparser │ └── java_object.rb ├── stanfordparser.gemspec └── test └── test_stanfordparser.rb /.document: -------------------------------------------------------------------------------- 1 | README.rdoc 2 | lib/**/*.rb 3 | bin/* 4 | features/**/*.feature 5 | LICENSE 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## MAC OS 2 | .DS_Store 3 | 4 | ## TEXTMATE 5 | *.tmproj 6 | tmtags 7 | 8 | ## EMACS 9 | *~ 10 | \#* 11 | .\#* 12 | 13 | ## VIM 14 | *.swp 15 | 16 | ## RubyMine 17 | /.idea 18 | 19 | ## PROJECT::GENERAL 20 | coverage 21 | rdoc 22 | pkg 23 | 24 | ## PROJECT::SPECIFIC 25 | .bundle 26 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source :gemcutter 2 | gem "rjb" 3 | gem "rake" 4 | gem "jeweler" 5 | gem "treebank", ">= 3.0.0" 6 | gem "rspec", ">= 1.2.9" 7 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | rake: 4 | group: 5 | - :default 6 | version: ">= 0" 7 | rspec: 8 | group: 9 | - :default 10 | version: ">= 1.2.9" 11 | jeweler: 12 | group: 13 | - :default 14 | version: ">= 0" 15 | treebank: 16 | group: 17 | - :default 18 | version: ">= 3.0.0" 19 | rjb: 20 | group: 21 | - :default 22 | version: ">= 0" 23 | specs: 24 | - rake: 25 | version: 0.8.7 26 | - json_pure: 27 | version: 1.4.3 28 | - gemcutter: 29 | version: 0.5.0 30 | - git: 31 | version: 1.2.5 32 | - rubyforge: 33 | version: 2.0.4 34 | - jeweler: 35 | version: 1.4.0 36 | - rjb: 37 | version: 1.2.5 38 | - rspec: 39 | version: 1.3.0 40 | - treebank: 41 | version: 3.0.0 42 | hash: 264a823adfd7bb2231dd1037e95b74038b67283d 43 | sources: 44 | - Rubygems: 45 | uri: http://gemcutter.org 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2007-2008 William Patrick McNeill 2 | Copyright (c) 2010 John Wilkinson 3 | 4 | This file is part of the Stanford Parser Ruby Wrapper. 5 | 6 | The Stanford Parser Ruby Wrapper is free software; you can redistribute it 7 | and/or modify it under the terms of the GNU General Public License as 8 | published by the Free Software Foundation; either version 2 of the License, 9 | or (at your option) any later version. 10 | 11 | The Stanford Parser Ruby Wrapper is distributed in the hope that it will be 12 | useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 14 | Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License along with 17 | editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin 18 | St, Fifth Floor, Boston, MA 02110-1301 USA -------------------------------------------------------------------------------- /README.orig.rdoc: -------------------------------------------------------------------------------- 1 | = Stanford Natural Language Parser Wrapper 2 | 3 | This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]. 4 | 5 | The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby along with pure Ruby objects that enable standoff parsing. 6 | 7 | 8 | = Installation and Configuration 9 | 10 | In addition to the Ruby gems it requires, to run this module you must manually install the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]. 11 | 12 | This module expects the parser to be installed in the /usr/local/stanford-parser/current directory on UNIX platforms and in the C:\stanford-parser\current directory on Windows platforms. This is the directory that contains the stanford-parser.jar file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments -server -Xmx150m. 13 | 14 | These defaults can be overridden by creating the configuration file /etc/ruby_stanford_parser.yaml on UNIX platforms and C:\stanford-parser\ruby-stanford-parser.yaml on Windows platforms. This file is in the Ruby YAML[http://ruby-doc.org/stdlib/libdoc/yaml/rdoc/index.html] format, and may contain two values: root and jvmargs. For example, the file might look like the following: 15 | 16 | root: /usr/local/stanford-parser/other/location 17 | jvmargs: -Xmx100m -verbose 18 | 19 | 20 | =Tokenization and Parsing 21 | 22 | Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into sentences and words. 23 | 24 | >> require "stanfordparser" 25 | => true 26 | >> preproc = StanfordParser::DocumentPreprocessor.new 27 | => 28 | >> puts preproc.getSentencesFromString("This is a sentence. So is this.") 29 | This is a sentence . 30 | So is this . 31 | 32 | Use the StanfordParser::LexicalizedParser class to parse sentences. 33 | 34 | >> parser = StanfordParser::LexicalizedParser.new 35 | Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec]. 36 | => edu.stanford.nlp.parser.lexparser.LexicalizedParser 37 | >> puts parser.apply("This is a sentence.") 38 | (ROOT 39 | (S [24.917] 40 | (NP [6.139] (DT [2.300] This)) 41 | (VP [17.636] (VBZ [0.144] is) 42 | (NP [12.299] (DT [1.419] a) (NN [8.897] sentence))) 43 | (. [0.002] .))) 44 | 45 | For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website. 46 | 47 | 48 | =Standoff Tokenization and Parsing 49 | 50 | This module also contains support for standoff tokenization and parsing, in which the terminal nodes of parse trees contain information about the text that was used to generate them. 51 | 52 | Use StanfordParser::StandoffDocumentPreprocessor class to tokenize text and files into sentences and words. 53 | 54 | >> preproc = StanfordParser::StandoffDocumentPreprocessor.new 55 | => 56 | >> s = preproc.getSentencesFromString("This is a sentence. So is this.") 57 | => [This is a sentence., So is this.] 58 | 59 | The standoff preprocessor returns StanfordParser::StandoffToken objects, which contain character offsets into the original text along with information about spacing characters that came before and after the token. 60 | 61 | >> puts s 62 | This [0,4] 63 | is [5,7] 64 | a [8,9] 65 | sentence [10,18] 66 | . [18,19] 67 | So [21,23] 68 | is [24,26] 69 | this [27,31] 70 | . [31,32] 71 | >> "This is a sentence. So is this."[27..31] 72 | => "this." 73 | 74 | This is the same information contained in the edu.stanford.nlp.ling.FeatureLabel class in the Stanford Parser Java implementation. 75 | 76 | Similarly, use the StanfordParser::StandoffParsedText object to parse a block of text into StanfordParser::StandoffNode parse trees whose terminal nodes are StanfordParser::StandoffToken objects. 77 | 78 | >> t = StanfordParser::StandoffParsedText.new("This is a sentence. So is this.") 79 | Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [4.9 sec]. 80 | => 81 | >> puts t.first 82 | (ROOT 83 | (S 84 | (NP (DT This [0,4])) 85 | (VP (VBZ is [5,7]) 86 | (NP (DT a [8,9]) (NN sentence [10,18]))) 87 | (. . [18,19]))) 88 | 89 | Standoff parse trees can reproduce the text from which they were generated verbatim. 90 | 91 | >> t.first.to_original_string 92 | => "This is a sentence. " 93 | 94 | They can also reproduce the original text with brackets inserted around the yields of specified parse nodes. 95 | 96 | >> t.first.to_bracketed_string([[0,0,0], [0,1,1]]) 97 | => "[This] is [a sentence]. " 98 | 99 | The format of the coordinates used to specify individual nodes is described in the documentation for the Ruby Treebank[http://rubyforge.org/projects/treebank/] gem. 100 | 101 | See the documentation of the individual classes in this module for more details. 102 | 103 | Unlike their parents StanfordParser::DocumentPreprocessor and StanfordParser::LexicalizedParser, which produce Ruby wrappers around Java objects, StanfordParser::StandoffDocumentPreprocessor and StanfordParser::StandoffParsedText produce pure Ruby objects. This is to facilitate serialization of these objects using tools like the Marshal module, which cannot serialize Java objects. 104 | 105 | = History 106 | 107 | 1.0.0:: Initial release 108 | 1.1.0:: Make module initialization function private. Add example code. 109 | 1.2.0:: Read Java VM arguments from the configuration file. Add Word class. 110 | 2.0.0:: Add support for standoff parsing. Change the way Rjb::JavaObjectWrapper wraps returned values: see wrap_java_object for details. Rjb::JavaObjectWrapper supports static members. Minor changes to stanford-sentence-parser script. 111 | 2.1.0:: Different default paths for Windows machines; Minor changes to StandoffToken definition 112 | 2.2.0:: Add parent information to StandoffNode 113 | 114 | = Copyright 115 | 116 | Copyright 2007-2008, William Patrick McNeill 117 | 118 | This program is distributed under the GNU General Public License. 119 | 120 | 121 | = Author 122 | 123 | W.P. McNeill mailto:billmcn@gmail.com -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = stanfordparser 2 | 3 | This is an upload/extension of Bill McNeal's stanfordparser rubyforge gem, check it out at its homepage (seems to be partially in French) 4 | 5 | http://rubyforge.org/projects/stanfordparser/ 6 | 7 | or its rdocs 8 | 9 | http://stanfordparser.rubyforge.org/ 10 | 11 | I've been having issues trying to use this gem so I decided to upload it to github and try to organize it to be a little more gem-friendly, especially using jeweler. 12 | 13 | AFAIK there aren't other copies of this on github, please correct me if I'm mistaken. The only similar one I can see is http://github.com/tiendung/ruby-nlp which has much less code and I can only assume to be something else. 14 | 15 | It seems like using version 1.6.1 of the java StanfordParser package is your best bet for compatability. 16 | 17 | See README.orig.rdoc for Bill's readme, which includes dependencies, installation, and usage. 18 | 19 | == Branches 20 | 21 | * master - Jeweler and Bundler integrated along with slight reorganization of files to be more gem-standard. This is the branch you should use if you want to source the gem straight from github. I will leave this branch alone for the most part unless I find/come up with stable and useful additions. All changes will be backwards compatible. 22 | * stock - Almost untouched from Bill's version, except for the README. Use this branch if that's what you're looking for. 23 | * fixing_tests - The tests are currently broken, this branch is trying to address that. Once the tests are fixed it will be merged back into master. Help appreciated! I'll keep a TESTS_STATUS.rdoc keeping track of progress. 24 | * experimental - I'll be putting in some code as examples and testing out some ideas. Do not use this branch as a gem. You are very encouraged, however, to fork it and add some code/make my code better. I'll try to integrate all the pull requests I get, if not in that branch into another. 25 | 26 | == Note on Patches/Pull Requests 27 | 28 | * Fork the project. 29 | * Make your feature addition or bug fix. 30 | * Add tests for it. I would prefer rSpec, but TestUnit is acceptable as well since there are some of those from the original author. 31 | * Commit. 32 | * Send me a pull request. Bonus points for topic branches. 33 | 34 | == Copyright 35 | 36 | Copyright (c) 2010 John Wilkinson. See LICENSE for details. 37 | Copyright 2007-2008, William Patrick McNeill. See README.orig for details. 38 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | $LOAD_PATH.unshift('lib') 5 | 6 | begin 7 | require 'jeweler' 8 | Jeweler::Tasks.new do |gem| 9 | gem.name = "stanfordparser" 10 | gem.summary = "GitHub upload/extension of Bill McNeal's stanfordparser rubygem" 11 | gem.description = "Ruby wrapper of the Stanford Parser, a NLP parser built in Java." 12 | gem.email = "jcwilk@gmail.com" 13 | gem.homepage = "http://github.com/jcwilk/stanfordparser" 14 | gem.authors = ["John Wilkinson","Bill McNeal"] 15 | 16 | gem.add_dependency "rjb", ">= 1.2.5" 17 | gem.add_dependency "treebank", ">= 3.0.0" 18 | gem.add_development_dependency "rspec", ">= 1.2.9" 19 | end 20 | Jeweler::GemcutterTasks.new 21 | rescue LoadError 22 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler" 23 | end 24 | 25 | require 'rake/testtask' 26 | Rake::TestTask.new(:test) do |test| 27 | test.test_files = FileList.new('test/**/test_*.rb') do |list| 28 | list.exclude 'test/test_helper.rb' 29 | end 30 | test.libs << 'test' 31 | test.verbose = true 32 | end 33 | 34 | require 'spec/rake/spectask' 35 | Spec::Rake::SpecTask.new(:spec) do |spec| 36 | spec.libs << 'lib' << 'spec' 37 | spec.spec_files = FileList['spec/**/*_spec.rb'] 38 | end 39 | 40 | Spec::Rake::SpecTask.new(:rcov) do |spec| 41 | spec.libs << 'lib' << 'spec' 42 | spec.pattern = 'spec/**/*_spec.rb' 43 | spec.rcov = true 44 | end 45 | 46 | task :test => :check_dependencies 47 | 48 | task :spec => :check_dependencies 49 | 50 | task :default => :test 51 | 52 | require 'rake/rdoctask' 53 | Rake::RDocTask.new do |rdoc| 54 | version = File.exist?('VERSION') ? File.read('VERSION') : "" 55 | 56 | rdoc.rdoc_dir = 'rdoc' 57 | rdoc.title = "stanfordparser #{version}" 58 | rdoc.rdoc_files.include('README*') 59 | rdoc.rdoc_files.include('lib/**/*.rb') 60 | end 61 | -------------------------------------------------------------------------------- /TESTS_STATUS.rdoc: -------------------------------------------------------------------------------- 1 | = Status of Tests 2 | * Please see fixing_tests branches for efforts towards this goal. 3 | -------------------------------------------------------------------------------- /VERSION.yml: -------------------------------------------------------------------------------- 1 | --- 2 | :major: 2 3 | :minor: 2 4 | :patch: 1 5 | :build: s -------------------------------------------------------------------------------- /examples/stanford-sentence-parser.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | #-- 4 | 5 | # Copyright 2007-2008 William Patrick McNeill 6 | # 7 | # This file is part of the Stanford Parser Ruby Wrapper. 8 | # 9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it 10 | # and/or modify it under the terms of the GNU General Public License as 11 | # published by the Free Software Foundation; either version 2 of the License, 12 | # or (at your option) any later version. 13 | # 14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be 15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 | # Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License along with 20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin 21 | # St, Fifth Floor, Boston, MA 02110-1301 USA 22 | # 23 | #++ 24 | 25 | # == Synopsis 26 | # 27 | # Parse a sentence passed in on the command line. 28 | # 29 | # == Usage 30 | # 31 | # stanford-sentence-parser.rb [options] sentence 32 | # 33 | # options:: 34 | # See the Java Stanford Parser documentation for details 35 | # 36 | # sentence:: 37 | # A sentence to parse. This must appear after all the options and be quoted. 38 | 39 | require 'rubygems' 40 | require "stanfordparser" 41 | 42 | # The last argument is the sentence. The rest of the command line is passed 43 | # along to the parser object. 44 | sentence = ARGV.pop 45 | parser = StanfordParser::LexicalizedParser.new(StanfordParser::ENGLISH_PCFG_MODEL, ARGV) 46 | puts parser.apply(sentence) 47 | -------------------------------------------------------------------------------- /lib/stanfordparser.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | 3 | require "pathname" 4 | require "rjb" 5 | require "singleton" 6 | begin 7 | require "treebank" 8 | gem "treebank", ">= 3.0.0" 9 | rescue LoadError 10 | require "treebank" 11 | end 12 | require "yaml" 13 | 14 | # Wrapper for the {Stanford Natural Language 15 | # Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]. 16 | module StanfordParser 17 | 18 | require "stanfordparser/java_object" 19 | 20 | VERSION = "2.2.1" 21 | 22 | # The default sentence segmenter and tokenizer. This is an English-language 23 | # tokenizer with support for Penn Treebank markup. 24 | EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer" 25 | 26 | # Path to an English PCFG model that comes with the Stanford Parser. The 27 | # location is relative to the parser root directory. This is a valid value 28 | # for the grammar parameter of the LexicalizedParser constructor. 29 | ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz" 30 | 31 | # This function is executed once when the module is loaded. It initializes 32 | # the Java virtual machine in which the Stanford parser will run. By 33 | # default, it adds the parser installation root to the Java classpath and 34 | # launches the VM with the arguments -server -Xmx150m. Different 35 | # values may be specified with the ruby-stanford-parser.yaml 36 | # configuration file. 37 | # 38 | # This function determines which operating system we are running on and sets 39 | # default pathnames accordingly: 40 | # 41 | # UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml 42 | # Windows:: C:\stanford-parser\current, 43 | # C:\stanford-parser\ruby-stanford-parser.yaml 44 | # 45 | # This function returns the path of the parser installation root. 46 | def StanfordParser.initialize_on_load 47 | if RUBY_PLATFORM =~ /(win|w)32$/ 48 | root = Pathname.new("C:\\stanford-parser\\current ") 49 | config = Pathname.new("C:\\stanford-parser\\ruby-stanford-parser.yaml") 50 | else 51 | root = Pathname.new("/usr/local/stanford-parser/current") 52 | config = Pathname.new("/etc/ruby-stanford-parser.yaml") 53 | end 54 | jvmargs = ["-server", "-Xmx150m"] 55 | if config.file? 56 | configuration = open(config) {|f| YAML.load(f)} 57 | if configuration.key?("root") and not configuration["root"].nil? 58 | root = Pathname.new(configuration["root"]) 59 | end 60 | if configuration.key?("jvmargs") and not configuration["jvmargs"].nil? 61 | jvmargs = configuration["jvmargs"].split 62 | end 63 | end 64 | Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs) 65 | root 66 | end 67 | 68 | private_class_method :initialize_on_load 69 | 70 | # The root directory of the Stanford parser installation. 71 | ROOT = initialize_on_load 72 | 73 | #-- 74 | # The documentation below is for the original Rjb::JavaObjectWrapper object. 75 | # It is reproduced here because rdoc only takes the last document block 76 | # defined. If Rjb is moved into its own gem, this documentation should go 77 | # with it, and the following should be written as documentation for this 78 | # class: 79 | # 80 | # Extension of the generic Ruby-Java Bridge wrapper object for the 81 | # StanfordParser module. 82 | #++ 83 | # A generic wrapper for a Java object loaded via the {Ruby-Java 84 | # Bridge}[http://rjb.rubyforge.org/]. The wrapper class handles 85 | # intialization and stringification, and passes other method calls down to 86 | # the underlying Java object. Objects returned by the underlying Java 87 | # object are converted to the appropriate Ruby object. 88 | # 89 | # Other modules may extend the list of Java objects that are converted by 90 | # adding their own converter functions. See wrap_java_object for details. 91 | # 92 | # This object is enumerable, yielding items in the order defined by the 93 | # underlying Java object's iterator. 94 | class Rjb::JavaObjectWrapper 95 | # FeatureLabel objects go inside a FeatureLabel wrapper. 96 | def wrap_edu_stanford_nlp_ling_FeatureLabel(object) 97 | StanfordParser::FeatureLabel.new(object) 98 | end 99 | 100 | # Tree objects go inside a Tree wrapper. Various tree types are aliased 101 | # to this function. 102 | def wrap_edu_stanford_nlp_trees_Tree(object) 103 | Tree.new(object) 104 | end 105 | 106 | alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree 107 | alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree 108 | alias :wrap_edu_stanford_nlp_trees_SimpleTree :wrap_edu_stanford_nlp_trees_Tree 109 | alias :wrap_edu_stanford_nlp_trees_TreeGraphNode :wrap_edu_stanford_nlp_trees_Tree 110 | 111 | protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel 112 | end # Rjb::JavaObjectWrapper 113 | 114 | 115 | # Lexicalized probabalistic parser. 116 | # 117 | # This is an wrapper for the 118 | # edu.stanford.nlp.parser.lexparser.LexicalizedParser object. 119 | class LexicalizedParser < Rjb::JavaObjectWrapper 120 | # The grammar used by the parser 121 | attr_reader :grammar 122 | 123 | # Create the parser given a grammar and options. The grammar 124 | # argument is a path to a grammar file. This path may contain the string 125 | # $(ROOT), which will be replaced with the root directory of the 126 | # Stanford Parser. By default, an English PCFG grammar is loaded. 127 | # 128 | # The options argument is a list of string arguments as they 129 | # would appear on a command line. See the documentaion of 130 | # edu.stanford.nlp.parser.lexparser.Options.setOptions for more 131 | # details. 132 | def initialize(grammar = ENGLISH_PCFG_MODEL, options = []) 133 | @grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT)) 134 | super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s) 135 | @java_object.setOptionFlags(options) 136 | end 137 | 138 | def to_s 139 | "LexicalizedParser(#{grammar.basename})" 140 | end 141 | end # LexicalizedParser 142 | 143 | 144 | # A singleton instance of the default Stanford Natural Language parser. A 145 | # singleton is used because the parser can take a few seconds to load. 146 | class DefaultParser < StanfordParser::LexicalizedParser 147 | include Singleton 148 | end 149 | 150 | 151 | # This is a wrapper for 152 | # edu.stanford.nlp.trees.Tree objects. It customizes 153 | # stringification. 154 | class Tree < Rjb::JavaObjectWrapper 155 | def initialize(obj = "edu.stanford.nlp.trees.Tree") 156 | super(obj) 157 | end 158 | 159 | # Return the label along with the score if there is one. 160 | def inspect 161 | s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]") 162 | "(#{s})" 163 | end 164 | 165 | # The Penn treebank representation. This prints with indenting instead of 166 | # putting everything on one line. 167 | def to_s 168 | "#{pennString}" 169 | end 170 | end # Tree 171 | 172 | 173 | # This is a wrapper for 174 | # edu.stanford.nlp.ling.Word objects. It customizes 175 | # stringification and adds an equivalence operator. 176 | class Word < Rjb::JavaObjectWrapper 177 | def initialize(obj = "edu.stanford.nlp.ling.Word", *args) 178 | super(obj, *args) 179 | end 180 | 181 | # See the word values. 182 | def inspect 183 | to_s 184 | end 185 | 186 | # Equivalence is defined relative to the word value. 187 | def ==(other) 188 | word == other 189 | end 190 | end # Word 191 | 192 | 193 | # This is a wrapper for edu.stanford.nlp.ling.FeatureLabel objects. 194 | # It customizes stringification. 195 | class FeatureLabel < Rjb::JavaObjectWrapper 196 | def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel") 197 | super 198 | end 199 | 200 | # Stringify with just the token and its begin and end position. 201 | def to_s 202 | # BUGBUG The position values come back as java.lang.Integer though I 203 | # would expect Rjb to convert them to Ruby integers. 204 | begin_position = get(self.BEGIN_POSITION_KEY) 205 | end_position = get(self.END_POSITION_KEY) 206 | "#{current} [#{begin_position},#{end_position}]" 207 | end 208 | 209 | # More verbose stringification with all the fields and their values. 210 | def inspect 211 | toString 212 | end 213 | end 214 | 215 | 216 | # Tokenizes documents into words and sentences. 217 | # 218 | # This is a wrapper for the 219 | # edu.stanford.nlp.process.DocumentPreprocessor object. 220 | class DocumentPreprocessor < Rjb::JavaObjectWrapper 221 | def initialize(suppressEscaping = false) 222 | super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping) 223 | end 224 | 225 | # Returns a list of sentences in a string. 226 | def getSentencesFromString(s) 227 | s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s) 228 | _invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object) 229 | end 230 | 231 | def inspect 232 | "<#{self.class.to_s.split('::').last}>" 233 | end 234 | 235 | def to_s 236 | inspect 237 | end 238 | end # DocumentPreprocessor 239 | 240 | # A text token that contains raw and normalized token identity (.e.g "(" and 241 | # "-LRB-"), an offset span, and the characters immediately preceding and 242 | # following the token. Given a list of these objects it is possible to 243 | # recreate the text from which they came verbatim. 244 | class StandoffToken < Struct.new(:current, :word, :before, :after, 245 | :begin_position, :end_position) 246 | def to_s 247 | "#{current} [#{begin_position},#{end_position}]" 248 | end 249 | end 250 | 251 | 252 | # A preprocessor that segments text into sentences and tokens that contain 253 | # character offset and token context information that can be used for 254 | # standoff annotation. 255 | class StandoffDocumentPreprocessor < DocumentPreprocessor 256 | def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER) 257 | # PTBTokenizer.factory is a static function, so use RJB to call it 258 | # directly instead of going through a JavaObjectWrapper. We do it this 259 | # way because the Standford parser Java code does not provide a 260 | # constructor that allows you to specify the second parameter, 261 | # invertible, to true, and we need this to write character offset 262 | # information into the tokens. 263 | ptb_tokenizer_class = Rjb::import(tokenizer) 264 | # See the documentation for 265 | # edu.stanford.nlp.process.DocumentPreprocessor for a 266 | # description of these parameters. 267 | ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false) 268 | super(ptb_tokenizer_factory) 269 | end 270 | 271 | # Returns a list of sentences in a string. This wraps the returned 272 | # sentences in a StandoffSentence object. 273 | def getSentencesFromString(s) 274 | super(s).map!{|s| StandoffSentence.new(s)} 275 | end 276 | end 277 | 278 | 279 | # A sentence is an array of StandoffToken objects. 280 | class StandoffSentence < Array 281 | # Construct an array of StandoffToken objects from a Java list sentence 282 | # object returned by the preprocessor. 283 | def initialize(stanford_parser_sentence) 284 | # Convert FeatureStructure wrappers to StandoffToken objects. 285 | s = stanford_parser_sentence.to_a.collect do |fs| 286 | current = fs.current 287 | word = fs.word 288 | before = fs.before 289 | after = fs.after 290 | # The to_s.to_i is necessary because the get function returns 291 | # java.lang.Integer objects instead of Ruby integers. 292 | begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i 293 | end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i 294 | StandoffToken.new(current, word, before, after, 295 | begin_position, end_position) 296 | end 297 | super(s) 298 | end 299 | 300 | # Return the original string verbatim. 301 | def to_s 302 | self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current 303 | end 304 | 305 | # Return the original string verbatim. 306 | def inspect 307 | to_s 308 | end 309 | end 310 | 311 | 312 | # Standoff syntactic annotation of natural language text which may contain 313 | # multiple sentences. 314 | # 315 | # This is an Array of StandoffNode objects, one for each sentence in the 316 | # text. 317 | class StandoffParsedText < Array 318 | # Parse the text and create the standoff annotation. 319 | # 320 | # The default parser is a singleton instance of the English language 321 | # Stanford Natural Langugage parser. There may be a delay of a few 322 | # seconds for it to load the first time it is created. 323 | def initialize(text, nodetype = StandoffNode, 324 | tokenizer = EN_PENN_TREEBANK_TOKENIZER, 325 | parser = DefaultParser.instance) 326 | preprocessor = StandoffDocumentPreprocessor.new(tokenizer) 327 | # Segment the text into sentences. Parse each sentence, writing 328 | # standoff annotation information into the terminal nodes. 329 | preprocessor.getSentencesFromString(text).map do |sentence| 330 | parse = parser.apply(sentence.to_s) 331 | push(nodetype.new(parse, sentence)) 332 | end 333 | end 334 | 335 | # Print class name and number of sentences. 336 | def inspect 337 | "<#{self.class.name}, #{length} sentences>" 338 | end 339 | 340 | # Print parses. 341 | def to_s 342 | flatten.join(" ") 343 | end 344 | end 345 | 346 | 347 | # Standoff syntactic tree annotation of text. Terminal nodes are labeled 348 | # with the appropriate StandoffToken objects. Standoff parses can reproduce 349 | # the original string from which they were generated verbatim, optionally 350 | # with brackets around the yields of specified non-terminal nodes. 351 | class StandoffNode < Treebank::ParentedNode 352 | # Create the standoff tree from a tree returned by the Stanford parser. 353 | # For non-terminal nodes, the tokens argument will be a 354 | # StandoffSentence containing the StandoffToken objects representing all 355 | # the tokens beneath and after this node. For terminal nodes, the 356 | # tokens argument will be a StandoffToken. 357 | def initialize(stanford_parser_node, tokens) 358 | # Annotate this node with a non-terminal label or a StandoffToken as 359 | # appropriate. 360 | super(tokens.instance_of?(StandoffSentence) ? 361 | stanford_parser_node.value : tokens) 362 | # Enumerate the children depth-first. Tokens are removed from the list 363 | # left-to-right as terminal nodes are added to the tree. 364 | stanford_parser_node.children.each do |child| 365 | subtree = self.class.new(child, child.leaf? ? tokens.shift : tokens) 366 | attach_child!(subtree) 367 | end 368 | end 369 | 370 | # Return the original text string dominated by this node. 371 | def to_original_string 372 | leaves.inject("") do |s, leaf| 373 | s += leaf.label.current + leaf.label.after 374 | end 375 | end 376 | 377 | # Print the original string with brackets around word spans dominated by 378 | # the specified consituents. 379 | # 380 | # The constituents to bracket are specified by passing a list of node 381 | # coordinates, which are arrays of integers of the form returned by the 382 | # tree enumerators of Treebank::Node objects. 383 | # 384 | # _coords_:: the coordinates of the nodes around which to place brackets 385 | # _open_:: the open bracket symbol 386 | # _close_:: the close bracket symbol 387 | def to_bracketed_string(coords, open = "[", close = "]") 388 | # Get a list of all the leaf nodes and their coordinates. 389 | items = depth_first_enumerator(true).find_all {|n| n.first.leaf?} 390 | # Enumerate over all the matching constituents inserting open and close 391 | # brackets around their yields in the items list. 392 | coords.each do |matching| 393 | # Insert using a simple state machine with three states: :start, 394 | # :open, and :close. 395 | state = :start 396 | # Enumerate over the items list looking for nodes that are the 397 | # children of the matching constituent. 398 | items.each_with_index do |item, index| 399 | # Skip inserted bracket characters. 400 | next if item.is_a? String 401 | # Handle terminal node items with the state machine. 402 | node, terminal_coordinate = item 403 | if state == :start 404 | next if not in_yield?(matching, terminal_coordinate) 405 | items.insert(index, open) 406 | state = :open 407 | else # state == :open 408 | next if in_yield?(matching, terminal_coordinate) 409 | items.insert(index, close) 410 | state = :close 411 | break 412 | end 413 | end # items.each_with_index 414 | # Handle the case where a matching constituent is flush with the end 415 | # of the sentence. 416 | items << close if state == :open 417 | end # each 418 | # Replace terminal nodes with their string representations. Insert 419 | # spacing characters in the list. 420 | items.each_with_index do |item, index| 421 | next if item.is_a? String 422 | text = item.first.label.current 423 | spacing = item.first.label.after 424 | # Replace the terminal node with its text. 425 | items[index] = text 426 | # Insert the spacing that comes after this text before the first 427 | # non-close bracket character. 428 | close_pos = find_index(items[index+1..-1]) {|item| not item == close} 429 | items.insert(index + close_pos + 1, spacing) 430 | end 431 | items.join 432 | end # to_bracketed_string 433 | 434 | # Find the index of the first item in _list_ for which _block_ is true. 435 | # Return 0 if no items are found. 436 | def find_index(list, &block) 437 | list.each_with_index do |item, index| 438 | return index if block.call(item) 439 | end 440 | 0 441 | end 442 | 443 | # Is the node at _terminal_ in the yield of the node at _node_? 444 | def in_yield?(node, terminal) 445 | # If node A's coordinates match the prefix of node B's coordinates, node 446 | # B is in the yield of node A. 447 | terminal.first(node.length) == node 448 | end 449 | 450 | private :in_yield?, :find_index 451 | end # StandoffNode 452 | 453 | end # StanfordParser 454 | -------------------------------------------------------------------------------- /lib/stanfordparser/java_object.rb: -------------------------------------------------------------------------------- 1 | # Copyright 2007-2008 William Patrick McNeill 2 | # 3 | # This file is part of the Stanford Parser Ruby Wrapper. 4 | # 5 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it 6 | # and/or modify it under the terms of the GNU General Public License as 7 | # published by the Free Software Foundation; either version 2 of the License, 8 | # or (at your option) any later version. 9 | # 10 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be 11 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 13 | # Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin 17 | # St, Fifth Floor, Boston, MA 02110-1301 USA 18 | 19 | # Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that 20 | # add a generic Java object wrapper class. 21 | module Rjb 22 | 23 | #-- 24 | # The documentation for this class appears next to its extension inside the 25 | # StanfordParser module in stanfordparser.rb. This should be changed if Rjb 26 | # is ever moved into its own gem. See the documention in stanfordparser.rb 27 | # for more details. 28 | #++ 29 | class JavaObjectWrapper 30 | include Enumerable 31 | 32 | # The underlying Java object. 33 | attr_reader :java_object 34 | 35 | # Initialize with a Java object obj. If obj is a 36 | # String, treat it as a Java class name and instantiate it. Otherwise, 37 | # treat obj as an instance of a Java object. 38 | def initialize(obj, *args) 39 | @java_object = obj.class == String ? 40 | Rjb::import(obj).send(:new, *args) : obj 41 | end 42 | 43 | # Enumerate all the items in the object using its iterator. If the object 44 | # has no iterator, this function yields nothing. 45 | def each 46 | if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"} 47 | i = @java_object.iterator 48 | while i.hasNext 49 | yield wrap_java_object(i.next) 50 | end 51 | end 52 | end # each 53 | 54 | # Reflect unhandled method calls to the underlying Java object and wrap 55 | # the return value in the appropriate Ruby object. 56 | def method_missing(m, *args) 57 | begin 58 | wrap_java_object(@java_object.send(m, *args)) 59 | rescue RuntimeError => e 60 | # The instance method failed. See if this is a static method. 61 | if not e.message.match(/^Fail: unknown method name/).nil? 62 | getClass.send(m, *args) 63 | end 64 | end 65 | end 66 | 67 | # Convert a value returned by a call to the underlying Java object to the 68 | # appropriate Ruby object. 69 | # 70 | # If the value is a JavaObjectWrapper, convert it using a protected 71 | # function with the name wrap_ followed by the underlying object's 72 | # classname with the Java path delimiters converted to underscores. For 73 | # example, a java.util.ArrayList would be converted by a function 74 | # called wrap_java_util_ArrayList. 75 | # 76 | # If the value lacks the appropriate converter function, wrap it in a 77 | # generic JavaObjectWrapper. 78 | # 79 | # If the value is not a JavaObjectWrapper, return it unchanged. 80 | # 81 | # This function is called recursively for every element in an Array. 82 | def wrap_java_object(object) 83 | if object.kind_of?(Array) 84 | object.collect {|item| wrap_java_object(item)} 85 | elsif object.respond_to?(:_classname) 86 | # Ruby-Java Bridge Java objects all have a _classname member which 87 | # tells the name of their Java class. Convert this to the 88 | # corresponding wrapper function name. 89 | wrapper_name = ("wrap_" + object._classname.gsub(/\./, "_")).to_sym 90 | respond_to?(wrapper_name) ? send(wrapper_name, object) : JavaObjectWrapper.new(object) 91 | else 92 | object 93 | end 94 | end 95 | 96 | # Convert java.util.ArrayList objects to Ruby Array objects. 97 | def wrap_java_util_ArrayList(object) 98 | array_list = [] 99 | object.size.times do 100 | |i| array_list << wrap_java_object(object.get(i)) 101 | end 102 | array_list 103 | end 104 | 105 | # Convert java.util.HashSet objects to Ruby Set objects. 106 | def wrap_java_util_HashSet(object) 107 | set = Set.new 108 | i = object.iterator 109 | while i.hasNext 110 | set << wrap_java_object(i.next) 111 | end 112 | set 113 | end 114 | 115 | # Show the classname of the underlying Java object. 116 | def inspect 117 | "<#{@java_object._classname}>" 118 | end 119 | 120 | # Use the underlying Java object's stringification. 121 | def to_s 122 | toString 123 | end 124 | 125 | protected :wrap_java_object, :wrap_java_util_ArrayList, :wrap_java_util_HashSet 126 | 127 | end # JavaObjectWrapper 128 | 129 | end # Rjb 130 | -------------------------------------------------------------------------------- /stanfordparser.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{stanfordparser} 8 | s.version = "2.2.1.s" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new("> 1.3.1") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["John Wilkinson", "Bill McNeal"] 12 | s.date = %q{2010-06-21} 13 | s.description = %q{Ruby wrapper of the Stanford Parser, a NLP parser built in Java.} 14 | s.email = %q{jcwilk@gmail.com} 15 | s.extra_rdoc_files = [ 16 | "LICENSE", 17 | "README.orig", 18 | "README.rdoc" 19 | ] 20 | s.files = [ 21 | ".document", 22 | ".gitignore", 23 | "LICENSE", 24 | "README.orig", 25 | "README.rdoc", 26 | "Rakefile", 27 | "VERSION.yml", 28 | "examples/connection_finder.rb", 29 | "examples/stanford-sentence-parser.rb", 30 | "lib/stanfordparser.rb", 31 | "lib/stanfordparser/java_object.rb", 32 | "spec/spec.opts", 33 | "spec/spec_helper.rb", 34 | "spec/stanfordparser_spec.rb", 35 | "test/test_stanfordparser.rb" 36 | ] 37 | s.homepage = %q{http://github.com/jcwilk/stanfordparser} 38 | s.rdoc_options = ["--charset=UTF-8"] 39 | s.require_paths = ["lib"] 40 | s.rubygems_version = %q{1.3.7} 41 | s.summary = %q{GitHub upload/extension of Bill McNeal's stanfordparser rubygem} 42 | s.test_files = [ 43 | "spec/spec_helper.rb", 44 | "spec/stanfordparser_spec.rb", 45 | "test/test_stanfordparser.rb", 46 | "examples/connection_finder.rb", 47 | "examples/stanford-sentence-parser.rb" 48 | ] 49 | 50 | if s.respond_to? :specification_version then 51 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 52 | s.specification_version = 3 53 | 54 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 55 | s.add_runtime_dependency(%q, [">= 1.2.5"]) 56 | s.add_runtime_dependency(%q, [">= 3.0.0"]) 57 | s.add_development_dependency(%q, [">= 1.2.9"]) 58 | else 59 | s.add_dependency(%q, [">= 1.2.5"]) 60 | s.add_dependency(%q, [">= 3.0.0"]) 61 | s.add_dependency(%q, [">= 1.2.9"]) 62 | end 63 | else 64 | s.add_dependency(%q, [">= 1.2.5"]) 65 | s.add_dependency(%q, [">= 3.0.0"]) 66 | s.add_dependency(%q, [">= 1.2.9"]) 67 | end 68 | end 69 | 70 | -------------------------------------------------------------------------------- /test/test_stanfordparser.rb: -------------------------------------------------------------------------------- 1 | #!/bin/env ruby 2 | 3 | #-- 4 | 5 | # Copyright 2007-2008 William Patrick McNeill 6 | # 7 | # This file is part of the Stanford Parser Ruby Wrapper. 8 | # 9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it 10 | # and/or modify it under the terms of the GNU General Public License as 11 | # published by the Free Software Foundation; either version 2 of the License, 12 | # or (at your option) any later version. 13 | # 14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be 15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 | # Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License along with 20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin 21 | # St, Fifth Floor, Boston, MA 02110-1301 USA 22 | # 23 | #++ 24 | 25 | # Test cases for the Stanford Parser module 26 | 27 | require "test/unit" 28 | require "set" 29 | require "singleton" 30 | require "stanfordparser" 31 | 32 | 33 | class LexicalizedParserTestCase < Test::Unit::TestCase 34 | def test_root_path 35 | assert_equal StanfordParser::ROOT.class, Pathname 36 | end 37 | 38 | def setup 39 | @parser = StanfordParser::DefaultParser.instance 40 | @tree = @parser.apply("This is a sentence.") 41 | end 42 | 43 | def test_parser 44 | assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz" 45 | assert_equal @tree.class, StanfordParser::Tree 46 | end 47 | 48 | def test_localTrees 49 | # The following call exercises the conversion from java.util.HashSet 50 | # objects to Ruby sets. 51 | l = @tree.localTrees 52 | assert_equal l.size, 5 53 | assert_equal Set.new(l.collect {|t| "#{t.label}"}), 54 | Set.new(["S", "NP", "VP", "ROOT", "NP"]) 55 | end 56 | 57 | def test_enumerable 58 | # StanfordParser::LexicalizedParser is not an enumerable object. 59 | assert_equal @parser.map, [] 60 | end 61 | end # LexicalizedParserTestCase 62 | 63 | 64 | class TreeTestCase < Test::Unit::TestCase 65 | def setup 66 | @parser = StanfordParser::DefaultParser.instance 67 | @tree = @parser.apply("This is a sentence.") 68 | end 69 | 70 | def test_enumerable 71 | assert @tree.all? {|n| n.class == StanfordParser::Tree} 72 | assert @tree.all? {|n| 73 | n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or 74 | n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf" 75 | } 76 | assert_equal @tree.map {|n| "#{n.label}"}, 77 | ["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \ 78 | "NN", "sentence", ".", "."] 79 | end 80 | end # TreeTestCase 81 | 82 | 83 | class FeatureLabelTestCase < Test::Unit::TestCase 84 | def test_feature_label 85 | f = StanfordParser::FeatureLabel.new 86 | assert_equal "BEGIN_POS", f.BEGIN_POSITION_KEY 87 | f.put(f.BEGIN_POSITION_KEY, 3) 88 | assert_equal "END_POS", f.END_POSITION_KEY 89 | f.put(f.END_POSITION_KEY, 7) 90 | assert_equal "current", f.CURRENT_KEY 91 | f.put(f.CURRENT_KEY, "word") 92 | assert_equal "{BEGIN_POS=3, END_POS=7, current=word}", f.inspect 93 | assert_equal "word [3,7]", f.to_s 94 | end 95 | end 96 | 97 | 98 | class DocumentPreprocessorTestCase < Test::Unit::TestCase 99 | def setup 100 | @preproc = StanfordParser::DocumentPreprocessor.new 101 | @standoff_preproc = StanfordParser::StandoffDocumentPreprocessor.new 102 | end 103 | 104 | def test_get_sentences_from_string 105 | # The following call exercises the conversion from java.util.ArrayList 106 | # objects to Ruby arrays. 107 | s = @preproc.getSentencesFromString("This is a sentence. So is this.") 108 | assert_equal "#{s[0]}", "This is a sentence ." 109 | assert_equal "#{s[1]}", "So is this ." 110 | end 111 | 112 | def test_enumerable 113 | # StanfordParser::DocumentPreprocessor is not an enumerable object. 114 | assert_equal @preproc.map, [] 115 | end 116 | 117 | # Segment and tokenize text containing two sentences. 118 | def test_standoff_document_preprocessor 119 | sentences = @standoff_preproc.getSentencesFromString("He (John) is tall. So is she.") 120 | # Recognize two sentences. 121 | assert_equal 2, sentences.length 122 | assert sentences.all? {|sentence| sentence.instance_of? StanfordParser::StandoffSentence} 123 | assert_equal "He (John) is tall.", sentences.first.to_s 124 | assert_equal 7, sentences.first.length 125 | assert sentences[0].all? {|token| token.instance_of? StanfordParser::StandoffToken} 126 | assert_equal "So is she.", sentences.last.to_s 127 | assert_equal 4, sentences.last.length 128 | assert sentences[1].all? {|token| token.instance_of? StanfordParser::StandoffToken} 129 | # Get the correct token information for the first sentence. 130 | assert_equal ["He", "He"], [sentences[0][0].current(), sentences[0][0].word()] 131 | assert_equal [0,2], [sentences[0][0].begin_position(), sentences[0][0].end_position()] 132 | assert_equal ["(", "-LRB-"], [sentences[0][1].current(), sentences[0][1].word()] 133 | assert_equal [3,4], [sentences[0][1].begin_position(), sentences[0][1].end_position()] 134 | assert_equal ["John", "John"], [sentences[0][2].current(), sentences[0][2].word()] 135 | assert_equal [4,8], [sentences[0][2].begin_position(), sentences[0][2].end_position()] 136 | assert_equal [")", "-RRB-"], [sentences[0][3].current(), sentences[0][3].word()] 137 | assert_equal [8,9], [sentences[0][3].begin_position(), sentences[0][3].end_position()] 138 | assert_equal ["is", "is"], [sentences[0][4].current(), sentences[0][4].word()] 139 | assert_equal [10,12], [sentences[0][4].begin_position(), sentences[0][4].end_position()] 140 | assert_equal ["tall", "tall"], [sentences[0][5].current(), sentences[0][5].word()] 141 | assert_equal [13,17], [sentences[0][5].begin_position(), sentences[0][5].end_position()] 142 | assert_equal [".", "."], [sentences[0][6].current(), sentences[0][6].word()] 143 | assert_equal [17,18], [sentences[0][6].begin_position(), sentences[0][6].end_position()] 144 | # Get the correct token information for the second sentence. 145 | assert_equal ["So", "So"], [sentences[1][0].current(), sentences[1][0].word()] 146 | assert_equal [20,22], [sentences[1][0].begin_position(), sentences[1][0].end_position()] 147 | assert_equal ["is", "is"], [sentences[1][1].current(), sentences[1][1].word()] 148 | assert_equal [23,25], [sentences[1][1].begin_position(), sentences[1][1].end_position()] 149 | assert_equal ["she", "she"], [sentences[1][2].current(), sentences[1][2].word()] 150 | assert_equal [26,29], [sentences[1][2].begin_position(), sentences[1][2].end_position()] 151 | assert_equal [".", "."], [sentences[1][3].current(), sentences[1][3].word()] 152 | assert_equal [29,30], [sentences[1][3].begin_position(), sentences[1][3].end_position()] 153 | end 154 | 155 | def test_stringification 156 | assert_equal "", @preproc.inspect 157 | assert_equal "", @preproc.to_s 158 | assert_equal "", @standoff_preproc.inspect 159 | assert_equal "", @standoff_preproc.to_s 160 | end 161 | 162 | end # DocumentPreprocessorTestCase 163 | 164 | 165 | class StandoffParsedTextTestCase < Test::Unit::TestCase 166 | def setup 167 | @text = "He (John) is tall. So is she." 168 | end 169 | 170 | def test_parse_text_default_nodetype 171 | parsed_text = StanfordParser::StandoffParsedText.new(@text) 172 | verify_parsed_text(parsed_text, StanfordParser::StandoffNode) 173 | end 174 | 175 | # Verify correct parsing with variable node types for text containing two sentences. 176 | def verify_parsed_text(parsed_text, nodetype) 177 | # Verify that there are two sentences. 178 | assert_equal 2, parsed_text.length 179 | assert parsed_text.all? {|sentence| sentence.instance_of? nodetype} 180 | # Verify the tokens in the leaf node of the first sentence. 181 | leaves = parsed_text[0].leaves.collect {|node| node.label} 182 | assert_equal ["He", "He"], [leaves[0].current(), leaves[0].word()] 183 | assert_equal [0,2], [leaves[0].begin_position(), leaves[0].end_position()] 184 | assert_equal ["(", "-LRB-"], [leaves[1].current(), leaves[1].word()] 185 | assert_equal [3,4], [leaves[1].begin_position(), leaves[1].end_position()] 186 | assert_equal ["John", "John"], [leaves[2].current(), leaves[2].word()] 187 | assert_equal [4,8], [leaves[2].begin_position(), leaves[2].end_position()] 188 | assert_equal [")", "-RRB-"], [leaves[3].current(), leaves[3].word()] 189 | assert_equal [8,9], [leaves[3].begin_position(), leaves[3].end_position()] 190 | assert_equal ["is", "is"], [leaves[4].current(), leaves[4].word()] 191 | assert_equal [10,12], [leaves[4].begin_position(), leaves[4].end_position()] 192 | assert_equal ["tall", "tall"], [leaves[5].current(), leaves[5].word()] 193 | assert_equal [13,17], [leaves[5].begin_position(), leaves[5].end_position()] 194 | assert_equal [".", "."], [leaves[6].current(), leaves[6].word()] 195 | assert_equal [17,18], [leaves[6].begin_position(), leaves[6].end_position()] 196 | # Verify the tokens in the leaf node of the second sentence. 197 | leaves = parsed_text[1].leaves.collect {|node| node.label} 198 | assert_equal ["So", "So"], [leaves[0].current(), leaves[0].word()] 199 | assert_equal [20,22], [leaves[0].begin_position(), leaves[0].end_position()] 200 | assert_equal ["is", "is"], [leaves[1].current(), leaves[1].word()] 201 | assert_equal [23,25], [leaves[1].begin_position(), leaves[1].end_position()] 202 | assert_equal ["she", "she"], [leaves[2].current(), leaves[2].word()] 203 | assert_equal [26,29], [leaves[2].begin_position(), leaves[2].end_position()] 204 | assert_equal [".", "."], [leaves[3].current(), leaves[3].word()] 205 | assert_equal [29,30], [leaves[3].begin_position(), leaves[3].end_position()] 206 | # Verify that the original string is recoverable. 207 | assert_equal "He (John) is tall. ", parsed_text[0].to_original_string 208 | assert_equal "So is she." , parsed_text[1].to_original_string 209 | # Draw < and > brackets around 3 constituents. 210 | b = parsed_text[0].to_bracketed_string([[0,0], [0,0,1,1], [0,1,1]], "<", ">") 211 | assert_equal ")> is . ", b 212 | end 213 | end 214 | 215 | 216 | class MiscPreprocessorTestCase < Test::Unit::TestCase 217 | def test_model_location 218 | assert_equal "$(ROOT)/englishPCFG.ser.gz", StanfordParser::ENGLISH_PCFG_MODEL 219 | end 220 | 221 | def test_word 222 | assert StanfordParser::Word.new("edu.stanford.nlp.ling.Word", "dog") == "dog" 223 | end 224 | end # MiscPreprocessorTestCase 225 | --------------------------------------------------------------------------------