├── .document
├── .gitignore
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.orig.rdoc
├── README.rdoc
├── Rakefile
├── TESTS_STATUS.rdoc
├── VERSION.yml
├── examples
└── stanford-sentence-parser.rb
├── lib
├── stanfordparser.rb
└── stanfordparser
│ └── java_object.rb
├── stanfordparser.gemspec
└── test
└── test_stanfordparser.rb
/.document:
--------------------------------------------------------------------------------
1 | README.rdoc
2 | lib/**/*.rb
3 | bin/*
4 | features/**/*.feature
5 | LICENSE
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## MAC OS
2 | .DS_Store
3 |
4 | ## TEXTMATE
5 | *.tmproj
6 | tmtags
7 |
8 | ## EMACS
9 | *~
10 | \#*
11 | .\#*
12 |
13 | ## VIM
14 | *.swp
15 |
16 | ## RubyMine
17 | /.idea
18 |
19 | ## PROJECT::GENERAL
20 | coverage
21 | rdoc
22 | pkg
23 |
24 | ## PROJECT::SPECIFIC
25 | .bundle
26 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source :gemcutter
2 | gem "rjb"
3 | gem "rake"
4 | gem "jeweler"
5 | gem "treebank", ">= 3.0.0"
6 | gem "rspec", ">= 1.2.9"
7 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 | rake:
4 | group:
5 | - :default
6 | version: ">= 0"
7 | rspec:
8 | group:
9 | - :default
10 | version: ">= 1.2.9"
11 | jeweler:
12 | group:
13 | - :default
14 | version: ">= 0"
15 | treebank:
16 | group:
17 | - :default
18 | version: ">= 3.0.0"
19 | rjb:
20 | group:
21 | - :default
22 | version: ">= 0"
23 | specs:
24 | - rake:
25 | version: 0.8.7
26 | - json_pure:
27 | version: 1.4.3
28 | - gemcutter:
29 | version: 0.5.0
30 | - git:
31 | version: 1.2.5
32 | - rubyforge:
33 | version: 2.0.4
34 | - jeweler:
35 | version: 1.4.0
36 | - rjb:
37 | version: 1.2.5
38 | - rspec:
39 | version: 1.3.0
40 | - treebank:
41 | version: 3.0.0
42 | hash: 264a823adfd7bb2231dd1037e95b74038b67283d
43 | sources:
44 | - Rubygems:
45 | uri: http://gemcutter.org
46 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2007-2008 William Patrick McNeill
2 | Copyright (c) 2010 John Wilkinson
3 |
4 | This file is part of the Stanford Parser Ruby Wrapper.
5 |
6 | The Stanford Parser Ruby Wrapper is free software; you can redistribute it
7 | and/or modify it under the terms of the GNU General Public License as
8 | published by the Free Software Foundation; either version 2 of the License,
9 | or (at your option) any later version.
10 |
11 | The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
12 | useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
14 | Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License along with
17 | editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 | St, Fifth Floor, Boston, MA 02110-1301 USA
--------------------------------------------------------------------------------
/README.orig.rdoc:
--------------------------------------------------------------------------------
1 | = Stanford Natural Language Parser Wrapper
2 |
3 | This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
4 |
5 | The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby along with pure Ruby objects that enable standoff parsing.
6 |
7 |
8 | = Installation and Configuration
9 |
10 | In addition to the Ruby gems it requires, to run this module you must manually install the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
11 |
12 | This module expects the parser to be installed in the /usr/local/stanford-parser/current directory on UNIX platforms and in the C:\stanford-parser\current directory on Windows platforms. This is the directory that contains the stanford-parser.jar file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments -server -Xmx150m.
13 |
14 | These defaults can be overridden by creating the configuration file /etc/ruby_stanford_parser.yaml on UNIX platforms and C:\stanford-parser\ruby-stanford-parser.yaml on Windows platforms. This file is in the Ruby YAML[http://ruby-doc.org/stdlib/libdoc/yaml/rdoc/index.html] format, and may contain two values: root and jvmargs. For example, the file might look like the following:
15 |
16 | root: /usr/local/stanford-parser/other/location
17 | jvmargs: -Xmx100m -verbose
18 |
19 |
20 | =Tokenization and Parsing
21 |
22 | Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into sentences and words.
23 |
24 | >> require "stanfordparser"
25 | => true
26 | >> preproc = StanfordParser::DocumentPreprocessor.new
27 | =>
28 | >> puts preproc.getSentencesFromString("This is a sentence. So is this.")
29 | This is a sentence .
30 | So is this .
31 |
32 | Use the StanfordParser::LexicalizedParser class to parse sentences.
33 |
34 | >> parser = StanfordParser::LexicalizedParser.new
35 | Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
36 | => edu.stanford.nlp.parser.lexparser.LexicalizedParser
37 | >> puts parser.apply("This is a sentence.")
38 | (ROOT
39 | (S [24.917]
40 | (NP [6.139] (DT [2.300] This))
41 | (VP [17.636] (VBZ [0.144] is)
42 | (NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
43 | (. [0.002] .)))
44 |
45 | For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
46 |
47 |
48 | =Standoff Tokenization and Parsing
49 |
50 | This module also contains support for standoff tokenization and parsing, in which the terminal nodes of parse trees contain information about the text that was used to generate them.
51 |
52 | Use StanfordParser::StandoffDocumentPreprocessor class to tokenize text and files into sentences and words.
53 |
54 | >> preproc = StanfordParser::StandoffDocumentPreprocessor.new
55 | =>
56 | >> s = preproc.getSentencesFromString("This is a sentence. So is this.")
57 | => [This is a sentence., So is this.]
58 |
59 | The standoff preprocessor returns StanfordParser::StandoffToken objects, which contain character offsets into the original text along with information about spacing characters that came before and after the token.
60 |
61 | >> puts s
62 | This [0,4]
63 | is [5,7]
64 | a [8,9]
65 | sentence [10,18]
66 | . [18,19]
67 | So [21,23]
68 | is [24,26]
69 | this [27,31]
70 | . [31,32]
71 | >> "This is a sentence. So is this."[27..31]
72 | => "this."
73 |
74 | This is the same information contained in the edu.stanford.nlp.ling.FeatureLabel class in the Stanford Parser Java implementation.
75 |
76 | Similarly, use the StanfordParser::StandoffParsedText object to parse a block of text into StanfordParser::StandoffNode parse trees whose terminal nodes are StanfordParser::StandoffToken objects.
77 |
78 | >> t = StanfordParser::StandoffParsedText.new("This is a sentence. So is this.")
79 | Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [4.9 sec].
80 | =>
81 | >> puts t.first
82 | (ROOT
83 | (S
84 | (NP (DT This [0,4]))
85 | (VP (VBZ is [5,7])
86 | (NP (DT a [8,9]) (NN sentence [10,18])))
87 | (. . [18,19])))
88 |
89 | Standoff parse trees can reproduce the text from which they were generated verbatim.
90 |
91 | >> t.first.to_original_string
92 | => "This is a sentence. "
93 |
94 | They can also reproduce the original text with brackets inserted around the yields of specified parse nodes.
95 |
96 | >> t.first.to_bracketed_string([[0,0,0], [0,1,1]])
97 | => "[This] is [a sentence]. "
98 |
99 | The format of the coordinates used to specify individual nodes is described in the documentation for the Ruby Treebank[http://rubyforge.org/projects/treebank/] gem.
100 |
101 | See the documentation of the individual classes in this module for more details.
102 |
103 | Unlike their parents StanfordParser::DocumentPreprocessor and StanfordParser::LexicalizedParser, which produce Ruby wrappers around Java objects, StanfordParser::StandoffDocumentPreprocessor and StanfordParser::StandoffParsedText produce pure Ruby objects. This is to facilitate serialization of these objects using tools like the Marshal module, which cannot serialize Java objects.
104 |
105 | = History
106 |
107 | 1.0.0:: Initial release
108 | 1.1.0:: Make module initialization function private. Add example code.
109 | 1.2.0:: Read Java VM arguments from the configuration file. Add Word class.
110 | 2.0.0:: Add support for standoff parsing. Change the way Rjb::JavaObjectWrapper wraps returned values: see wrap_java_object for details. Rjb::JavaObjectWrapper supports static members. Minor changes to stanford-sentence-parser script.
111 | 2.1.0:: Different default paths for Windows machines; Minor changes to StandoffToken definition
112 | 2.2.0:: Add parent information to StandoffNode
113 |
114 | = Copyright
115 |
116 | Copyright 2007-2008, William Patrick McNeill
117 |
118 | This program is distributed under the GNU General Public License.
119 |
120 |
121 | = Author
122 |
123 | W.P. McNeill mailto:billmcn@gmail.com
--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
1 | = stanfordparser
2 |
3 | This is an upload/extension of Bill McNeal's stanfordparser rubyforge gem, check it out at its homepage (seems to be partially in French)
4 |
5 | http://rubyforge.org/projects/stanfordparser/
6 |
7 | or its rdocs
8 |
9 | http://stanfordparser.rubyforge.org/
10 |
11 | I've been having issues trying to use this gem so I decided to upload it to github and try to organize it to be a little more gem-friendly, especially using jeweler.
12 |
13 | AFAIK there aren't other copies of this on github, please correct me if I'm mistaken. The only similar one I can see is http://github.com/tiendung/ruby-nlp which has much less code and I can only assume to be something else.
14 |
15 | It seems like using version 1.6.1 of the java StanfordParser package is your best bet for compatability.
16 |
17 | See README.orig.rdoc for Bill's readme, which includes dependencies, installation, and usage.
18 |
19 | == Branches
20 |
21 | * master - Jeweler and Bundler integrated along with slight reorganization of files to be more gem-standard. This is the branch you should use if you want to source the gem straight from github. I will leave this branch alone for the most part unless I find/come up with stable and useful additions. All changes will be backwards compatible.
22 | * stock - Almost untouched from Bill's version, except for the README. Use this branch if that's what you're looking for.
23 | * fixing_tests - The tests are currently broken, this branch is trying to address that. Once the tests are fixed it will be merged back into master. Help appreciated! I'll keep a TESTS_STATUS.rdoc keeping track of progress.
24 | * experimental - I'll be putting in some code as examples and testing out some ideas. Do not use this branch as a gem. You are very encouraged, however, to fork it and add some code/make my code better. I'll try to integrate all the pull requests I get, if not in that branch into another.
25 |
26 | == Note on Patches/Pull Requests
27 |
28 | * Fork the project.
29 | * Make your feature addition or bug fix.
30 | * Add tests for it. I would prefer rSpec, but TestUnit is acceptable as well since there are some of those from the original author.
31 | * Commit.
32 | * Send me a pull request. Bonus points for topic branches.
33 |
34 | == Copyright
35 |
36 | Copyright (c) 2010 John Wilkinson. See LICENSE for details.
37 | Copyright 2007-2008, William Patrick McNeill. See README.orig for details.
38 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rubygems'
2 | require 'rake'
3 |
4 | $LOAD_PATH.unshift('lib')
5 |
6 | begin
7 | require 'jeweler'
8 | Jeweler::Tasks.new do |gem|
9 | gem.name = "stanfordparser"
10 | gem.summary = "GitHub upload/extension of Bill McNeal's stanfordparser rubygem"
11 | gem.description = "Ruby wrapper of the Stanford Parser, a NLP parser built in Java."
12 | gem.email = "jcwilk@gmail.com"
13 | gem.homepage = "http://github.com/jcwilk/stanfordparser"
14 | gem.authors = ["John Wilkinson","Bill McNeal"]
15 |
16 | gem.add_dependency "rjb", ">= 1.2.5"
17 | gem.add_dependency "treebank", ">= 3.0.0"
18 | gem.add_development_dependency "rspec", ">= 1.2.9"
19 | end
20 | Jeweler::GemcutterTasks.new
21 | rescue LoadError
22 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
23 | end
24 |
25 | require 'rake/testtask'
26 | Rake::TestTask.new(:test) do |test|
27 | test.test_files = FileList.new('test/**/test_*.rb') do |list|
28 | list.exclude 'test/test_helper.rb'
29 | end
30 | test.libs << 'test'
31 | test.verbose = true
32 | end
33 |
34 | require 'spec/rake/spectask'
35 | Spec::Rake::SpecTask.new(:spec) do |spec|
36 | spec.libs << 'lib' << 'spec'
37 | spec.spec_files = FileList['spec/**/*_spec.rb']
38 | end
39 |
40 | Spec::Rake::SpecTask.new(:rcov) do |spec|
41 | spec.libs << 'lib' << 'spec'
42 | spec.pattern = 'spec/**/*_spec.rb'
43 | spec.rcov = true
44 | end
45 |
46 | task :test => :check_dependencies
47 |
48 | task :spec => :check_dependencies
49 |
50 | task :default => :test
51 |
52 | require 'rake/rdoctask'
53 | Rake::RDocTask.new do |rdoc|
54 | version = File.exist?('VERSION') ? File.read('VERSION') : ""
55 |
56 | rdoc.rdoc_dir = 'rdoc'
57 | rdoc.title = "stanfordparser #{version}"
58 | rdoc.rdoc_files.include('README*')
59 | rdoc.rdoc_files.include('lib/**/*.rb')
60 | end
61 |
--------------------------------------------------------------------------------
/TESTS_STATUS.rdoc:
--------------------------------------------------------------------------------
1 | = Status of Tests
2 | * Please see fixing_tests branches for efforts towards this goal.
3 |
--------------------------------------------------------------------------------
/VERSION.yml:
--------------------------------------------------------------------------------
1 | ---
2 | :major: 2
3 | :minor: 2
4 | :patch: 1
5 | :build: s
--------------------------------------------------------------------------------
/examples/stanford-sentence-parser.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | #--
4 |
5 | # Copyright 2007-2008 William Patrick McNeill
6 | #
7 | # This file is part of the Stanford Parser Ruby Wrapper.
8 | #
9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
10 | # and/or modify it under the terms of the GNU General Public License as
11 | # published by the Free Software Foundation; either version 2 of the License,
12 | # or (at your option) any later version.
13 | #
14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
17 | # Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License along with
20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21 | # St, Fifth Floor, Boston, MA 02110-1301 USA
22 | #
23 | #++
24 |
25 | # == Synopsis
26 | #
27 | # Parse a sentence passed in on the command line.
28 | #
29 | # == Usage
30 | #
31 | # stanford-sentence-parser.rb [options] sentence
32 | #
33 | # options::
34 | # See the Java Stanford Parser documentation for details
35 | #
36 | # sentence::
37 | # A sentence to parse. This must appear after all the options and be quoted.
38 |
39 | require 'rubygems'
40 | require "stanfordparser"
41 |
42 | # The last argument is the sentence. The rest of the command line is passed
43 | # along to the parser object.
44 | sentence = ARGV.pop
45 | parser = StanfordParser::LexicalizedParser.new(StanfordParser::ENGLISH_PCFG_MODEL, ARGV)
46 | puts parser.apply(sentence)
47 |
--------------------------------------------------------------------------------
/lib/stanfordparser.rb:
--------------------------------------------------------------------------------
1 | require 'rubygems'
2 |
3 | require "pathname"
4 | require "rjb"
5 | require "singleton"
6 | begin
7 | require "treebank"
8 | gem "treebank", ">= 3.0.0"
9 | rescue LoadError
10 | require "treebank"
11 | end
12 | require "yaml"
13 |
14 | # Wrapper for the {Stanford Natural Language
15 | # Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
16 | module StanfordParser
17 |
18 | require "stanfordparser/java_object"
19 |
20 | VERSION = "2.2.1"
21 |
22 | # The default sentence segmenter and tokenizer. This is an English-language
23 | # tokenizer with support for Penn Treebank markup.
24 | EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer"
25 |
26 | # Path to an English PCFG model that comes with the Stanford Parser. The
27 | # location is relative to the parser root directory. This is a valid value
28 | # for the grammar parameter of the LexicalizedParser constructor.
29 | ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
30 |
31 | # This function is executed once when the module is loaded. It initializes
32 | # the Java virtual machine in which the Stanford parser will run. By
33 | # default, it adds the parser installation root to the Java classpath and
34 | # launches the VM with the arguments -server -Xmx150m. Different
35 | # values may be specified with the ruby-stanford-parser.yaml
36 | # configuration file.
37 | #
38 | # This function determines which operating system we are running on and sets
39 | # default pathnames accordingly:
40 | #
41 | # UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml
42 | # Windows:: C:\stanford-parser\current,
43 | # C:\stanford-parser\ruby-stanford-parser.yaml
44 | #
45 | # This function returns the path of the parser installation root.
46 | def StanfordParser.initialize_on_load
47 | if RUBY_PLATFORM =~ /(win|w)32$/
48 | root = Pathname.new("C:\\stanford-parser\\current ")
49 | config = Pathname.new("C:\\stanford-parser\\ruby-stanford-parser.yaml")
50 | else
51 | root = Pathname.new("/usr/local/stanford-parser/current")
52 | config = Pathname.new("/etc/ruby-stanford-parser.yaml")
53 | end
54 | jvmargs = ["-server", "-Xmx150m"]
55 | if config.file?
56 | configuration = open(config) {|f| YAML.load(f)}
57 | if configuration.key?("root") and not configuration["root"].nil?
58 | root = Pathname.new(configuration["root"])
59 | end
60 | if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
61 | jvmargs = configuration["jvmargs"].split
62 | end
63 | end
64 | Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
65 | root
66 | end
67 |
68 | private_class_method :initialize_on_load
69 |
70 | # The root directory of the Stanford parser installation.
71 | ROOT = initialize_on_load
72 |
73 | #--
74 | # The documentation below is for the original Rjb::JavaObjectWrapper object.
75 | # It is reproduced here because rdoc only takes the last document block
76 | # defined. If Rjb is moved into its own gem, this documentation should go
77 | # with it, and the following should be written as documentation for this
78 | # class:
79 | #
80 | # Extension of the generic Ruby-Java Bridge wrapper object for the
81 | # StanfordParser module.
82 | #++
83 | # A generic wrapper for a Java object loaded via the {Ruby-Java
84 | # Bridge}[http://rjb.rubyforge.org/]. The wrapper class handles
85 | # intialization and stringification, and passes other method calls down to
86 | # the underlying Java object. Objects returned by the underlying Java
87 | # object are converted to the appropriate Ruby object.
88 | #
89 | # Other modules may extend the list of Java objects that are converted by
90 | # adding their own converter functions. See wrap_java_object for details.
91 | #
92 | # This object is enumerable, yielding items in the order defined by the
93 | # underlying Java object's iterator.
94 | class Rjb::JavaObjectWrapper
95 | # FeatureLabel objects go inside a FeatureLabel wrapper.
96 | def wrap_edu_stanford_nlp_ling_FeatureLabel(object)
97 | StanfordParser::FeatureLabel.new(object)
98 | end
99 |
100 | # Tree objects go inside a Tree wrapper. Various tree types are aliased
101 | # to this function.
102 | def wrap_edu_stanford_nlp_trees_Tree(object)
103 | Tree.new(object)
104 | end
105 |
106 | alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree
107 | alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree
108 | alias :wrap_edu_stanford_nlp_trees_SimpleTree :wrap_edu_stanford_nlp_trees_Tree
109 | alias :wrap_edu_stanford_nlp_trees_TreeGraphNode :wrap_edu_stanford_nlp_trees_Tree
110 |
111 | protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel
112 | end # Rjb::JavaObjectWrapper
113 |
114 |
115 | # Lexicalized probabalistic parser.
116 | #
117 | # This is an wrapper for the
118 | # edu.stanford.nlp.parser.lexparser.LexicalizedParser object.
119 | class LexicalizedParser < Rjb::JavaObjectWrapper
120 | # The grammar used by the parser
121 | attr_reader :grammar
122 |
123 | # Create the parser given a grammar and options. The grammar
124 | # argument is a path to a grammar file. This path may contain the string
125 | # $(ROOT), which will be replaced with the root directory of the
126 | # Stanford Parser. By default, an English PCFG grammar is loaded.
127 | #
128 | # The options argument is a list of string arguments as they
129 | # would appear on a command line. See the documentaion of
130 | # edu.stanford.nlp.parser.lexparser.Options.setOptions for more
131 | # details.
132 | def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
133 | @grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
134 | super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
135 | @java_object.setOptionFlags(options)
136 | end
137 |
138 | def to_s
139 | "LexicalizedParser(#{grammar.basename})"
140 | end
141 | end # LexicalizedParser
142 |
143 |
144 | # A singleton instance of the default Stanford Natural Language parser. A
145 | # singleton is used because the parser can take a few seconds to load.
146 | class DefaultParser < StanfordParser::LexicalizedParser
147 | include Singleton
148 | end
149 |
150 |
151 | # This is a wrapper for
152 | # edu.stanford.nlp.trees.Tree objects. It customizes
153 | # stringification.
154 | class Tree < Rjb::JavaObjectWrapper
155 | def initialize(obj = "edu.stanford.nlp.trees.Tree")
156 | super(obj)
157 | end
158 |
159 | # Return the label along with the score if there is one.
160 | def inspect
161 | s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
162 | "(#{s})"
163 | end
164 |
165 | # The Penn treebank representation. This prints with indenting instead of
166 | # putting everything on one line.
167 | def to_s
168 | "#{pennString}"
169 | end
170 | end # Tree
171 |
172 |
173 | # This is a wrapper for
174 | # edu.stanford.nlp.ling.Word objects. It customizes
175 | # stringification and adds an equivalence operator.
176 | class Word < Rjb::JavaObjectWrapper
177 | def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
178 | super(obj, *args)
179 | end
180 |
181 | # See the word values.
182 | def inspect
183 | to_s
184 | end
185 |
186 | # Equivalence is defined relative to the word value.
187 | def ==(other)
188 | word == other
189 | end
190 | end # Word
191 |
192 |
193 | # This is a wrapper for edu.stanford.nlp.ling.FeatureLabel objects.
194 | # It customizes stringification.
195 | class FeatureLabel < Rjb::JavaObjectWrapper
196 | def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel")
197 | super
198 | end
199 |
200 | # Stringify with just the token and its begin and end position.
201 | def to_s
202 | # BUGBUG The position values come back as java.lang.Integer though I
203 | # would expect Rjb to convert them to Ruby integers.
204 | begin_position = get(self.BEGIN_POSITION_KEY)
205 | end_position = get(self.END_POSITION_KEY)
206 | "#{current} [#{begin_position},#{end_position}]"
207 | end
208 |
209 | # More verbose stringification with all the fields and their values.
210 | def inspect
211 | toString
212 | end
213 | end
214 |
215 |
216 | # Tokenizes documents into words and sentences.
217 | #
218 | # This is a wrapper for the
219 | # edu.stanford.nlp.process.DocumentPreprocessor object.
220 | class DocumentPreprocessor < Rjb::JavaObjectWrapper
221 | def initialize(suppressEscaping = false)
222 | super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
223 | end
224 |
225 | # Returns a list of sentences in a string.
226 | def getSentencesFromString(s)
227 | s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
228 | _invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
229 | end
230 |
231 | def inspect
232 | "<#{self.class.to_s.split('::').last}>"
233 | end
234 |
235 | def to_s
236 | inspect
237 | end
238 | end # DocumentPreprocessor
239 |
240 | # A text token that contains raw and normalized token identity (.e.g "(" and
241 | # "-LRB-"), an offset span, and the characters immediately preceding and
242 | # following the token. Given a list of these objects it is possible to
243 | # recreate the text from which they came verbatim.
244 | class StandoffToken < Struct.new(:current, :word, :before, :after,
245 | :begin_position, :end_position)
246 | def to_s
247 | "#{current} [#{begin_position},#{end_position}]"
248 | end
249 | end
250 |
251 |
252 | # A preprocessor that segments text into sentences and tokens that contain
253 | # character offset and token context information that can be used for
254 | # standoff annotation.
255 | class StandoffDocumentPreprocessor < DocumentPreprocessor
256 | def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER)
257 | # PTBTokenizer.factory is a static function, so use RJB to call it
258 | # directly instead of going through a JavaObjectWrapper. We do it this
259 | # way because the Standford parser Java code does not provide a
260 | # constructor that allows you to specify the second parameter,
261 | # invertible, to true, and we need this to write character offset
262 | # information into the tokens.
263 | ptb_tokenizer_class = Rjb::import(tokenizer)
264 | # See the documentation for
265 | # edu.stanford.nlp.process.DocumentPreprocessor for a
266 | # description of these parameters.
267 | ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false)
268 | super(ptb_tokenizer_factory)
269 | end
270 |
271 | # Returns a list of sentences in a string. This wraps the returned
272 | # sentences in a StandoffSentence object.
273 | def getSentencesFromString(s)
274 | super(s).map!{|s| StandoffSentence.new(s)}
275 | end
276 | end
277 |
278 |
279 | # A sentence is an array of StandoffToken objects.
280 | class StandoffSentence < Array
281 | # Construct an array of StandoffToken objects from a Java list sentence
282 | # object returned by the preprocessor.
283 | def initialize(stanford_parser_sentence)
284 | # Convert FeatureStructure wrappers to StandoffToken objects.
285 | s = stanford_parser_sentence.to_a.collect do |fs|
286 | current = fs.current
287 | word = fs.word
288 | before = fs.before
289 | after = fs.after
290 | # The to_s.to_i is necessary because the get function returns
291 | # java.lang.Integer objects instead of Ruby integers.
292 | begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i
293 | end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i
294 | StandoffToken.new(current, word, before, after,
295 | begin_position, end_position)
296 | end
297 | super(s)
298 | end
299 |
300 | # Return the original string verbatim.
301 | def to_s
302 | self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current
303 | end
304 |
305 | # Return the original string verbatim.
306 | def inspect
307 | to_s
308 | end
309 | end
310 |
311 |
312 | # Standoff syntactic annotation of natural language text which may contain
313 | # multiple sentences.
314 | #
315 | # This is an Array of StandoffNode objects, one for each sentence in the
316 | # text.
317 | class StandoffParsedText < Array
318 | # Parse the text and create the standoff annotation.
319 | #
320 | # The default parser is a singleton instance of the English language
321 | # Stanford Natural Langugage parser. There may be a delay of a few
322 | # seconds for it to load the first time it is created.
323 | def initialize(text, nodetype = StandoffNode,
324 | tokenizer = EN_PENN_TREEBANK_TOKENIZER,
325 | parser = DefaultParser.instance)
326 | preprocessor = StandoffDocumentPreprocessor.new(tokenizer)
327 | # Segment the text into sentences. Parse each sentence, writing
328 | # standoff annotation information into the terminal nodes.
329 | preprocessor.getSentencesFromString(text).map do |sentence|
330 | parse = parser.apply(sentence.to_s)
331 | push(nodetype.new(parse, sentence))
332 | end
333 | end
334 |
335 | # Print class name and number of sentences.
336 | def inspect
337 | "<#{self.class.name}, #{length} sentences>"
338 | end
339 |
340 | # Print parses.
341 | def to_s
342 | flatten.join(" ")
343 | end
344 | end
345 |
346 |
347 | # Standoff syntactic tree annotation of text. Terminal nodes are labeled
348 | # with the appropriate StandoffToken objects. Standoff parses can reproduce
349 | # the original string from which they were generated verbatim, optionally
350 | # with brackets around the yields of specified non-terminal nodes.
351 | class StandoffNode < Treebank::ParentedNode
352 | # Create the standoff tree from a tree returned by the Stanford parser.
353 | # For non-terminal nodes, the tokens argument will be a
354 | # StandoffSentence containing the StandoffToken objects representing all
355 | # the tokens beneath and after this node. For terminal nodes, the
356 | # tokens argument will be a StandoffToken.
357 | def initialize(stanford_parser_node, tokens)
358 | # Annotate this node with a non-terminal label or a StandoffToken as
359 | # appropriate.
360 | super(tokens.instance_of?(StandoffSentence) ?
361 | stanford_parser_node.value : tokens)
362 | # Enumerate the children depth-first. Tokens are removed from the list
363 | # left-to-right as terminal nodes are added to the tree.
364 | stanford_parser_node.children.each do |child|
365 | subtree = self.class.new(child, child.leaf? ? tokens.shift : tokens)
366 | attach_child!(subtree)
367 | end
368 | end
369 |
370 | # Return the original text string dominated by this node.
371 | def to_original_string
372 | leaves.inject("") do |s, leaf|
373 | s += leaf.label.current + leaf.label.after
374 | end
375 | end
376 |
377 | # Print the original string with brackets around word spans dominated by
378 | # the specified consituents.
379 | #
380 | # The constituents to bracket are specified by passing a list of node
381 | # coordinates, which are arrays of integers of the form returned by the
382 | # tree enumerators of Treebank::Node objects.
383 | #
384 | # _coords_:: the coordinates of the nodes around which to place brackets
385 | # _open_:: the open bracket symbol
386 | # _close_:: the close bracket symbol
387 | def to_bracketed_string(coords, open = "[", close = "]")
388 | # Get a list of all the leaf nodes and their coordinates.
389 | items = depth_first_enumerator(true).find_all {|n| n.first.leaf?}
390 | # Enumerate over all the matching constituents inserting open and close
391 | # brackets around their yields in the items list.
392 | coords.each do |matching|
393 | # Insert using a simple state machine with three states: :start,
394 | # :open, and :close.
395 | state = :start
396 | # Enumerate over the items list looking for nodes that are the
397 | # children of the matching constituent.
398 | items.each_with_index do |item, index|
399 | # Skip inserted bracket characters.
400 | next if item.is_a? String
401 | # Handle terminal node items with the state machine.
402 | node, terminal_coordinate = item
403 | if state == :start
404 | next if not in_yield?(matching, terminal_coordinate)
405 | items.insert(index, open)
406 | state = :open
407 | else # state == :open
408 | next if in_yield?(matching, terminal_coordinate)
409 | items.insert(index, close)
410 | state = :close
411 | break
412 | end
413 | end # items.each_with_index
414 | # Handle the case where a matching constituent is flush with the end
415 | # of the sentence.
416 | items << close if state == :open
417 | end # each
418 | # Replace terminal nodes with their string representations. Insert
419 | # spacing characters in the list.
420 | items.each_with_index do |item, index|
421 | next if item.is_a? String
422 | text = item.first.label.current
423 | spacing = item.first.label.after
424 | # Replace the terminal node with its text.
425 | items[index] = text
426 | # Insert the spacing that comes after this text before the first
427 | # non-close bracket character.
428 | close_pos = find_index(items[index+1..-1]) {|item| not item == close}
429 | items.insert(index + close_pos + 1, spacing)
430 | end
431 | items.join
432 | end # to_bracketed_string
433 |
434 | # Find the index of the first item in _list_ for which _block_ is true.
435 | # Return 0 if no items are found.
436 | def find_index(list, &block)
437 | list.each_with_index do |item, index|
438 | return index if block.call(item)
439 | end
440 | 0
441 | end
442 |
443 | # Is the node at _terminal_ in the yield of the node at _node_?
444 | def in_yield?(node, terminal)
445 | # If node A's coordinates match the prefix of node B's coordinates, node
446 | # B is in the yield of node A.
447 | terminal.first(node.length) == node
448 | end
449 |
450 | private :in_yield?, :find_index
451 | end # StandoffNode
452 |
453 | end # StanfordParser
454 |
--------------------------------------------------------------------------------
/lib/stanfordparser/java_object.rb:
--------------------------------------------------------------------------------
1 | # Copyright 2007-2008 William Patrick McNeill
2 | #
3 | # This file is part of the Stanford Parser Ruby Wrapper.
4 | #
5 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
6 | # and/or modify it under the terms of the GNU General Public License as
7 | # published by the Free Software Foundation; either version 2 of the License,
8 | # or (at your option) any later version.
9 | #
10 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
11 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13 | # Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License along with
16 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 | # St, Fifth Floor, Boston, MA 02110-1301 USA
18 |
19 | # Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
20 | # add a generic Java object wrapper class.
21 | module Rjb
22 |
23 | #--
24 | # The documentation for this class appears next to its extension inside the
25 | # StanfordParser module in stanfordparser.rb. This should be changed if Rjb
26 | # is ever moved into its own gem. See the documention in stanfordparser.rb
27 | # for more details.
28 | #++
29 | class JavaObjectWrapper
30 | include Enumerable
31 |
32 | # The underlying Java object.
33 | attr_reader :java_object
34 |
35 | # Initialize with a Java object obj. If obj is a
36 | # String, treat it as a Java class name and instantiate it. Otherwise,
37 | # treat obj as an instance of a Java object.
38 | def initialize(obj, *args)
39 | @java_object = obj.class == String ?
40 | Rjb::import(obj).send(:new, *args) : obj
41 | end
42 |
43 | # Enumerate all the items in the object using its iterator. If the object
44 | # has no iterator, this function yields nothing.
45 | def each
46 | if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
47 | i = @java_object.iterator
48 | while i.hasNext
49 | yield wrap_java_object(i.next)
50 | end
51 | end
52 | end # each
53 |
54 | # Reflect unhandled method calls to the underlying Java object and wrap
55 | # the return value in the appropriate Ruby object.
56 | def method_missing(m, *args)
57 | begin
58 | wrap_java_object(@java_object.send(m, *args))
59 | rescue RuntimeError => e
60 | # The instance method failed. See if this is a static method.
61 | if not e.message.match(/^Fail: unknown method name/).nil?
62 | getClass.send(m, *args)
63 | end
64 | end
65 | end
66 |
67 | # Convert a value returned by a call to the underlying Java object to the
68 | # appropriate Ruby object.
69 | #
70 | # If the value is a JavaObjectWrapper, convert it using a protected
71 | # function with the name wrap_ followed by the underlying object's
72 | # classname with the Java path delimiters converted to underscores. For
73 | # example, a java.util.ArrayList would be converted by a function
74 | # called wrap_java_util_ArrayList.
75 | #
76 | # If the value lacks the appropriate converter function, wrap it in a
77 | # generic JavaObjectWrapper.
78 | #
79 | # If the value is not a JavaObjectWrapper, return it unchanged.
80 | #
81 | # This function is called recursively for every element in an Array.
82 | def wrap_java_object(object)
83 | if object.kind_of?(Array)
84 | object.collect {|item| wrap_java_object(item)}
85 | elsif object.respond_to?(:_classname)
86 | # Ruby-Java Bridge Java objects all have a _classname member which
87 | # tells the name of their Java class. Convert this to the
88 | # corresponding wrapper function name.
89 | wrapper_name = ("wrap_" + object._classname.gsub(/\./, "_")).to_sym
90 | respond_to?(wrapper_name) ? send(wrapper_name, object) : JavaObjectWrapper.new(object)
91 | else
92 | object
93 | end
94 | end
95 |
96 | # Convert java.util.ArrayList objects to Ruby Array objects.
97 | def wrap_java_util_ArrayList(object)
98 | array_list = []
99 | object.size.times do
100 | |i| array_list << wrap_java_object(object.get(i))
101 | end
102 | array_list
103 | end
104 |
105 | # Convert java.util.HashSet objects to Ruby Set objects.
106 | def wrap_java_util_HashSet(object)
107 | set = Set.new
108 | i = object.iterator
109 | while i.hasNext
110 | set << wrap_java_object(i.next)
111 | end
112 | set
113 | end
114 |
115 | # Show the classname of the underlying Java object.
116 | def inspect
117 | "<#{@java_object._classname}>"
118 | end
119 |
120 | # Use the underlying Java object's stringification.
121 | def to_s
122 | toString
123 | end
124 |
125 | protected :wrap_java_object, :wrap_java_util_ArrayList, :wrap_java_util_HashSet
126 |
127 | end # JavaObjectWrapper
128 |
129 | end # Rjb
130 |
--------------------------------------------------------------------------------
/stanfordparser.gemspec:
--------------------------------------------------------------------------------
1 | # Generated by jeweler
2 | # DO NOT EDIT THIS FILE DIRECTLY
3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4 | # -*- encoding: utf-8 -*-
5 |
6 | Gem::Specification.new do |s|
7 | s.name = %q{stanfordparser}
8 | s.version = "2.2.1.s"
9 |
10 | s.required_rubygems_version = Gem::Requirement.new("> 1.3.1") if s.respond_to? :required_rubygems_version=
11 | s.authors = ["John Wilkinson", "Bill McNeal"]
12 | s.date = %q{2010-06-21}
13 | s.description = %q{Ruby wrapper of the Stanford Parser, a NLP parser built in Java.}
14 | s.email = %q{jcwilk@gmail.com}
15 | s.extra_rdoc_files = [
16 | "LICENSE",
17 | "README.orig",
18 | "README.rdoc"
19 | ]
20 | s.files = [
21 | ".document",
22 | ".gitignore",
23 | "LICENSE",
24 | "README.orig",
25 | "README.rdoc",
26 | "Rakefile",
27 | "VERSION.yml",
28 | "examples/connection_finder.rb",
29 | "examples/stanford-sentence-parser.rb",
30 | "lib/stanfordparser.rb",
31 | "lib/stanfordparser/java_object.rb",
32 | "spec/spec.opts",
33 | "spec/spec_helper.rb",
34 | "spec/stanfordparser_spec.rb",
35 | "test/test_stanfordparser.rb"
36 | ]
37 | s.homepage = %q{http://github.com/jcwilk/stanfordparser}
38 | s.rdoc_options = ["--charset=UTF-8"]
39 | s.require_paths = ["lib"]
40 | s.rubygems_version = %q{1.3.7}
41 | s.summary = %q{GitHub upload/extension of Bill McNeal's stanfordparser rubygem}
42 | s.test_files = [
43 | "spec/spec_helper.rb",
44 | "spec/stanfordparser_spec.rb",
45 | "test/test_stanfordparser.rb",
46 | "examples/connection_finder.rb",
47 | "examples/stanford-sentence-parser.rb"
48 | ]
49 |
50 | if s.respond_to? :specification_version then
51 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
52 | s.specification_version = 3
53 |
54 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55 | s.add_runtime_dependency(%q, [">= 1.2.5"])
56 | s.add_runtime_dependency(%q, [">= 3.0.0"])
57 | s.add_development_dependency(%q, [">= 1.2.9"])
58 | else
59 | s.add_dependency(%q, [">= 1.2.5"])
60 | s.add_dependency(%q, [">= 3.0.0"])
61 | s.add_dependency(%q, [">= 1.2.9"])
62 | end
63 | else
64 | s.add_dependency(%q, [">= 1.2.5"])
65 | s.add_dependency(%q, [">= 3.0.0"])
66 | s.add_dependency(%q, [">= 1.2.9"])
67 | end
68 | end
69 |
70 |
--------------------------------------------------------------------------------
/test/test_stanfordparser.rb:
--------------------------------------------------------------------------------
1 | #!/bin/env ruby
2 |
3 | #--
4 |
5 | # Copyright 2007-2008 William Patrick McNeill
6 | #
7 | # This file is part of the Stanford Parser Ruby Wrapper.
8 | #
9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
10 | # and/or modify it under the terms of the GNU General Public License as
11 | # published by the Free Software Foundation; either version 2 of the License,
12 | # or (at your option) any later version.
13 | #
14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
17 | # Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License along with
20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21 | # St, Fifth Floor, Boston, MA 02110-1301 USA
22 | #
23 | #++
24 |
25 | # Test cases for the Stanford Parser module
26 |
27 | require "test/unit"
28 | require "set"
29 | require "singleton"
30 | require "stanfordparser"
31 |
32 |
33 | class LexicalizedParserTestCase < Test::Unit::TestCase
34 | def test_root_path
35 | assert_equal StanfordParser::ROOT.class, Pathname
36 | end
37 |
38 | def setup
39 | @parser = StanfordParser::DefaultParser.instance
40 | @tree = @parser.apply("This is a sentence.")
41 | end
42 |
43 | def test_parser
44 | assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
45 | assert_equal @tree.class, StanfordParser::Tree
46 | end
47 |
48 | def test_localTrees
49 | # The following call exercises the conversion from java.util.HashSet
50 | # objects to Ruby sets.
51 | l = @tree.localTrees
52 | assert_equal l.size, 5
53 | assert_equal Set.new(l.collect {|t| "#{t.label}"}),
54 | Set.new(["S", "NP", "VP", "ROOT", "NP"])
55 | end
56 |
57 | def test_enumerable
58 | # StanfordParser::LexicalizedParser is not an enumerable object.
59 | assert_equal @parser.map, []
60 | end
61 | end # LexicalizedParserTestCase
62 |
63 |
64 | class TreeTestCase < Test::Unit::TestCase
65 | def setup
66 | @parser = StanfordParser::DefaultParser.instance
67 | @tree = @parser.apply("This is a sentence.")
68 | end
69 |
70 | def test_enumerable
71 | assert @tree.all? {|n| n.class == StanfordParser::Tree}
72 | assert @tree.all? {|n|
73 | n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
74 | n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
75 | }
76 | assert_equal @tree.map {|n| "#{n.label}"},
77 | ["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
78 | "NN", "sentence", ".", "."]
79 | end
80 | end # TreeTestCase
81 |
82 |
83 | class FeatureLabelTestCase < Test::Unit::TestCase
84 | def test_feature_label
85 | f = StanfordParser::FeatureLabel.new
86 | assert_equal "BEGIN_POS", f.BEGIN_POSITION_KEY
87 | f.put(f.BEGIN_POSITION_KEY, 3)
88 | assert_equal "END_POS", f.END_POSITION_KEY
89 | f.put(f.END_POSITION_KEY, 7)
90 | assert_equal "current", f.CURRENT_KEY
91 | f.put(f.CURRENT_KEY, "word")
92 | assert_equal "{BEGIN_POS=3, END_POS=7, current=word}", f.inspect
93 | assert_equal "word [3,7]", f.to_s
94 | end
95 | end
96 |
97 |
98 | class DocumentPreprocessorTestCase < Test::Unit::TestCase
99 | def setup
100 | @preproc = StanfordParser::DocumentPreprocessor.new
101 | @standoff_preproc = StanfordParser::StandoffDocumentPreprocessor.new
102 | end
103 |
104 | def test_get_sentences_from_string
105 | # The following call exercises the conversion from java.util.ArrayList
106 | # objects to Ruby arrays.
107 | s = @preproc.getSentencesFromString("This is a sentence. So is this.")
108 | assert_equal "#{s[0]}", "This is a sentence ."
109 | assert_equal "#{s[1]}", "So is this ."
110 | end
111 |
112 | def test_enumerable
113 | # StanfordParser::DocumentPreprocessor is not an enumerable object.
114 | assert_equal @preproc.map, []
115 | end
116 |
117 | # Segment and tokenize text containing two sentences.
118 | def test_standoff_document_preprocessor
119 | sentences = @standoff_preproc.getSentencesFromString("He (John) is tall. So is she.")
120 | # Recognize two sentences.
121 | assert_equal 2, sentences.length
122 | assert sentences.all? {|sentence| sentence.instance_of? StanfordParser::StandoffSentence}
123 | assert_equal "He (John) is tall.", sentences.first.to_s
124 | assert_equal 7, sentences.first.length
125 | assert sentences[0].all? {|token| token.instance_of? StanfordParser::StandoffToken}
126 | assert_equal "So is she.", sentences.last.to_s
127 | assert_equal 4, sentences.last.length
128 | assert sentences[1].all? {|token| token.instance_of? StanfordParser::StandoffToken}
129 | # Get the correct token information for the first sentence.
130 | assert_equal ["He", "He"], [sentences[0][0].current(), sentences[0][0].word()]
131 | assert_equal [0,2], [sentences[0][0].begin_position(), sentences[0][0].end_position()]
132 | assert_equal ["(", "-LRB-"], [sentences[0][1].current(), sentences[0][1].word()]
133 | assert_equal [3,4], [sentences[0][1].begin_position(), sentences[0][1].end_position()]
134 | assert_equal ["John", "John"], [sentences[0][2].current(), sentences[0][2].word()]
135 | assert_equal [4,8], [sentences[0][2].begin_position(), sentences[0][2].end_position()]
136 | assert_equal [")", "-RRB-"], [sentences[0][3].current(), sentences[0][3].word()]
137 | assert_equal [8,9], [sentences[0][3].begin_position(), sentences[0][3].end_position()]
138 | assert_equal ["is", "is"], [sentences[0][4].current(), sentences[0][4].word()]
139 | assert_equal [10,12], [sentences[0][4].begin_position(), sentences[0][4].end_position()]
140 | assert_equal ["tall", "tall"], [sentences[0][5].current(), sentences[0][5].word()]
141 | assert_equal [13,17], [sentences[0][5].begin_position(), sentences[0][5].end_position()]
142 | assert_equal [".", "."], [sentences[0][6].current(), sentences[0][6].word()]
143 | assert_equal [17,18], [sentences[0][6].begin_position(), sentences[0][6].end_position()]
144 | # Get the correct token information for the second sentence.
145 | assert_equal ["So", "So"], [sentences[1][0].current(), sentences[1][0].word()]
146 | assert_equal [20,22], [sentences[1][0].begin_position(), sentences[1][0].end_position()]
147 | assert_equal ["is", "is"], [sentences[1][1].current(), sentences[1][1].word()]
148 | assert_equal [23,25], [sentences[1][1].begin_position(), sentences[1][1].end_position()]
149 | assert_equal ["she", "she"], [sentences[1][2].current(), sentences[1][2].word()]
150 | assert_equal [26,29], [sentences[1][2].begin_position(), sentences[1][2].end_position()]
151 | assert_equal [".", "."], [sentences[1][3].current(), sentences[1][3].word()]
152 | assert_equal [29,30], [sentences[1][3].begin_position(), sentences[1][3].end_position()]
153 | end
154 |
155 | def test_stringification
156 | assert_equal "", @preproc.inspect
157 | assert_equal "", @preproc.to_s
158 | assert_equal "", @standoff_preproc.inspect
159 | assert_equal "", @standoff_preproc.to_s
160 | end
161 |
162 | end # DocumentPreprocessorTestCase
163 |
164 |
165 | class StandoffParsedTextTestCase < Test::Unit::TestCase
166 | def setup
167 | @text = "He (John) is tall. So is she."
168 | end
169 |
170 | def test_parse_text_default_nodetype
171 | parsed_text = StanfordParser::StandoffParsedText.new(@text)
172 | verify_parsed_text(parsed_text, StanfordParser::StandoffNode)
173 | end
174 |
175 | # Verify correct parsing with variable node types for text containing two sentences.
176 | def verify_parsed_text(parsed_text, nodetype)
177 | # Verify that there are two sentences.
178 | assert_equal 2, parsed_text.length
179 | assert parsed_text.all? {|sentence| sentence.instance_of? nodetype}
180 | # Verify the tokens in the leaf node of the first sentence.
181 | leaves = parsed_text[0].leaves.collect {|node| node.label}
182 | assert_equal ["He", "He"], [leaves[0].current(), leaves[0].word()]
183 | assert_equal [0,2], [leaves[0].begin_position(), leaves[0].end_position()]
184 | assert_equal ["(", "-LRB-"], [leaves[1].current(), leaves[1].word()]
185 | assert_equal [3,4], [leaves[1].begin_position(), leaves[1].end_position()]
186 | assert_equal ["John", "John"], [leaves[2].current(), leaves[2].word()]
187 | assert_equal [4,8], [leaves[2].begin_position(), leaves[2].end_position()]
188 | assert_equal [")", "-RRB-"], [leaves[3].current(), leaves[3].word()]
189 | assert_equal [8,9], [leaves[3].begin_position(), leaves[3].end_position()]
190 | assert_equal ["is", "is"], [leaves[4].current(), leaves[4].word()]
191 | assert_equal [10,12], [leaves[4].begin_position(), leaves[4].end_position()]
192 | assert_equal ["tall", "tall"], [leaves[5].current(), leaves[5].word()]
193 | assert_equal [13,17], [leaves[5].begin_position(), leaves[5].end_position()]
194 | assert_equal [".", "."], [leaves[6].current(), leaves[6].word()]
195 | assert_equal [17,18], [leaves[6].begin_position(), leaves[6].end_position()]
196 | # Verify the tokens in the leaf node of the second sentence.
197 | leaves = parsed_text[1].leaves.collect {|node| node.label}
198 | assert_equal ["So", "So"], [leaves[0].current(), leaves[0].word()]
199 | assert_equal [20,22], [leaves[0].begin_position(), leaves[0].end_position()]
200 | assert_equal ["is", "is"], [leaves[1].current(), leaves[1].word()]
201 | assert_equal [23,25], [leaves[1].begin_position(), leaves[1].end_position()]
202 | assert_equal ["she", "she"], [leaves[2].current(), leaves[2].word()]
203 | assert_equal [26,29], [leaves[2].begin_position(), leaves[2].end_position()]
204 | assert_equal [".", "."], [leaves[3].current(), leaves[3].word()]
205 | assert_equal [29,30], [leaves[3].begin_position(), leaves[3].end_position()]
206 | # Verify that the original string is recoverable.
207 | assert_equal "He (John) is tall. ", parsed_text[0].to_original_string
208 | assert_equal "So is she." , parsed_text[1].to_original_string
209 | # Draw < and > brackets around 3 constituents.
210 | b = parsed_text[0].to_bracketed_string([[0,0], [0,0,1,1], [0,1,1]], "<", ">")
211 | assert_equal ")> is . ", b
212 | end
213 | end
214 |
215 |
216 | class MiscPreprocessorTestCase < Test::Unit::TestCase
217 | def test_model_location
218 | assert_equal "$(ROOT)/englishPCFG.ser.gz", StanfordParser::ENGLISH_PCFG_MODEL
219 | end
220 |
221 | def test_word
222 | assert StanfordParser::Word.new("edu.stanford.nlp.ling.Word", "dog") == "dog"
223 | end
224 | end # MiscPreprocessorTestCase
225 |
--------------------------------------------------------------------------------