├── .document
├── .gitignore
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.orig.rdoc
├── README.rdoc
├── Rakefile
├── TESTS_STATUS.rdoc
├── VERSION.yml
├── examples
    └── stanford-sentence-parser.rb
├── lib
    ├── stanfordparser.rb
    └── stanfordparser
    │   └── java_object.rb
├── stanfordparser.gemspec
└── test
    └── test_stanfordparser.rb


/.document:
--------------------------------------------------------------------------------
1 | README.rdoc
2 | lib/**/*.rb
3 | bin/*
4 | features/**/*.feature
5 | LICENSE
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## MAC OS
 2 | .DS_Store
 3 | 
 4 | ## TEXTMATE
 5 | *.tmproj
 6 | tmtags
 7 | 
 8 | ## EMACS
 9 | *~
10 | \#*
11 | .\#*
12 | 
13 | ## VIM
14 | *.swp
15 | 
16 | ## RubyMine
17 | /.idea
18 | 
19 | ## PROJECT::GENERAL
20 | coverage
21 | rdoc
22 | pkg
23 | 
24 | ## PROJECT::SPECIFIC
25 | .bundle
26 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source :gemcutter
2 | gem "rjb"
3 | gem "rake"
4 | gem "jeweler"
5 | gem "treebank", ">= 3.0.0"
6 | gem "rspec", ">= 1.2.9"
7 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | --- 
 2 | dependencies: 
 3 |   rake: 
 4 |     group: 
 5 |     - :default
 6 |     version: ">= 0"
 7 |   rspec: 
 8 |     group: 
 9 |     - :default
10 |     version: ">= 1.2.9"
11 |   jeweler: 
12 |     group: 
13 |     - :default
14 |     version: ">= 0"
15 |   treebank: 
16 |     group: 
17 |     - :default
18 |     version: ">= 3.0.0"
19 |   rjb: 
20 |     group: 
21 |     - :default
22 |     version: ">= 0"
23 | specs: 
24 | - rake: 
25 |     version: 0.8.7
26 | - json_pure: 
27 |     version: 1.4.3
28 | - gemcutter: 
29 |     version: 0.5.0
30 | - git: 
31 |     version: 1.2.5
32 | - rubyforge: 
33 |     version: 2.0.4
34 | - jeweler: 
35 |     version: 1.4.0
36 | - rjb: 
37 |     version: 1.2.5
38 | - rspec: 
39 |     version: 1.3.0
40 | - treebank: 
41 |     version: 3.0.0
42 | hash: 264a823adfd7bb2231dd1037e95b74038b67283d
43 | sources: 
44 | - Rubygems: 
45 |     uri: http://gemcutter.org
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2007-2008 William Patrick McNeill
 2 | Copyright (c) 2010 John Wilkinson
 3 | 
 4 | This file is part of the Stanford Parser Ruby Wrapper.
 5 | 
 6 | The Stanford Parser Ruby Wrapper is free software; you can redistribute it
 7 | and/or modify it under the terms of the GNU General Public License as
 8 | published by the Free Software Foundation; either version 2 of the License,
 9 | or (at your option) any later version.
10 | 
11 | The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
12 | useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
14 | Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License along with
17 | editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 | St, Fifth Floor, Boston, MA 02110-1301 USA


--------------------------------------------------------------------------------
/README.orig.rdoc:
--------------------------------------------------------------------------------
  1 | = Stanford Natural Language Parser Wrapper
  2 | 
  3 | This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
  4 | 
  5 | The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic.  This module provides a thin wrapper around the Java code to make it accessible from Ruby along with pure Ruby objects that enable standoff parsing.
  6 | 
  7 | 
  8 | = Installation and Configuration
  9 | 
 10 | In addition to the Ruby gems it requires, to run this module you must manually install the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
 11 | 
 12 | This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory on UNIX platforms and in the <tt>C:\stanford-parser\current</tt> directory on Windows platforms.  This is the directory that contains the <tt>stanford-parser.jar</tt> file.  When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments <tt>-server -Xmx150m</tt>.
 13 | 
 14 | These defaults can be overridden by creating the configuration file <tt>/etc/ruby_stanford_parser.yaml</tt> on UNIX platforms and <tt>C:\stanford-parser\ruby-stanford-parser.yaml</tt> on Windows platforms.  This file is in the Ruby YAML[http://ruby-doc.org/stdlib/libdoc/yaml/rdoc/index.html] format, and may contain two values: <tt>root</tt> and <tt>jvmargs</tt>. For example, the file might look like the following:
 15 | 
 16 | 	root: /usr/local/stanford-parser/other/location
 17 | 	jvmargs: -Xmx100m -verbose
 18 | 
 19 | 
 20 | =Tokenization and Parsing
 21 | 
 22 | Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into sentences and words.
 23 | 
 24 | 	>> require "stanfordparser"
 25 | 	=> true
 26 | 	>> preproc = StanfordParser::DocumentPreprocessor.new
 27 | 	=> <DocumentPreprocessor>
 28 | 	>> puts preproc.getSentencesFromString("This is a sentence.  So is this.")
 29 | 	This is a sentence .
 30 | 	So is this .
 31 | 
 32 | Use the StanfordParser::LexicalizedParser class to parse sentences.
 33 | 
 34 | 	>> parser = StanfordParser::LexicalizedParser.new
 35 | 	Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
 36 | 	=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
 37 | 	>> puts parser.apply("This is a sentence.")
 38 | 	(ROOT
 39 | 	  (S [24.917]
 40 | 	    (NP [6.139] (DT [2.300] This))
 41 | 	    (VP [17.636] (VBZ [0.144] is)
 42 | 	      (NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
 43 | 	    (. [0.002] .)))
 44 | 
 45 | For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
 46 | 
 47 | 
 48 | =Standoff Tokenization and Parsing
 49 | 
 50 | This module also contains support for standoff tokenization and parsing, in which the terminal nodes of parse trees contain information about the text that was used to generate them.
 51 | 
 52 | Use StanfordParser::StandoffDocumentPreprocessor class to tokenize text and files into sentences and words.
 53 | 
 54 | 	>> preproc = StanfordParser::StandoffDocumentPreprocessor.new
 55 | 	=> <StandoffDocumentPreprocessor>
 56 | 	>> s = preproc.getSentencesFromString("This is a sentence.  So is this.")
 57 | 	=> [This is a sentence., So is this.]
 58 | 
 59 | The standoff preprocessor returns StanfordParser::StandoffToken objects, which contain character offsets into the original text along with information about spacing characters that came before and after the token.
 60 | 
 61 |  	>> puts s
 62 | 	This [0,4]
 63 | 	is [5,7]
 64 | 	a [8,9]
 65 | 	sentence [10,18]
 66 | 	. [18,19]
 67 | 	So [21,23]
 68 | 	is [24,26]
 69 | 	this [27,31]
 70 | 	. [31,32]
 71 | 	>> "This is a sentence.  So is this."[27..31]
 72 | 	=> "this."
 73 | 
 74 | This is the same information contained in the <tt>edu.stanford.nlp.ling.FeatureLabel</tt> class in the Stanford Parser Java implementation.  
 75 | 
 76 | Similarly, use the StanfordParser::StandoffParsedText object to parse a block of text into StanfordParser::StandoffNode parse trees whose terminal nodes are StanfordParser::StandoffToken objects.
 77 | 
 78 | 	>> t = StanfordParser::StandoffParsedText.new("This is a sentence.  So is this.")
 79 | 	Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [4.9 sec].
 80 | 	=> <StanfordParser::StandoffParsedText, 2 sentences>
 81 | 	>> puts t.first
 82 | 	(ROOT
 83 | 	  (S
 84 | 	    (NP (DT This [0,4]))
 85 | 	    (VP (VBZ is [5,7])
 86 | 	      (NP (DT a [8,9]) (NN sentence [10,18])))
 87 | 	    (. . [18,19])))
 88 | 
 89 | Standoff parse trees can reproduce the text from which they were generated verbatim.
 90 | 
 91 | 	>> t.first.to_original_string
 92 | 	=> "This is a sentence.  "
 93 | 
 94 | They can also reproduce the original text with brackets inserted around the yields of specified parse nodes.
 95 | 
 96 | 	>> t.first.to_bracketed_string([[0,0,0], [0,1,1]])
 97 | 	=> "[This] is [a sentence].  "
 98 | 
 99 | The format of the coordinates used to specify individual nodes is described in the documentation for the Ruby Treebank[http://rubyforge.org/projects/treebank/] gem.
100 | 
101 | See the documentation of the individual classes in this module for more details.
102 | 
103 | Unlike their parents StanfordParser::DocumentPreprocessor and StanfordParser::LexicalizedParser, which produce Ruby wrappers around Java objects, StanfordParser::StandoffDocumentPreprocessor and StanfordParser::StandoffParsedText produce pure Ruby objects.  This is to facilitate serialization of these objects using tools like the Marshal module, which cannot serialize Java objects.
104 | 
105 | = History
106 | 
107 | 1.0.0:: Initial release
108 | 1.1.0:: Make module initialization function private.  Add example code.
109 | 1.2.0:: Read Java VM arguments from the configuration file.  Add Word class.
110 | 2.0.0:: Add support for standoff parsing.  Change the way Rjb::JavaObjectWrapper wraps returned values: see wrap_java_object for details.  Rjb::JavaObjectWrapper supports static members.  Minor changes to stanford-sentence-parser script.
111 | 2.1.0:: Different default paths for Windows machines; Minor changes to StandoffToken definition
112 | 2.2.0:: Add parent information to StandoffNode
113 | 
114 | = Copyright
115 | 
116 | Copyright 2007-2008, William Patrick McNeill
117 | 
118 | This program is distributed under the GNU General Public License.
119 | 
120 | 
121 | = Author
122 | 
123 | W.P. McNeill mailto:billmcn@gmail.com


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
 1 | = stanfordparser
 2 | 
 3 | This is an upload/extension of Bill McNeal's stanfordparser rubyforge gem, check it out at its homepage (seems to be partially in French)
 4 | 
 5 | http://rubyforge.org/projects/stanfordparser/
 6 | 
 7 | or its rdocs
 8 | 
 9 | http://stanfordparser.rubyforge.org/
10 | 
11 | I've been having issues trying to use this gem so I decided to upload it to github and try to organize it to be a little more gem-friendly, especially using jeweler.
12 | 
13 | AFAIK there aren't other copies of this on github, please correct me if I'm mistaken. The only similar one I can see is http://github.com/tiendung/ruby-nlp which has much less code and I can only assume to be something else.
14 | 
15 | It seems like using version 1.6.1 of the java StanfordParser package is your best bet for compatability.
16 | 
17 | See README.orig.rdoc for Bill's readme, which includes dependencies, installation, and usage.
18 | 
19 | == Branches
20 | 
21 | * master - Jeweler and Bundler integrated along with slight reorganization of files to be more gem-standard. This is the branch you should use if you want to source the gem straight from github. I will leave this branch alone for the most part unless I find/come up with stable and useful additions. All changes will be backwards compatible.
22 | * stock - Almost untouched from Bill's version, except for the README. Use this branch if that's what you're looking for.
23 | * fixing_tests - The tests are currently broken, this branch is trying to address that. Once the tests are fixed it will be merged back into master. Help appreciated! I'll keep a TESTS_STATUS.rdoc keeping track of progress.
24 | * experimental - I'll be putting in some code as examples and testing out some ideas. Do not use this branch as a gem. You are very encouraged, however, to fork it and add some code/make my code better. I'll try to integrate all the pull requests I get, if not in that branch into another. 
25 | 
26 | == Note on Patches/Pull Requests
27 |  
28 | * Fork the project.
29 | * Make your feature addition or bug fix.
30 | * Add tests for it. I would prefer rSpec, but TestUnit is acceptable as well since there are some of those from the original author.
31 | * Commit.
32 | * Send me a pull request. Bonus points for topic branches.
33 | 
34 | == Copyright
35 | 
36 | Copyright (c) 2010 John Wilkinson. See LICENSE for details.
37 | Copyright 2007-2008, William Patrick McNeill. See README.orig for details.
38 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'rake'
 3 | 
 4 | $LOAD_PATH.unshift('lib')
 5 | 
 6 | begin
 7 |   require 'jeweler'
 8 |   Jeweler::Tasks.new do |gem|
 9 |     gem.name = "stanfordparser"
10 |     gem.summary = "GitHub upload/extension of Bill McNeal's stanfordparser rubygem"
11 |     gem.description = "Ruby wrapper of the Stanford Parser, a NLP parser built in Java."
12 |     gem.email = "jcwilk@gmail.com"
13 |     gem.homepage = "http://github.com/jcwilk/stanfordparser"
14 |     gem.authors = ["John Wilkinson","Bill McNeal"]
15 | 
16 |     gem.add_dependency "rjb", ">= 1.2.5"
17 |     gem.add_dependency "treebank", ">= 3.0.0"
18 |     gem.add_development_dependency "rspec", ">= 1.2.9"
19 |   end
20 |   Jeweler::GemcutterTasks.new
21 | rescue LoadError
22 |   puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
23 | end
24 | 
25 | require 'rake/testtask'
26 | Rake::TestTask.new(:test) do |test|
27 |   test.test_files = FileList.new('test/**/test_*.rb') do |list|
28 |     list.exclude 'test/test_helper.rb'
29 |   end
30 |   test.libs << 'test'
31 |   test.verbose = true
32 | end
33 | 
34 | require 'spec/rake/spectask'
35 | Spec::Rake::SpecTask.new(:spec) do |spec|
36 |   spec.libs << 'lib' << 'spec'
37 |   spec.spec_files = FileList['spec/**/*_spec.rb']
38 | end
39 | 
40 | Spec::Rake::SpecTask.new(:rcov) do |spec|
41 |   spec.libs << 'lib' << 'spec'
42 |   spec.pattern = 'spec/**/*_spec.rb'
43 |   spec.rcov = true
44 | end
45 | 
46 | task :test => :check_dependencies
47 | 
48 | task :spec => :check_dependencies
49 | 
50 | task :default => :test
51 | 
52 | require 'rake/rdoctask'
53 | Rake::RDocTask.new do |rdoc|
54 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
55 | 
56 |   rdoc.rdoc_dir = 'rdoc'
57 |   rdoc.title = "stanfordparser #{version}"
58 |   rdoc.rdoc_files.include('README*')
59 |   rdoc.rdoc_files.include('lib/**/*.rb')
60 | end
61 | 


--------------------------------------------------------------------------------
/TESTS_STATUS.rdoc:
--------------------------------------------------------------------------------
1 | = Status of Tests
2 | * Please see fixing_tests branches for efforts towards this goal.
3 | 


--------------------------------------------------------------------------------
/VERSION.yml:
--------------------------------------------------------------------------------
1 | ---
2 | :major: 2
3 | :minor: 2
4 | :patch: 1
5 | :build: s


--------------------------------------------------------------------------------
/examples/stanford-sentence-parser.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | #--
 4 | 
 5 | # Copyright 2007-2008 William Patrick McNeill
 6 | #
 7 | # This file is part of the Stanford Parser Ruby Wrapper.
 8 | #
 9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
10 | # and/or modify it under the terms of the GNU General Public License as
11 | # published by the Free Software Foundation; either version 2 of the License,
12 | # or (at your option) any later version.
13 | #
14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
17 | # Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License along with
20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21 | # St, Fifth Floor, Boston, MA 02110-1301 USA
22 | #
23 | #++
24 | 
25 | # == Synopsis
26 | #
27 | # Parse a sentence passed in on the command line.
28 | #
29 | # == Usage
30 | #
31 | # stanford-sentence-parser.rb [options] sentence
32 | #
33 | # options::
34 | #    See the Java Stanford Parser documentation for details
35 | #
36 | # sentence::
37 | #    A sentence to parse.  This must appear after all the options and be quoted.
38 | 
39 | require 'rubygems'
40 | require "stanfordparser"
41 | 
42 | # The last argument is the sentence.  The rest of the command line is passed
43 | # along to the parser object.
44 | sentence = ARGV.pop
45 | parser = StanfordParser::LexicalizedParser.new(StanfordParser::ENGLISH_PCFG_MODEL, ARGV)
46 | puts parser.apply(sentence)
47 | 


--------------------------------------------------------------------------------
/lib/stanfordparser.rb:
--------------------------------------------------------------------------------
  1 | require 'rubygems'
  2 | 
  3 | require "pathname"
  4 | require "rjb"
  5 | require "singleton"
  6 | begin
  7 |   require "treebank"
  8 |   gem "treebank", ">= 3.0.0"
  9 | rescue LoadError
 10 |   require "treebank"
 11 | end
 12 | require "yaml"
 13 | 
 14 | # Wrapper for the {Stanford Natural Language
 15 | # Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
 16 | module StanfordParser
 17 | 
 18 |   require "stanfordparser/java_object"
 19 | 
 20 |   VERSION = "2.2.1"
 21 | 
 22 |   # The default sentence segmenter and tokenizer.  This is an English-language
 23 |   # tokenizer with support for Penn Treebank markup.
 24 |   EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer"
 25 | 
 26 |   # Path to an English PCFG model that comes with the Stanford Parser.  The
 27 |   # location is relative to the parser root directory.  This is a valid value
 28 |   # for the <em>grammar</em> parameter of the LexicalizedParser constructor.
 29 |   ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
 30 | 
 31 |   # This function is executed once when the module is loaded.  It initializes
 32 |   # the Java virtual machine in which the Stanford parser will run.  By
 33 |   # default, it adds the parser installation root to the Java classpath and
 34 |   # launches the VM with the arguments <tt>-server -Xmx150m</tt>.  Different
 35 |   # values may be specified with the <tt>ruby-stanford-parser.yaml</tt>
 36 |   # configuration file.
 37 |   #
 38 |   # This function determines which operating system we are running on and sets
 39 |   # default pathnames accordingly:
 40 |   #
 41 |   # UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml
 42 |   # Windows:: C:\stanford-parser\current,
 43 |   #           C:\stanford-parser\ruby-stanford-parser.yaml
 44 |   #
 45 |   # This function returns the path of the parser installation root.
 46 |   def StanfordParser.initialize_on_load
 47 |     if RUBY_PLATFORM =~ /(win|w)32$/
 48 |       root = Pathname.new("C:\\stanford-parser\\current ")
 49 |       config = Pathname.new("C:\\stanford-parser\\ruby-stanford-parser.yaml")
 50 |     else
 51 |       root = Pathname.new("/usr/local/stanford-parser/current")
 52 |       config = Pathname.new("/etc/ruby-stanford-parser.yaml")
 53 |     end
 54 |     jvmargs = ["-server", "-Xmx150m"]
 55 |     if config.file?
 56 |       configuration = open(config) {|f| YAML.load(f)}
 57 |       if configuration.key?("root") and not configuration["root"].nil?
 58 |         root = Pathname.new(configuration["root"])
 59 |       end
 60 |       if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
 61 |         jvmargs = configuration["jvmargs"].split
 62 |       end
 63 |     end
 64 |     Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
 65 |     root
 66 |   end
 67 | 
 68 |   private_class_method :initialize_on_load
 69 | 
 70 |   # The root directory of the Stanford parser installation.
 71 |   ROOT = initialize_on_load
 72 | 
 73 |   #--
 74 |   # The documentation below is for the original Rjb::JavaObjectWrapper object.
 75 |   # It is reproduced here because rdoc only takes the last document block
 76 |   # defined.  If Rjb is moved into its own gem, this documentation should go
 77 |   # with it, and the following should be written as documentation for this
 78 |   # class:
 79 |   #
 80 |   # Extension of the generic Ruby-Java Bridge wrapper object for the
 81 |   # StanfordParser module.
 82 |   #++
 83 |   # A generic wrapper for a Java object loaded via the {Ruby-Java
 84 |   # Bridge}[http://rjb.rubyforge.org/].  The wrapper class handles
 85 |   # intialization and stringification, and passes other method calls down to
 86 |   # the underlying Java object.  Objects returned by the underlying Java
 87 |   # object are converted to the appropriate Ruby object.
 88 |   #
 89 |   # Other modules may extend the list of Java objects that are converted by
 90 |   # adding their own converter functions.  See wrap_java_object for details.
 91 |   #
 92 |   # This object is enumerable, yielding items in the order defined by the
 93 |   # underlying Java object's iterator.
 94 |   class Rjb::JavaObjectWrapper
 95 |     # FeatureLabel objects go inside a FeatureLabel wrapper.
 96 |     def wrap_edu_stanford_nlp_ling_FeatureLabel(object)
 97 |       StanfordParser::FeatureLabel.new(object)
 98 |     end
 99 | 
100 |     # Tree objects go inside a Tree wrapper.  Various tree types are aliased
101 |     # to this function.
102 |     def wrap_edu_stanford_nlp_trees_Tree(object)
103 |       Tree.new(object)
104 |     end
105 | 
106 |     alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree
107 |     alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree
108 |     alias :wrap_edu_stanford_nlp_trees_SimpleTree            :wrap_edu_stanford_nlp_trees_Tree
109 |     alias :wrap_edu_stanford_nlp_trees_TreeGraphNode         :wrap_edu_stanford_nlp_trees_Tree
110 | 
111 |     protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel
112 |   end # Rjb::JavaObjectWrapper
113 | 
114 | 
115 |   # Lexicalized probabalistic parser.
116 |   #
117 |   # This is an wrapper for the
118 |   # <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
119 |   class LexicalizedParser < Rjb::JavaObjectWrapper
120 |     # The grammar used by the parser
121 |     attr_reader :grammar
122 | 
123 |     # Create the parser given a grammar and options.  The <em>grammar</em>
124 |     # argument is a path to a grammar file.  This path may contain the string
125 |     # <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
126 |     # Stanford Parser. By default, an English PCFG grammar is loaded.
127 |     #
128 |     # The <em>options</em> argument is a list of string arguments as they
129 |     # would appear on a command line.  See the documentaion of
130 |     # <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
131 |     # details.
132 |     def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
133 |       @grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
134 |       super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
135 |       @java_object.setOptionFlags(options)
136 |     end
137 | 
138 |     def to_s
139 |       "LexicalizedParser(#{grammar.basename})"
140 |     end
141 |   end # LexicalizedParser
142 | 
143 | 
144 |   # A singleton instance of the default Stanford Natural Language parser.  A
145 |   # singleton is used because the parser can take a few seconds to load.
146 |   class DefaultParser < StanfordParser::LexicalizedParser
147 |     include Singleton
148 |   end
149 | 
150 | 
151 |   # This is a wrapper for
152 |   # <tt>edu.stanford.nlp.trees.Tree</tt> objects.  It customizes
153 |   # stringification.
154 |   class Tree < Rjb::JavaObjectWrapper
155 |     def initialize(obj = "edu.stanford.nlp.trees.Tree")
156 |       super(obj)
157 |     end
158 | 
159 |     # Return the label along with the score if there is one.
160 |     def inspect
161 |       s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
162 |       "(#{s})"
163 |     end
164 | 
165 |     # The Penn treebank representation.  This prints with indenting instead of
166 |     # putting everything on one line.
167 |     def to_s
168 |       "#{pennString}"
169 |     end
170 |   end # Tree
171 | 
172 | 
173 |   # This is a wrapper for
174 |   # <tt>edu.stanford.nlp.ling.Word</tt> objects.  It customizes
175 |   # stringification and adds an equivalence operator.
176 |   class Word < Rjb::JavaObjectWrapper
177 |     def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
178 |       super(obj, *args)
179 |     end
180 | 
181 |     # See the word values.
182 |     def inspect
183 |       to_s
184 |     end
185 | 
186 |     # Equivalence is defined relative to the word value.
187 |     def ==(other)
188 |       word == other
189 |     end
190 |   end # Word
191 | 
192 | 
193 |   # This is a wrapper for <tt>edu.stanford.nlp.ling.FeatureLabel</tt> objects.
194 |   # It customizes stringification.
195 |   class FeatureLabel < Rjb::JavaObjectWrapper
196 |     def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel")
197 |       super
198 |     end
199 | 
200 |     # Stringify with just the token and its begin and end position.
201 |     def to_s
202 |       # BUGBUG The position values come back as java.lang.Integer though I
203 |       # would expect Rjb to convert them to Ruby integers.
204 |       begin_position = get(self.BEGIN_POSITION_KEY)
205 |       end_position = get(self.END_POSITION_KEY)
206 |       "#{current} [#{begin_position},#{end_position}]"
207 |     end
208 | 
209 |     # More verbose stringification with all the fields and their values.
210 |     def inspect
211 |       toString
212 |     end
213 |   end
214 | 
215 | 
216 |   # Tokenizes documents into words and sentences.
217 |   #
218 |   # This is a wrapper for the
219 |   # <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
220 |   class DocumentPreprocessor < Rjb::JavaObjectWrapper
221 |     def initialize(suppressEscaping = false)
222 |       super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
223 |     end
224 | 
225 |     # Returns a list of sentences in a string.
226 |     def getSentencesFromString(s)
227 |       s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
228 |       _invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
229 |     end
230 |     
231 |     def inspect
232 |       "<#{self.class.to_s.split('::').last}>"
233 |     end
234 |     
235 |     def to_s
236 |       inspect
237 |     end
238 |   end # DocumentPreprocessor
239 | 
240 |   # A text token that contains raw and normalized token identity (.e.g "(" and
241 |   # "-LRB-"), an offset span, and the characters immediately preceding and
242 |   # following the token.  Given a list of these objects it is possible to
243 |   # recreate the text from which they came verbatim.
244 |   class StandoffToken < Struct.new(:current, :word, :before, :after,
245 |                                    :begin_position, :end_position)
246 |     def to_s
247 |       "#{current} [#{begin_position},#{end_position}]"
248 |     end
249 |   end
250 | 
251 | 
252 |   # A preprocessor that segments text into sentences and tokens that contain
253 |   # character offset and token context information that can be used for
254 |   # standoff annotation.
255 |   class StandoffDocumentPreprocessor < DocumentPreprocessor
256 |     def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER)
257 |       # PTBTokenizer.factory is a static function, so use RJB to call it
258 |       # directly instead of going through a JavaObjectWrapper.  We do it this
259 |       # way because the Standford parser Java code does not provide a
260 |       # constructor that allows you to specify the second parameter,
261 |       # invertible, to true, and we need this to write character offset
262 |       # information into the tokens.
263 |       ptb_tokenizer_class = Rjb::import(tokenizer)
264 |       # See the documentation for
265 |       # <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> for a
266 |       # description of these parameters.
267 |       ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false)
268 |       super(ptb_tokenizer_factory)
269 |     end
270 | 
271 |     # Returns a list of sentences in a string.  This wraps the returned
272 |     # sentences in a StandoffSentence object.
273 |     def getSentencesFromString(s)
274 |       super(s).map!{|s| StandoffSentence.new(s)}
275 |     end
276 |   end
277 | 
278 | 
279 |   # A sentence is an array of StandoffToken objects.
280 |   class StandoffSentence < Array
281 |     # Construct an array of StandoffToken objects from a Java list sentence
282 |     # object returned by the preprocessor.
283 |     def initialize(stanford_parser_sentence)
284 |       # Convert FeatureStructure wrappers to StandoffToken objects.
285 |       s = stanford_parser_sentence.to_a.collect do |fs|
286 |         current = fs.current
287 |         word = fs.word
288 |         before = fs.before
289 |         after = fs.after
290 |         # The to_s.to_i is necessary because the get function returns
291 |         # java.lang.Integer objects instead of Ruby integers.
292 |         begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i
293 |         end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i
294 |         StandoffToken.new(current, word, before, after,
295 |                           begin_position, end_position)
296 |       end
297 |       super(s)
298 |     end
299 | 
300 |     # Return the original string verbatim.
301 |     def to_s
302 |       self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current
303 |     end
304 | 
305 |     # Return the original string verbatim.
306 |     def inspect
307 |       to_s
308 |     end
309 |   end
310 | 
311 | 
312 |   # Standoff syntactic annotation of natural language text which may contain
313 |   # multiple sentences.
314 |   #
315 |   # This is an Array of StandoffNode objects, one for each sentence in the
316 |   # text.
317 |   class StandoffParsedText < Array
318 |     # Parse the text and create the standoff annotation.
319 |     #
320 |     # The default parser is a singleton instance of the English language
321 |     # Stanford Natural Langugage parser.  There may be a delay of a few
322 |     # seconds for it to load the first time it is created.
323 |     def initialize(text, nodetype = StandoffNode,
324 |                    tokenizer = EN_PENN_TREEBANK_TOKENIZER,
325 |                    parser = DefaultParser.instance)
326 |       preprocessor = StandoffDocumentPreprocessor.new(tokenizer)
327 |       # Segment the text into sentences.  Parse each sentence, writing
328 |       # standoff annotation information into the terminal nodes.
329 |       preprocessor.getSentencesFromString(text).map do |sentence|
330 |         parse = parser.apply(sentence.to_s)
331 |         push(nodetype.new(parse, sentence))
332 |       end
333 |     end
334 | 
335 |     # Print class name and number of sentences.
336 |     def inspect
337 |       "<#{self.class.name}, #{length} sentences>"
338 |     end
339 | 
340 |     # Print parses.
341 |     def to_s
342 |       flatten.join(" ")
343 |     end
344 |   end
345 | 
346 | 
347 |   # Standoff syntactic tree annotation of text.  Terminal nodes are labeled
348 |   # with the appropriate StandoffToken objects.  Standoff parses can reproduce
349 |   # the original string from which they were generated verbatim, optionally
350 |   # with brackets around the yields of specified non-terminal nodes.
351 |   class StandoffNode < Treebank::ParentedNode
352 |     # Create the standoff tree from a tree returned by the Stanford parser.
353 |     # For non-terminal nodes, the <em>tokens</em> argument will be a
354 |     # StandoffSentence containing the StandoffToken objects representing all
355 |     # the tokens beneath and after this node.  For terminal nodes, the
356 |     # <em>tokens</em> argument will be a StandoffToken.
357 |     def initialize(stanford_parser_node, tokens)
358 |       # Annotate this node with a non-terminal label or a StandoffToken as
359 |       # appropriate.
360 |       super(tokens.instance_of?(StandoffSentence) ?
361 |             stanford_parser_node.value : tokens)
362 |       # Enumerate the children depth-first.  Tokens are removed from the list
363 |       # left-to-right as terminal nodes are added to the tree.
364 |       stanford_parser_node.children.each do |child|
365 |         subtree = self.class.new(child, child.leaf? ? tokens.shift : tokens)
366 |         attach_child!(subtree)
367 |       end
368 |     end
369 | 
370 |     # Return the original text string dominated by this node.
371 |     def to_original_string
372 |       leaves.inject("") do |s, leaf|
373 |         s += leaf.label.current + leaf.label.after
374 |       end
375 |     end
376 | 
377 |     # Print the original string with brackets around word spans dominated by
378 |     # the specified consituents.
379 |     #
380 |     # The constituents to bracket are specified by passing a list of node
381 |     # coordinates, which are arrays of integers of the form returned by the
382 |     # tree enumerators of Treebank::Node objects.
383 |     #
384 |     # _coords_:: the coordinates of the nodes around which to place brackets
385 |     # _open_:: the open bracket symbol
386 |     # _close_:: the close bracket symbol
387 |     def to_bracketed_string(coords, open = "[", close = "]")
388 |       # Get a list of all the leaf nodes and their coordinates.
389 |       items = depth_first_enumerator(true).find_all {|n| n.first.leaf?}
390 |       # Enumerate over all the matching constituents inserting open and close
391 |       # brackets around their yields in the items list.
392 |       coords.each do |matching|
393 |         # Insert using a simple state machine with three states: :start,
394 |         # :open, and :close.
395 |         state = :start
396 |         # Enumerate over the items list looking for nodes that are the
397 |         # children of the matching constituent.
398 |         items.each_with_index do |item, index|
399 |           # Skip inserted bracket characters.
400 |           next if item.is_a? String
401 |           # Handle terminal node items with the state machine.
402 |           node, terminal_coordinate = item
403 |           if state == :start
404 |             next if not in_yield?(matching, terminal_coordinate)
405 |             items.insert(index, open)
406 |             state = :open
407 |           else # state == :open
408 |             next if in_yield?(matching, terminal_coordinate)
409 |             items.insert(index, close)
410 |             state = :close
411 |             break
412 |           end
413 |         end # items.each_with_index
414 |         # Handle the case where a matching constituent is flush with the end
415 |         # of the sentence.
416 |         items << close if state == :open
417 |       end # each
418 |       # Replace terminal nodes with their string representations.  Insert
419 |       # spacing characters in the list.
420 |       items.each_with_index do |item, index|
421 |         next if item.is_a? String
422 |         text = item.first.label.current
423 |         spacing = item.first.label.after
424 |         # Replace the terminal node with its text.
425 |         items[index] = text
426 |         # Insert the spacing that comes after this text before the first
427 |         # non-close bracket character.
428 |         close_pos = find_index(items[index+1..-1]) {|item| not item == close}
429 |         items.insert(index + close_pos + 1, spacing)
430 |       end
431 |       items.join
432 |     end # to_bracketed_string
433 | 
434 |     # Find the index of the first item in _list_ for which _block_ is true.
435 |     # Return 0 if no items are found.
436 |     def find_index(list, &block)
437 |       list.each_with_index do |item, index|
438 |         return index if block.call(item)
439 |       end
440 |       0
441 |     end
442 | 
443 |     # Is the node at _terminal_ in the yield of the node at _node_?
444 |     def in_yield?(node, terminal)
445 |       # If node A's coordinates match the prefix of node B's coordinates, node
446 |       # B is in the yield of node A.
447 |       terminal.first(node.length) == node
448 |     end
449 | 
450 |     private :in_yield?, :find_index
451 |   end # StandoffNode
452 | 
453 | end # StanfordParser
454 | 


--------------------------------------------------------------------------------
/lib/stanfordparser/java_object.rb:
--------------------------------------------------------------------------------
  1 | # Copyright 2007-2008 William Patrick McNeill
  2 | #
  3 | # This file is part of the Stanford Parser Ruby Wrapper.
  4 | #
  5 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
  6 | # and/or modify it under the terms of the GNU General Public License as
  7 | # published by the Free Software Foundation; either version 2 of the License,
  8 | # or (at your option) any later version.
  9 | #
 10 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
 11 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 13 | # Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License along with
 16 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
 17 | # St, Fifth Floor, Boston, MA 02110-1301 USA
 18 | 
 19 | # Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
 20 | # add a generic Java object wrapper class.
 21 | module Rjb
 22 | 
 23 |   #--
 24 |   # The documentation for this class appears next to its extension inside the
 25 |   # StanfordParser module in stanfordparser.rb.  This should be changed if Rjb
 26 |   # is ever moved into its own gem.  See the documention in stanfordparser.rb
 27 |   # for more details.  
 28 |   #++
 29 |   class JavaObjectWrapper
 30 |     include Enumerable
 31 | 
 32 |     # The underlying Java object.
 33 |     attr_reader :java_object
 34 | 
 35 |     # Initialize with a Java object <em>obj</em>.  If <em>obj</em> is a
 36 |     # String, treat it as a Java class name and instantiate it.  Otherwise,
 37 |     # treat <em>obj</em> as an instance of a Java object.
 38 |     def initialize(obj, *args)
 39 |       @java_object = obj.class == String ?
 40 |       Rjb::import(obj).send(:new, *args) : obj
 41 |     end
 42 | 
 43 |     # Enumerate all the items in the object using its iterator.  If the object
 44 |     # has no iterator, this function yields nothing.
 45 |     def each
 46 |       if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
 47 |         i = @java_object.iterator
 48 |         while i.hasNext
 49 |           yield wrap_java_object(i.next)
 50 |         end
 51 |       end
 52 |     end # each
 53 | 
 54 |     # Reflect unhandled method calls to the underlying Java object and wrap
 55 |     # the return value in the appropriate Ruby object.
 56 |     def method_missing(m, *args)
 57 |       begin
 58 |         wrap_java_object(@java_object.send(m, *args))
 59 |       rescue RuntimeError => e
 60 |         # The instance method failed.  See if this is a static method.
 61 |         if not e.message.match(/^Fail: unknown method name/).nil?
 62 |           getClass.send(m, *args)
 63 |         end
 64 |       end
 65 |     end
 66 | 
 67 |     # Convert a value returned by a call to the underlying Java object to the
 68 |     # appropriate Ruby object.
 69 |     #
 70 |     # If the value is a JavaObjectWrapper, convert it using a protected
 71 |     # function with the name wrap_ followed by the underlying object's
 72 |     # classname with the Java path delimiters converted to underscores. For
 73 |     # example, a <tt>java.util.ArrayList</tt> would be converted by a function
 74 |     # called wrap_java_util_ArrayList.
 75 |     #
 76 |     # If the value lacks the appropriate converter function, wrap it in a
 77 |     # generic JavaObjectWrapper.
 78 |     #
 79 |     # If the value is not a JavaObjectWrapper, return it unchanged.
 80 |     #
 81 |     # This function is called recursively for every element in an Array.
 82 |     def wrap_java_object(object)
 83 |       if object.kind_of?(Array)
 84 |         object.collect {|item| wrap_java_object(item)}
 85 |       elsif object.respond_to?(:_classname)
 86 |         # Ruby-Java Bridge Java objects all have a _classname member which
 87 |         # tells the name of their Java class.  Convert this to the
 88 |         # corresponding wrapper function name.
 89 |         wrapper_name = ("wrap_" + object._classname.gsub(/\./, "_")).to_sym
 90 |         respond_to?(wrapper_name) ? send(wrapper_name, object) : JavaObjectWrapper.new(object)
 91 |       else
 92 |         object
 93 |       end
 94 |     end
 95 | 
 96 |     # Convert <tt>java.util.ArrayList</tt> objects to Ruby Array objects.
 97 |     def wrap_java_util_ArrayList(object)
 98 |       array_list = []
 99 |       object.size.times do
100 |         |i| array_list << wrap_java_object(object.get(i))
101 |       end
102 |       array_list
103 |     end
104 | 
105 |     # Convert <tt>java.util.HashSet</tt> objects to Ruby Set objects.
106 |     def wrap_java_util_HashSet(object)
107 |       set = Set.new
108 |       i = object.iterator
109 |       while i.hasNext
110 |         set << wrap_java_object(i.next)
111 |       end
112 |       set
113 |     end
114 | 
115 |     # Show the classname of the underlying Java object.
116 |     def inspect
117 |       "<#{@java_object._classname}>"
118 |     end
119 | 
120 |     # Use the underlying Java object's stringification.
121 |     def to_s
122 |       toString
123 |     end
124 | 
125 |     protected :wrap_java_object, :wrap_java_util_ArrayList, :wrap_java_util_HashSet
126 | 
127 |   end # JavaObjectWrapper
128 | 
129 | end # Rjb
130 | 


--------------------------------------------------------------------------------
/stanfordparser.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{stanfordparser}
 8 |   s.version = "2.2.1.s"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new("> 1.3.1") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["John Wilkinson", "Bill McNeal"]
12 |   s.date = %q{2010-06-21}
13 |   s.description = %q{Ruby wrapper of the Stanford Parser, a NLP parser built in Java.}
14 |   s.email = %q{jcwilk@gmail.com}
15 |   s.extra_rdoc_files = [
16 |     "LICENSE",
17 |      "README.orig",
18 |      "README.rdoc"
19 |   ]
20 |   s.files = [
21 |     ".document",
22 |      ".gitignore",
23 |      "LICENSE",
24 |      "README.orig",
25 |      "README.rdoc",
26 |      "Rakefile",
27 |      "VERSION.yml",
28 |      "examples/connection_finder.rb",
29 |      "examples/stanford-sentence-parser.rb",
30 |      "lib/stanfordparser.rb",
31 |      "lib/stanfordparser/java_object.rb",
32 |      "spec/spec.opts",
33 |      "spec/spec_helper.rb",
34 |      "spec/stanfordparser_spec.rb",
35 |      "test/test_stanfordparser.rb"
36 |   ]
37 |   s.homepage = %q{http://github.com/jcwilk/stanfordparser}
38 |   s.rdoc_options = ["--charset=UTF-8"]
39 |   s.require_paths = ["lib"]
40 |   s.rubygems_version = %q{1.3.7}
41 |   s.summary = %q{GitHub upload/extension of Bill McNeal's stanfordparser rubygem}
42 |   s.test_files = [
43 |     "spec/spec_helper.rb",
44 |      "spec/stanfordparser_spec.rb",
45 |      "test/test_stanfordparser.rb",
46 |      "examples/connection_finder.rb",
47 |      "examples/stanford-sentence-parser.rb"
48 |   ]
49 | 
50 |   if s.respond_to? :specification_version then
51 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
52 |     s.specification_version = 3
53 | 
54 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55 |       s.add_runtime_dependency(%q<rjb>, [">= 1.2.5"])
56 |       s.add_runtime_dependency(%q<treebank>, [">= 3.0.0"])
57 |       s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
58 |     else
59 |       s.add_dependency(%q<rjb>, [">= 1.2.5"])
60 |       s.add_dependency(%q<treebank>, [">= 3.0.0"])
61 |       s.add_dependency(%q<rspec>, [">= 1.2.9"])
62 |     end
63 |   else
64 |     s.add_dependency(%q<rjb>, [">= 1.2.5"])
65 |     s.add_dependency(%q<treebank>, [">= 3.0.0"])
66 |     s.add_dependency(%q<rspec>, [">= 1.2.9"])
67 |   end
68 | end
69 | 
70 | 


--------------------------------------------------------------------------------
/test/test_stanfordparser.rb:
--------------------------------------------------------------------------------
  1 | #!/bin/env ruby
  2 | 
  3 | #--
  4 | 
  5 | # Copyright 2007-2008 William Patrick McNeill
  6 | #
  7 | # This file is part of the Stanford Parser Ruby Wrapper.
  8 | #
  9 | # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
 10 | # and/or modify it under the terms of the GNU General Public License as
 11 | # published by the Free Software Foundation; either version 2 of the License,
 12 | # or (at your option) any later version.
 13 | #
 14 | # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
 15 | # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 17 | # Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License along with
 20 | # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
 21 | # St, Fifth Floor, Boston, MA 02110-1301 USA
 22 | #
 23 | #++
 24 | 
 25 | # Test cases for the Stanford Parser module
 26 | 
 27 | require "test/unit"
 28 | require "set"
 29 | require "singleton"
 30 | require "stanfordparser"
 31 | 
 32 | 
 33 | class LexicalizedParserTestCase < Test::Unit::TestCase
 34 |   def test_root_path
 35 |     assert_equal StanfordParser::ROOT.class, Pathname
 36 |   end
 37 | 
 38 |   def setup
 39 |     @parser = StanfordParser::DefaultParser.instance
 40 |     @tree = @parser.apply("This is a sentence.")
 41 |   end
 42 | 
 43 |   def test_parser
 44 |     assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
 45 |     assert_equal @tree.class, StanfordParser::Tree
 46 |   end
 47 | 
 48 |   def test_localTrees
 49 |     # The following call exercises the conversion from java.util.HashSet
 50 |     # objects to Ruby sets.
 51 |     l = @tree.localTrees
 52 |     assert_equal l.size, 5
 53 |     assert_equal Set.new(l.collect {|t| "#{t.label}"}),
 54 |                  Set.new(["S", "NP", "VP", "ROOT", "NP"])
 55 |   end
 56 | 
 57 |   def test_enumerable
 58 |     # StanfordParser::LexicalizedParser is not an enumerable object.
 59 |     assert_equal @parser.map, []
 60 |   end
 61 | end # LexicalizedParserTestCase
 62 | 
 63 | 
 64 | class TreeTestCase < Test::Unit::TestCase
 65 |   def setup
 66 |     @parser = StanfordParser::DefaultParser.instance
 67 |     @tree = @parser.apply("This is a sentence.")
 68 |   end
 69 | 
 70 |   def test_enumerable
 71 |     assert @tree.all? {|n| n.class == StanfordParser::Tree}
 72 |     assert @tree.all? {|n|
 73 |       n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
 74 |       n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
 75 |     }
 76 |     assert_equal @tree.map {|n| "#{n.label}"},
 77 |       ["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
 78 |        "NN", "sentence", ".", "."]
 79 |   end
 80 | end # TreeTestCase
 81 | 
 82 | 
 83 | class FeatureLabelTestCase < Test::Unit::TestCase
 84 |   def test_feature_label
 85 |     f = StanfordParser::FeatureLabel.new
 86 |     assert_equal "BEGIN_POS", f.BEGIN_POSITION_KEY
 87 |     f.put(f.BEGIN_POSITION_KEY, 3)
 88 |     assert_equal "END_POS", f.END_POSITION_KEY
 89 |     f.put(f.END_POSITION_KEY, 7)
 90 |     assert_equal "current", f.CURRENT_KEY
 91 |     f.put(f.CURRENT_KEY, "word")
 92 |     assert_equal "{BEGIN_POS=3, END_POS=7, current=word}", f.inspect
 93 |     assert_equal "word [3,7]", f.to_s
 94 |   end
 95 | end
 96 | 
 97 | 
 98 | class DocumentPreprocessorTestCase < Test::Unit::TestCase
 99 |   def setup
100 |     @preproc = StanfordParser::DocumentPreprocessor.new
101 |     @standoff_preproc = StanfordParser::StandoffDocumentPreprocessor.new
102 |   end
103 | 
104 |   def test_get_sentences_from_string
105 |     # The following call exercises the conversion from java.util.ArrayList
106 |     # objects to Ruby arrays.
107 |     s = @preproc.getSentencesFromString("This is a sentence.  So is this.")
108 |     assert_equal "#{s[0]}", "This is a sentence ."
109 |     assert_equal "#{s[1]}", "So is this ."
110 |   end
111 | 
112 |   def test_enumerable
113 |     # StanfordParser::DocumentPreprocessor is not an enumerable object.
114 |     assert_equal @preproc.map, []
115 |   end
116 | 
117 |   # Segment and tokenize text containing two sentences.
118 |   def test_standoff_document_preprocessor
119 |     sentences = @standoff_preproc.getSentencesFromString("He (John) is tall.  So is she.")
120 |     # Recognize two sentences.
121 |     assert_equal 2, sentences.length
122 |     assert sentences.all? {|sentence| sentence.instance_of? StanfordParser::StandoffSentence}
123 |     assert_equal "He (John) is tall.", sentences.first.to_s
124 |     assert_equal 7, sentences.first.length
125 |     assert sentences[0].all? {|token| token.instance_of? StanfordParser::StandoffToken}
126 |     assert_equal "So is she.", sentences.last.to_s
127 |     assert_equal 4, sentences.last.length
128 |     assert sentences[1].all? {|token| token.instance_of? StanfordParser::StandoffToken}
129 |     # Get the correct token information for the first sentence.
130 |     assert_equal ["He", "He"], [sentences[0][0].current(), sentences[0][0].word()]
131 |     assert_equal [0,2],        [sentences[0][0].begin_position(), sentences[0][0].end_position()]
132 |     assert_equal ["(", "-LRB-"], [sentences[0][1].current(), sentences[0][1].word()]
133 |     assert_equal [3,4],          [sentences[0][1].begin_position(), sentences[0][1].end_position()]
134 |     assert_equal ["John", "John"], [sentences[0][2].current(), sentences[0][2].word()]
135 |     assert_equal [4,8],            [sentences[0][2].begin_position(), sentences[0][2].end_position()]
136 |     assert_equal [")", "-RRB-"], [sentences[0][3].current(), sentences[0][3].word()]
137 |     assert_equal [8,9],          [sentences[0][3].begin_position(), sentences[0][3].end_position()]
138 |     assert_equal ["is", "is"], [sentences[0][4].current(), sentences[0][4].word()]
139 |     assert_equal [10,12],      [sentences[0][4].begin_position(), sentences[0][4].end_position()]
140 |     assert_equal ["tall", "tall"], [sentences[0][5].current(), sentences[0][5].word()]
141 |     assert_equal [13,17],          [sentences[0][5].begin_position(), sentences[0][5].end_position()]
142 |     assert_equal [".", "."], [sentences[0][6].current(), sentences[0][6].word()]
143 |     assert_equal [17,18],    [sentences[0][6].begin_position(), sentences[0][6].end_position()]
144 |     # Get the correct token information for the second sentence.
145 |     assert_equal ["So", "So"], [sentences[1][0].current(), sentences[1][0].word()]
146 |     assert_equal [20,22],      [sentences[1][0].begin_position(), sentences[1][0].end_position()]
147 |     assert_equal ["is", "is"], [sentences[1][1].current(), sentences[1][1].word()]
148 |     assert_equal [23,25],      [sentences[1][1].begin_position(), sentences[1][1].end_position()]
149 |     assert_equal ["she", "she"], [sentences[1][2].current(), sentences[1][2].word()]
150 |     assert_equal [26,29],        [sentences[1][2].begin_position(), sentences[1][2].end_position()]
151 |     assert_equal [".", "."], [sentences[1][3].current(), sentences[1][3].word()]
152 |     assert_equal [29,30],    [sentences[1][3].begin_position(), sentences[1][3].end_position()]
153 |   end
154 | 
155 |   def test_stringification
156 |     assert_equal "<DocumentPreprocessor>", @preproc.inspect
157 |     assert_equal "<DocumentPreprocessor>", @preproc.to_s
158 |     assert_equal "<StandoffDocumentPreprocessor>", @standoff_preproc.inspect
159 |     assert_equal "<StandoffDocumentPreprocessor>", @standoff_preproc.to_s
160 |   end
161 | 
162 | end # DocumentPreprocessorTestCase
163 | 
164 | 
165 | class StandoffParsedTextTestCase < Test::Unit::TestCase
166 |   def setup
167 |     @text = "He (John) is tall.  So is she."
168 |   end
169 | 
170 |   def test_parse_text_default_nodetype
171 |     parsed_text = StanfordParser::StandoffParsedText.new(@text)
172 |     verify_parsed_text(parsed_text, StanfordParser::StandoffNode)
173 |   end
174 | 
175 |   # Verify correct parsing with variable node types for text containing two sentences.
176 |   def verify_parsed_text(parsed_text, nodetype)
177 |     # Verify that there are two sentences.
178 |     assert_equal 2, parsed_text.length
179 |     assert parsed_text.all? {|sentence| sentence.instance_of? nodetype}
180 |     # Verify the tokens in the leaf node of the first sentence.
181 |     leaves = parsed_text[0].leaves.collect {|node| node.label}
182 |     assert_equal ["He", "He"], [leaves[0].current(), leaves[0].word()]
183 |     assert_equal [0,2],        [leaves[0].begin_position(), leaves[0].end_position()]
184 |     assert_equal ["(", "-LRB-"], [leaves[1].current(), leaves[1].word()]
185 |     assert_equal [3,4],          [leaves[1].begin_position(), leaves[1].end_position()]
186 |     assert_equal ["John", "John"], [leaves[2].current(), leaves[2].word()]
187 |     assert_equal [4,8],            [leaves[2].begin_position(), leaves[2].end_position()]
188 |     assert_equal [")", "-RRB-"], [leaves[3].current(), leaves[3].word()]
189 |     assert_equal [8,9],          [leaves[3].begin_position(), leaves[3].end_position()]
190 |     assert_equal ["is", "is"], [leaves[4].current(), leaves[4].word()]
191 |     assert_equal [10,12],      [leaves[4].begin_position(), leaves[4].end_position()]
192 |     assert_equal ["tall", "tall"], [leaves[5].current(), leaves[5].word()]
193 |     assert_equal [13,17],          [leaves[5].begin_position(), leaves[5].end_position()]
194 |     assert_equal [".", "."], [leaves[6].current(), leaves[6].word()]
195 |     assert_equal [17,18],    [leaves[6].begin_position(), leaves[6].end_position()]
196 |     # Verify the tokens in the leaf node of the second sentence.
197 |     leaves = parsed_text[1].leaves.collect {|node| node.label}
198 |     assert_equal ["So", "So"], [leaves[0].current(), leaves[0].word()]
199 |     assert_equal [20,22],      [leaves[0].begin_position(), leaves[0].end_position()]
200 |     assert_equal ["is", "is"], [leaves[1].current(), leaves[1].word()]
201 |     assert_equal [23,25],      [leaves[1].begin_position(), leaves[1].end_position()]
202 |     assert_equal ["she", "she"], [leaves[2].current(), leaves[2].word()]
203 |     assert_equal [26,29],        [leaves[2].begin_position(), leaves[2].end_position()]
204 |     assert_equal [".", "."], [leaves[3].current(), leaves[3].word()]
205 |     assert_equal [29,30],    [leaves[3].begin_position(), leaves[3].end_position()]
206 |     # Verify that the original string is recoverable.
207 |     assert_equal "He (John) is tall.  ", parsed_text[0].to_original_string
208 |     assert_equal "So is she."          , parsed_text[1].to_original_string
209 |     # Draw < and > brackets around 3 constituents.
210 |     b = parsed_text[0].to_bracketed_string([[0,0], [0,0,1,1], [0,1,1]], "<", ">")
211 |     assert_equal "<He (<John>)> is <tall>.  ", b
212 |   end
213 | end
214 | 
215 | 
216 | class MiscPreprocessorTestCase < Test::Unit::TestCase
217 |   def test_model_location
218 |     assert_equal "$(ROOT)/englishPCFG.ser.gz", StanfordParser::ENGLISH_PCFG_MODEL
219 |   end
220 | 
221 |   def test_word
222 |     assert StanfordParser::Word.new("edu.stanford.nlp.ling.Word", "dog") ==  "dog"
223 |   end
224 | end # MiscPreprocessorTestCase
225 | 


--------------------------------------------------------------------------------