├── .autotest
├── History.rdoc
├── Manifest.txt
├── README.rdoc
├── Rakefile
├── bin
    ├── ruby_parse
    └── ruby_parse_extract_error
├── compare
    └── normalize.rb
├── debugging.md
├── gauntlet.md
├── lib
    ├── .document
    ├── rp_extensions.rb
    ├── rp_stringscanner.rb
    ├── ruby_lexer.rb
    ├── ruby_lexer.rex
    ├── ruby_lexer_strings.rb
    ├── ruby_parser.rb
    ├── ruby_parser2.yy
    ├── ruby_parser3.yy
    └── ruby_parser_extras.rb
├── test
    ├── test_ruby_lexer.rb
    ├── test_ruby_parser.rb
    └── test_ruby_parser_extras.rb
└── tools
    ├── munge.rb
    └── ripper.rb


/.autotest:
--------------------------------------------------------------------------------
 1 | # -*- ruby -*-
 2 | 
 3 | require 'autotest/restart'
 4 | # require 'autotest/isolate'
 5 | require 'autotest/rcov' if ENV['RCOV']
 6 | 
 7 | Autotest.add_hook :initialize do |at|
 8 |   at.extra_files << "../../sexp_processor/dev/lib/pt_testcase.rb"
 9 |   at.libs << ":../../sexp_processor/dev/lib"
10 |   at.add_exception ".diff"
11 |   at.add_exception ".rdoc"
12 |   at.add_exception ".yml"
13 |   at.add_exception ".txt"
14 |   at.add_exception ".output"
15 |   at.add_exception "bin"
16 |   at.add_exception "compare"
17 |   at.add_exception "coverage"
18 |   at.add_exception "coverage.info"
19 |   at.add_exception "misc"
20 | 
21 |   Dir["lib/ruby??_parser.{rb,y}"].each do |f|
22 |     at.add_exception f
23 |   end
24 | 
25 |   Dir["gauntlet*"].each do |f|
26 |     at.add_exception f
27 |   end
28 | 
29 |   at.libs << ':../../minitest/dev/lib'
30 |   at.testlib = "minitest/autorun"
31 | 
32 |   at.add_mapping(/^lib\/.*\.y$/) do |f, _|
33 |     g = File.basename(f, ".y").gsub("_", "_?").gsub(/2\d/, '')
34 |     at.files_matching %r%^test/.*#{g}.rb$%
35 |   end
36 | 
37 |   at.add_mapping(/^lib\/.*\.yy$/) do |f, _|
38 |     g = File.basename(f, ".yy").gsub("_", "_?")
39 |     at.files_matching %r%^test/.*#{g}.rb$%
40 |   end
41 | 
42 |   at.add_mapping(/^lib\/ruby_lexer\.rex\.rb$/) do |f, _|
43 |     at.files_matching %r%^test/.*ruby_lexer\.rb$%
44 |   end
45 | 
46 |   at.add_mapping(/^lib\/.*\.rex$/) do |f, _|
47 |     g = File.basename(f, ".rex").gsub("_", "_?")
48 |     at.files_matching %r%^test/.*#{g}.rb$%
49 |   end
50 | 
51 |   at.add_mapping(/pt_testcase.rb/) do |f, _|
52 |     at.files_matching(/test_.*rb$/)
53 |   end
54 | 
55 |   %w(TestEnvironment TestStackState TestValueExpr).each do |klass|
56 |     at.extra_class_map[klass] = "test/test_ruby_parser_extras.rb"
57 |   end
58 | 
59 |   Dir["lib/ruby??_parser.rb"].each do |s|
60 |     n = s[/\d+/]
61 |     at.extra_class_map["TestRubyParserV#{n}"] = "test/test_ruby_parser.rb"
62 |   end
63 | end
64 | 
65 | Autotest.add_hook :run_command do |at, _|
66 |   system "rake parser lexer DEBUG=1"
67 | end
68 | 


--------------------------------------------------------------------------------
/Manifest.txt:
--------------------------------------------------------------------------------
 1 | .autotest
 2 | History.rdoc
 3 | Manifest.txt
 4 | README.rdoc
 5 | Rakefile
 6 | bin/ruby_parse
 7 | bin/ruby_parse_extract_error
 8 | compare/normalize.rb
 9 | debugging.md
10 | gauntlet.md
11 | lib/.document
12 | lib/rp_extensions.rb
13 | lib/rp_stringscanner.rb
14 | lib/ruby_lexer.rb
15 | lib/ruby_lexer.rex
16 | lib/ruby_lexer.rex.rb
17 | lib/ruby_lexer_strings.rb
18 | lib/ruby_parser.rb
19 | lib/ruby_parser2.yy
20 | lib/ruby_parser20.rb
21 | lib/ruby_parser21.rb
22 | lib/ruby_parser22.rb
23 | lib/ruby_parser23.rb
24 | lib/ruby_parser24.rb
25 | lib/ruby_parser25.rb
26 | lib/ruby_parser26.rb
27 | lib/ruby_parser27.rb
28 | lib/ruby_parser3.yy
29 | lib/ruby_parser30.rb
30 | lib/ruby_parser31.rb
31 | lib/ruby_parser32.rb
32 | lib/ruby_parser33.rb
33 | lib/ruby_parser34.rb
34 | lib/ruby_parser_extras.rb
35 | test/test_ruby_lexer.rb
36 | test/test_ruby_parser.rb
37 | test/test_ruby_parser_extras.rb
38 | tools/munge.rb
39 | tools/ripper.rb
40 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
  1 | = ruby_parser
  2 | 
  3 | home :: https://github.com/seattlerb/ruby_parser
  4 | bugs :: https://github.com/seattlerb/ruby_parser/issues
  5 | rdoc :: http://docs.seattlerb.org/ruby_parser
  6 | 
  7 | == DESCRIPTION:
  8 | 
  9 | ruby_parser (RP) is a ruby parser written in pure ruby (utilizing
 10 | racc--which does by default use a C extension). It outputs
 11 | s-expressions which can be manipulated and converted back to ruby via
 12 | the ruby2ruby gem.
 13 | 
 14 | As an example:
 15 | 
 16 |     def conditional1 arg1
 17 |       return 1 if arg1 == 0
 18 |       return 0
 19 |     end
 20 | 
 21 | becomes:
 22 | 
 23 |     s(:defn, :conditional1, s(:args, :arg1),
 24 |       s(:if,
 25 |         s(:call, s(:lvar, :arg1), :==, s(:lit, 0)),
 26 |         s(:return, s(:lit, 1)),
 27 |         nil),
 28 |       s(:return, s(:lit, 0)))
 29 | 
 30 | Tested against 801,039 files from the latest of all rubygems (as of 2013-05):
 31 | 
 32 | * 1.8 parser is at 99.9739% accuracy, 3.651 sigma
 33 | * 1.9 parser is at 99.9940% accuracy, 4.013 sigma
 34 | * 2.0 parser is at 99.9939% accuracy, 4.008 sigma
 35 | * 2.6 parser is at 99.9972% accuracy, 4.191 sigma
 36 | * 3.0 parser has a 100% parse rate.
 37 |   * Tested against 2,672,412 unique ruby files across 167k gems.
 38 |   * As do all the others now, basically.
 39 | 
 40 | == FEATURES/PROBLEMS:
 41 | 
 42 | * Pure ruby, no compiles.
 43 | * Includes preceding comment data for defn/defs/class/module nodes!
 44 | * Incredibly simple interface.
 45 | * Output is 100% equivalent to ParseTree.
 46 |   * Can utilize PT's SexpProcessor and UnifiedRuby for language processing.
 47 | * Known Issue: Speed is now pretty good, but can always improve:
 48 |   * RP parses a corpus of 3702 files in 125s (avg 108 Kb/s)
 49 |   * MRI+PT parsed the same in 67.38s (avg 200.89 Kb/s)
 50 | * Known Issue: Code is much better, but still has a long way to go.
 51 | * Known Issue: Totally awesome.
 52 | * Known Issue: line number values can be slightly off. Parsing LR sucks.
 53 | 
 54 | == SYNOPSIS:
 55 | 
 56 |   RubyParser.new.parse "1+1"
 57 |   # => s(:call, s(:lit, 1), :+, s(:lit, 1))
 58 | 
 59 | You can also use Ruby19Parser, Ruby18Parser, or RubyParser.for_current_ruby:
 60 | 
 61 |   RubyParser.for_current_ruby.parse "1+1"
 62 |   # => s(:call, s(:lit, 1), :+, s(:lit, 1))
 63 | 
 64 | == DEVELOPER NOTES:
 65 | 
 66 | To add a new version:
 67 | 
 68 | * New parser should be generated from lib/ruby_parser[23].yy.
 69 | * Extend lib/ruby_parser[23].yy with new class name.
 70 | * Add new version number to V2/V3 in Rakefile for rule creation.
 71 | * Add new `ruby_parse "x.y.z"` line to Rakefile for rake compare (line ~300).
 72 | * Require generated parser in lib/ruby_parser.rb.
 73 | * Add new V## = ::Ruby##Parser; end to ruby_parser.rb (bottom of file).
 74 | * Add empty TestRubyParserShared##Plus module and TestRubyParserV## to test/test_ruby_parser.rb.
 75 | * Extend Manifest.txt with generated file names.
 76 | * Add new version number to sexp_processor's pt_testcase.rb in all_versions.
 77 | 
 78 | Until all of these are done, you won't have a clean test run.
 79 | 
 80 | == REQUIREMENTS:
 81 | 
 82 | * ruby. woot.
 83 | * sexp_processor for Sexp and SexpProcessor classes, and testing.
 84 | * racc full package for parser development (compiling .y to .rb).
 85 | 
 86 | == INSTALL:
 87 | 
 88 | * sudo gem install ruby_parser
 89 | 
 90 | == LICENSE:
 91 | 
 92 | (The MIT License)
 93 | 
 94 | Copyright (c) Ryan Davis, seattle.rb
 95 | 
 96 | Permission is hereby granted, free of charge, to any person obtaining
 97 | a copy of this software and associated documentation files (the
 98 | 'Software'), to deal in the Software without restriction, including
 99 | without limitation the rights to use, copy, modify, merge, publish,
100 | distribute, sublicense, and/or sell copies of the Software, and to
101 | permit persons to whom the Software is furnished to do so, subject to
102 | the following conditions:
103 | 
104 | The above copyright notice and this permission notice shall be
105 | included in all copies or substantial portions of the Software.
106 | 
107 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
108 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
109 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
110 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
111 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
112 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
113 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
114 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
  1 | # -*- ruby -*-
  2 | 
  3 | require "hoe"
  4 | 
  5 | Hoe.plugin :seattlerb
  6 | Hoe.plugin :racc
  7 | Hoe.plugin :isolate
  8 | Hoe.plugin :rdoc
  9 | 
 10 | Hoe.add_include_dirs "lib"
 11 | Hoe.add_include_dirs "../../sexp_processor/dev/lib"
 12 | Hoe.add_include_dirs "../../minitest/dev/lib"
 13 | Hoe.add_include_dirs "../../oedipus_lex/dev/lib"
 14 | Hoe.add_include_dirs "../../ruby2ruby/dev/lib"
 15 | 
 16 | V2   = %w[20 21 22 23 24 25 26 27]
 17 | V3   = %w[30 31 32 33 34]
 18 | 
 19 | VERS = V2 + V3
 20 | 
 21 | ENV["FAST"] = VERS.last if ENV["FAST"] && !VERS.include?(ENV["FAST"])
 22 | VERS.replace [ENV["FAST"]] if ENV["FAST"]
 23 | 
 24 | racc_flags = nil
 25 | 
 26 | Hoe.spec "ruby_parser" do
 27 |   developer "Ryan Davis", "ryand-ruby@zenspider.com"
 28 | 
 29 |   license "MIT"
 30 | 
 31 |   dependency "sexp_processor", "~>  4.16"
 32 |   dependency "racc",           "~>  1.5"
 33 |   dependency "rake",           [">= 10",  "< 15"], :developer
 34 |   dependency "oedipus_lex",    "~>  2.6", :developer
 35 | 
 36 |   require_ruby_version [">= 2.6", "< 4"]
 37 | 
 38 |   if plugin? :perforce then     # generated files
 39 |     VERS.each do |n|
 40 |       self.perforce_ignore << "lib/ruby_parser#{n}.rb"
 41 |     end
 42 | 
 43 |     VERS.each do |n|
 44 |       self.perforce_ignore << "lib/ruby_parser#{n}.y"
 45 |     end
 46 | 
 47 |     self.perforce_ignore << "lib/ruby_lexer.rex.rb"
 48 |   end
 49 | 
 50 |   if plugin?(:racc)
 51 |     self.racc_flags << " -t" if ENV["DEBUG"]
 52 |     self.racc_flags << " --superclass RubyParser::Parser"
 53 |     racc_flags = self.racc_flags
 54 |   end
 55 | end
 56 | 
 57 | def maybe_add_to_top path, string
 58 |   file = File.read path
 59 | 
 60 |   return if file.start_with? string
 61 | 
 62 |   warn "Altering top of #{path}"
 63 |   tmp_path = "#{path}.tmp"
 64 |   File.open(tmp_path, "w") do |f|
 65 |     f.puts string
 66 |     f.puts
 67 | 
 68 |     f.write file
 69 |     # TODO: make this deal with encoding comments properly?
 70 |   end
 71 |   File.rename tmp_path, path
 72 | end
 73 | 
 74 | def unifdef?
 75 |   @unifdef ||= system("which unifdef") or abort <<~EOM
 76 |     unifdef not found!
 77 | 
 78 |     Please install 'unifdef' package on your system or `rake generate` on a mac.
 79 |   EOM
 80 | end
 81 | 
 82 | def racc?
 83 |   @racc ||= system("which racc") or abort <<~EOM
 84 |     racc not found! `gem install racc`
 85 |   EOM
 86 | end
 87 | 
 88 | generate_parser = proc do |t|
 89 |   unifdef?
 90 |   racc?
 91 |   n = t.name[/\d+/]
 92 |   sh "unifdef -tk -DV=%s %s | racc %s /dev/stdin -o %s" % [n, t.source, racc_flags, t.name]
 93 |   maybe_add_to_top t.name, "# frozen_string_literal: true"
 94 | end
 95 | 
 96 | V2.each do |n|
 97 |   file "lib/ruby_parser#{n}.rb" => "lib/ruby_parser2.yy", &generate_parser
 98 | end
 99 | 
100 | V3.each do |n|
101 |   file "lib/ruby_parser#{n}.rb" => "lib/ruby_parser3.yy", &generate_parser
102 | end
103 | 
104 | file "lib/ruby_lexer.rex.rb" => "lib/ruby_lexer.rex"
105 | 
106 | task :generate => [:lexer, :parser]
107 | 
108 | task :clean do
109 |   rm_rf(Dir["**/*~"] +
110 |         Dir["diff.diff"] + # not all diffs. bit me too many times
111 |         Dir["coverage.info"] +
112 |         Dir["coverage"] +
113 |         Dir["lib/ruby_parser2*.y"] +
114 |         Dir["lib/ruby_parser3*.y"] +
115 |         Dir["lib/*.output"])
116 | end
117 | 
118 | task :sort do
119 |   sh "grepsort '^ +def' lib/ruby_lexer.rb"
120 |   sh "grepsort '^ +def (test|util)' test/test_ruby_lexer.rb"
121 | end
122 | 
123 | desc "what was that command again?"
124 | task :huh? do
125 |   puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..."
126 | end
127 | 
128 | def (task(:phony)).timestamp
129 |   Time.at 0
130 | end
131 | 
132 | task :isolate => :phony
133 | 
134 | def dl v, f
135 |   dir = v[/^\d+\.\d+/]
136 |   url = "https://cache.ruby-lang.org/pub/ruby/#{dir}/ruby-#{v}.tar.xz"
137 | 
138 |   warn "Downloading ruby #{v}"
139 |   system "curl -s -o #{f} #{url}"
140 | end
141 | 
142 | task :parser => :isolate
143 | 
144 | multitask :compare_build
145 | task :compare_build => :generate
146 | task :compare => :compare_build
147 | 
148 | def ruby_parse version
149 |   v         = version[/^\d+\.\d+/].delete "."
150 |   diff      = "compare/diff#{v}.diff"
151 |   rp_txt    = "compare/rp#{v}.txt"
152 |   mri_txt   = "compare/mri#{v}.txt"
153 |   parse_y   = "compare/parse#{v}.y"
154 |   tarball   = "compare/ruby-#{version}.tar.xz"
155 |   ruby_dir  = "compare/ruby-#{version}"
156 |   rp_out    = "lib/ruby_parser#{v}.output"
157 |   rp_y_rb   = "lib/ruby_parser#{v}.rb"
158 |   normalize = "compare/normalize.rb"
159 | 
160 |   file tarball do
161 |     dl version, tarball
162 |   end
163 | 
164 |   desc "fetch all tarballs"
165 |   task :fetch => tarball
166 | 
167 |   file ruby_dir => tarball do
168 |     extract_glob = case
169 |                    when version > "3.3" then
170 |                      "{id.h,parse.y,tool/{id2token.rb,lrama},defs/id.def}"
171 |                    when version > "3.2" then
172 |                      "{id.h,parse.y,tool/id2token.rb,defs/id.def}"
173 |                    when version > "2.7" then
174 |                      "{id.h,parse.y,tool/{id2token.rb,lib/vpath.rb}}"
175 |                    else
176 |                      "{id.h,parse.y,tool/{id2token.rb,vpath.rb}}"
177 |                    end
178 |     system "tar xf #{tarball} -C compare #{File.basename ruby_dir}/#{extract_glob}"
179 |   end
180 | 
181 |   file parse_y => ruby_dir do
182 |     # env -u RUBYOPT rake compare/parse33.y
183 |     warn "Warning: RUBYOPT is set! Use 'env -u RUBYOPT rake'" if ENV["RUBYOPT"]
184 | 
185 |     # Debugging a new parse build system:
186 |     #
187 |     # Unpack the ruby tarball in question, configure, and run the following:
188 |     #
189 |     # % [ -e Makefile ] || ./configure ; make -n -W parse.y parse.c
190 |     # ...
191 |     # echo generating parse.c
192 |     # ruby --disable=gems ./tool/id2token.rb parse.y | \
193 |     #       ruby ./tool/lrama/exe/lrama -oparse.c -Hparse.h - parse.y
194 |     #
195 |     # Then integrate these commands into the mess below:
196 | 
197 |     d = ruby_dir
198 |     cmd = if version > "3.2" then
199 |             "ruby #{d}/tool/id2token.rb #{d}/parse.y | expand > #{parse_y}"
200 |           else
201 |             "ruby #{d}/tool/id2token.rb --path-separator=.:./ #{d}/id.h #{d}/parse.y | expand | ruby -pe 'gsub(/^%pure-parser/, \"%define api.pure\")'  > #{parse_y}"
202 |           end
203 | 
204 |     sh cmd
205 |   end
206 | 
207 |   bison = Dir["/opt/homebrew/opt/bison/bin/bison",
208 |               "/usr/local/opt/bison/bin/bison",
209 |               `which bison`.chomp,
210 |              ].first
211 | 
212 |   file mri_txt => [parse_y, normalize] do
213 |     d = ruby_dir
214 |     if version > "3.3" then
215 |       sh "./#{d}/tool/lrama/exe/lrama -r states --report-file=compare/parse#{v}.output -ocompare/parse#{v}.tab.c #{parse_y}"
216 |     else
217 |       sh "#{bison} -r all #{parse_y}"
218 |       mv Dir["parse#{v}.*"], "compare"
219 |     end
220 | 
221 |     sh "#{normalize} compare/parse#{v}.output > #{mri_txt}"
222 |     rm ["compare/parse#{v}.output", "compare/parse#{v}.tab.c"]
223 |   end
224 | 
225 |   file rp_out => rp_y_rb
226 | 
227 |   file rp_txt => [rp_out, normalize] do
228 |     sh "#{normalize} #{rp_out} > #{rp_txt}"
229 |   end
230 | 
231 |   compare = "compare#{v}"
232 |   compare_build = "compare_build#{v}"
233 | 
234 |   desc "Compare all grammars to MRI"
235 |   task :compare => compare
236 |   task :compare_build => compare_build
237 | 
238 |   task compare_build => diff
239 | 
240 |   file diff => [mri_txt, rp_txt] do
241 |     sh "diff -du #{mri_txt} #{rp_txt} > #{diff}; true"
242 |   end
243 | 
244 |   desc "Compare #{v} grammar to MRI #{version}"
245 |   task compare => diff do
246 |     system "wc -l #{diff}"
247 |   end
248 | 
249 |   task :clean do
250 |     rm_f Dir[mri_txt, rp_txt, ruby_dir]
251 |   end
252 | 
253 |   task :realclean do
254 |     rm_f Dir[parse_y, tarball]
255 |   end
256 | end
257 | 
258 | task :versions do
259 |   require "open-uri"
260 |   require "net/http" # avoid require issues in threads
261 |   require "net/https"
262 | 
263 |   versions = VERS.map { |s| s.split(//).join "." }
264 | 
265 |   base_url = "https://cache.ruby-lang.org/pub/ruby"
266 | 
267 |   class Array
268 |     def human_sort
269 |       sort_by { |item| item.to_s.split(/(\d+)/).map { |e| [e.to_i, e] } }
270 |     end
271 |   end
272 | 
273 |   versions = versions.map { |ver|
274 |     Thread.new {
275 |       URI
276 |         .parse("#{base_url}/#{ver}/")
277 |         .read
278 |         .scan(/ruby-\d+\.\d+\.\d+[-\w.]*?.tar.gz/)
279 |         .reject { |s| s =~ /-(?:rc|preview)\d/ }
280 |         .human_sort
281 |         .last
282 |         .delete_prefix("ruby-")
283 |         .delete_suffix ".tar.gz"
284 |     }
285 |   }.map(&:value).sort
286 | 
287 |   puts versions.map { |v| "ruby_parse %p" % [v] }
288 | end
289 | 
290 | ruby_parse "2.0.0-p648"
291 | ruby_parse "2.1.10"
292 | ruby_parse "2.2.10"
293 | ruby_parse "2.3.8"
294 | ruby_parse "2.4.10"
295 | ruby_parse "2.5.9"
296 | ruby_parse "2.6.10"
297 | ruby_parse "2.7.8"
298 | ruby_parse "3.0.6"
299 | ruby_parse "3.1.7"
300 | ruby_parse "3.2.8"
301 | ruby_parse "3.3.7"
302 | ruby_parse "3.4.2"
303 | 
304 | task :debug => :isolate do
305 |   ENV["V"] ||= VERS.last
306 |   Rake.application[:parser].invoke # this way we can have DEBUG set
307 |   Rake.application[:lexer].invoke # this way we can have DEBUG set
308 | 
309 |   $:.unshift "lib"
310 |   require "ruby_parser"
311 |   require "pp"
312 | 
313 |   klass = Object.const_get("Ruby#{ENV["V"]}Parser") rescue nil
314 |   raise "Unsupported version #{ENV["V"]}" unless klass
315 |   parser = klass.new
316 | 
317 |   time = (ENV["RP_TIMEOUT"] || 10).to_i
318 | 
319 |   n = ENV["BUG"]
320 |   file = (n && "bug#{n}.rb") || ENV["F"] || ENV["FILE"] || "debug.rb"
321 |   ruby = ENV["R"] || ENV["RUBY"]
322 | 
323 |   if ruby then
324 |     file = "env"
325 |   else
326 |     ruby = File.read file
327 |   end
328 | 
329 | 
330 |   begin
331 |     pp parser.process(ruby, file, time)
332 |   rescue ArgumentError, Racc::ParseError => e
333 |     p e
334 |     puts e.backtrace.join "\n  "
335 |     ss = parser.lexer.ss
336 |     src = ss.string
337 |     lines = src[0..ss.pos].split(/\n/)
338 |     abort "on #{file}:#{lines.size}"
339 |   end
340 | end
341 | 
342 | task :debug3 do
343 |   file    = ENV["F"] || "debug.rb"
344 |   version = ENV["V"] || ""
345 |   verbose = ENV["VERBOSE"] ? "-v" : ""
346 |   munge    = "./tools/munge.rb #{verbose}"
347 | 
348 |   abort "Need a file to parse, via: F=path.rb" unless file
349 | 
350 |   ENV.delete "V"
351 | 
352 |   ruby = "ruby#{version}"
353 | 
354 |   sh "#{ruby} -v"
355 |   sh "#{ruby} -y #{file} 2>&1 | #{munge} > tmp/ruby"
356 |   sh "#{ruby} ./tools/ripper.rb -d #{file} | #{munge} > tmp/rip"
357 |   sh "rake debug F=#{file} DEBUG=1 2>&1 | #{munge} > tmp/rp"
358 |   sh "diff -U 999 -d tmp/{ruby,rp}"
359 | end
360 | 
361 | task :cmp do
362 |   sh %(emacsclient --eval '(ediff-files "tmp/ruby" "tmp/rp")')
363 | end
364 | 
365 | task :cmp3 do
366 |   sh %(emacsclient --eval '(ediff-files3 "tmp/ruby" "tmp/rip" "tmp/rp")')
367 | end
368 | 
369 | task :extract => :isolate do
370 |   ENV["V"] ||= VERS.last
371 |   Rake.application[:parser].invoke # this way we can have DEBUG set
372 | 
373 |   file = ENV["F"] || ENV["FILE"] || abort("Need to provide F=<path>")
374 | 
375 |   ruby "-Ilib", "bin/ruby_parse_extract_error", file
376 | end
377 | 
378 | task :parse => :isolate do
379 |   ENV["V"] ||= VERS.last
380 |   Rake.application[:parser].invoke # this way we can have DEBUG set
381 | 
382 |   file = ENV["F"] || ENV["FILE"] || abort("Need to provide F=<path>")
383 | 
384 |   ruby "-Ilib", "bin/ruby_parse", file
385 | end
386 | 
387 | task :bugs do
388 |   sh "for f in bug*.rb bad*.rb ; do #{Gem.ruby} -S rake debug F=$f && rm $f ; done"
389 | end
390 | 
391 | # vim: syntax=Ruby
392 | 


--------------------------------------------------------------------------------
/bin/ruby_parse:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby -s
 2 | 
 3 | $q ||= false
 4 | $g ||= false
 5 | 
 6 | require 'rubygems'
 7 | require 'ruby_parser'
 8 | require 'pp'
 9 | 
10 | good = bad = 0
11 | 
12 | multi = ARGV.size != 1
13 | total_time = 0
14 | total_loc = 0
15 | total_kbytes = 0
16 | times = {}
17 | locs = {}
18 | kbytes = {}
19 | 
20 | begin
21 |   time = (ENV["RP_TIMEOUT"] || 10).to_i
22 | 
23 |   ARGV.each do |file|
24 |     rp = RubyParser.new
25 |     loc = `wc -l #{file}`.strip.to_i
26 |     size = `wc -c #{file}`.strip.to_i / 1024.0
27 |     locs[file] = loc
28 |     kbytes[file] = size
29 |     total_loc += loc
30 |     total_kbytes += size
31 |     if $q then
32 |       $stderr.print "."
33 |     else
34 |       warn "# file = #{file} loc = #{loc}"
35 |     end
36 |     GC.start if $g
37 | 
38 |     t = Time.now
39 |     begin
40 |       begin
41 |         rp.reset
42 |         r = rp.process(File.binread(file), file, time)
43 |         pp r unless $q
44 |         good += 1
45 |       rescue SyntaxError => e
46 |         warn "SyntaxError for #{file}: #{e.message}"
47 |         bad += 1
48 |       end
49 |     rescue => e
50 |       warn "#{e.backtrace.first} #{e.inspect.gsub(/\n/, ' ')} for #{file}"
51 |       warn "  #{e.backtrace.join("\n  ")}"
52 |       bad += 1
53 |     end
54 | 
55 |     t = Time.now - t
56 |     times[file] = t
57 |     total_time += t
58 |   end
59 | rescue Interrupt
60 |   # do nothing
61 | end
62 | 
63 | warn "done"
64 | 
65 | total = 0
66 | times.values.each do |t|
67 |   total += t
68 | end
69 | 
70 | puts
71 | puts "good = #{good} bad = #{bad}" if multi
72 | puts
73 | 
74 | format = "%5.2fs:%9.2f l/s:%8.2f Kb/s:%5d Kb:%5d loc:%s"
75 | 
76 | times.sort_by { |f, t| -t }.each do |f, t|
77 |   next if t < 0.005
78 |   loc = locs[f]
79 |   size = kbytes[f]
80 |   puts format % [t, loc / t, size / t, size, loc, f]
81 | end
82 | 
83 | puts
84 | 
85 | puts format % [total_time,
86 |                total_loc / total_time,
87 |                total_kbytes / total_time,
88 |                total_kbytes,
89 |                total_loc,
90 |                "TOTAL"] unless total_time == 0
91 | 


--------------------------------------------------------------------------------
/bin/ruby_parse_extract_error:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby -ws
  2 | 
  3 | $d ||= ENV["DELETE"]         || false
  4 | $t ||= ENV["DELETE_TIMEOUT"] || false
  5 | $m ||= ENV["MOVE_TIMEOUT"]   || false
  6 | $q ||= ENV["QUIET"]          || false
  7 | $s ||= ENV["SPEED"]          || false
  8 | 
  9 | require 'rubygems'
 10 | require 'ruby_parser'
 11 | require 'fileutils'
 12 | 
 13 | ARGV.push "-" if ARGV.empty?
 14 | 
 15 | class RubyParser
 16 |   def extract_defs
 17 |     ss = current.lexer.ss
 18 | 
 19 |     raise "can't access source. possible encoding issue" unless ss
 20 | 
 21 |     src = ss.string
 22 |     pre_error = src[0...ss.pos]
 23 | 
 24 |     defs = pre_error.lines.grep(/^ *(?:def|it)/)
 25 | 
 26 |     raise "can't figure out where the bad code starts" unless defs.last
 27 | 
 28 |     last_def_indent = defs.last[/^ */]
 29 | 
 30 |     post_error = src[ss.pos..-1]
 31 |     idx = post_error =~ /^#{last_def_indent}end.*/
 32 | 
 33 |     raise "can't figure out where the bad code ends" unless idx
 34 | 
 35 |     src = pre_error + post_error[0..idx+$&.length]
 36 | 
 37 |     src.scan(/^(( *)(?:def|it) .*?^\2end)/m)
 38 |   end
 39 | 
 40 |   def retest_for_errors defs
 41 |     parser = self.class.new
 42 | 
 43 |     parser.process(defs.join("\n\n"))
 44 |   rescue SyntaxError, StandardError
 45 |     nil
 46 |   end
 47 | end
 48 | 
 49 | def expand path
 50 |   if File.directory? path then
 51 |     require 'find'
 52 | 
 53 |     files = []
 54 | 
 55 |     Find.find(*Dir[path]) do |f|
 56 |       files << f if File.file? f
 57 |     end
 58 | 
 59 |     files.sort
 60 |   else
 61 |     Dir.glob path
 62 |   end
 63 | end
 64 | 
 65 | def process_error parser
 66 |   defs = parser.extract_defs
 67 | 
 68 |   if parser.retest_for_errors defs then
 69 |     warn "Can't reproduce error with just methods, punting..."
 70 |     return
 71 |   end
 72 | 
 73 |   catch :extract_done do
 74 |     (1..defs.size).each do |perm_size|
 75 |       defs.combination(perm_size).each do |trial|
 76 |         unless parser.retest_for_errors trial then
 77 |           puts trial.join "\n"
 78 |           throw :extract_done
 79 |         end
 80 |       end
 81 |     end
 82 |   end
 83 | rescue RuntimeError, Racc::ParseError => e
 84 |   warn "# process error: #{e.message.strip}"
 85 |   warn "#   #{e.backtrace.first}"
 86 | end
 87 | 
 88 | def process file
 89 |   ruby = file == "-" ? $stdin.binread : File.binread(file)
 90 |   time = (ENV["RP_TIMEOUT"] || 10).to_i
 91 | 
 92 |   $stderr.print "# Validating #{file}: "
 93 |   parser = RubyParser.new
 94 |   t0 = Time.now if $s
 95 |   parser.process(ruby, file, time)
 96 |   if $s then
 97 |     warn "good: #{Time.now - t0}"
 98 |   else
 99 |     warn "good"
100 |   end
101 |   File.unlink file if $d
102 | rescue Timeout::Error
103 |   $exit = 1
104 |   warn "TIMEOUT parsing #{file}. Skipping."
105 | 
106 |   if $m then
107 |     base_dir, *rest = file.split("/")
108 |     base_dir.sub!(/\.slow\.?.*/, "")
109 |     base_dir += ".slow.#{time}"
110 | 
111 |     new_file = File.join(base_dir, *rest)
112 | 
113 |     FileUtils.mkdir_p File.dirname(new_file)
114 |     FileUtils.move file, new_file, verbose:true
115 |   elsif $t then
116 |     File.unlink file
117 |   end
118 | rescue StandardError, SyntaxError, Racc::ParseError => e
119 |   $exit = 1
120 |   warn ""
121 |   warn "# error: #{e.message.strip}" unless $q
122 |   warn "#   #{e.backtrace.first}"
123 |   warn ""
124 |   return if $q
125 | 
126 |   process_error parser
127 | end
128 | 
129 | $exit = 0
130 | $stdout.sync = true
131 | 
132 | ARGV.each do |path|
133 |   expand(path).each do |file|
134 |     next unless File.file? file # omg... why would you name a dir support.rb?
135 |     process file
136 |   end
137 | end
138 | 
139 | exit $exit
140 | 


--------------------------------------------------------------------------------
/compare/normalize.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby -w
  2 | 
  3 | good = false
  4 | 
  5 | rules = Hash.new { |h,k| h[k] = [] }
  6 | rule = nil
  7 | order = []
  8 | 
  9 | def munge s
 10 |   renames = [
 11 |              # unquote... wtf?
 12 |              /`(.+?)'/,          proc { $1 },
 13 |              /"'(.+?)'"/,        proc { "\"#{$1}\"" },
 14 | 
 15 |              "'='",             "tEQL",
 16 |              "'!'",             "tBANG",
 17 |              "'%'",             "tPERCENT",
 18 |              "'&'",             "tAMPER2",
 19 |              "'('",             "tLPAREN2",
 20 |              "')'",             "tRPAREN",
 21 |              "'*'",             "tSTAR2",
 22 |              "'+'",             "tPLUS",
 23 |              "','",             "tCOMMA",
 24 |              "'-'",             "tMINUS",
 25 |              "'.'",             "tDOT",
 26 |              "'/'",             "tDIVIDE",
 27 |              "';'",             "tSEMI",
 28 |              "':'",             "tCOLON",
 29 |              "'<'",             "tLT",
 30 |              "'>'",             "tGT",
 31 |              "'?'",             "tEH",
 32 |              "'['",             "tLBRACK",
 33 |              "'\\n'",           "tNL",
 34 |              "']'",             "tRBRACK",
 35 |              "'^'",             "tCARET",
 36 |              "'`'",             "tBACK_REF2",
 37 |              "'{'",             "tLCURLY",
 38 |              "'|'",             "tPIPE",
 39 |              "'}'",             "tRCURLY",
 40 |              "'~'",             "tTILDE",
 41 |              '"["',             "tLBRACK",
 42 | 
 43 |              # 2.0 changes?
 44 |              '"<=>"',            "tCMP",
 45 |              '"=="',             "tEQ",
 46 |              '"==="',            "tEQQ",
 47 |              '"!~"',             "tNMATCH",
 48 |              '"=~"',             "tMATCH",
 49 |              '">="',             "tGEQ",
 50 |              '"<="',             "tLEQ",
 51 |              '"!="',             "tNEQ",
 52 |              '"<<"',             "tLSHFT",
 53 |              '">>"',             "tRSHFT",
 54 |              '"*"',              "tSTAR",
 55 | 
 56 |              '".."',             "tDOT2",
 57 | 
 58 |              '"&"',              "tAMPER",
 59 |              '"&&"',             "tANDOP",
 60 |              '"&."',             "tLONELY",
 61 |              '"||"',             "tOROP",
 62 | 
 63 |              '"..."',            "tDOT3",
 64 |              '"**"',             "tPOW",
 65 |              '"unary+"',         "tUPLUS",
 66 |              '"unary-"',         "tUMINUS",
 67 |              '"[]"',             "tAREF",
 68 |              '"[]="',            "tASET",
 69 |              '"::"',             "tCOLON2",
 70 |              '"{ arg"',          "tLBRACE_ARG",
 71 |              '"( arg"',          "tLPAREN_ARG",
 72 |              '"("',              "tLPAREN",
 73 |              'rparen',           "tRPAREN",
 74 |              '"{"',              "tLBRACE",
 75 |              '"=>"',             "tASSOC",
 76 |              '"->"',             "tLAMBDA",
 77 |              '":: at EXPR_BEG"', "tCOLON3",
 78 |              '"**arg"',          "tDSTAR",
 79 |              '","',              "tCOMMA",
 80 | 
 81 |              # other
 82 | 
 83 |              'tLBRACK2',        "tLBRACK", # HACK
 84 | 
 85 |              "' '",             "tSPACE", # needs to be later to avoid bad hits
 86 | 
 87 |              "ε",               "none", # bison 3+
 88 |              "%empty",          "none", # newer bison
 89 |              "/* empty */",     "none",
 90 |              /^\s*$/,           "none",
 91 | 
 92 |              "keyword_BEGIN",   "klBEGIN",
 93 |              "keyword_END",     "klEND",
 94 |              /keyword_(\w+)/,   proc { "k#{$1.upcase}" },
 95 |              /\bk_([a-z_]+)/,   proc { "k#{$1.upcase}" },
 96 |              /modifier_(\w+)/,  proc { "k#{$1.upcase}_MOD" },
 97 |              "kVARIABLE",       "keyword_variable", # ugh
 98 | 
 99 |              # 2.6 collapses klBEGIN to kBEGIN
100 |              "klBEGIN",   "kBEGIN",
101 |              "klEND",     "kEND",
102 | 
103 |              /keyword_(\w+)/,          proc { "k#{$1.upcase}" },
104 |              /\bk_([^_][a-z_]+)/,      proc { "k#{$1.upcase}" },
105 |              /modifier_(\w+)/,         proc { "k#{$1.upcase}_MOD" },
106 | 
107 |              "kVARIABLE",       "keyword_variable", # ugh: this is a rule name
108 | 
109 |              # 2.7 changes:
110 | 
111 |              '"global variable"',          "tGVAR",
112 |              '"operator-assignment"',      "tOP_ASGN",
113 |              '"back reference"',           "tBACK_REF",
114 |              '"numbered reference"',       "tNTH_REF",
115 |              '"local variable or method"', "tIDENTIFIER",
116 |              '"constant"',                 "tCONSTANT",
117 | 
118 |              '"(.."',                  "tBDOT2",
119 |              '"(..."',                 "tBDOT3",
120 |              '"char literal"',         "tCHAR",
121 |              '"literal content"',      "tSTRING_CONTENT",
122 |              '"string literal"',       "tSTRING_BEG",
123 |              '"symbol literal"',       "tSYMBEG",
124 |              '"backtick literal"',     "tXSTRING_BEG",
125 |              '"regexp literal"',       "tREGEXP_BEG",
126 |              '"word list"',            "tWORDS_BEG",
127 |              '"verbatim word list"',   "tQWORDS_BEG",
128 |              '"symbol list"',          "tSYMBOLS_BEG",
129 |              '"verbatim symbol list"', "tQSYMBOLS_BEG",
130 | 
131 |              '"float literal"',        "tFLOAT",
132 |              '"imaginary literal"',    "tIMAGINARY",
133 |              '"integer literal"',      "tINTEGER",
134 |              '"rational literal"',     "tRATIONAL",
135 | 
136 |              '"instance variable"',  "tIVAR",
137 |              '"class variable"',     "tCVAR",
138 |              '"terminator"',         "tSTRING_END", # TODO: switch this?
139 |              '"method"',             "tFID",
140 |              '"}"',                  "tSTRING_DEND",
141 | 
142 |              '"do for block"',     "kDO_BLOCK",
143 |              '"do for condition"', "kDO_COND",
144 |              '"do for lambda"',    "kDO_LAMBDA",
145 |              "tLABEL",             "kLABEL",
146 | 
147 |              # UGH
148 |              "k_LINE__",       "k__LINE__",
149 |              "k_FILE__",       "k__FILE__",
150 |              "k_ENCODING__",   "k__ENCODING__",
151 | 
152 |              '"defined?"',     "kDEFINED",
153 | 
154 |              '"do (for condition)"', "kDO_COND",
155 |              '"do (for lambda)"',    "kDO_LAMBDA",
156 |              %("'do' for block"),    "kDO_BLOCK",    # 3.4
157 |              %("'do' for lambda"),   "kDO_LAMBDA",   # 3.4
158 |              %("'do' for condition"),"kDO_COND",     # 3.4
159 |              %q("#{"),               "tSTRING_DBEG", # 3.4
160 |              '"do (for block)"',     "kDO_BLOCK",    # 3.4
161 | 
162 |              /\"'(\w+)' \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" }, # 3.4
163 |              /\"(\w+) \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" },
164 |              /\"((?!k)\w+)\"/,             proc { |x| "k#{$1.upcase}" },
165 | 
166 |              /\$?@(\d+)(\s+|$)/,    "", # newer bison
167 | 
168 |              # 3.4(ish?) changes:
169 |              "option_tNL",              "opt_nl", # ruby 3.4
170 | 
171 |              # TODO: remove for 3.0 work:
172 |              "lex_ctxt ", "" # 3.0 production that's mostly noise right now
173 |             ]
174 | 
175 |   renames.each_slice(2) do |(a, b)|
176 |     if Proc === b then
177 |       s.gsub!(a, &b)
178 |     else
179 |       s.gsub!(a, b)
180 |     end
181 |   end
182 | 
183 |   s.strip
184 | end
185 | 
186 | ARGF.each_line do |line|
187 |   next unless good or line =~ /^-* ?Grammar|\$accept : /
188 | 
189 |   case line.strip # TODO: .delete %q["'()]
190 |   when /^$/ then
191 |   when /^(\d+) (\$?[@\w]+): (.*)/ then    # yacc
192 |     rule = $2
193 |     order << rule unless rules.has_key? rule
194 |     rules[rule] << munge($3)
195 |   when /^(\d+) (\$?[@\w]+'(?: |\\n)'): (.*)/ then # munges both sides
196 |     rule = $2
197 |     order << rule unless rules.has_key? rule
198 |     rules[munge(rule)] << munge($3)
199 |   when /^(\d+) \s+\| (.*)/ then        # yacc
200 |     rules[rule] << munge($2)
201 |   when /^(\d+) (@\d+): (.*)/ then      # yacc
202 |     rule = $2
203 |     order << rule unless rules.has_key? rule
204 |     rules[rule] << munge($3)
205 |   when /^rule (\d+) (@?\w+):(.*)/ then # racc
206 |     rule = $2
207 |     order << rule unless rules.has_key? rule
208 |     rules[rule] << munge($3)
209 |   when /\$accept/ then                 # byacc?
210 |     good = true
211 |   when /Grammar/ then                  # both
212 |     good = true
213 |   when /^-+ Symbols/ then              # racc
214 |     break
215 |   when /^Terminals/ then               # yacc
216 |     break
217 |   when /^State \d/ then                # lrama
218 |     break
219 |   when /^\cL/ then                     # byacc
220 |     break
221 |   else
222 |     warn "unparsed: #{$.}: #{line.strip.inspect}"
223 |   end
224 | end
225 | 
226 | require 'yaml'
227 | 
228 | order.each do |k|
229 |   next if k =~ /@/
230 |   puts
231 |   puts "#{k}:"
232 |   puts rules[k].map { |r| "    #{r}" }.join "\n"
233 | end
234 | 


--------------------------------------------------------------------------------
/debugging.md:
--------------------------------------------------------------------------------
  1 | # Quick Notes to Help with Debugging
  2 | 
  3 | ## Reducing
  4 | 
  5 | One of the most important steps is reducing the code sample to a
  6 | minimal reproduction. For example, one thing I'm debugging right now
  7 | was reported as:
  8 | 
  9 | ```ruby
 10 | a, b, c, d, e, f, g, h, i, j = 1, *[p1, p2, p3], *[p1, p2, p3], *[p4, p5, p6]
 11 | ```
 12 | 
 13 | This original sample has 10 items on the left-hand-side (LHS) and 1 +
 14 | 3 groups of 3 (calls) on the RHS + 3 arrays + 3 splats. That's a lot.
 15 | 
 16 | It's already been reported (perhaps incorrectly) that this has to do
 17 | with multiple splats on the RHS, so let's focus on that. At a minimum
 18 | the code can be reduced to 2 splats on the RHS and some
 19 | experimentation shows that it needs a non-splat item to fail:
 20 | 
 21 | ```
 22 | _, _, _ = 1, *[2], *[3]
 23 | ```
 24 | 
 25 | and some intuition further removed the arrays:
 26 | 
 27 | ```
 28 | _, _, _ = 1, *2, *3
 29 | ```
 30 | 
 31 | the difference is huge and will make a ton of difference when
 32 | debugging.
 33 | 
 34 | ## Getting something to compare
 35 | 
 36 | ```
 37 | % rake debug3 F=file.rb
 38 | ```
 39 | 
 40 | TODO
 41 | 
 42 | ## Comparing against ruby / ripper:
 43 | 
 44 | ```
 45 | % rake cmp3 F=file.rb
 46 | ```
 47 | 
 48 | This compiles the parser & lexer and then parses file.rb using both
 49 | ruby, ripper, and ruby_parser in debug modes. The output is munged to
 50 | be as uniform as possible and diffable. I'm using emacs'
 51 | `ediff-files3` to compare these files (via `rake cmp3`) all at once,
 52 | but regular `diff -u tmp/{ruby,rp}` will suffice for most tasks.
 53 | 
 54 | From there? Good luck. I'm currently trying to backtrack from rule
 55 | reductions to state change differences. I'd like to figure out a way
 56 | to go from this sort of diff to a reasonable test that checks state
 57 | changes but I don't have that set up at this point.
 58 | 
 59 | ## Adding New Grammar Productions
 60 | 
 61 | Ruby adds stuff to the parser ALL THE TIME. It's actually hard to keep
 62 | up with, but I've added some tools and shown what a typical workflow
 63 | looks like. Let's say you want to add ruby 2.7's "beginless range" (eg
 64 | `..42`).
 65 | 
 66 | Whenever there's a language feature missing, I start with comparing
 67 | the parse trees between MRI and RP:
 68 | 
 69 | ### Structural Comparing
 70 | 
 71 | There's a bunch of rake tasks `compare27`, `compare26`, etc that try
 72 | to normalize and diff MRI's parse.y parse tree (just the structure of
 73 | the tree in yacc) to ruby\_parser's parse tree (racc). It's the first
 74 | thing I do when I'm adding a new version. Stub out all the version
 75 | differences, and then start to diff the structure and move
 76 | ruby\_parser towards the new changes.
 77 | 
 78 | Some differences are just gonna be there... but here's an example of a
 79 | real diff between MRI 2.7 and ruby_parser as of today:
 80 | 
 81 | ```diff
 82 |      arg tDOT3 arg
 83 |      arg tDOT2
 84 |      arg tDOT3
 85 | -    tBDOT2 arg
 86 | -    tBDOT3 arg
 87 |      arg tPLUS arg
 88 |      arg tMINUS arg
 89 |      arg tSTAR2 arg
 90 | ```
 91 | 
 92 | This is a new language feature that ruby_parser doesn't handle yet.
 93 | It's in MRI (the left hand side of the diff) but not ruby\_parser (the
 94 | right hand side) so it is a `-` or missing line.
 95 | 
 96 | Some other diffs will have both `+` and `-` lines. That usually
 97 | happens when MRI has been refactoring the grammar. Sometimes I choose
 98 | to adapt those refactorings and sometimes it starts to get too
 99 | difficult to maintain multiple versions of ruby parsing in a single
100 | file.
101 | 
102 | But! This structural comparing is always a place you should look when
103 | ruby_parser is failing to parse something. Maybe it just hasn't been
104 | implemented yet and the easiest place to look is the diff.
105 | 
106 | ### Starting Test First
107 | 
108 | The next thing I do is to add a parser test to cover that feature. I
109 | usually start with the parser and work backwards towards the lexer as
110 | needed, as I find it structures things properly and keeps things goal
111 | oriented.
112 | 
113 | So, make a new parser test, usually in the versioned section of the
114 | parser tests.
115 | 
116 | ```
117 |   def test_beginless2
118 |     rb = "..10\n; ..a\n; c"
119 |     pt = s(:block,
120 |            s(:dot2, nil, s(:lit, 0).line(1)).line(1),
121 |            s(:dot2, nil, s(:call, nil, :a).line(2)).line(2),
122 |            s(:call, nil, :c).line(3)).line(1)
123 | 
124 |     assert_parse_line rb, pt, 1
125 | 
126 |     flunk "not done yet"
127 |   end
128 | ```
129 | 
130 | (In this case copied and modified the tests for open ranges from 2.6)
131 | and run it to get my first error:
132 | 
133 | ```
134 | % rake N=/beginless/
135 | 
136 | ...
137 | 
138 | E
139 | 
140 | Finished in 0.021814s, 45.8421 runs/s, 0.0000 assertions/s.
141 | 
142 |   1) Error:
143 | TestRubyParserV27#test_whatevs:
144 | Racc::ParseError: (string):1 :: parse error on value ".." (tDOT2)
145 |     GEMS/2.7.0/gems/racc-1.5.0/lib/racc/parser.rb:538:in `on_error'
146 |     WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1304:in `on_error'
147 |     (eval):3:in `_racc_do_parse_c'
148 |     (eval):3:in `do_parse'
149 |     WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1329:in `block in process'
150 |     RUBY/lib/ruby/2.7.0/timeout.rb:95:in `block in timeout'
151 |     RUBY/lib/ruby/2.7.0/timeout.rb:33:in `block in catch'
152 |     RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
153 |     RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
154 |     RUBY/lib/ruby/2.7.0/timeout.rb:110:in `timeout'
155 |     WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1317:in `process'
156 |     WORK/ruby_parser/dev/test/test_ruby_parser.rb:4198:in `assert_parse'
157 |     WORK/ruby_parser/dev/test/test_ruby_parser.rb:4221:in `assert_parse_line'
158 |     WORK/ruby_parser/dev/test/test_ruby_parser.rb:4451:in `test_whatevs'
159 | ```
160 | 
161 | For starters, we know the missing production is for `tBDOT2 arg`. It
162 | is currently blowing up because it is getting `tDOT2` and simply
163 | doesn't know what to do with it, so it raises the error. As the diff
164 | suggests, that's the wrong token to begin with, so it is probably time
165 | to also create a lexer test:
166 | 
167 | ```
168 | def test_yylex_bdot2
169 |   assert_lex3("..42",
170 |               s(:dot2, nil, s(:lit, 42)),
171 | 
172 |               :tBDOT2,   "..", EXPR_BEG,
173 |               :tINTEGER, "42", EXPR_NUM)
174 | 
175 |   flunk "not done yet"
176 | end
177 | ```
178 | 
179 | This one is mostly speculative at this point. It says "if we're lexing
180 | this string, we should get this sexp if we fully parse it, and the
181 | lexical stream should look like this"... That last bit is mostly made
182 | up at this point. Sometimes I don't know exactly what expression state
183 | things should be in until I start really digging in.
184 | 
185 | At this point, I have 2 failing tests that are directing me in the
186 | right direction. It's now a matter of digging through
187 | `compare/parse26.y` to see how the lexer differs and implementing
188 | it...
189 | 
190 | But this is a good start to the doco for now. I'll add more later.
191 | 


--------------------------------------------------------------------------------
/gauntlet.md:
--------------------------------------------------------------------------------
  1 | # Running the Gauntlet
  2 | 
  3 | ## Maintaining a Gem Mirror
  4 | 
  5 | I use rubygems-mirror to keep an archive of all the latest rubygems on
  6 | an external disk. Here is the config:
  7 | 
  8 | ```
  9 | ---
 10 | - from: https://rubygems.org
 11 |   to: /Volumes/StuffA/gauntlet/mirror
 12 |   parallelism: 10
 13 |   retries: 3
 14 |   delete: true
 15 |   skiperror: true
 16 |   hashdir: true
 17 | ```
 18 | 
 19 | And I update using rake:
 20 | 
 21 | ```
 22 | % cd GIT/rubygems/rubygems-mirror
 23 | % git down
 24 | % rake mirror:latest
 25 | % /Volumes/StuffA/gauntlet/bin/cleanup.rb -y -v
 26 | ```
 27 | 
 28 | This rather quickly updates my mirror to the latest versions of
 29 | everything and then deletes all old versions. I then run a cleanup
 30 | script that fixes the file dates to their publication date and deletes
 31 | any gems that have invalid specs. This can argue with the mirror a
 32 | bit, but it is pretty minimal (currently ~20 bad gems).
 33 | 
 34 | ## Curating an Archive of Ruby Files
 35 | 
 36 | Next, I process the gem mirror into a much more digestable structure
 37 | using `unpack_gems.rb`.
 38 | 
 39 | ```
 40 | % cd RP/gauntlet
 41 | % time caffeinate /Volumes/StuffA/gauntlet/bin/unpack_gems.rb -v [-a] ; say done
 42 | ... waaaait ...
 43 | % DIR=gauntlet.$(today).(all|new).noindex
 44 | % mv hashed.noindex $DIR
 45 | % tar vc -T <(fd -tf . $DIR | sort) | zstd -5 -T0 --long > archives/$DIR.tar.zst ; say done
 46 | % ./bin/sync.sh
 47 | ```
 48 | 
 49 | This script filters all the newer (< 1 year old) gems (unless `-a` is
 50 | used), unpacks them, finds all the files that look like they're valid
 51 | ruby, ensures they're valid ruby (using the current version of ruby to
 52 | compile them), and then moves them into a SHA dir structure that looks
 53 | something like this:
 54 | 
 55 | ```
 56 | hashed.noindex/a/b/c/<full_file_sha>.rb
 57 | ```
 58 | 
 59 | This removes all duplicates and puts everything in a fairly even,
 60 | wide, flat directory layout.
 61 | 
 62 | This process takes a very long time, even with a lot of
 63 | parallelization. There are currently about 160k gems in the mirror.
 64 | Unpacking, validating, SHA'ing everything is disk and CPU intensive.
 65 | The `.noindex` extension stops spotlight from indexing the continous
 66 | churn of files being unpacked and moved and saves time.
 67 | 
 68 | Finally, I rename and archive it all up (currently using zstd to
 69 | compress).
 70 | 
 71 | ### Stats
 72 | 
 73 | ```
 74 | 9696 % find gauntlet.$(today).noindex -type f | lc
 75 |   561270
 76 | 3.5G gauntlet.2021-08-06.noindex
 77 | 239M gauntlet.2021-08-06.noindex.tar.zst
 78 | ```
 79 | 
 80 | So I wind up with a little over half a million unique ruby files to
 81 | parse. It's about 3.5g but compresses very nicely down to 240m
 82 | 
 83 | ## Running the Gauntlet
 84 | 
 85 | Assuming you're starting from scratch, unpack the archive once:
 86 | 
 87 | ```
 88 | % zstdcat gauntlet.$(today).noindex.tar.zst | tar x
 89 | ```
 90 | 
 91 | Then, either run a single process (easier to read):
 92 | 
 93 | ```
 94 | % ./gauntlet/bin/gauntlet.rb gauntlet/*.noindex/?
 95 | ```
 96 | 
 97 | Or max out your machine using xargs (note the `-P 16` and choose accordingly):
 98 | 
 99 | ```
100 | % ls -d gauntlet/*.noindex/?/? | time xargs -n 1 -P 16 ./gauntlet/bin/gauntlet.rb
101 | ```
102 | 
103 | In another terminal I usually monitor the progress like so:
104 | 
105 | ```
106 | % while true ; do clear; fd . -t d -t e gauntlet/*.noindex -X rmdir -p 2> /dev/null ; for D in gauntlet/*.noindex/? ; do echo -n "$D: "; fd .rb $D | wc -l ; done ; echo ; sleep 30 ; done
107 | ```
108 | 


--------------------------------------------------------------------------------
/lib/.document:
--------------------------------------------------------------------------------
1 | *.rb
2 | 


--------------------------------------------------------------------------------
/lib/rp_extensions.rb:
--------------------------------------------------------------------------------
 1 | # :stopdoc:
 2 | # WHY do I have to do this?!?
 3 | class Regexp
 4 |   ONCE = 0 unless defined? ONCE # FIX: remove this - it makes no sense
 5 | 
 6 |   unless defined? ENC_NONE then
 7 |     ENC_NONE = /x/n.options
 8 |     ENC_EUC  = /x/e.options
 9 |     ENC_SJIS = /x/s.options
10 |     ENC_UTF8 = /x/u.options
11 |   end
12 | end
13 | # :startdoc:
14 | 
15 | class Array
16 |   def prepend *vals
17 |     self[0,0] = vals
18 |   end
19 | end unless [].respond_to?(:prepend)
20 | 
21 | # :stopdoc:
22 | class Symbol
23 |   def end_with? o
24 |     self.to_s.end_with? o
25 |   end
26 | end unless :woot.respond_to?(:end_with?)
27 | # :startdoc:
28 | 
29 | ############################################################
30 | # HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK
31 | 
32 | class String
33 |   def clean_caller
34 |     self.sub(File.dirname(__FILE__), "./lib").sub(/:in.*/, "")
35 |   end if $DEBUG
36 | end
37 | 
38 | require "sexp"
39 | 
40 | class Sexp
41 |   attr_writer :paren # TODO: retire
42 | 
43 |   def paren
44 |     @paren ||= false
45 |   end
46 | 
47 |   def block_pass?
48 |     any? { |s| Sexp === s && s.sexp_type == :block_pass }
49 |   end
50 | end
51 | 
52 | # END HACK
53 | ############################################################
54 | 


--------------------------------------------------------------------------------
/lib/rp_stringscanner.rb:
--------------------------------------------------------------------------------
 1 | require "strscan"
 2 | 
 3 | class RPStringScanner < StringScanner
 4 |   if ENV["DEBUG"] || ENV["TALLY"] then
 5 |     def getch
 6 |       c = super
 7 |       where = caller.drop_while { |s| s =~ /(getch|nextc).$/ }.first
 8 |       where = where.split(/:/).first(2).join(":")
 9 |       if ENV["TALLY"] then
10 |         d getch:where
11 |       else
12 |         d getch:[c, where]
13 |       end
14 |       c
15 |     end
16 | 
17 |     def scan re
18 |       s = super
19 |       where = caller.drop_while { |x| x =~ /scan.$/ }.first
20 |       where = where.split(/:/).first(2).join(":")
21 |       if ENV["TALLY"] then
22 |         d scan:[where]
23 |       else
24 |         d scan:[s, where] if s
25 |       end
26 |       s
27 |     end
28 | 
29 |     def d o
30 |       STDERR.puts o.inspect
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/ruby_lexer.rb:
--------------------------------------------------------------------------------
   1 | # frozen_string_literal: true
   2 | # encoding: UTF-8
   3 | 
   4 | $DEBUG = true if ENV["DEBUG"]
   5 | 
   6 | class RubyLexer
   7 |   # :stopdoc:
   8 |   EOF = :eof_haha!
   9 | 
  10 |   ESCAPES = {
  11 |     "a"    => "\007",
  12 |     "b"    => "\010",
  13 |     "e"    => "\033",
  14 |     "f"    => "\f",
  15 |     "n"    => "\n",
  16 |     "r"    => "\r",
  17 |     "s"    => " ",
  18 |     "t"    => "\t",
  19 |     "v"    => "\13",
  20 |     "\\"   => '\\',
  21 |     "\n"   => "",
  22 |     "C-\?" => 127.chr,
  23 |     "c\?"  => 127.chr,
  24 |   }
  25 | 
  26 |   BTOKENS = {
  27 |     ".."  => :tBDOT2,
  28 |     "..." => :tBDOT3,
  29 |   }
  30 | 
  31 |   TOKENS = {
  32 |     "!"   => :tBANG,
  33 |     "!="  => :tNEQ,
  34 |     "!@"  => :tBANG,
  35 |     "!~"  => :tNMATCH,
  36 |     ","   => :tCOMMA,
  37 |     ".."  => :tDOT2,
  38 |     "..." => :tDOT3,
  39 |     "="   => :tEQL,
  40 |     "=="  => :tEQ,
  41 |     "===" => :tEQQ,
  42 |     "=>"  => :tASSOC,
  43 |     "=~"  => :tMATCH,
  44 |     "->"  => :tLAMBDA,
  45 |   }
  46 | 
  47 |   PERCENT_END = {
  48 |     "(" => ")",
  49 |     "[" => "]",
  50 |     "{" => "}",
  51 |     "<" => ">",
  52 |   }
  53 | 
  54 |   SIMPLE_RE_META = /[\$\*\+\.\?\^\|\)\]\}\>]/
  55 | 
  56 |   @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
  57 |   @@regexp_cache[nil] = nil
  58 | 
  59 |   def regexp_cache
  60 |     @@regexp_cache
  61 |   end
  62 | 
  63 |   if $DEBUG then
  64 |     attr_reader :lex_state
  65 | 
  66 |     def lex_state= o
  67 |       return if @lex_state == o
  68 | 
  69 |       from = ""
  70 |       if ENV["VERBOSE"]
  71 |         path = caller[0]
  72 |         path = caller[1] if path =~ /result/
  73 |         path, line, *_ = path.split(/:/)
  74 |         path.delete_prefix! File.dirname File.dirname __FILE__
  75 |         from = " at .%s:%s" % [path, line]
  76 |       end
  77 | 
  78 |       warn "lex_state: %p -> %p%s" % [lex_state, o, from]
  79 | 
  80 |       @lex_state = o
  81 |     end
  82 |   end
  83 | 
  84 |   # :startdoc:
  85 | 
  86 |   attr_accessor :lex_state unless $DEBUG
  87 | 
  88 |   attr_accessor :brace_nest
  89 |   attr_accessor :cmdarg
  90 |   attr_accessor :command_start
  91 |   attr_accessor :cmd_state # temporary--ivar to avoid passing everywhere
  92 |   attr_accessor :last_state
  93 |   attr_accessor :cond
  94 |   attr_accessor :old_ss
  95 |   attr_accessor :old_lineno
  96 | 
  97 |   # these are generated via ruby_lexer.rex: ss, lineno
  98 | 
  99 |   ##
 100 |   # Additional context surrounding tokens that both the lexer and
 101 |   # grammar use.
 102 | 
 103 |   attr_accessor :lex_strterm
 104 |   attr_accessor :lpar_beg
 105 |   attr_accessor :paren_nest
 106 |   attr_accessor :parser # HACK for very end of lexer... *sigh*
 107 |   attr_accessor :space_seen
 108 |   attr_accessor :string_buffer
 109 |   attr_accessor :string_nest
 110 | 
 111 |   # Last token read via next_token.
 112 |   attr_accessor :token
 113 | 
 114 |   # Last comment lexed, or nil
 115 |   attr_accessor :comment
 116 | 
 117 |   def initialize _ = nil
 118 |     @lex_state = nil # remove one warning under $DEBUG
 119 |     @lex_state = EXPR_NONE
 120 | 
 121 |     self.cond   = RubyParserStuff::StackState.new(:cond, $DEBUG)
 122 |     self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
 123 |     self.ss     = RPStringScanner.new ""
 124 | 
 125 |     reset
 126 |   end
 127 | 
 128 |   def arg_ambiguous
 129 |     self.warning "Ambiguous first argument. make sure."
 130 |   end
 131 | 
 132 |   def arg_state
 133 |     is_after_operator? ? EXPR_ARG : EXPR_BEG
 134 |   end
 135 | 
 136 |   def debug n
 137 |     raise "debug #{n}"
 138 |   end
 139 | 
 140 |   def expr_dot?
 141 |     lex_state =~ EXPR_DOT
 142 |   end
 143 | 
 144 |   def expr_fname? # REFACTOR
 145 |     lex_state =~ EXPR_FNAME
 146 |   end
 147 | 
 148 |   def expr_result token, text
 149 |     cond.push false
 150 |     cmdarg.push false
 151 |     result EXPR_BEG, token, text
 152 |   end
 153 | 
 154 |   def in_fname? # REFACTOR
 155 |     lex_state =~ EXPR_FNAME
 156 |   end
 157 | 
 158 |   def int_with_base base
 159 |     rb_compile_error "Invalid numeric format" if matched =~ /__/
 160 | 
 161 |     text = matched
 162 |     case
 163 |     when text.end_with?("ri")
 164 |       result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base)))
 165 |     when text.end_with?("r")
 166 |       result EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base))
 167 |     when text.end_with?("i")
 168 |       result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base))
 169 |     else
 170 |       result EXPR_NUM, :tINTEGER, text.to_i(base)
 171 |     end
 172 |   end
 173 | 
 174 |   def is_after_operator?
 175 |     lex_state =~ EXPR_FNAME|EXPR_DOT
 176 |   end
 177 | 
 178 |   def is_arg?
 179 |     lex_state =~ EXPR_ARG_ANY
 180 |   end
 181 | 
 182 |   def is_beg?
 183 |     lex_state =~ EXPR_BEG_ANY || lex_state == EXPR_LAB # yes, == EXPR_LAB
 184 |   end
 185 | 
 186 |   def is_end?
 187 |     lex_state =~ EXPR_END_ANY
 188 |   end
 189 | 
 190 |   def is_label_possible?
 191 |     (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg?
 192 |   end
 193 | 
 194 |   def is_label_suffix?
 195 |     check(/:(?!:)/)
 196 |   end
 197 | 
 198 |   def is_space_arg? c = "x"
 199 |     is_arg? and space_seen and c !~ /\s/
 200 |   end
 201 | 
 202 |   def lambda_beginning?
 203 |     lpar_beg && lpar_beg == paren_nest
 204 |   end
 205 | 
 206 |   def is_local_id id
 207 |     # maybe just make this false for now
 208 |     self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right
 209 |   end
 210 | 
 211 |   def lvar_defined? id
 212 |     # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
 213 |     self.parser.env[id.to_sym] == :lvar
 214 |   end
 215 | 
 216 |   def not_end?
 217 |     not is_end?
 218 |   end
 219 | 
 220 |   def possibly_escape_string text, check
 221 |     content = match[1]
 222 | 
 223 |     if text =~ check then
 224 |       unescape_string content
 225 |     else
 226 |       content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'")
 227 |     end
 228 |   end
 229 | 
 230 |   def process_amper text
 231 |     token = if is_arg? && space_seen && !check(/\s/) then
 232 |                warning("`&' interpreted as argument prefix")
 233 |                :tAMPER
 234 |              elsif lex_state =~ EXPR_BEG|EXPR_MID then
 235 |                :tAMPER
 236 |              else
 237 |                :tAMPER2
 238 |              end
 239 | 
 240 |     result :arg_state, token, "&"
 241 |   end
 242 | 
 243 |   def process_backref text
 244 |     token = match[1].to_sym
 245 |     # TODO: can't do lineno hack w/ symbol
 246 |     result EXPR_END, :tBACK_REF, token
 247 |   end
 248 | 
 249 |   def process_begin text
 250 |     self.comment ||= +""
 251 |     self.comment << matched
 252 | 
 253 |     unless scan(/.*?\n=end( |\t|\f)*[^\n]*(\n|\z)/m) then
 254 |       self.comment = nil
 255 |       rb_compile_error("embedded document meets end of file")
 256 |     end
 257 | 
 258 |     self.comment << matched
 259 |     self.lineno += matched.count("\n") # HACK?
 260 | 
 261 |     nil # TODO
 262 |   end
 263 | 
 264 |   # TODO: make all tXXXX terminals include lexer.lineno ... enforce it somehow?
 265 | 
 266 |   def process_brace_close text
 267 |     case matched
 268 |     when "}" then
 269 |       self.brace_nest -= 1
 270 |       return :tSTRING_DEND, matched if brace_nest < 0
 271 |     end
 272 | 
 273 |     # matching compare/parse26.y:8099
 274 |     cond.pop
 275 |     cmdarg.pop
 276 | 
 277 |     case matched
 278 |     when "}" then
 279 |       self.lex_state   = ruby24minus? ? EXPR_ENDARG : EXPR_END
 280 |       return :tRCURLY, matched
 281 |     when "]" then
 282 |       self.paren_nest -= 1
 283 |       self.lex_state   = ruby24minus? ? EXPR_ENDARG : EXPR_END
 284 |       return :tRBRACK, matched
 285 |     when ")" then
 286 |       self.paren_nest -= 1
 287 |       self.lex_state   = EXPR_ENDFN
 288 |       return :tRPAREN, matched
 289 |     else
 290 |       raise "Unknown bracing: #{matched.inspect}"
 291 |     end
 292 |   end
 293 | 
 294 |   def process_brace_open text
 295 |     # matching compare/parse23.y:8694
 296 |     self.brace_nest += 1
 297 | 
 298 |     if lambda_beginning? then
 299 |       self.lpar_beg = nil
 300 |       self.paren_nest -= 1 # close arg list when lambda opens body
 301 | 
 302 |       return expr_result(:tLAMBEG, "{")
 303 |     end
 304 | 
 305 |     token = case
 306 |             when lex_state =~ EXPR_LABELED then
 307 |               :tLBRACE     # hash
 308 |             when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then
 309 |               :tLCURLY     # block (primary) "{" in parse.y
 310 |             when lex_state =~ EXPR_ENDARG then
 311 |               :tLBRACE_ARG # block (expr)
 312 |             else
 313 |               :tLBRACE     # hash
 314 |             end
 315 | 
 316 |     state = token == :tLBRACE_ARG ? EXPR_BEG : EXPR_PAR
 317 |     self.command_start = true if token != :tLBRACE
 318 | 
 319 |     cond.push false
 320 |     cmdarg.push false
 321 |     result state, token, text
 322 |   end
 323 | 
 324 |   def process_colon1 text
 325 |     # ?: / then / when
 326 |     if is_end? || check(/\s/) then
 327 |       return result EXPR_BEG, :tCOLON, text
 328 |     end
 329 | 
 330 |     case
 331 |     when scan(/\'/) then
 332 |       string STR_SSYM, matched
 333 |     when scan(/\"/) then
 334 |       string STR_DSYM, matched
 335 |     end
 336 | 
 337 |     result EXPR_FNAME, :tSYMBEG, text
 338 |   end
 339 | 
 340 |   def process_colon2 text
 341 |     if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
 342 |       result EXPR_BEG, :tCOLON3, text
 343 |     else
 344 |       result EXPR_DOT, :tCOLON2, text
 345 |     end
 346 |   end
 347 | 
 348 |   def process_dots text # parse32.y:10216
 349 |     is_beg = self.is_beg?
 350 |     self.lex_state = EXPR_BEG
 351 | 
 352 |     return result EXPR_ENDARG, :tBDOT3, text if
 353 |       parser.in_argdef && text == "..." # TODO: version check?
 354 | 
 355 |     tokens = ruby27plus? && is_beg ? BTOKENS : TOKENS
 356 | 
 357 |     result EXPR_BEG, tokens[text], text
 358 |   end
 359 | 
 360 |   def process_float text
 361 |     rb_compile_error "Invalid numeric format" if text =~ /__/
 362 | 
 363 |     case
 364 |     when text.end_with?("ri")
 365 |       result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
 366 |     when text.end_with?("i")
 367 |       result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
 368 |     when text.end_with?("r")
 369 |       result EXPR_NUM, :tRATIONAL,  Rational(text.chop)
 370 |     else
 371 |       result EXPR_NUM, :tFLOAT, text.to_f
 372 |     end
 373 |   end
 374 | 
 375 |   def process_gvar text
 376 |     if parser.class.version > 20 && text == "$-" then
 377 |       rb_compile_error "unexpected $undefined"
 378 |     end
 379 | 
 380 |     result EXPR_END, :tGVAR, text
 381 |   end
 382 | 
 383 |   def process_gvar_oddity text
 384 |     rb_compile_error "#{text.inspect} is not allowed as a global variable name"
 385 |   end
 386 | 
 387 |   def process_ivar text
 388 |     tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
 389 |     result EXPR_END, tok_id, text
 390 |   end
 391 | 
 392 |   def process_label text
 393 |     symbol = possibly_escape_string text, /^\"/
 394 | 
 395 |     result EXPR_LAB, :tLABEL, symbol
 396 |   end
 397 | 
 398 |   def process_label_or_string text
 399 |     if @was_label && text =~ /:\Z/ then
 400 |       @was_label = nil
 401 |       return process_label text
 402 |     elsif text =~ /:\Z/ then
 403 |       self.pos -= 1 # put back ":"
 404 |       text = text[0..-2]
 405 |     end
 406 | 
 407 |     orig_line = lineno
 408 |     str = text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
 409 |     self.lineno += str.count("\n")
 410 | 
 411 |     result EXPR_END, :tSTRING, str, orig_line
 412 |   end
 413 | 
 414 |   def process_lchevron text
 415 |     if (lex_state !~ EXPR_DOT|EXPR_CLASS &&
 416 |         !is_end? &&
 417 |         (!is_arg? || lex_state =~ EXPR_LABELED || space_seen)) then
 418 |       tok = self.heredoc_identifier
 419 |       return tok if tok
 420 |     end
 421 | 
 422 |     if is_after_operator? then
 423 |       self.lex_state = EXPR_ARG
 424 |     else
 425 |       self.command_start = true if lex_state =~ EXPR_CLASS
 426 |       self.lex_state = EXPR_BEG
 427 |     end
 428 | 
 429 |     result lex_state, :tLSHFT, "\<\<"
 430 |   end
 431 | 
 432 |   def process_newline_or_comment text    # ../compare/parse30.y:9126 ish
 433 |     c = matched
 434 | 
 435 |     if c == "#" then
 436 |       self.pos -= 1
 437 | 
 438 |       while scan(/\s*\#.*(\n+|\z)/) do
 439 |         self.lineno += matched.count "\n"
 440 |         self.comment ||= +""
 441 |         self.comment << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
 442 |       end
 443 | 
 444 |       return nil if end_of_stream?
 445 |     end
 446 | 
 447 |     c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
 448 |          lex_state !~ EXPR_LABELED)
 449 |     if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
 450 |       # ignore if !fallthrough?
 451 |       if !c && parser.in_kwarg then
 452 |         # normal newline
 453 |         self.command_start = true
 454 |         return result EXPR_BEG, :tNL, nil
 455 |       else
 456 |         maybe_pop_stack
 457 |         return # goto retry
 458 |       end
 459 |     end
 460 | 
 461 |     if scan(/[\ \t\r\f\v]+/) then
 462 |       self.space_seen = true
 463 |     end
 464 | 
 465 |     if check(/#/) then
 466 |       return # goto retry
 467 |     elsif check(/&\.|\.(?!\.)/) then # C version is a hellish obfuscated xnor
 468 |       return # goto retry
 469 |     end
 470 | 
 471 |     self.command_start = true
 472 | 
 473 |     result EXPR_BEG, :tNL, nil
 474 |   end
 475 | 
 476 |   def process_nthref text
 477 |     # TODO: can't do lineno hack w/ number
 478 |     result EXPR_END, :tNTH_REF, match[1].to_i
 479 |   end
 480 | 
 481 |   def process_paren text
 482 |     token = if is_beg? then
 483 |               :tLPAREN
 484 |             elsif !space_seen then
 485 |               # foo( ... ) => method call, no ambiguity
 486 |               :tLPAREN2
 487 |             elsif is_space_arg? then
 488 |               :tLPAREN_ARG
 489 |             elsif lex_state =~ EXPR_ENDFN && !lambda_beginning? then
 490 |               # TODO:
 491 |               # warn("parentheses after method name is interpreted as " \
 492 |               #      "an argument list, not a decomposed argument")
 493 |               :tLPAREN2
 494 |             else
 495 |               :tLPAREN2 # plain "(" in parse.y
 496 |             end
 497 | 
 498 |     self.paren_nest += 1
 499 | 
 500 |     cond.push false
 501 |     cmdarg.push false
 502 |     result EXPR_PAR, token, text
 503 |   end
 504 | 
 505 |   def process_percent text
 506 |     case
 507 |     when is_beg? then
 508 |       process_percent_quote
 509 |     when scan(/\=/)
 510 |       result EXPR_BEG, :tOP_ASGN, "%"
 511 |     when is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
 512 |       process_percent_quote
 513 |     else
 514 |       result :arg_state, :tPERCENT, "%"
 515 |     end
 516 |   end
 517 | 
 518 |   def process_plus_minus text
 519 |     sign = matched
 520 |     utype, type = if sign == "+" then
 521 |                     [:tUPLUS, :tPLUS]
 522 |                   else
 523 |                     [:tUMINUS, :tMINUS]
 524 |                   end
 525 | 
 526 |     if is_after_operator? then
 527 |       if scan(/@/) then
 528 |         return result(EXPR_ARG, utype, "#{sign}@")
 529 |       else
 530 |         return result(EXPR_ARG, type, sign)
 531 |       end
 532 |     end
 533 | 
 534 |     return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/)
 535 | 
 536 |     if is_beg? || (is_arg? && space_seen && !check(/\s/)) then
 537 |       arg_ambiguous if is_arg?
 538 | 
 539 |       if check(/\d/) then
 540 |         return nil if utype == :tUPLUS
 541 |         return result EXPR_BEG, :tUMINUS_NUM, sign
 542 |       end
 543 | 
 544 |       return result EXPR_BEG, utype, sign
 545 |     end
 546 | 
 547 |     result EXPR_BEG, type, sign
 548 |   end
 549 | 
 550 |   def process_questionmark text
 551 |     if is_end? then
 552 |       return result EXPR_BEG, :tEH, "?"
 553 |     end
 554 | 
 555 |     if end_of_stream? then
 556 |       rb_compile_error "incomplete character syntax: parsed #{text.inspect}"
 557 |     end
 558 | 
 559 |     if check(/\s|\v/) then
 560 |       unless is_arg? then
 561 |         c2 = { " " => "s",
 562 |               "\n" => "n",
 563 |               "\t" => "t",
 564 |               "\v" => "v",
 565 |               "\r" => "r",
 566 |               "\f" => "f" }[matched]
 567 | 
 568 |         if c2 then
 569 |           warning("invalid character syntax; use ?\\" + c2)
 570 |         end
 571 |       end
 572 | 
 573 |       # ternary
 574 |       return result EXPR_BEG, :tEH, "?"
 575 |     elsif check(/\w(?=\w)/) then # ternary, also
 576 |       return result EXPR_BEG, :tEH, "?"
 577 |     end
 578 | 
 579 |     c = if scan(/\\/) then
 580 |           self.read_escape
 581 |         else
 582 |           getch
 583 |         end
 584 | 
 585 |     result EXPR_END, :tSTRING, c
 586 |   end
 587 | 
 588 |   def process_simple_string text
 589 |     orig_line = lineno
 590 |     self.lineno += text.count("\n")
 591 | 
 592 |     str = unescape_string text[1..-2]
 593 | 
 594 |     result EXPR_END, :tSTRING, str, orig_line
 595 |   end
 596 | 
 597 |   def process_slash text
 598 |     if is_beg? then
 599 |       string STR_REGEXP, matched
 600 | 
 601 |       return result nil, :tREGEXP_BEG, "/"
 602 |     end
 603 | 
 604 |     if scan(/\=/) then
 605 |       return result(EXPR_BEG, :tOP_ASGN, "/")
 606 |     end
 607 | 
 608 |     if is_arg? && space_seen then
 609 |       unless scan(/\s/) then
 610 |         arg_ambiguous
 611 |         string STR_REGEXP, "/"
 612 |         return result(nil, :tREGEXP_BEG, "/")
 613 |       end
 614 |     end
 615 | 
 616 |     result :arg_state, :tDIVIDE, "/"
 617 |   end
 618 | 
 619 |   def process_square_bracket text
 620 |     self.paren_nest += 1
 621 | 
 622 |     token = nil
 623 | 
 624 |     if is_after_operator? then
 625 |       case
 626 |       when scan(/\]\=/) then
 627 |         self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
 628 |         return result EXPR_ARG, :tASET, "[]="
 629 |       when scan(/\]/) then
 630 |         self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
 631 |         return result EXPR_ARG, :tAREF, "[]"
 632 |       else
 633 |         rb_compile_error "unexpected '['"
 634 |       end
 635 |     elsif is_beg? then
 636 |       token = :tLBRACK
 637 |     elsif is_arg? && (space_seen || lex_state =~ EXPR_LABELED) then
 638 |       token = :tLBRACK
 639 |     else
 640 |       token = :tLBRACK2
 641 |     end
 642 | 
 643 |     cond.push false
 644 |     cmdarg.push false
 645 |     result EXPR_PAR, token, text
 646 |   end
 647 | 
 648 |   def process_symbol text
 649 |     symbol = possibly_escape_string text, /^:\"/ # stupid emacs
 650 | 
 651 |     result EXPR_LIT, :tSYMBOL, symbol
 652 |   end
 653 | 
 654 |   def process_token text
 655 |     # matching: parse_ident in compare/parse23.y:7989
 656 |     # FIX: remove: self.last_state = lex_state
 657 | 
 658 |     token = self.token = text
 659 |     token << matched if scan(/[\!\?](?!=)/)
 660 | 
 661 |     tok_id =
 662 |       case
 663 |       when token =~ /[!?]$/ then
 664 |         :tFID
 665 |       when lex_state =~ EXPR_FNAME && scan(/=(?:(?![~>=])|(?==>))/) then
 666 |         # ident=, not =~ => == or followed by =>
 667 |         # TODO test lexing of a=>b vs a==>b
 668 |         token << matched
 669 |         :tIDENTIFIER
 670 |       when token =~ /^[A-Z]/ then
 671 |         :tCONSTANT
 672 |       else
 673 |         :tIDENTIFIER
 674 |       end
 675 | 
 676 |     if is_label_possible? and is_label_suffix? then
 677 |       scan(/:/)
 678 |       return result EXPR_LAB, :tLABEL, token
 679 |     end
 680 | 
 681 |     # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
 682 |     if lex_state !~ EXPR_DOT then
 683 |       # See if it is a reserved word.
 684 |       keyword = RubyParserStuff::Keyword.keyword token
 685 | 
 686 |       return process_token_keyword keyword if keyword
 687 |     end
 688 | 
 689 |     # matching: compare/parse32.y:9031
 690 |     state = if lex_state =~ EXPR_BEG_ANY|EXPR_ARG_ANY|EXPR_DOT then
 691 |               cmd_state ? EXPR_CMDARG : EXPR_ARG
 692 |             elsif lex_state =~ EXPR_FNAME then
 693 |               EXPR_ENDFN
 694 |             else
 695 |               EXPR_END
 696 |             end
 697 |     self.lex_state = state
 698 | 
 699 |     tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
 700 | 
 701 |     if last_state !~ EXPR_DOT|EXPR_FNAME and
 702 |         (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn
 703 |         lvar_defined?(token) then
 704 |       state = EXPR_END|EXPR_LABEL
 705 |     end
 706 | 
 707 |     result state, tok_id, token
 708 |   end
 709 | 
 710 |   def process_token_keyword keyword
 711 |     # matching MIDDLE of parse_ident in compare/parse32.y:9695
 712 |     state = lex_state
 713 | 
 714 |     return result(EXPR_ENDFN, keyword.id0, token) if lex_state =~ EXPR_FNAME
 715 | 
 716 |     self.lex_state = keyword.state
 717 |     self.command_start = true if lex_state =~ EXPR_BEG
 718 | 
 719 |     case
 720 |     when keyword.id0 == :kDO then # parse32.y line 9712
 721 |       case
 722 |       when lambda_beginning? then
 723 |         self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
 724 |         self.paren_nest -= 1 # TODO: question this?
 725 |         result lex_state, :kDO_LAMBDA, token
 726 |       when cond.is_in_state then
 727 |         result lex_state, :kDO_COND, token
 728 |       when cmdarg.is_in_state && state != EXPR_CMDARG then
 729 |         result lex_state, :kDO_BLOCK, token
 730 |       else
 731 |         result lex_state, :kDO, token
 732 |       end
 733 |     when state =~ EXPR_PAD then
 734 |       result lex_state, keyword.id0, token
 735 |     when keyword.id0 != keyword.id1 then
 736 |       result EXPR_PAR, keyword.id1, token
 737 |     else
 738 |       result lex_state, keyword.id1, token
 739 |     end
 740 |   end
 741 | 
 742 |   def process_underscore text
 743 |     self.unscan # put back "_"
 744 | 
 745 |     if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
 746 |       ss.terminate
 747 |       [RubyLexer::EOF, RubyLexer::EOF]
 748 |     elsif scan(/#{IDENT_CHAR}+/) then
 749 |       process_token matched
 750 |     end
 751 |   end
 752 | 
 753 |   def rb_compile_error msg
 754 |     msg += ". near line #{self.lineno}: #{self.rest[/^.*/].inspect}"
 755 |     raise RubyParser::SyntaxError, msg
 756 |   end
 757 | 
 758 |   def reset
 759 |     self.lineno        = 1
 760 |     self.brace_nest    = 0
 761 |     self.command_start = true
 762 |     self.comment       = nil
 763 |     self.lex_state     = EXPR_NONE
 764 |     self.lex_strterm   = nil
 765 |     self.lpar_beg      = nil
 766 |     self.paren_nest    = 0
 767 |     self.space_seen    = false
 768 |     self.string_nest   = 0
 769 |     self.token         = nil
 770 |     self.string_buffer = []
 771 |     self.old_ss        = nil
 772 |     self.old_lineno    = nil
 773 | 
 774 |     self.cond.reset
 775 |     self.cmdarg.reset
 776 |   end
 777 | 
 778 |   def result new_state, token, text, line = self.lineno # :nodoc:
 779 |     new_state = self.arg_state if new_state == :arg_state
 780 |     self.lex_state = new_state if new_state
 781 | 
 782 |     [token, [text, line]]
 783 |   end
 784 | 
 785 |   def ruby22_label?
 786 |     ruby22plus? and is_label_possible?
 787 |   end
 788 | 
 789 |   def ruby22plus?
 790 |     parser.class.version >= 22
 791 |   end
 792 | 
 793 |   def ruby23plus?
 794 |     parser.class.version >= 23
 795 |   end
 796 | 
 797 |   def ruby24minus?
 798 |     parser.class.version <= 24
 799 |   end
 800 | 
 801 |   def ruby27plus?
 802 |     parser.class.version >= 27
 803 |   end
 804 | 
 805 |   def space_vs_beginning space_type, beg_type, fallback
 806 |     if is_space_arg? check(/./m) then
 807 |       warning "`**' interpreted as argument prefix"
 808 |       space_type
 809 |     elsif is_beg? then
 810 |       beg_type
 811 |     else
 812 |       # TODO: warn_balanced("**", "argument prefix");
 813 |       fallback
 814 |     end
 815 |   end
 816 | 
 817 |   def unescape_string str
 818 |     str = str.gsub(ESC) { unescape($1).b.force_encoding Encoding::UTF_8 }
 819 |     if str.valid_encoding?
 820 |       str
 821 |     else
 822 |       str.b
 823 |     end
 824 |   end
 825 | 
 826 |   def unescape s
 827 |     r = ESCAPES[s]
 828 | 
 829 |     return r if r
 830 | 
 831 |     x = case s
 832 |         when /^[0-7]{1,3}/ then
 833 |           ($&.to_i(8) & 0xFF).chr
 834 |         when /^x([0-9a-fA-F]{1,2})/ then
 835 |           $1.to_i(16).chr
 836 |         when /^M-(.)/ then
 837 |           ($1[0].ord | 0x80).chr
 838 |         when /^(C-|c)(.)/ then
 839 |           ($2[0].ord & 0x9f).chr
 840 |         when /^[89a-f]/i then # bad octal or hex... ignore? that's what MRI does :(
 841 |           s
 842 |         when /^[McCx0-9]/ then
 843 |           rb_compile_error("Invalid escape character syntax")
 844 |         when /u(\h{4})/ then
 845 |           [$1.delete("{}").to_i(16)].pack("U")
 846 |         when /u(\h{1,3})/ then
 847 |           rb_compile_error("Invalid escape character syntax")
 848 |         when /u\{(\h+(?:\s+\h+)*)\}/ then
 849 |           $1.split.map { |cp| cp.to_i(16) }.pack("U*")
 850 |         else
 851 |           s
 852 |         end
 853 |     x
 854 |   end
 855 | 
 856 |   def warning s
 857 |     # do nothing for now
 858 |   end
 859 | 
 860 |   def was_label?
 861 |     @was_label = ruby22_label?
 862 |     true
 863 |   end
 864 | 
 865 |   class State
 866 |     attr_accessor :n
 867 |     attr_accessor :names
 868 | 
 869 |     # TODO: take a shared hash of strings for inspect/to_s
 870 |     def initialize o, names
 871 |       raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
 872 | 
 873 |       self.n = o
 874 |       self.names = names
 875 |     end
 876 | 
 877 |     def == o
 878 |       self.equal?(o) || (o.class == self.class && o.n == self.n)
 879 |     end
 880 | 
 881 |     def =~ v
 882 |       (self.n & v.n) != 0
 883 |     end
 884 | 
 885 |     def | v
 886 |       raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless
 887 |         self.names == v.names
 888 |       self.class.new(self.n | v.n, self.names)
 889 |     end
 890 | 
 891 |     def inspect
 892 |       return "EXPR_NONE" if n.zero? # HACK?
 893 | 
 894 |       names.map { |v, k| k if self =~ v }.
 895 |         compact.
 896 |         join("|").
 897 |         gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "")
 898 |     end
 899 | 
 900 |     alias to_s inspect
 901 | 
 902 |     module Values
 903 |       expr_names = {}
 904 | 
 905 |       EXPR_NONE    = State.new    0x0, expr_names
 906 |       EXPR_BEG     = State.new    0x1, expr_names
 907 |       EXPR_END     = State.new    0x2, expr_names
 908 |       EXPR_ENDARG  = State.new    0x4, expr_names
 909 |       EXPR_ENDFN   = State.new    0x8, expr_names
 910 |       EXPR_ARG     = State.new   0x10, expr_names
 911 |       EXPR_CMDARG  = State.new   0x20, expr_names
 912 |       EXPR_MID     = State.new   0x40, expr_names
 913 |       EXPR_FNAME   = State.new   0x80, expr_names
 914 |       EXPR_DOT     = State.new  0x100, expr_names
 915 |       EXPR_CLASS   = State.new  0x200, expr_names
 916 |       EXPR_LABEL   = State.new  0x400, expr_names
 917 |       EXPR_LABELED = State.new  0x800, expr_names
 918 |       EXPR_FITEM   = State.new 0x1000, expr_names
 919 | 
 920 |       EXPR_BEG_ANY = EXPR_BEG | EXPR_MID    | EXPR_CLASS
 921 |       EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
 922 |       EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
 923 | 
 924 |       # extra fake lex_state names to make things a bit cleaner
 925 | 
 926 |       EXPR_LAB = EXPR_ARG|EXPR_LABELED
 927 |       EXPR_LIT = EXPR_END|EXPR_ENDARG
 928 |       EXPR_PAR = EXPR_BEG|EXPR_LABEL
 929 |       EXPR_PAD = EXPR_BEG|EXPR_LABELED
 930 | 
 931 |       EXPR_NUM = EXPR_LIT
 932 | 
 933 |       expr_names.merge!(EXPR_NONE    => "EXPR_NONE",
 934 |                         EXPR_BEG     => "EXPR_BEG",
 935 |                         EXPR_END     => "EXPR_END",
 936 |                         EXPR_ENDARG  => "EXPR_ENDARG",
 937 |                         EXPR_ENDFN   => "EXPR_ENDFN",
 938 |                         EXPR_ARG     => "EXPR_ARG",
 939 |                         EXPR_CMDARG  => "EXPR_CMDARG",
 940 |                         EXPR_MID     => "EXPR_MID",
 941 |                         EXPR_FNAME   => "EXPR_FNAME",
 942 |                         EXPR_DOT     => "EXPR_DOT",
 943 |                         EXPR_CLASS   => "EXPR_CLASS",
 944 |                         EXPR_LABEL   => "EXPR_LABEL",
 945 |                         EXPR_LABELED => "EXPR_LABELED",
 946 |                         EXPR_FITEM   => "EXPR_FITEM")
 947 | 
 948 |       # ruby constants for strings
 949 | 
 950 |       str_func_names = {}
 951 | 
 952 |       STR_FUNC_BORING = State.new 0x00,    str_func_names
 953 |       STR_FUNC_ESCAPE = State.new 0x01,    str_func_names
 954 |       STR_FUNC_EXPAND = State.new 0x02,    str_func_names
 955 |       STR_FUNC_REGEXP = State.new 0x04,    str_func_names
 956 |       STR_FUNC_QWORDS = State.new 0x08,    str_func_names
 957 |       STR_FUNC_SYMBOL = State.new 0x10,    str_func_names
 958 |       STR_FUNC_INDENT = State.new 0x20,    str_func_names # <<-HEREDOC
 959 |       STR_FUNC_LABEL  = State.new 0x40,    str_func_names
 960 |       STR_FUNC_LIST   = State.new 0x4000,  str_func_names
 961 |       STR_FUNC_TERM   = State.new 0x8000,  str_func_names
 962 |       STR_FUNC_DEDENT = State.new 0x10000, str_func_names # <<~HEREDOC
 963 | 
 964 |       # TODO: check parser25.y on how they do STR_FUNC_INDENT
 965 | 
 966 |       STR_SQUOTE = STR_FUNC_BORING
 967 |       STR_DQUOTE = STR_FUNC_EXPAND
 968 |       STR_XQUOTE = STR_FUNC_EXPAND
 969 |       STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
 970 |       STR_SWORD  = STR_FUNC_QWORDS | STR_FUNC_LIST
 971 |       STR_DWORD  = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
 972 |       STR_SSYM   = STR_FUNC_SYMBOL
 973 |       STR_DSYM   = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
 974 |       STR_LABEL  = STR_FUNC_LABEL
 975 | 
 976 |       str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
 977 |                             STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
 978 |                             STR_FUNC_REGEXP => "STR_FUNC_REGEXP",
 979 |                             STR_FUNC_QWORDS => "STR_FUNC_QWORDS",
 980 |                             STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL",
 981 |                             STR_FUNC_INDENT => "STR_FUNC_INDENT",
 982 |                             STR_FUNC_LABEL  => "STR_FUNC_LABEL",
 983 |                             STR_FUNC_LIST   => "STR_FUNC_LIST",
 984 |                             STR_FUNC_TERM   => "STR_FUNC_TERM",
 985 |                             STR_FUNC_DEDENT => "STR_FUNC_DEDENT",
 986 |                             STR_SQUOTE      => "STR_SQUOTE")
 987 |     end
 988 | 
 989 |     include Values
 990 |   end
 991 | 
 992 |   include State::Values
 993 | end
 994 | 
 995 | class RubyLexer
 996 |   module SSWrapper
 997 |     def string= s
 998 |       ss.string= s
 999 |     end
1000 | 
1001 |     def beginning_of_line?
1002 |       ss.bol?
1003 |     end
1004 | 
1005 |     alias bol? beginning_of_line? # to make .rex file more readable
1006 | 
1007 |     def check re
1008 |       maybe_pop_stack
1009 | 
1010 |       ss.check re
1011 |     end
1012 | 
1013 |     def end_of_stream?
1014 |       ss.eos?
1015 |     end
1016 | 
1017 |     alias eos? end_of_stream?
1018 | 
1019 |     def getch
1020 |       c = ss.getch
1021 |       c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1022 |       c
1023 |     end
1024 | 
1025 |     def match
1026 |       ss
1027 |     end
1028 | 
1029 |     def matched
1030 |       ss.matched
1031 |     end
1032 | 
1033 |     def in_heredoc?
1034 |       !!self.old_ss
1035 |     end
1036 | 
1037 |     def maybe_pop_stack
1038 |       if ss.eos? && in_heredoc? then
1039 |         self.ss_pop
1040 |         self.lineno_pop
1041 |       end
1042 |     end
1043 | 
1044 |     def pos
1045 |       ss.pos
1046 |     end
1047 | 
1048 |     def pos= n
1049 |       ss.pos = n
1050 |     end
1051 | 
1052 |     def rest
1053 |       ss.rest
1054 |     end
1055 | 
1056 |     def scan re
1057 |       maybe_pop_stack
1058 | 
1059 |       ss.scan re
1060 |     end
1061 | 
1062 |     def scanner_class # TODO: design this out of oedipus_lex. or something.
1063 |       RPStringScanner
1064 |     end
1065 | 
1066 |     def ss_string
1067 |       ss.string
1068 |     end
1069 | 
1070 |     def ss_string= s
1071 |       raise "Probably not"
1072 |       ss.string = s
1073 |     end
1074 | 
1075 |     def unscan
1076 |       ss.unscan
1077 |     end
1078 |   end
1079 | 
1080 |   include SSWrapper
1081 | end
1082 | 
1083 | class RubyLexer
1084 |   module SSStackish
1085 |     def lineno_push new_lineno
1086 |       self.old_lineno = self.lineno
1087 |       self.lineno     = new_lineno
1088 |     end
1089 | 
1090 |     def lineno_pop
1091 |       self.lineno     = self.old_lineno
1092 |       self.old_lineno = nil
1093 |     end
1094 | 
1095 |     def ss= o
1096 |       raise "Clearing ss while in heredoc!?!" if in_heredoc?
1097 |       @old_ss = nil
1098 |       super
1099 |     end
1100 | 
1101 |     def ss_push new_ss
1102 |       @old_ss = self.ss
1103 |       @ss     = new_ss
1104 |     end
1105 | 
1106 |     def ss_pop
1107 |       @ss     = self.old_ss
1108 |       @old_ss = nil
1109 |     end
1110 |   end
1111 | 
1112 |   prepend SSStackish
1113 | end
1114 | 
1115 | if ENV["RP_STRTERM_DEBUG"] then
1116 |   class RubyLexer
1117 |     def d o
1118 |       $stderr.puts o.inspect
1119 |     end
1120 | 
1121 |     alias old_lex_strterm= lex_strterm=
1122 | 
1123 |     def lex_strterm= o
1124 |       self.old_lex_strterm= o
1125 |       where = caller.first.split(/:/).first(2).join(":")
1126 |       $stderr.puts
1127 |       d :lex_strterm => [o, where]
1128 |     end
1129 |   end
1130 | end
1131 | 
1132 | require_relative "./ruby_lexer.rex.rb"
1133 | require_relative "./ruby_lexer_strings.rb"
1134 | 
1135 | if ENV["RP_LINENO_DEBUG"] then
1136 |   class RubyLexer
1137 |     def d o
1138 |       $stderr.puts o.inspect
1139 |     end
1140 | 
1141 |     alias old_lineno= lineno=
1142 | 
1143 |     def lineno= n
1144 |       self.old_lineno= n
1145 |       where = caller.first.split(/:/).first(2).join(":")
1146 |       $stderr.puts
1147 |       d :lineno => [n, where]
1148 |     end
1149 |   end
1150 | end
1151 | 


--------------------------------------------------------------------------------
/lib/ruby_lexer.rex:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | #
  3 | # lexical scanner definition for ruby
  4 | 
  5 | class RubyLexer
  6 | 
  7 | option
  8 | 
  9 |   lineno
 10 |   column
 11 | 
 12 | macro
 13 | 
 14 |   IDENT_CHAR    /[a-zA-Z0-9_[:^ascii:]]/
 15 | 
 16 |   ESC           /\\((?>[0-7]{1,3}|x\h{1,2}|M-[^\\]|(C-|c)[^\\]|u\h{1,4}|u\{\h+(?:\s+\h+)*\}|[^0-7xMCc]))/
 17 |   SIMPLE_STRING /((#{ESC}|\#(#{ESC}|[^\{\#\@\$\"\\])|[^\"\\\#])*)/o
 18 |   SSTRING       /((\\.|[^\'])*)/
 19 | 
 20 |   INT_DEC       /[+]?(?:(?:[1-9][\d_]*|0)(?!\.\d)(ri|r|i)?\b|0d[0-9_]+)(ri|r|i)?/i
 21 |   INT_HEX       /[+]?0x[a-f0-9_]+(ri|r|i)?/i
 22 |   INT_BIN       /[+]?0b[01_]+(ri|r|i)?/i
 23 |   INT_OCT       /[+]?0o?[0-7_]+(ri|r|i)?|0o(ri|r|i)?/i
 24 |   FLOAT         /[+]?\d[\d_]*\.[\d_]+(e[+-]?[\d_]+)?(?:(ri|r|i)\b)?|[+]?[\d_]+e[+-]?[\d_]+(?:(ri|r|i)\b)?/i
 25 |   INT_DEC2      /[+]?\d[0-9_]*(?![e])((ri|r|i)\b)?/i
 26 | 
 27 |   NUM_BAD       /[+]?0[xbd]\b/i
 28 |   INT_OCT_BAD   /[+]?0o?[0-7_]*[89]/i
 29 |   FLOAT_BAD     /[+]?\d[\d_]*_(e|\.)/i
 30 | 
 31 | start
 32 | 
 33 |   maybe_pop_stack
 34 |   return process_string_or_heredoc if lex_strterm
 35 | 
 36 |   self.cmd_state = self.command_start
 37 |   self.command_start = false
 38 |   self.space_seen    = false # TODO: rename token_seen?
 39 |   self.last_state    = lex_state
 40 | 
 41 | rule
 42 | 
 43 | # [:state]      pattern                 [actions]
 44 | 
 45 |                 # \s - \n + \v
 46 |                 /[\ \t\r\f\v]+/         { self.space_seen = true; next }
 47 | 
 48 |                 /\n|\#/                 process_newline_or_comment
 49 | 
 50 |                 /[\]\)\}]/              process_brace_close
 51 | 
 52 | : /\!/
 53 | | is_after_operator? /\!\@/             { result EXPR_ARG,   TOKENS[text], text }
 54 | |               /\![=~]?/               { result :arg_state, TOKENS[text], text }
 55 | 
 56 | : /\./
 57 | |               /\.\.\.?/               process_dots
 58 | |               /\.\d/                  { rb_compile_error "no .<digit> floating literal anymore put 0 before dot" }
 59 | |               /\./                    { self.lex_state = EXPR_BEG; result EXPR_DOT, :tDOT, "." }
 60 | 
 61 |                 /\(/                    process_paren
 62 | 
 63 |                 /\,/                    { result EXPR_PAR, TOKENS[text], text }
 64 | 
 65 | : /=/
 66 | |               /\=\=\=|\=\=|\=~|\=>|\=(?!begin\b)/ { result arg_state, TOKENS[text], text }
 67 | | bol?          /\=begin(?=\s)/         process_begin
 68 | |               /\=(?=begin\b)/         { result arg_state, TOKENS[text], text }
 69 | 
 70 | ruby22_label?   /\"#{SIMPLE_STRING}\":/o process_label
 71 |                 /\"(#{SIMPLE_STRING})\"/o process_simple_string
 72 |                 /\"/                    { string STR_DQUOTE, '"'; result nil, :tSTRING_BEG, text }
 73 | 
 74 |                 /\@\@?\d/               { rb_compile_error "`#{text}` is not allowed as a variable name" }
 75 |                 /\@\@?#{IDENT_CHAR}+/o  process_ivar
 76 | 
 77 | : /:/
 78 | | not_end?      /:([a-zA-Z_]#{IDENT_CHAR}*(?:[?]|[!](?!=)|=(?==>)|=(?![=>]))?)/o process_symbol
 79 | | not_end?      /\:\"(#{SIMPLE_STRING})\"/o process_symbol
 80 | | not_end?      /\:\'(#{SSTRING})\'/o       process_symbol
 81 | |               /\:\:/                      process_colon2
 82 | |               /\:/                        process_colon1
 83 | 
 84 |                 /->/                    { result EXPR_ENDFN, :tLAMBDA, text }
 85 | 
 86 |                 /[+-]/                  process_plus_minus
 87 | 
 88 | : /[+\d]/
 89 | |               /#{NUM_BAD}/o           { rb_compile_error "Invalid numeric format"  }
 90 | |               /#{INT_DEC}/o           { int_with_base 10                           }
 91 | |               /#{INT_HEX}/o           { int_with_base 16                           }
 92 | |               /#{INT_BIN}/o           { int_with_base 2                            }
 93 | |               /#{INT_OCT_BAD}/o       { rb_compile_error "Illegal octal digit."    }
 94 | |               /#{INT_OCT}/o           { int_with_base 8                            }
 95 | |               /#{FLOAT_BAD}/o         { rb_compile_error "Trailing '_' in number." }
 96 | |               /#{FLOAT}/o             process_float
 97 | |               /#{INT_DEC2}/o          { int_with_base 10                           }
 98 | |               /[0-9]/                 { rb_compile_error "Bad number format" }
 99 | 
100 |                 /\[/                    process_square_bracket
101 | 
102 | was_label?        /\'#{SSTRING}\':?/o   process_label_or_string
103 |                   /\'/                  { string STR_SQUOTE, "'"; result nil, :tSTRING_BEG, text }
104 | 
105 | : /\|/
106 | |               /\|\|\=/                { result EXPR_BEG, :tOP_ASGN, "||" }
107 | |               /\|\|/                  { result EXPR_BEG, :tOROP,    "||" }
108 | |               /\|\=/                  { result EXPR_BEG, :tOP_ASGN, "|" }
109 | |               /\|/                    { state = is_after_operator? ? EXPR_ARG : EXPR_PAR; result state, :tPIPE, "|" }
110 | 
111 |                 /\{/                    process_brace_open
112 | 
113 | : /\*/
114 | |               /\*\*=/                 { result EXPR_BEG, :tOP_ASGN, "**" }
115 | |               /\*\*/                  { result :arg_state, space_vs_beginning(:tDSTAR, :tDSTAR, :tPOW), "**" }
116 | |               /\*\=/                  { result EXPR_BEG, :tOP_ASGN, "*" }
117 | |               /\*/                    { result :arg_state, space_vs_beginning(:tSTAR, :tSTAR, :tSTAR2), "*" }
118 | 
119 | # TODO: fix result+process_lchevron to set command_start = true
120 | : /</
121 | |               /\<\=\>/                { result :arg_state, :tCMP, "<=>"    }
122 | |               /\<\=/                  { result :arg_state, :tLEQ, "<="     }
123 | |               /\<\<\=/                { result EXPR_BEG,  :tOP_ASGN, "<<" }
124 | |               /\<\</                  process_lchevron
125 | |               /\</                    { result :arg_state, :tLT, "<"       }
126 | 
127 | : />/
128 | |               /\>\=/                  { result :arg_state, :tGEQ, ">="     }
129 | |               /\>\>=/                 { result EXPR_BEG,  :tOP_ASGN, ">>" }
130 | |               /\>\>/                  { result :arg_state, :tRSHFT, ">>"   }
131 | |               /\>/                    { result :arg_state, :tGT, ">"       }
132 | 
133 | : /\`/
134 | | expr_fname?   /\`/                   { result EXPR_END, :tBACK_REF2, "`" }
135 | | expr_dot?     /\`/                   { result((cmd_state ? EXPR_CMDARG : EXPR_ARG), :tBACK_REF2, "`") }
136 | |               /\`/                   { string STR_XQUOTE, '`'; result nil, :tXSTRING_BEG, "`" }
137 | 
138 |                 /\?/                    process_questionmark
139 | 
140 | : /&/
141 | |               /\&\&\=/                { result EXPR_BEG, :tOP_ASGN, "&&" }
142 | |               /\&\&/                  { result EXPR_BEG, :tANDOP,   "&&" }
143 | |               /\&\=/                  { result EXPR_BEG, :tOP_ASGN, "&"  }
144 | |               /\&\./                  { result EXPR_DOT, :tLONELY,  "&." }
145 | |               /\&/                    process_amper
146 | 
147 |                 /\//                    process_slash
148 | 
149 | : /\^/
150 | |               /\^=/                   { result EXPR_BEG, :tOP_ASGN, "^" }
151 | |               /\^/                    { result :arg_state, :tCARET, "^" }
152 | 
153 |                 /\;/                    { self.command_start = true; result EXPR_BEG, :tSEMI, ";" }
154 | 
155 | : /~/
156 | | is_after_operator? /\~@/              { result :arg_state, :tTILDE, "~" }
157 | |               /\~/                    { result :arg_state, :tTILDE, "~" }
158 | 
159 | : /\\/
160 | |               /\\\r?\n/               { self.lineno += 1; self.space_seen = true; next }
161 | |               /\\/                    { rb_compile_error "bare backslash only allowed before newline" }
162 | 
163 |                 /\%/                    process_percent
164 | 
165 | : /\$/
166 | |               /\$_\w+/                         process_gvar
167 | |               /\$_/                            process_gvar
168 | |               /\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/ process_gvar
169 | | in_fname?     /\$([\&\`\'\+])/                 process_gvar
170 | |               /\$([\&\`\'\+])/                 process_backref
171 | | in_fname?     /\$([1-9]\d*)/                   process_gvar
172 | |               /\$([1-9]\d*)/                   process_nthref
173 | |               /\$0/                            process_gvar
174 | |               /\$#{IDENT_CHAR}+/               process_gvar
175 | |               /\$\W/                           process_gvar_oddity
176 | 
177 |                 /\_/                    process_underscore
178 | 
179 |                 /#{IDENT_CHAR}+/o       process_token
180 | 
181 |                 /\004|\032|\000|\Z/     { [RubyLexer::EOF, RubyLexer::EOF] }
182 | 
183 |                 /./                     { rb_compile_error "Invalid char #{text.inspect} in expression" }
184 | 
185 | end
186 | 


--------------------------------------------------------------------------------
/lib/ruby_lexer_strings.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | class RubyLexer
  4 |   def eat_whitespace
  5 |     r = scan(/\s+/)
  6 |     self.lineno += r.count("\n") if r
  7 | 
  8 |     r += eat_whitespace if eos? && in_heredoc? # forces heredoc pop
  9 | 
 10 |     r
 11 |   end
 12 | 
 13 |   def heredoc here                              # ../compare/parse30.y:7678
 14 |     _, term, func, _indent_max, _lineno, range = here
 15 | 
 16 |     start_line = lineno
 17 |     eos = term # HACK
 18 |     indent = func =~ STR_FUNC_INDENT
 19 | 
 20 |     self.string_buffer = []
 21 | 
 22 |     last_line = self.ss_string[range] if range
 23 |     eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n" # HACK
 24 | 
 25 |     expand = func =~ STR_FUNC_EXPAND
 26 | 
 27 |     # TODO? p->heredoc_line_indent == -1
 28 | 
 29 |     indent_re = indent ? "[ \t]*" : nil
 30 |     eos_re    = /#{indent_re}#{Regexp.escape eos}(?=\r?\n|\z)/
 31 |     err_msg   = "can't match #{eos_re.inspect} anywhere in "
 32 | 
 33 |     maybe_pop_stack
 34 |     rb_compile_error err_msg if end_of_stream?
 35 | 
 36 |     if beginning_of_line? && scan(eos_re) then
 37 |       scan(/\r?\n|\z/)
 38 |       self.lineno += 1 if matched =~ /\n/
 39 | 
 40 |       heredoc_restore
 41 | 
 42 |       self.lex_strterm = nil
 43 |       self.lex_state = EXPR_END
 44 | 
 45 |       return :tSTRING_END, [term, func, range]
 46 |     end
 47 | 
 48 |     if expand then
 49 |       case
 50 |       when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
 51 |         # TODO: !ISASCII
 52 |         # ?! see parser_peek_variable_name
 53 |         return :tSTRING_DVAR, matched
 54 |       when scan(/#(?=\@\@?[a-zA-Z_])/) then
 55 |         # TODO: !ISASCII
 56 |         return :tSTRING_DVAR, matched
 57 |       when scan(/#[{]/) then
 58 |         self.command_start = true
 59 |         return :tSTRING_DBEG, [matched, lineno]
 60 |       when scan(/#/) then
 61 |         string_buffer << "#"
 62 |       end
 63 | 
 64 |       begin
 65 |         # NOTE: this visibly diverges from the C code but uses tokadd_string
 66 |         #       to stay clean.
 67 | 
 68 |         str = tokadd_string func, eol, nil
 69 |         rb_compile_error err_msg if str == RubyLexer::EOF
 70 | 
 71 |         if str != eol then
 72 |           str = string_buffer.join
 73 |           string_buffer.clear
 74 |           return result nil, :tSTRING_CONTENT, str, start_line
 75 |         else
 76 |           string_buffer << scan(/\r?\n/)
 77 |           self.lineno += 1 # TODO: try to remove most scan(/\n/) and friends
 78 |         end
 79 |       end until check eos_re
 80 |     else
 81 |       until check(eos_re) do
 82 |         string_buffer << scan(/.*(\r?\n|\z)/)
 83 |         self.lineno += 1
 84 |         rb_compile_error err_msg if end_of_stream?
 85 |       end
 86 |     end
 87 | 
 88 |     string_content = begin
 89 |                        s = string_buffer.join
 90 |                        s.b.force_encoding Encoding::UTF_8
 91 |                        s
 92 |                      end
 93 |     string_buffer.clear
 94 | 
 95 |     result nil, :tSTRING_CONTENT, string_content, start_line
 96 |   end
 97 | 
 98 |   def heredoc_identifier                        # ../compare/parse30.y:7354
 99 |     token  = :tSTRING_BEG
100 |     func   = STR_FUNC_BORING
101 |     term   = nil
102 |     indent = nil
103 |     quote  = nil
104 |     char_pos = nil
105 |     byte_pos = nil
106 | 
107 |     heredoc_indent_mods = "-"
108 |     heredoc_indent_mods += '\~' if ruby23plus?
109 | 
110 |     case
111 |     when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
112 |       mods, quote, term = match[1], match[2], match[3]
113 |       char_pos = ss.charpos
114 |       byte_pos = ss.pos
115 | 
116 |       func |= STR_FUNC_INDENT unless mods.empty?
117 |       func |= STR_FUNC_DEDENT if mods == "~"
118 |       func |= case quote
119 |               when "\'" then
120 |                 STR_SQUOTE
121 |               when '"' then
122 |                 STR_DQUOTE
123 |               when "`" then
124 |                 token = :tXSTRING_BEG
125 |                 STR_XQUOTE
126 |               else
127 |                 debug 1
128 |               end
129 |     when scan(/[#{heredoc_indent_mods}]?([\'\"\`])(?!\1*\Z)/) then
130 |       rb_compile_error "unterminated here document identifier"
131 |     when scan(/([#{heredoc_indent_mods}]?)(#{IDENT_CHAR}+)/) then
132 |       mods, term = match[1], match[2]
133 |       quote = '"'
134 |       char_pos = ss.charpos
135 |       byte_pos = ss.pos
136 | 
137 |       func |= STR_FUNC_INDENT unless mods.empty?
138 |       func |= STR_FUNC_DEDENT if mods == "~"
139 |       func |= STR_DQUOTE
140 |     else
141 |       return
142 |     end
143 | 
144 |     old_lineno = self.lineno
145 |     rest_of_line = scan(/.*(?:\r?\n|\z)/)
146 |     self.lineno += rest_of_line.count "\n"
147 | 
148 |     char_pos_end = ss.charpos - 1
149 | 
150 |     range = nil
151 |     range = char_pos..char_pos_end unless rest_of_line.empty?
152 | 
153 |     self.lex_strterm = [:heredoc, term, func, indent, old_lineno, range, byte_pos]
154 | 
155 |     result nil, token, quote, old_lineno
156 |   end
157 | 
158 |   def heredoc_restore                           # ../compare/parse30.y:7438
159 |     _, _term, _func, _indent, lineno, range, bytepos = lex_strterm
160 | 
161 |     new_ss = ss.class.new self.ss_string[0..range.max]
162 |     new_ss.pos = bytepos
163 | 
164 |     lineno_push lineno
165 |     ss_push new_ss
166 | 
167 |     nil
168 |   end
169 | 
170 |   def newtok
171 |     string_buffer.clear
172 |   end
173 | 
174 |   def nextc
175 |     # TODO:
176 |     # if (UNLIKELY((p->lex.pcur == p->lex.pend) || p->eofp || RTEST(p->lex.nextline))) {
177 |     #     if (nextline(p)) return -1;
178 |     # }
179 | 
180 |     maybe_pop_stack
181 | 
182 |     c = ss.getch
183 | 
184 |     if c == "\n" then
185 |       ss.unscan
186 |       c = nil
187 |     end
188 | 
189 |     c
190 |   end
191 | 
192 |   def parse_string quote                         # ../compare/parse30.y:7273
193 |     _, func, term, paren = quote
194 | 
195 |     qwords = func =~ STR_FUNC_QWORDS
196 |     regexp = func =~ STR_FUNC_REGEXP
197 |     expand = func =~ STR_FUNC_EXPAND
198 |     list   = func =~ STR_FUNC_LIST
199 |     termx  = func =~ STR_FUNC_TERM # TODO: document wtf this means
200 | 
201 |     space = false
202 |     term_re = regexp_cache[term]
203 | 
204 |     if termx then
205 |       # self.nextc if qwords # delayed term
206 | 
207 |       self.lex_strterm = nil
208 | 
209 |       return result EXPR_END, regexp ? :tREGEXP_END : :tSTRING_END, term
210 |     end
211 | 
212 |     space = true if qwords and eat_whitespace
213 | 
214 |     if list then
215 |       debug 4
216 |       # quote[1] -= STR_FUNC_LIST
217 |       # space = true
218 |     end
219 | 
220 |     # TODO: move to quote.nest!
221 |     if string_nest == 0 && scan(term_re) then
222 |       if qwords then
223 |         quote[1] |= STR_FUNC_TERM
224 | 
225 |         return :tSPACE, matched
226 |       end
227 | 
228 |       return string_term func
229 |     end
230 | 
231 |     return result nil, :tSPACE, " " if space
232 | 
233 |     newtok
234 | 
235 |     if expand && check(/#/) then
236 |       t = self.scan_variable_name
237 |       return t if t
238 | 
239 |       tokadd "#"
240 |     end
241 | 
242 |     # TODO: add string_nest, enc, base_enc ?
243 |     lineno = self.lineno
244 |     if tokadd_string(func, term, paren) == RubyLexer::EOF then
245 |       if qwords then
246 |         rb_compile_error "unterminated list meets end of file"
247 |       end
248 | 
249 |       if regexp then
250 |         rb_compile_error "unterminated regexp meets end of file"
251 |       else
252 |         rb_compile_error "unterminated string meets end of file"
253 |       end
254 |     end
255 | 
256 |     result nil, :tSTRING_CONTENT, string_buffer.join, lineno
257 |   end
258 | 
259 |   # called from process_percent
260 |   def process_percent_quote                      # ../compare/parse30.y:8645
261 |     c = getch # type %<type><term>...<term>
262 | 
263 |     long_hand = !!(c =~ /[QqWwIixrs]/)
264 | 
265 |     if end_of_stream? || c !~ /\p{Alnum}/ then
266 |       term = c # TODO? PERCENT_END[c] || c
267 | 
268 |       debug 2 if c && c !~ /\p{ASCII}/
269 |       c = "Q"
270 |     else
271 |       term = getch
272 | 
273 |       debug 3 if term =~ /\p{Alnum}|\P{ASCII}/
274 |     end
275 | 
276 |     if end_of_stream? or c == RubyLexer::EOF or term == RubyLexer::EOF then
277 |       rb_compile_error "unterminated quoted string meets end of file"
278 |     end
279 | 
280 |     # "\0" is special to indicate beg=nnd and that no nesting?
281 |     paren = term
282 |     term = PERCENT_END[term]
283 |     term, paren = paren, "\0" if term.nil? # TODO: "\0" -> nil
284 | 
285 |     text = long_hand ? "%#{c}#{paren}" : "%#{term}"
286 | 
287 |     current_line = self.lineno
288 | 
289 |     token_type, string_type =
290 |       case c
291 |       when "Q" then
292 |         [:tSTRING_BEG,   STR_DQUOTE]
293 |       when "q" then
294 |         [:tSTRING_BEG,   STR_SQUOTE]
295 |       when "W" then
296 |         eat_whitespace
297 |         [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_QWORDS]
298 |       when "w" then
299 |         eat_whitespace
300 |         [:tQWORDS_BEG,   STR_SQUOTE | STR_FUNC_QWORDS]
301 |       when "I" then
302 |         eat_whitespace
303 |         [:tSYMBOLS_BEG,  STR_DQUOTE | STR_FUNC_QWORDS]
304 |       when "i" then
305 |         eat_whitespace
306 |         [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
307 |       when "x" then
308 |         [:tXSTRING_BEG,  STR_XQUOTE]
309 |       when "r" then
310 |         [:tREGEXP_BEG,   STR_REGEXP]
311 |       when "s" then
312 |         self.lex_state = EXPR_FNAME
313 |         [:tSYMBEG,       STR_SSYM]
314 |       else
315 |         rb_compile_error "unknown type of %string. Expected [QqWwIixrs], found '#{c}'."
316 |       end
317 | 
318 |     string string_type, term, paren
319 | 
320 |     result nil, token_type, text, current_line
321 |   end
322 | 
323 |   def process_string_or_heredoc                  # ../compare/parse30.y:9075
324 |     if lex_strterm[0] == :heredoc then
325 |       self.heredoc lex_strterm
326 |     else
327 |       self.parse_string lex_strterm
328 |     end
329 |   end
330 | 
331 |   def read_escape flags = nil                    # ../compare/parse30.y:6712
332 |     case
333 |     when scan(/\\/) then                  # Backslash
334 |       '\\'
335 |     when scan(/n/) then                   # newline
336 |       "\n"
337 |     when scan(/t/) then                   # horizontal tab
338 |       "\t"
339 |     when scan(/r/) then                   # carriage-return
340 |       "\r"
341 |     when scan(/f/) then                   # form-feed
342 |       "\f"
343 |     when scan(/v/) then                   # vertical tab
344 |       "\13"
345 |     when scan(/a/) then                   # alarm(bell)
346 |       "\007"
347 |     when scan(/e/) then                   # escape
348 |       "\033"
349 |     when scan(/[0-7]{1,3}/) then          # octal constant
350 |       (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
351 |     when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
352 |       # TODO: force encode everything to UTF-8?
353 |       match[1].to_i(16).chr.force_encoding Encoding::UTF_8
354 |     when scan(/b/) then                   # backspace
355 |       "\010"
356 |     when scan(/s/) then                   # space
357 |       " "
358 |     when check(/M-\\u/) then
359 |       debug 5
360 |     when scan(/M-\\(?=.)/) then
361 |       c = read_escape
362 |       c[0] = (c[0].ord | 0x80).chr
363 |       c
364 |     when scan(/M-(\p{ASCII})/) then
365 |       # TODO: ISCNTRL(c) -> goto eof
366 |       c = match[1]
367 |       c[0] = (c[0].ord | 0x80).chr
368 |       c
369 |     when check(/(C-|c)\\u/) then
370 |       debug 6
371 |     when scan(/(C-|c)\\?\?/) then
372 |       127.chr
373 |     when scan(/(C-|c)\\/) then
374 |       c = read_escape
375 |       c[0] = (c[0].ord & 0x9f).chr
376 |       c
377 |     when scan(/(?:C-|c)(.)/) then
378 |       c = match[1]
379 |       c[0] = (c[0].ord & 0x9f).chr
380 |       c
381 |     when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
382 |       matched
383 |     when scan(/u(\h{4})/) then
384 |       [match[1].to_i(16)].pack("U")
385 |     when scan(/u(\h{1,3})/) then
386 |       debug 7
387 |       rb_compile_error "Invalid escape character syntax"
388 |     when scan(/u\{(\h+(?: +\h+)*)\}/) then
389 |       match[1].split.map { |s| s.to_i(16) }.pack("U*")
390 |     when scan(/[McCx0-9]/) || end_of_stream? then
391 |       rb_compile_error("Invalid escape character syntax")
392 |     else
393 |       getch
394 |     end.dup
395 |   end
396 | 
397 |   def regx_options                               # ../compare/parse30.y:6914
398 |     newtok
399 | 
400 |     options = scan(/\p{Alpha}+/) || ""
401 | 
402 |     rb_compile_error("unknown regexp options: %s" % [options]) if
403 |       options =~ /[^ixmonesu]/
404 | 
405 |     options
406 |   end
407 | 
408 |   def scan_variable_name                        # ../compare/parse30.y:7208
409 |     case
410 |     when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
411 |       # TODO: !ISASCII
412 |       return :tSTRING_DVAR, matched
413 |     when scan(/#(?=\@\@?[a-zA-Z_])/) then
414 |       # TODO: !ISASCII
415 |       return :tSTRING_DVAR, matched
416 |     when scan(/#[{]/) then
417 |       self.command_start = true
418 |       return :tSTRING_DBEG, [matched, lineno]
419 |     when scan(/#/) then
420 |       # do nothing but swallow
421 |     end
422 | 
423 |     # if scan(/\P{ASCII}|_|\p{Alpha}/) then # TODO: fold into above DVAR cases
424 |     #   # if (!ISASCII(c) || c == '_' || ISALPHA(c))
425 |     #   #     return tSTRING_DVAR;
426 |     # end
427 | 
428 |     nil
429 |   end
430 | 
431 |   def string type, beg, nnd = nil
432 |     # label = (IS_LABEL_POSSIBLE() ? str_label : 0);
433 |     # p->lex.strterm = NEW_STRTERM(str_dquote | label, '"', 0);
434 |     # p->lex.ptok = p->lex.pcur-1;
435 | 
436 |     type |= STR_FUNC_LABEL if is_label_possible?
437 |     self.lex_strterm = [:strterm, type, beg, nnd || "\0"]
438 |   end
439 | 
440 |   def string_term func                          # ../compare/parse30.y:7254
441 |     self.lex_strterm = nil
442 | 
443 |     return result EXPR_END, :tREGEXP_END, self.regx_options if
444 |       func =~ STR_FUNC_REGEXP
445 | 
446 |     if func =~ STR_FUNC_LABEL && is_label_suffix? then
447 |       self.getch
448 |       self.lex_state = EXPR_BEG|EXPR_LABEL
449 | 
450 |       return :tLABEL_END, string_buffer.join
451 |     end
452 | 
453 |     self.lex_state = EXPR_END
454 | 
455 |     return :tSTRING_END, [self.matched, func]
456 |   end
457 | 
458 |   def tokadd c                                  # ../compare/parse30.y:6548
459 |     string_buffer << c
460 |   end
461 | 
462 |   def tokadd_escape                              # ../compare/parse30.y:6840
463 |     case
464 |     when scan(/\\\n/) then
465 |       # just ignore
466 |     when scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
467 |       tokadd matched
468 |     when scan(/\\([MC]-|c)(?=\\)/) then
469 |       tokadd matched
470 |       self.tokadd_escape
471 |     when scan(/\\([MC]-|c)(.)/) then
472 |       tokadd matched
473 | 
474 |       self.tokadd_escape if check(/\\/) # recurse if continued!
475 |     when scan(/\\[McCx]/) then # all unprocessed branches from above have failed
476 |       rb_compile_error "Invalid escape character syntax"
477 |     when scan(/\\(.)/m) then
478 |       chr, = match[1]
479 | 
480 |       tokadd "\\"
481 |       tokadd chr
482 |     else
483 |       rb_compile_error "Invalid escape character syntax: %p" % [self.rest.lines.first]
484 |     end
485 |   end
486 | 
487 |   def tokadd_string func, term, paren           # ../compare/parse30.y:7020
488 |     qwords = func =~ STR_FUNC_QWORDS
489 |     escape = func =~ STR_FUNC_ESCAPE
490 |     expand = func =~ STR_FUNC_EXPAND
491 |     regexp = func =~ STR_FUNC_REGEXP
492 | 
493 |     paren_re = regexp_cache[paren] if paren != "\0"
494 |     term_re  = if term == "\n"
495 |                  /\r?\n/
496 |                else
497 |                  regexp_cache[term]
498 |                end
499 | 
500 |     until end_of_stream? do
501 |       case
502 |       when paren_re && scan(paren_re) then
503 |         self.string_nest += 1
504 |       when scan(term_re) then
505 |         if self.string_nest == 0 then
506 |           self.pos -= 1 # TODO: ss.unscan 665 errors #$ HACK: why do we depend on this so hard?
507 |           break # leave eos loop, go parse term in caller (heredoc or parse_string)
508 |         else
509 |           self.lineno += matched.count("\n")
510 |           self.string_nest -= 1
511 |         end
512 | 
513 |       when expand && check(/#[\$\@\{]/) then
514 |         # do nothing since we used `check`
515 |         break # leave eos loop
516 |       when check(/\\/) then
517 |         case
518 |         when scan(/\\\n/) then
519 |           self.lineno += 1
520 |           case
521 |           when qwords then
522 |             tokadd "\n"
523 |             next
524 |           when expand then
525 |             next if func !~ STR_FUNC_INDENT
526 | 
527 |             if term == "\n" then
528 |               unscan     # rollback
529 |               scan(/\\/) # and split
530 |               scan(/\n/) # this is `matched`
531 |               break
532 |             end
533 | 
534 |             tokadd "\\"
535 |             debug 9
536 |           else
537 |             unscan     # rollback
538 |             scan(/\\/) # this is `matched`
539 |           end
540 |         when check(/\\\\/) then
541 |           tokadd '\\' if escape
542 |           nextc # ignore 1st \\
543 |           nextc # for tokadd ss.matched, below
544 |         when scan(/\\u/) then
545 |           unless expand then
546 |             tokadd "\\"
547 |             next
548 |           end
549 | 
550 |           tokadd_utf8 term, func, regexp
551 | 
552 |           next
553 |         else
554 |           scan(/\\/) # eat it, we know it's there
555 | 
556 |           return RubyLexer::EOF if end_of_stream?
557 | 
558 |           if scan(/\P{ASCII}/) then
559 |             tokadd "\\" unless expand
560 |             tokadd self.matched
561 |             next
562 |           end
563 | 
564 |           case
565 |           when regexp then
566 |             if term !~ SIMPLE_RE_META && scan(term_re) then
567 |               tokadd matched
568 |               next
569 |             end
570 | 
571 |             self.pos -= 1 # TODO: ss.unscan 15 errors
572 |             # HACK? decide whether to eat the \\ above
573 |             if _esc = tokadd_escape && end_of_stream? then
574 |               debug 10
575 |             end
576 | 
577 |             next # C's continue = Ruby's next
578 |           when expand then
579 |             tokadd "\\" if escape
580 |             tokadd read_escape
581 |             next
582 |           when qwords && scan(/\s/) then
583 |             # ignore backslashed spaces in %w
584 |           when !check(term_re) && !(paren_re && check(paren_re)) then
585 |             tokadd "\\"
586 |             next
587 |           else
588 |             getch # slurp it too for matched below
589 |           end
590 |         end # inner case for /\\/
591 | 
592 |       when scan(/\P{ASCII}/) then
593 |         # not currently checking encoding stuff -- drops to tokadd below
594 |       when qwords && check(/\s/) then
595 |         break # leave eos loop
596 |       else
597 |         t  = Regexp.escape term == "\n" ? "\r\n" : term
598 |         x  = Regexp.escape paren if paren && paren != "\000"
599 |         q  = "\\s" if qwords
600 |         re = /[^#{t}#{x}\#\\#{q}]+/
601 | 
602 |         scan re or getch
603 |         self.lineno += matched.count "\n" if matched
604 |       end # big case
605 | 
606 |       tokadd self.matched
607 |     end # until end_of_stream?
608 | 
609 |     if self.matched then
610 |       self.matched
611 |     elsif end_of_stream? then
612 |       RubyLexer::EOF
613 |     end
614 |   end # tokadd_string
615 | 
616 |   def tokadd_utf8 term, func, regexp_literal    # ../compare/parse30.y:6646
617 |     tokadd "\\u" if regexp_literal
618 | 
619 |     case
620 |     when scan(/\h{4}/) then
621 |       codepoint = [matched.to_i(16)].pack("U")
622 | 
623 |       tokadd regexp_literal ? matched : codepoint
624 |     when scan(/\{\s*(\h{1,6}(?:\s+\h{1,6})*)\s*\}/) then
625 |       codepoints = match[1].split.map { |s| s.to_i 16 }.pack("U")
626 | 
627 |       if regexp_literal then
628 |         tokadd "{"
629 |         tokadd match[1].split.join(" ")
630 |         tokadd "}"
631 |       else
632 |         tokadd codepoints
633 |       end
634 |     else
635 |       rb_compile_error "unterminated Unicode escape"
636 |     end
637 |   end
638 | end
639 | 


--------------------------------------------------------------------------------
/lib/ruby_parser.rb:
--------------------------------------------------------------------------------
  1 | require "ruby_parser_extras"
  2 | require "racc/parser"
  3 | 
  4 | ##
  5 | # RubyParser is a compound parser that uses all known versions to
  6 | # attempt to parse.
  7 | 
  8 | class RubyParser
  9 | 
 10 |   VERSIONS = []
 11 | 
 12 |   attr_accessor :current
 13 | 
 14 |   def self.for_current_ruby
 15 |     name  = "V#{RUBY_VERSION[/^\d+\.\d+/].delete "."}"
 16 |     klass = if const_defined? name then
 17 |               const_get name
 18 |             else
 19 |               latest = VERSIONS.first
 20 |               warn "NOTE: RubyParser::#{name} undefined, using #{latest}."
 21 |               latest
 22 |             end
 23 | 
 24 |     klass.new
 25 |   end
 26 | 
 27 |   def self.latest
 28 |     VERSIONS.first.new
 29 |   end
 30 | 
 31 |   def process s, f = "(string)", t = 10
 32 |     e = nil
 33 |     VERSIONS.each do |klass|
 34 |       self.current = parser = klass.new
 35 |       begin
 36 |         return parser.process s, f, t
 37 |       rescue Racc::ParseError, RubyParser::SyntaxError => exc
 38 |         e ||= exc
 39 |       end
 40 |     end
 41 |     raise e
 42 |   end
 43 | 
 44 |   alias :parse :process
 45 | 
 46 |   def reset
 47 |     # do nothing
 48 |   end
 49 | 
 50 |   class Parser < Racc::Parser
 51 |     include RubyParserStuff
 52 | 
 53 |     def self.inherited x
 54 |       RubyParser::VERSIONS << x
 55 |     end
 56 | 
 57 |     def self.version= v
 58 |        @version = v
 59 |     end
 60 | 
 61 |     def self.version
 62 |       @version ||= Parser > self && self.name[/(?:V|Ruby)(\d+)/, 1].to_i
 63 |     end
 64 |   end
 65 | 
 66 |   class SyntaxError < RuntimeError; end
 67 | end
 68 | 
 69 | ##
 70 | # Unfortunately a problem with racc is that it won't let me namespace
 71 | # properly, so instead of RubyParser::V25, I still have to generate
 72 | # the old RubyParser25 and shove it in as V25.
 73 | 
 74 | require "ruby_parser20"
 75 | require "ruby_parser21"
 76 | require "ruby_parser22"
 77 | require "ruby_parser23"
 78 | require "ruby_parser24"
 79 | require "ruby_parser25"
 80 | require "ruby_parser26"
 81 | require "ruby_parser27"
 82 | require "ruby_parser30"
 83 | require "ruby_parser31"
 84 | require "ruby_parser32"
 85 | require "ruby_parser33"
 86 | require "ruby_parser34"
 87 | 
 88 | class RubyParser # HACK
 89 |   VERSIONS.clear # also a HACK caused by racc namespace issues
 90 | 
 91 |   class V34 < ::Ruby33Parser; end
 92 |   class V33 < ::Ruby33Parser; end
 93 |   class V32 < ::Ruby32Parser; end
 94 |   class V31 < ::Ruby31Parser; end
 95 |   class V30 < ::Ruby30Parser; end
 96 |   class V27 < ::Ruby27Parser; end
 97 |   class V26 < ::Ruby26Parser; end
 98 |   class V25 < ::Ruby25Parser; end
 99 |   class V24 < ::Ruby24Parser; end
100 |   class V23 < ::Ruby23Parser; end
101 |   class V22 < ::Ruby22Parser; end
102 |   class V21 < ::Ruby21Parser; end
103 |   class V20 < ::Ruby20Parser; end
104 | end
105 | 


--------------------------------------------------------------------------------
/lib/ruby_parser_extras.rb:
--------------------------------------------------------------------------------
   1 | # frozen_string_literal: true
   2 | 
   3 | require "sexp"
   4 | require "ruby_lexer"
   5 | require "timeout"
   6 | require "rp_extensions"
   7 | require "rp_stringscanner"
   8 | 
   9 | class Sexp
  10 |   def check_line_numbers
  11 |     raise "bad nil line for:\n%s" % [self.pretty_inspect] if nil_line?
  12 |     raise "bad line number for:\n%s" % [self.pretty_inspect] unless
  13 |       Integer === self.line &&
  14 |       self.line >= 1 &&
  15 |       self.line <= self.line_min
  16 |   end
  17 | 
  18 |   ##
  19 |   # Returns the minimum line number of the children of self.
  20 | 
  21 |   def line_min
  22 |     @line_min ||= [self.deep_each.map(&:line).min, self.line].compact.min
  23 |   end
  24 | 
  25 |   def nil_line?
  26 |     self.deep_each.map(&:line).any?(&:nil?)
  27 |   end
  28 | end
  29 | 
  30 | module RubyParserStuff
  31 |   VERSION = "3.21.1"
  32 | 
  33 |   attr_accessor :lexer, :in_def, :in_single, :file, :in_argdef
  34 |   attr_accessor :in_kwarg
  35 |   attr_reader :env
  36 | 
  37 |   ##
  38 |   # Canonicalize conditionals. Eg:
  39 |   #
  40 |   #   not x ? a : b
  41 |   #
  42 |   # becomes:
  43 |   #
  44 |   #   x ? b : a
  45 | 
  46 |   attr_accessor :canonicalize_conditions
  47 | 
  48 |   ##
  49 |   # The last token type returned from #next_token
  50 | 
  51 |   attr_accessor :last_token_type
  52 | 
  53 |   $good20 = []
  54 | 
  55 |   %w[
  56 |   ].map(&:to_i).each do |n|
  57 |     $good20[n] = n
  58 |   end
  59 | 
  60 |   def debug20 n, v = nil, r = nil
  61 |     raise "not yet #{n} #{v.inspect} => #{r.inspect}" unless $good20[n]
  62 |   end
  63 | 
  64 |   def self.deprecate old, new
  65 |     define_method old do |*args|
  66 |       warn "DEPRECATED: #{old} -> #{new} from #{caller.first}"
  67 |       send new, *args
  68 |     end
  69 |   end
  70 | 
  71 |   ##
  72 |   # for pure ruby systems only
  73 | 
  74 |   def do_parse
  75 |     _racc_do_parse_rb(_racc_setup, false)
  76 |   end if ENV["PURE_RUBY"] || ENV["CHECK_LINE_NUMS"]
  77 | 
  78 |   if ENV["CHECK_LINE_NUMS"] then
  79 |     def _racc_do_reduce arg, act
  80 |       x = super
  81 | 
  82 |       @racc_vstack.grep(Sexp).each do |sexp|
  83 |         sexp.check_line_numbers
  84 |       end
  85 |       x
  86 |     end
  87 |   end
  88 | 
  89 |   ARG_TYPES = [:arglist, :call_args, :array, :args].map { |k|
  90 |     [k, true]
  91 |   }.to_h
  92 | 
  93 |   # This is in sorted order of occurrence according to
  94 |   # charlock_holmes against 500k files, with UTF_8 forced
  95 |   # to the top.
  96 |   #
  97 |   # Overwrite this contstant if you need something different.
  98 |   ENCODING_ORDER = [
  99 |     Encoding::UTF_8, # moved to top to reflect default in 2.0
 100 |     Encoding::ISO_8859_1,
 101 |     Encoding::ISO_8859_2,
 102 |     Encoding::ISO_8859_9,
 103 |     Encoding::SHIFT_JIS,
 104 |     Encoding::WINDOWS_1252,
 105 |     Encoding::EUC_JP
 106 |   ]
 107 | 
 108 |   JUMP_TYPE = [:return, :next, :break, :yield].map { |k| [k, true] }.to_h
 109 | 
 110 |   TAB_WIDTH = 8
 111 | 
 112 |   def initialize(options = {})
 113 |     super()
 114 | 
 115 |     v = self.class.name[/[23]\d/]
 116 |     raise "Bad Class name #{self.class}" unless v
 117 | 
 118 |     self.lexer = RubyLexer.new v && v.to_i
 119 |     self.lexer.parser = self
 120 |     self.in_kwarg = false
 121 |     self.in_argdef = false
 122 | 
 123 |     @env = RubyParserStuff::Environment.new
 124 | 
 125 |     @canonicalize_conditions = true
 126 | 
 127 |     self.reset
 128 |   end
 129 | 
 130 |   def arg_concat node1, node2 # TODO: nuke
 131 |     raise "huh" unless node2
 132 | 
 133 |     splat = s(:splat, node2)
 134 |     splat.line node2.line
 135 | 
 136 |     node1 << splat
 137 |   end
 138 | 
 139 |   def argl x
 140 |     x = s(:arglist, x) if x and x.sexp_type == :array
 141 |     x
 142 |   end
 143 | 
 144 |   def args args
 145 |     result = s(:args)
 146 | 
 147 |     ss = args.grep Sexp
 148 |     if ss.empty? then
 149 |       result.line lexer.lineno
 150 |     else
 151 |       result.line ss.first.line
 152 |       result.line_max = ss.first.line_max
 153 |     end
 154 | 
 155 |     args.each do |arg|
 156 |       if arg.instance_of? Array and arg.size == 2 and arg.last.is_a? Numeric then
 157 |         arg = arg.first
 158 |       end
 159 | 
 160 |       case arg
 161 |       when Sexp then
 162 |         case arg.sexp_type
 163 |         when :args, :block, :array, :call_args then # HACK call_args mismatch
 164 |           rest = arg.sexp_body
 165 | 
 166 |           rest.map! { |x|
 167 |             if x.instance_of? Array and x.size == 2 and Numeric === x.last then
 168 |               x.first
 169 |             else
 170 |               x
 171 |             end
 172 |           }
 173 | 
 174 |           result.concat rest
 175 |         when :forward_args then
 176 |           self.env[:*]  = :lvar # TODO: arg_var(p, idFWD_REST) ?
 177 |           self.env[:**] = :lvar
 178 |           self.env[:&]  = :lvar
 179 | 
 180 |           result << arg
 181 |         when :block_arg then
 182 |           result << :"&#{arg.last}"
 183 |         when :shadow then
 184 |           name = arg.last
 185 |           self.env[name] = :lvar
 186 |           if Sexp === result.last and result.last.sexp_type == :shadow then
 187 |             result.last << name
 188 |           else
 189 |             result << arg
 190 |           end
 191 |         when :masgn, :block_pass, :hash then # HACK: remove. prolly call_args
 192 |           result << arg
 193 |         else
 194 |           raise "unhandled: #{arg.sexp_type} in #{args.inspect}"
 195 |         end
 196 |       when Symbol then
 197 |         name = arg.to_s.delete("&*")
 198 |         self.env[name.to_sym] = :lvar unless name.empty?
 199 |         result << arg
 200 |       when true, false then
 201 |         self.in_kwarg = arg
 202 |       when ",", "|", ";", "(", ")", nil then
 203 |         # ignore
 204 |       else
 205 |         raise "unhandled: #{arg.inspect} in #{args.inspect}"
 206 |       end
 207 |     end
 208 | 
 209 |     result
 210 |   end
 211 | 
 212 |   def end_args args
 213 |     lexer.lex_state     = RubyLexer::State::Values::EXPR_BEG
 214 |     lexer.command_start = true
 215 |     self.args args
 216 |   end
 217 | 
 218 |   def attrset_id? id
 219 |     id =~ /^\[\]=$|^\w+=$/
 220 |   end
 221 | 
 222 |   def endless_method_name defn_or_defs
 223 |     _, name, maybe_name, * = defn_or_defs
 224 |     name = maybe_name unless Symbol === name
 225 | 
 226 |     if attrset_id? name then
 227 |       yyerror "setter method cannot be defined in an endless method definition"
 228 |     end
 229 | 
 230 |     # TODO? token_info_drop(p, "def", loc->beg_pos);
 231 |   end
 232 | 
 233 |   def array_to_hash array
 234 |     case array.sexp_type
 235 |     when :kwsplat then
 236 |       array
 237 |     else
 238 |       s(:hash, *array.sexp_body).line array.line
 239 |     end
 240 |   end
 241 | 
 242 |   def aryset receiver, index
 243 |     index ||= s()
 244 |     l = receiver.line
 245 |     result = s(:attrasgn, receiver, :"[]=",
 246 |                *index.sexp_body).compact # [].sexp_body => nil
 247 |     result.line = l
 248 |     result
 249 |   end
 250 | 
 251 |   def assignable(lhs, value = nil)
 252 |     id, line = lhs
 253 |     id = id.to_sym
 254 | 
 255 |     result = case id
 256 |              when /^@@/ then
 257 |                asgn = in_def || in_single > 0
 258 |                s((asgn ? :cvasgn : :cvdecl), id)
 259 |              when /^@/ then
 260 |                s(:iasgn, id)
 261 |              when /^\$/ then
 262 |                s(:gasgn, id)
 263 |              when /^[A-Z]/ then
 264 |                s(:cdecl, id)
 265 |              else
 266 |                case self.env[id]
 267 |                when :lvar, :dvar, nil then
 268 |                  s(:lasgn, id)
 269 |                else
 270 |                  raise "wtf? unknown type: #{self.env[id]}"
 271 |                end
 272 |              end
 273 | 
 274 |     self.env[id] ||= :lvar if result.sexp_type == :lasgn
 275 | 
 276 |     result << value if value
 277 |     result.line line
 278 |     result
 279 |   end
 280 | 
 281 |   def backref_assign_error ref
 282 |     # TODO: need a test for this... obviously
 283 |     case ref.sexp_type
 284 |     when :nth_ref then
 285 |       raise "write a test 2"
 286 |       raise SyntaxError, "Can't set variable %p" % ref.last
 287 |     when :back_ref then
 288 |       raise "write a test 3"
 289 |       raise SyntaxError, "Can't set back reference %p" % ref.last
 290 |     else
 291 |       raise "Unknown backref type: #{ref.inspect}"
 292 |     end
 293 |   end
 294 | 
 295 |   def block_append(head, tail)
 296 |     return head if tail.nil?
 297 |     return tail if head.nil?
 298 | 
 299 |     line = [head.line, tail.line].compact.min
 300 | 
 301 |     head = remove_begin(head)
 302 |     head = s(:block, head).line(line) unless head.sexp_type == :block
 303 | 
 304 |     # head.line = line
 305 |     head << tail
 306 |   end
 307 | 
 308 |   def block_dup_check call_or_args, block
 309 |     syntax_error "Both block arg and actual block given." if
 310 |       block and call_or_args.block_pass?
 311 |   end
 312 | 
 313 |   def block_var *args
 314 |     result = self.args args
 315 |     result.sexp_type = :masgn
 316 |     result
 317 |   end
 318 | 
 319 |   def call_args args
 320 |     result = s(:call_args)
 321 | 
 322 |     a = args.grep(Sexp).first
 323 |     if a then
 324 |       result.line a.line
 325 |     else
 326 |       result.line lexer.lineno
 327 |     end
 328 | 
 329 |     args.each do |arg|
 330 |       # ruby 3.0+ TODO: next if arg in [String, Integer] # eg ["(", 1]
 331 |       next if arg.class == Array && arg.map(&:class) == [String, Integer]
 332 | 
 333 |       case arg
 334 |       when Sexp then
 335 |         case arg.sexp_type
 336 |         when :array, :args, :call_args then # HACK? remove array at some point
 337 |           result.sexp_body += arg.sexp_body
 338 |         else
 339 |           result << arg
 340 |         end
 341 |       when Symbol then
 342 |         result << arg
 343 |       when Array then
 344 |         id, _line = arg
 345 |         result << id
 346 |       when ",", nil, "(" then
 347 |         # ignore
 348 |       else
 349 |         raise "unhandled: #{arg.inspect} in #{args.inspect}"
 350 |       end
 351 |     end
 352 | 
 353 |     result
 354 |   end
 355 | 
 356 |   def clean_mlhs sexp
 357 |     case sexp.sexp_type
 358 |     when :masgn then
 359 |       if sexp.size == 2 and sexp[1].sexp_type == :array then
 360 |         s(:masgn, *sexp[1].sexp_body.map { |sub| clean_mlhs sub })
 361 |       else
 362 |         debug20 5
 363 |         sexp
 364 |       end
 365 |     when :gasgn, :iasgn, :lasgn, :cvasgn then
 366 |       if sexp.size == 2 then
 367 |         sexp.last
 368 |       else
 369 |         debug20 7
 370 |         sexp # optional value
 371 |       end
 372 |     else
 373 |       raise "unsupported type: #{sexp.inspect}"
 374 |     end
 375 |   end
 376 | 
 377 |   def cond node
 378 |     return nil if node.nil?
 379 |     node = value_expr node
 380 | 
 381 |     case node.sexp_type
 382 |     when :lit then
 383 |       if Regexp === node.last then
 384 |         s(:match, node)
 385 |       else
 386 |         node
 387 |       end
 388 |     when :and then
 389 |       _, lhs, rhs = node
 390 |       s(:and,  cond(lhs), cond(rhs))
 391 |     when :or then
 392 |       _, lhs, rhs = node
 393 |       s(:or,  cond(lhs), cond(rhs))
 394 |     when :dot2 then
 395 |       label = "flip#{node.hash}"
 396 |       env[label] = :lvar
 397 |       _, lhs, rhs = node
 398 |       s(:flip2, lhs, rhs) # TODO: recurse?
 399 |     when :dot3 then
 400 |       label = "flip#{node.hash}"
 401 |       env[label] = :lvar
 402 |       _, lhs, rhs = node
 403 |       s(:flip3, lhs, rhs)
 404 |     else
 405 |       node
 406 |     end.line node.line
 407 |   end
 408 | 
 409 |   def dedent sexp
 410 |     dedent_count = dedent_size sexp
 411 | 
 412 |     skip_one = false
 413 |     sexp.map { |obj|
 414 |       case obj
 415 |       when Symbol then
 416 |         obj
 417 |       when String then
 418 |         obj.lines.map { |l| remove_whitespace_width l, dedent_count }.join
 419 |       when Sexp then
 420 |         case obj.sexp_type
 421 |         when :evstr then
 422 |           skip_one = true
 423 |           obj
 424 |         when :str then
 425 |           _, str = obj
 426 |           str = if skip_one then
 427 |                   skip_one = false
 428 |                   s1, *rest = str.lines
 429 |                   s1 + rest.map { |l| remove_whitespace_width l, dedent_count }.join
 430 |                 else
 431 |                   str.lines.map { |l| remove_whitespace_width l, dedent_count }.join
 432 |                 end
 433 | 
 434 |           s(:str, str).line obj.line
 435 |         else
 436 |           warn "unprocessed sexp %p" % [obj]
 437 |         end
 438 |       else
 439 |         warn "unprocessed: %p" % [obj]
 440 |       end
 441 |     }
 442 |   end
 443 | 
 444 |   def dedent_size sexp
 445 |     skip_one = false
 446 |     sexp.flat_map { |s|
 447 |       case s
 448 |       when Symbol then
 449 |         next
 450 |       when String then
 451 |         s.lines
 452 |       when Sexp then
 453 |         case s.sexp_type
 454 |         when :evstr then
 455 |           skip_one = true
 456 |           next
 457 |         when :str then
 458 |           _, str = s
 459 |           lines = str.lines
 460 |           if skip_one then
 461 |             skip_one = false
 462 |             lines.shift
 463 |           end
 464 |           lines
 465 |         else
 466 |           warn "unprocessed sexp %p" % [s]
 467 |         end
 468 |       else
 469 |         warn "unprocessed: %p" % [s]
 470 |       end.map { |l| whitespace_width l }
 471 |     }.compact.min
 472 |   end
 473 | 
 474 |   def dedent_string string, width
 475 |     characters_skipped = 0
 476 |     indentation_skipped = 0
 477 | 
 478 |     string.chars.each do |char|
 479 |       break if indentation_skipped >= width
 480 |       if char == " "
 481 |         characters_skipped += 1
 482 |         indentation_skipped += 1
 483 |       elsif char == "\t"
 484 |         proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1)
 485 |         break if proposed > width
 486 |         characters_skipped += 1
 487 |         indentation_skipped = proposed
 488 |       end
 489 |     end
 490 |     string[characters_skipped..-1]
 491 |   end
 492 | 
 493 |   def gettable(id)
 494 |     id = id.to_sym if String === id
 495 | 
 496 |     result = case id.to_s
 497 |              when /^@@/ then
 498 |                s(:cvar, id)
 499 |              when /^@/ then
 500 |                s(:ivar, id)
 501 |              when /^\$/ then
 502 |                s(:gvar, id)
 503 |              when /^[A-Z]/ then
 504 |                s(:const, id)
 505 |              else
 506 |                type = env[id]
 507 |                if type then
 508 |                  s(type, id)
 509 |                else
 510 |                  new_call(nil, id)
 511 |                end
 512 |              end
 513 | 
 514 |     raise "identifier #{id.inspect} is not valid" unless result
 515 | 
 516 |     result
 517 |   end
 518 | 
 519 |   def hack_encoding str, extra = nil
 520 |     encodings = ENCODING_ORDER.dup
 521 |     encodings.unshift(extra) unless extra.nil?
 522 | 
 523 |     # terrible, horrible, no good, very bad, last ditch effort.
 524 |     encodings.each do |enc|
 525 |       begin
 526 |         str.force_encoding enc
 527 |         if str.valid_encoding? then
 528 |           str.encode! Encoding::UTF_8
 529 |           break
 530 |         end
 531 |       rescue ArgumentError # unknown encoding name
 532 |         # do nothing
 533 |       rescue Encoding::InvalidByteSequenceError
 534 |         # do nothing
 535 |       rescue Encoding::UndefinedConversionError
 536 |         # do nothing
 537 |       end
 538 |     end
 539 | 
 540 |     # no amount of pain is enough for you.
 541 |     raise "Bad encoding. Need a magic encoding comment." unless
 542 |       str.encoding.name == "UTF-8"
 543 |   end
 544 | 
 545 |   ##
 546 |   # Returns a UTF-8 encoded string after processing BOMs and magic
 547 |   # encoding comments.
 548 |   #
 549 |   # Holy crap... ok. Here goes:
 550 |   #
 551 |   # Ruby's file handling and encoding support is insane. We need to be
 552 |   # able to lex a file. The lexer file is explicitly UTF-8 to make
 553 |   # things cleaner. This allows us to deal with extended chars in
 554 |   # class and method names. In order to do this, we need to encode all
 555 |   # input source files as UTF-8. First, we look for a UTF-8 BOM by
 556 |   # looking at the first line while forcing its encoding to
 557 |   # ASCII-8BIT. If we find a BOM, we strip it and set the expected
 558 |   # encoding to UTF-8. Then, we search for a magic encoding comment.
 559 |   # If found, it overrides the BOM. Finally, we force the encoding of
 560 |   # the input string to whatever was found, and then encode that to
 561 |   # UTF-8 for compatibility with the lexer.
 562 | 
 563 |   def handle_encoding str
 564 |     str = str.dup
 565 |     encoding = nil
 566 | 
 567 |     header = str.each_line.first(2)
 568 |     header.map! { |s| s.force_encoding "ASCII-8BIT" }
 569 | 
 570 |     first = header.first || ""
 571 |     encoding, str = +"utf-8", str.b[3..-1] if first =~ /\A\xEF\xBB\xBF/n
 572 | 
 573 |     encoding = $1.strip if header.find { |s|
 574 |       s[/^#.*?-\*-.*?coding:\s*([^ ;]+).*?-\*-/, 1] ||
 575 |       s[/^#.*(?:en)?coding(?:\s*[:=])\s*([\w-]+)/, 1]
 576 |     }
 577 | 
 578 |     if encoding then
 579 |       encoding.sub!(/utf-8-.+$/, "utf-8") # HACK for stupid emacs formats
 580 |       hack_encoding str, encoding
 581 |     else
 582 |       # nothing specified... ugh. try to encode as utf-8
 583 |       hack_encoding str
 584 |     end
 585 | 
 586 |     str
 587 |   end
 588 | 
 589 |   def invert_block_call val
 590 |     ret, iter = val
 591 |     type, call = ret
 592 | 
 593 |     iter.insert 1, call
 594 | 
 595 |     ret = s(type).line ret.line
 596 | 
 597 |     [iter, ret]
 598 |   end
 599 | 
 600 |   def inverted? val
 601 |     JUMP_TYPE[val[0].sexp_type]
 602 |   end
 603 | 
 604 |   def list_append list, item # TODO: nuke me *sigh*
 605 |     return s(:array, item) unless list
 606 |     list = s(:array, list) unless Sexp === list && list.sexp_type == :array
 607 |     list << item
 608 |   end
 609 | 
 610 |   def list_prepend item, list # TODO: nuke me *sigh*
 611 |     list = s(:array, list) unless Sexp === list && list.sexp_type == :array
 612 |     list.insert 1, item
 613 |     list
 614 |   end
 615 | 
 616 |   def literal_concat head, tail # TODO: ugh. rewrite
 617 |     return tail unless head
 618 |     return head unless tail
 619 | 
 620 |     htype, ttype = head.sexp_type, tail.sexp_type
 621 | 
 622 |     head = s(:dstr, "", head).line head.line if htype == :evstr
 623 | 
 624 |     case ttype
 625 |     when :str then
 626 |       if htype == :str
 627 |         a, b = head.last, tail.last
 628 |         b = b.dup.force_encoding a.encoding unless Encoding.compatible?(a, b)
 629 |         a << b
 630 |       elsif htype == :dstr and head.size == 2 then
 631 |         head.last << tail.last
 632 |       else
 633 |         head << tail
 634 |       end
 635 |     when :dstr then
 636 |       if htype == :str then
 637 |         lineno = head.line
 638 |         _, h1 = head
 639 |         _, t1, *rest = tail
 640 |         tail.sexp_body = [h1 + t1, *rest]
 641 | 
 642 |         head = tail
 643 |         head.line = lineno
 644 |       else
 645 |         tail.sexp_type = :array
 646 |         _, tail_s, *tail_r = tail
 647 |         if tail_s == "" then
 648 |           tail.sexp_body = tail_r
 649 |         else
 650 |           tail.sexp_body = [s(:str, tail_s).line(tail.line), *tail_r]
 651 |         end
 652 | 
 653 |         head.push(*tail.sexp_body)
 654 |       end
 655 |     when :evstr then
 656 |       if htype == :str then
 657 |         f, l = head.file, head.line
 658 |         head = s(:dstr, *head.sexp_body)
 659 |         head.file = f
 660 |         head.line = l
 661 |       end
 662 | 
 663 |       _, t1, * = tail
 664 |       if head.size == 2 and tail.size > 1 and t1.sexp_type == :str then
 665 |         _, h1 = head
 666 |         head.sexp_body = [h1.dup] if h1.frozen? # this is dumb
 667 |         head.last << t1.last
 668 |         head.sexp_type = :str if head.size == 2 # HACK ?
 669 |       else
 670 |         head.push(tail)
 671 |       end
 672 |     else
 673 |       x = [head, tail]
 674 |       raise "unknown type: #{x.inspect}"
 675 |     end
 676 | 
 677 |     return head
 678 |   end
 679 | 
 680 |   def local_pop in_def
 681 |     lexer.cond.pop # group = local_pop
 682 |     lexer.cmdarg.pop
 683 |     self.env.unextend
 684 |     self.in_def = in_def
 685 |   end
 686 | 
 687 |   def logical_op type, left, right
 688 |     left = value_expr left
 689 | 
 690 |     if left and left.sexp_type == type and not left.paren then
 691 |       node, rhs = left, nil
 692 | 
 693 |       loop do
 694 |         _, _lhs, rhs = node
 695 |         break unless rhs && rhs.sexp_type == type and not rhs.paren
 696 |         node = rhs
 697 |       end
 698 | 
 699 |       node.pop
 700 |       node << s(type, rhs, right).line(rhs.line)
 701 | 
 702 |       return left
 703 |     end
 704 | 
 705 |     result = s(type, left, right)
 706 |     result.line left.line if left.line
 707 |     result
 708 |   end
 709 | 
 710 |   def new_aref val
 711 |     val[2] ||= s(:arglist)
 712 |     val[2].sexp_type = :arglist if val[2].sexp_type == :array # REFACTOR
 713 |     new_call val[0], :"[]", val[2]
 714 |   end
 715 | 
 716 |   def new_arg val
 717 |     arg, = val
 718 | 
 719 |     case arg
 720 |     when Symbol then
 721 |       result = s(:args, arg).line line
 722 |     when Sexp then
 723 |       result = arg
 724 |     when Array then
 725 |       (arg, line), = val
 726 |       result = s(:args, arg).line line
 727 |     else
 728 |       debug20 32
 729 |       raise "Unknown f_arg type: #{val.inspect}"
 730 |     end
 731 | 
 732 |     result
 733 |   end
 734 | 
 735 |   def ary_to_pat ary
 736 |     pat = ary.dup
 737 |     pat.sexp_type = :array_TAIL
 738 | 
 739 |     new_array_pattern nil, nil, pat, ary.line
 740 |   end
 741 | 
 742 |   def new_array_pattern const, pre_arg, arypat, loc
 743 |     result = s(:array_pat, const).line loc
 744 |     result << pre_arg if pre_arg
 745 | 
 746 |     if arypat && arypat.sexp_type == :array_TAIL then
 747 |       result.concat arypat.sexp_body
 748 |     else
 749 |       raise "NO?: %p" % [arypat]
 750 |     end
 751 | 
 752 |     result
 753 |   end
 754 | 
 755 |   def array_pat_concat lhs, rhs
 756 |     case lhs.sexp_type
 757 |     when :PATTERN then
 758 |       lhs.sexp_type = :array_pat
 759 |     end
 760 | 
 761 |     if rhs then
 762 |       case rhs.sexp_type
 763 |       when :array_pat, :array_TAIL, :PATTERN then
 764 |         lhs.concat rhs.sexp_body
 765 |       else
 766 |         lhs << rhs
 767 |       end
 768 |     end
 769 |   end
 770 | 
 771 |   def new_array_pattern_tail pre_args, has_rest, rest_arg, post_args
 772 |     # TODO: remove has_rest once all tests pass !!!
 773 |     rest_arg = if has_rest then
 774 |                  :"*#{rest_arg}"
 775 |                else
 776 |                  nil
 777 |                end
 778 | 
 779 |     result = s(:array_TAIL).line 666
 780 | 
 781 |     array_pat_concat result, pre_args
 782 | 
 783 |     result << rest_arg if rest_arg
 784 | 
 785 |     array_pat_concat result, post_args
 786 | 
 787 |     result
 788 |   end
 789 | 
 790 |   def new_assign lhs, rhs
 791 |     return nil unless lhs
 792 | 
 793 |     rhs = value_expr rhs
 794 | 
 795 |     case lhs.sexp_type
 796 |     when :lasgn, :iasgn, :cdecl, :cvdecl, :gasgn, :cvasgn, :attrasgn, :safe_attrasgn then
 797 |       lhs << rhs
 798 |       lhs.line_max = rhs.line_max
 799 |     when :const then
 800 |       lhs.sexp_type = :cdecl
 801 |       lhs << rhs
 802 |     else
 803 |       raise "unknown lhs #{lhs.inspect} w/ #{rhs.inspect}"
 804 |     end
 805 | 
 806 |     lhs
 807 |   end
 808 | 
 809 |   def new_attrasgn recv, meth, call_op = :"."
 810 |     call_op = call_op.first if Array === call_op
 811 | 
 812 |     meth = :"#{meth}="
 813 | 
 814 |     result = case call_op.to_sym
 815 |              when :"."
 816 |                s(:attrasgn, recv, meth)
 817 |              when :"&."
 818 |                s(:safe_attrasgn, recv, meth)
 819 |              else
 820 |                raise "unknown call operator: `#{type.inspect}`"
 821 |              end
 822 | 
 823 |     result.line = recv.line
 824 |     result
 825 |   end
 826 | 
 827 |   def new_begin val
 828 |     (_, line), _, body, _ = val
 829 | 
 830 |     result = body ? s(:begin, body) : s(:nil)
 831 |     result.line line
 832 | 
 833 |     result
 834 |   end
 835 | 
 836 |   def new_body val
 837 |     body, resbody, elsebody, ensurebody = val
 838 | 
 839 |     result = body
 840 | 
 841 |     if resbody then
 842 |       result = s(:rescue)
 843 |       result << body if body
 844 | 
 845 |       res = resbody
 846 | 
 847 |       while res do
 848 |         result << res
 849 |         res = res.find_node :resbody, :delete
 850 |       end
 851 | 
 852 |       result << elsebody if elsebody
 853 | 
 854 |       result.line = (body || resbody).line
 855 |     end
 856 | 
 857 |     if elsebody and not resbody then
 858 |       warning("else without rescue is useless")
 859 |       result = s(:begin, result).line result.line if result
 860 |       result = block_append(result, elsebody)
 861 |     end
 862 | 
 863 |     if ensurebody
 864 |       lineno = (result || ensurebody).line
 865 |       result = s(:ensure, result, ensurebody).compact.line lineno
 866 |     end
 867 | 
 868 |     result
 869 |   end
 870 | 
 871 |   def new_brace_body args, body, lineno
 872 |     new_iter(nil, args, body).line lineno
 873 |   end
 874 | 
 875 |   def new_call recv, meth, args = nil, call_op = :"."
 876 |     call_op = call_op.first if Array === call_op
 877 | 
 878 |     result = case call_op.to_sym
 879 |              when :"."
 880 |                s(:call, recv, meth)
 881 |              when :"&."
 882 |                s(:safe_call, recv, meth)
 883 |              else
 884 |                raise "unknown call operator: `#{type.inspect}`"
 885 |              end
 886 | 
 887 |     # TODO: need a test with f(&b) to produce block_pass
 888 |     # TODO: need a test with f(&b) { } to produce warning
 889 | 
 890 |     if args then
 891 |       if ARG_TYPES[args.sexp_type] then
 892 |         result.concat args.sexp_body
 893 |       else
 894 |         result << args
 895 |       end
 896 |       result.line_max = args.line_max
 897 |     end
 898 | 
 899 |     # line = result.grep(Sexp).map(&:line).compact.min
 900 |     result.line = recv.line if recv
 901 |     result.line ||= lexer.lineno
 902 | 
 903 |     result
 904 |   end
 905 | 
 906 |   def new_in pat, body, cases, line
 907 |     s(:in, pat, body, cases).line line
 908 |   end
 909 | 
 910 |   def new_case expr, body, line
 911 |     result = s(:case, expr)
 912 | 
 913 |     while body and [:when, :in].include? body.sexp_type
 914 |       result << body
 915 |       body = body.delete_at 3
 916 |     end
 917 | 
 918 |     _, _expr, *cases = result
 919 |     cases.each do |node|
 920 |       block = node.find_node :block, :delete
 921 |       node.concat block.sexp_body if block
 922 |     end
 923 | 
 924 |     # else
 925 |     body = nil if body == s(:block)
 926 |     result << body
 927 | 
 928 |     result.line = line
 929 |     result
 930 |   end
 931 | 
 932 |   def new_class val
 933 |     (_, line, comment), path, superclass, _, body, (_, line_max) = val
 934 | 
 935 |     path = path.first if path.instance_of? Array
 936 | 
 937 |     result = s(:class, path, superclass)
 938 | 
 939 |     if body then
 940 |       if body.sexp_type == :block then
 941 |         result.push(*body.sexp_body)
 942 |       else
 943 |         result.push body
 944 |       end
 945 |     end
 946 | 
 947 |     result.line = line
 948 |     result.line_max = line_max
 949 |     result.comments = comment if comment
 950 |     result
 951 |   end
 952 | 
 953 |   def new_compstmt val
 954 |     result = void_stmts(val.grep(Sexp)[0])
 955 |     result = remove_begin(result) if result
 956 |     result
 957 |   end
 958 | 
 959 |   def new_const_op_asgn val
 960 |     lhs, (asgn_op, _), rhs = val
 961 |     asgn_op = asgn_op.to_sym
 962 | 
 963 |     result = case asgn_op
 964 |              when :"||" then
 965 |                s(:op_asgn_or, lhs, rhs)
 966 |              when :"&&" then
 967 |                s(:op_asgn_and, lhs, rhs)
 968 |              else
 969 |                s(:op_asgn, lhs, asgn_op, rhs)
 970 |              end
 971 | 
 972 |     result.line = lhs.line
 973 |     result
 974 |   end
 975 | 
 976 |   def new_defn val
 977 |     if val.size == 4 then
 978 |       ((_, line, comment), (name, _line, in_def)), args, body, (_, line_max) = val
 979 |     else
 980 |       (_, line, comment), (name, line), in_def, args, body, (_, line_max) = val
 981 |     end
 982 | 
 983 |     body ||= s(:nil).line line
 984 | 
 985 |     args.line line
 986 | 
 987 |     result = s(:defn, name.to_sym, args).line line
 988 |     result.line_max = line_max
 989 | 
 990 |     if body.sexp_type == :block then
 991 |       result.push(*body.sexp_body)
 992 |     else
 993 |       result.push body
 994 |     end
 995 | 
 996 |     result.comments = comment if comment
 997 | 
 998 |     [result, in_def]
 999 |   end
1000 | 
1001 |   def new_endless_defn val
1002 |     # not available in 2.x so we don't need to check size
1003 |     ((_, line, comment), (name, _, in_def)), args, _, body, _, resbody = val
1004 | 
1005 |     result =
1006 |       if resbody then
1007 |         s(:defn, name, args,
1008 |           new_rescue(body,
1009 |                      new_resbody(s(:array).line(line),
1010 |                                  resbody))).line line
1011 |       else
1012 |         s(:defn, name, args, body).line line
1013 |       end
1014 | 
1015 |     local_pop in_def
1016 |     endless_method_name result
1017 | 
1018 |     result.comments = comment if comment
1019 | 
1020 |     result
1021 |   end
1022 | 
1023 |   def new_endless_defs val
1024 |     # not available in 2.x so we don't need to check size
1025 |     ((_, line, comment), recv, _, _, (name, line, in_def)), \
1026 |       args, _, body, _, resbody = val
1027 | 
1028 |     result =
1029 |       if resbody then
1030 |         s(:defs, recv, name, args,
1031 |           new_rescue(body,
1032 |                      new_resbody(s(:array).line(line),
1033 |                                  resbody))).line line
1034 |       else
1035 |         s(:defs, recv, name, args, body).line(line)
1036 |       end
1037 | 
1038 |     self.in_single -= 1
1039 |     local_pop in_def
1040 |     endless_method_name result
1041 | 
1042 |     result.comments = comment if comment
1043 | 
1044 |     result
1045 |   end
1046 | 
1047 |   def new_defs val
1048 |     if val.size == 4 then
1049 |       ((_, line, comment), recv, _, _, (name, line, in_def)), \
1050 |         args, body, (_, line_max) = val
1051 |     else
1052 |       (_, line, comment), recv, (name, _), in_def, \
1053 |         args, body, (_, line_max) = val
1054 |     end
1055 | 
1056 |     body ||= s(:nil).line line
1057 | 
1058 |     args.line line
1059 | 
1060 |     result = s(:defs, recv, name.to_sym, args).line line
1061 |     result.line_max = line_max
1062 | 
1063 |     # TODO: remove_begin
1064 |     # TODO: reduce_nodes
1065 | 
1066 |     if body.sexp_type == :block then
1067 |       result.push(*body.sexp_body)
1068 |     else
1069 |       result.push body
1070 |     end
1071 | 
1072 |     result.comments = comment if comment
1073 | 
1074 |     [result, in_def]
1075 |   end
1076 | 
1077 |   def new_do_body args, body, lineno
1078 |     new_iter(nil, args, body).line(lineno)
1079 |   end
1080 | 
1081 |   def new_find_pattern const, pat
1082 |     pat.sexp_type = :find_pat
1083 |     pat.insert 1, const
1084 |   end
1085 | 
1086 |   def new_find_pattern_tail lhs, mid, rhs
1087 |     lhs_id, line = lhs
1088 |     rhs_id, _line = rhs
1089 | 
1090 |     # TODO: fpinfo->pre_rest_arg = pre_rest_arg ? assignable(p, pre_rest_arg, 0, loc) : NODE_SPECIAL_NO_NAME_REST;
1091 | 
1092 |     lhs_id = "*#{lhs_id}".to_sym
1093 |     rhs_id = "*#{rhs_id}".to_sym
1094 | 
1095 |     raise "BAD?" unless mid.sexp_type == :array_TAIL
1096 | 
1097 |     s(:find_pat_TAIL, lhs_id, *mid.sexp_body, rhs_id).line line
1098 |   end
1099 | 
1100 |   def new_for expr, var, body
1101 |     result = s(:for, expr, var).line(var.line)
1102 |     result << body if body
1103 |     result
1104 |   end
1105 | 
1106 |   def new_hash val
1107 |     _, line, assocs = val
1108 | 
1109 |     s(:hash).line(line).concat assocs.sexp_body
1110 |   end
1111 | 
1112 |   def new_hash_pattern const, hash_pat, loc
1113 |     _, pat, kw_args, kw_rest_arg = hash_pat
1114 | 
1115 |     line = (const||hash_pat).line
1116 | 
1117 |     result = s(:hash_pat, const).line line
1118 |     result.concat pat.sexp_body if pat
1119 |     result << kw_args     if kw_args
1120 |     result << kw_rest_arg if kw_rest_arg
1121 |     result
1122 |   end
1123 | 
1124 |   def new_hash_pattern_tail kw_args, kw_rest_arg, line # TODO: remove line arg
1125 |     # kw_rest_arg = assignable(kw_rest_arg, nil).line line if kw_rest_arg
1126 | 
1127 |     result = s(:hash_pat).line line
1128 |     result << kw_args
1129 | 
1130 |     if kw_rest_arg then
1131 |       name = kw_rest_arg.value
1132 |       # TODO: I _hate_ this:
1133 |       assignable [name, kw_rest_arg.line] if name != :**
1134 |       result << kw_rest_arg
1135 |     end
1136 | 
1137 |     result
1138 |   end
1139 | 
1140 |   def push_pktbl
1141 |   end
1142 | 
1143 |   def pop_pktbl
1144 |   end
1145 | 
1146 |   def push_pvtbl
1147 |   end
1148 | 
1149 |   def pop_pvtbl
1150 |   end
1151 | 
1152 |   def new_if c, t, f
1153 |     l = [c.line, t && t.line, f && f.line].compact.min
1154 |     c = cond c
1155 |     c, t, f = c.last, f, t if c.sexp_type == :not and canonicalize_conditions
1156 |     s(:if, c, t, f).line(l)
1157 |   end
1158 | 
1159 |   def new_iter call, args, body
1160 |     body ||= nil
1161 | 
1162 |     args ||= s(:args)
1163 |     args = s(:args, args) if Symbol === args
1164 | 
1165 |     result = s(:iter)
1166 |     result << call if call
1167 |     result << args
1168 |     result << body if body
1169 | 
1170 |     result.line call.line if call
1171 | 
1172 |     unless args == 0 then
1173 |       args.line call.line if call
1174 |       args.sexp_type = :args
1175 |     end
1176 | 
1177 |     result
1178 |   end
1179 | 
1180 |   def new_masgn lhs, rhs, wrap = false
1181 |     _, ary = lhs
1182 | 
1183 |     line = rhs.line
1184 |     rhs = value_expr(rhs)
1185 |     rhs = ary ? s(:to_ary, rhs) : s(:array, rhs) if wrap
1186 |     rhs.line line if wrap
1187 | 
1188 |     lhs.delete_at 1 if ary.nil?
1189 |     lhs << rhs
1190 | 
1191 |     lhs
1192 |   end
1193 | 
1194 |   def new_masgn_arg rhs, wrap = false
1195 |     rhs = value_expr(rhs)
1196 |     # HACK: could be array if lhs isn't right
1197 |     rhs = s(:to_ary, rhs).line rhs.line if wrap
1198 |     rhs
1199 |   end
1200 | 
1201 |   def new_match lhs, rhs
1202 |     if lhs then
1203 |       case lhs.sexp_type
1204 |       when :dregx, :dregx_once then
1205 |         # TODO: no test coverage
1206 |         return s(:match2, lhs, rhs).line(lhs.line)
1207 |       when :lit then
1208 |         return s(:match2, lhs, rhs).line(lhs.line) if Regexp === lhs.last
1209 |       end
1210 |     end
1211 | 
1212 |     if rhs then
1213 |       case rhs.sexp_type
1214 |       when :dregx, :dregx_once then
1215 |         # TODO: no test coverage
1216 |         return s(:match3, rhs, lhs).line(lhs.line)
1217 |       when :lit then
1218 |         return s(:match3, rhs, lhs).line(lhs.line) if Regexp === rhs.last
1219 |       end
1220 |     end
1221 | 
1222 |     new_call(lhs, :"=~", argl(rhs)).line lhs.line
1223 |   end
1224 | 
1225 |   def new_module val
1226 |     (_, line_min, comment), path, _, body, (_, line_max) = val
1227 | 
1228 |     path = path.first if path.instance_of? Array
1229 | 
1230 |     result = s(:module, path).line line_min
1231 |     result.line_max = line_max
1232 | 
1233 |     if body then # REFACTOR?
1234 |       if body.sexp_type == :block then
1235 |         result.push(*body.sexp_body)
1236 |       else
1237 |         result.push body
1238 |       end
1239 |     end
1240 | 
1241 |     result.comments = comment if comment
1242 |     result
1243 |   end
1244 | 
1245 |   def new_op_asgn val
1246 |     lhs, (op, _line), rhs = val
1247 |     op = op.to_sym
1248 | 
1249 |     name = gettable(lhs.last).line lhs.line
1250 |     arg = remove_begin rhs
1251 |     result = case op # REFACTOR
1252 |              when :"||" then
1253 |                lhs << arg
1254 |                s(:op_asgn_or, name, lhs).line lhs.line
1255 |              when :"&&" then
1256 |                lhs << arg
1257 |                s(:op_asgn_and, name, lhs).line lhs.line
1258 |              else
1259 |                lhs << new_call(name, op, argl(arg))
1260 |                lhs
1261 |              end
1262 | 
1263 |     result
1264 |   end
1265 | 
1266 |   def new_op_asgn1 val
1267 |     lhs, _, args, _, (op, _), rhs = val
1268 | 
1269 |     args.sexp_type = :arglist if args
1270 | 
1271 |     result = s(:op_asgn1, lhs, args, op.to_sym, rhs)
1272 |     result.line lhs.line
1273 |     result
1274 |   end
1275 | 
1276 |   def new_op_asgn2 val
1277 |     recv, (call_op, _), (meth, _), (op, _), arg = val
1278 |     meth = :"#{meth}="
1279 | 
1280 |     result = case call_op.to_sym
1281 |              when :"."
1282 |                s(:op_asgn2, recv, meth, op.to_sym, arg)
1283 |              when :"&."
1284 |                s(:safe_op_asgn2, recv, meth, op.to_sym, arg)
1285 |              else
1286 |                raise "unknown call operator: `#{type.inspect}`"
1287 |              end
1288 | 
1289 |     result.line = recv.line
1290 |     result
1291 |   end
1292 | 
1293 |   def new_qsym_list
1294 |     s(:array).line lexer.lineno
1295 |   end
1296 | 
1297 |   def new_qsym_list_entry val
1298 |     _, (str, line), _ = val
1299 |     s(:lit, str.to_sym).line line
1300 |   end
1301 | 
1302 |   def new_qword_list
1303 |     s(:array).line lexer.lineno
1304 |   end
1305 | 
1306 |   def new_qword_list_entry val
1307 |     _, (str, line), _ = val
1308 |     str.force_encoding("ASCII-8BIT") unless str.valid_encoding?
1309 |     s(:str, str).line line
1310 |   end
1311 | 
1312 |   def new_regexp val
1313 |     (_, line), node, (options, line_max) = val
1314 | 
1315 |     node ||= s(:str, "").line line
1316 |     node.line_max = line_max
1317 | 
1318 |     o, k = 0, nil
1319 |     options.split(//).uniq.each do |c| # FIX: this has a better home
1320 |       v = {
1321 |         "x" => Regexp::EXTENDED,
1322 |         "i" => Regexp::IGNORECASE,
1323 |         "m" => Regexp::MULTILINE,
1324 |         "o" => Regexp::ONCE,
1325 |         "n" => Regexp::ENC_NONE,
1326 |         "e" => Regexp::ENC_EUC,
1327 |         "s" => Regexp::ENC_SJIS,
1328 |         "u" => Regexp::ENC_UTF8,
1329 |       }[c]
1330 |       raise "unknown regexp option: #{c}" unless v
1331 |       o += v
1332 |     end
1333 | 
1334 |     case node.sexp_type
1335 |     when :str then
1336 |       _, str = node
1337 |       node.sexp_type = :lit
1338 |       val = if k then
1339 |               Regexp.new(str, o, k)
1340 |             else
1341 |               begin
1342 |                 Regexp.new(str, o)
1343 |               rescue RegexpError => e
1344 |                 warn "WARNING: #{e.message} for #{str.inspect} #{options.inspect}"
1345 |                 begin
1346 |                   warn "WARNING: trying to recover with ENC_UTF8"
1347 |                   Regexp.new(str, Regexp::ENC_UTF8)
1348 |                 rescue RegexpError => e
1349 |                   warn "WARNING: trying to recover with ENC_NONE"
1350 |                   Regexp.new(str, Regexp::ENC_NONE)
1351 |                 end
1352 |               end
1353 |             end
1354 |       node.sexp_body = [val]
1355 |     when :dstr then
1356 |       if options =~ /o/ then
1357 |         node.sexp_type = :dregx_once
1358 |       else
1359 |         node.sexp_type = :dregx
1360 |       end
1361 |       node << o if o and o != 0
1362 |     else
1363 |       node = s(:dregx, "", node).line line
1364 |       node.sexp_type = :dregx_once if options =~ /o/
1365 |       node << o if o and o != 0
1366 |     end
1367 | 
1368 |     node
1369 |   end
1370 | 
1371 |   def new_resbody cond, body
1372 |     if body && body.sexp_type == :block then
1373 |       body.shift # remove block and splat it in directly
1374 |     else
1375 |       body = [body]
1376 |     end
1377 | 
1378 |     s(:resbody, cond, *body).line cond.line
1379 |   end
1380 | 
1381 |   def new_rescue body, resbody
1382 |     s(:rescue, body, resbody).line body.line
1383 |   end
1384 | 
1385 |   def new_sclass val
1386 |     (_, line), _, recv, in_def, _, in_single, body, _ = val
1387 | 
1388 |     result = s(:sclass, recv)
1389 | 
1390 |     if body then
1391 |       if body.sexp_type == :block then
1392 |         result.push(*body.sexp_body)
1393 |       else
1394 |         result.push body
1395 |       end
1396 |     end
1397 | 
1398 |     result.line = line
1399 |     self.in_def = in_def
1400 |     self.in_single = in_single
1401 |     result
1402 |   end
1403 | 
1404 |   def new_string val
1405 |     (str, line), = val
1406 | 
1407 |     str.force_encoding("UTF-8")
1408 |     # TODO: remove:
1409 |     str.force_encoding("ASCII-8BIT") unless str.valid_encoding?
1410 |     s(:str, str).line line
1411 |   end
1412 | 
1413 |   def new_super args
1414 |     if args && args.sexp_type == :block_pass then
1415 |       s(:super, args).line args.line
1416 |     else
1417 |       args ||= s(:arglist).line lexer.lineno
1418 |       s(:super, *args.sexp_body).line args.line
1419 |     end
1420 |   end
1421 | 
1422 |   def new_symbol val
1423 |     name = val.last
1424 |     s(:lit, name.to_sym).line lexer.lineno
1425 |   end
1426 | 
1427 |   def new_symbol_list
1428 |     # TODO: hunt down and try to remove ALL lexer.lineno usage!
1429 |     s(:array).line lexer.lineno
1430 |   end
1431 | 
1432 |   def new_symbol_list_entry val
1433 |     _, sym, _ = val
1434 | 
1435 |     sym ||= s(:str, "").line lexer.lineno
1436 | 
1437 |     case sym.sexp_type
1438 |     when :dstr then
1439 |       sym.sexp_type = :dsym
1440 |     when :str then
1441 |       sym = s(:lit, sym.last.to_sym).line sym.line
1442 |     else
1443 |       sym = s(:dsym, "", sym).line sym.line
1444 |     end
1445 | 
1446 |     sym
1447 |   end
1448 | 
1449 |   def new_undef n, m = nil
1450 |     if m then
1451 |       block_append(n, s(:undef, m).line(m.line))
1452 |     else
1453 |       s(:undef, n).line n.line
1454 |     end
1455 |   end
1456 | 
1457 |   def new_until block, expr, pre
1458 |     new_until_or_while :until, block, expr, pre
1459 |   end
1460 | 
1461 |   def new_until_or_while type, block, expr, pre
1462 |     other = type == :until ? :while : :until
1463 |     line = [block && block.line, expr.line].compact.min
1464 |     block, pre = block.last, false if block && block.sexp_type == :begin
1465 | 
1466 |     expr = cond expr
1467 | 
1468 |     result = unless expr.sexp_type == :not and canonicalize_conditions then
1469 |                s(type,  expr,      block, pre)
1470 |              else
1471 |                s(other, expr.last, block, pre)
1472 |              end
1473 | 
1474 |     result.line = line
1475 |     result
1476 |   end
1477 | 
1478 |   def new_when cond, body
1479 |     s(:when, cond, body)
1480 |   end
1481 | 
1482 |   def new_while block, expr, pre
1483 |     new_until_or_while :while, block, expr, pre
1484 |   end
1485 | 
1486 |   def new_word_list
1487 |     s(:array).line lexer.lineno
1488 |   end
1489 | 
1490 |   def new_word_list_entry val
1491 |     _, word, _ = val
1492 |     word.sexp_type == :evstr ? s(:dstr, "", word).line(word.line) : word
1493 |   end
1494 | 
1495 |   def new_xstring val
1496 |     _, node = val
1497 | 
1498 |     node ||= s(:str, "").line lexer.lineno
1499 | 
1500 |     if node then
1501 |       case node.sexp_type
1502 |       when :str
1503 |         node.sexp_type = :xstr
1504 |       when :dstr
1505 |         node.sexp_type = :dxstr
1506 |       else
1507 |         node = s(:dxstr, "", node).line node.line
1508 |       end
1509 |     end
1510 | 
1511 |     node
1512 |   end
1513 | 
1514 |   def new_yield args = nil
1515 |     # TODO: raise args.inspect unless [:arglist].include? args.first # HACK
1516 |     raise "write a test 4" if args && args.sexp_type == :block_pass
1517 |     raise SyntaxError, "Block argument should not be given." if
1518 |       args && args.sexp_type == :block_pass
1519 | 
1520 |     args ||= s(:arglist).line lexer.lineno
1521 | 
1522 |     args.sexp_type = :arglist if [:call_args, :array].include? args.sexp_type
1523 |     args = s(:arglist, args).line args.line unless args.sexp_type == :arglist
1524 | 
1525 |     s(:yield, *args.sexp_body).line args.line
1526 |   end
1527 | 
1528 |   def prev_value_to_lineno v
1529 |     s, n = v
1530 |     if String === s then
1531 |       n
1532 |     else
1533 |       lexer.lineno
1534 |     end
1535 |   end
1536 | 
1537 |   KEEP_COMMENT_TOKENS = [:kCLASS, :kMODULE, :kDEF, :tNL]
1538 | 
1539 |   def next_token
1540 |     token = self.lexer.next_token
1541 | 
1542 |     if token and token.first != RubyLexer::EOF then
1543 |       self.last_token_type = token
1544 | 
1545 |       self.lexer.comment = nil unless KEEP_COMMENT_TOKENS.include? token.first
1546 | 
1547 |       return token
1548 |     elsif !token
1549 |       return self.lexer.next_token
1550 |     else
1551 |       return [false, false]
1552 |     end
1553 |   end
1554 | 
1555 |   def on_error(et, ev, values)
1556 |     ev = ev.first if ev.instance_of?(Array) && ev.size == 2 && ev.last.is_a?(Integer)
1557 |     super
1558 |   rescue Racc::ParseError => e
1559 |     # I don't like how the exception obscures the error message
1560 |     e.message.replace "%s:%p :: %s" % [self.file, lexer.lineno, e.message.strip]
1561 |     warn e.message if $DEBUG
1562 |     raise
1563 |   end
1564 | 
1565 |   ##
1566 |   # Parse +str+ at path +file+ and return a sexp. Raises
1567 |   # Timeout::Error if it runs for more than +time+ seconds.
1568 | 
1569 |   def process(str, file = "(string)", time = 10)
1570 |     str.freeze
1571 | 
1572 |     Timeout.timeout time do
1573 |       raise "bad val: #{str.inspect}" unless String === str
1574 | 
1575 |       self.lexer.string = handle_encoding str
1576 | 
1577 |       self.file = file
1578 | 
1579 |       @yydebug = ENV.has_key? "DEBUG"
1580 | 
1581 |       do_parse
1582 |     end
1583 |   end
1584 | 
1585 |   alias parse process
1586 | 
1587 |   def remove_begin node
1588 |     line = node.line
1589 | 
1590 |     node = node.last while node and node.sexp_type == :begin and node.size == 2
1591 | 
1592 |     node = s(:nil) if node == s(:begin)
1593 | 
1594 |     node.line ||= line
1595 | 
1596 |     node
1597 |   end
1598 | 
1599 |   alias value_expr remove_begin # TODO: for now..? could check the tree, but meh?
1600 | 
1601 |   def reset
1602 |     lexer.reset
1603 |     self.in_def = false
1604 |     self.in_single = 0
1605 |     self.env.reset
1606 |     self.last_token_type = nil
1607 |   end
1608 | 
1609 |   def ret_args node
1610 |     if node then
1611 |       raise "write a test 5" if node.sexp_type == :block_pass
1612 | 
1613 |       raise SyntaxError, "block argument should not be given" if
1614 |         node.sexp_type == :block_pass
1615 | 
1616 |       node.sexp_type = :array if node.sexp_type == :call_args
1617 |       node = node.last if node.sexp_type == :array && node.size == 2
1618 | 
1619 |       # HACK matz wraps ONE of the FOUR splats in a newline to
1620 |       # distinguish. I use paren for now. ugh
1621 |       node = s(:svalue, node).line node.line if node.sexp_type == :splat and not node.paren
1622 |       node.sexp_type = :svalue if node.sexp_type == :arglist && node[1].sexp_type == :splat
1623 |     end
1624 | 
1625 |     node
1626 |   end
1627 | 
1628 |   def s(*args)
1629 |     result = Sexp.new(*args)
1630 |     # result.line ||= lexer.lineno if lexer.ss unless ENV["CHECK_LINE_NUMS"] # otherwise...
1631 |     result.file = self.file
1632 |     result
1633 |   end
1634 | 
1635 |   def debug n
1636 |     if ENV["PRY"] then
1637 |       require "pry"; binding.pry
1638 |     end
1639 | 
1640 |     raise RubyParser::SyntaxError, "debug #{n}"
1641 |   end
1642 | 
1643 |   def syntax_error msg
1644 |     raise RubyParser::SyntaxError, msg
1645 |   end
1646 | 
1647 |   alias yyerror syntax_error
1648 | 
1649 |   def void_stmts node
1650 |     return nil unless node
1651 |     return node unless node.sexp_type == :block
1652 | 
1653 |     if node.respond_to? :sexp_body= then
1654 |       node.sexp_body = node.sexp_body.map { |n| remove_begin n }
1655 |     else
1656 |       node[1..-1] = node[1..-1].map { |n| remove_begin(n) }
1657 |     end
1658 | 
1659 |     node
1660 |   end
1661 | 
1662 |   def warning s
1663 |     # do nothing for now
1664 |   end
1665 | 
1666 |   def whitespace_width line, remove_width = nil
1667 |     col = 0
1668 |     idx = 0
1669 | 
1670 |     line.chars.each do |c|
1671 |       break if remove_width && col >= remove_width
1672 |       case c
1673 |       when " " then
1674 |         col += 1
1675 |       when "\t" then
1676 |         n = TAB_WIDTH * (col / TAB_WIDTH + 1)
1677 |         break if remove_width && n > remove_width
1678 |         col = n
1679 |       else
1680 |         break
1681 |       end
1682 |       idx += 1
1683 |     end
1684 | 
1685 |     if remove_width then
1686 |       line[idx..-1]
1687 |     elsif line[idx] == "\n"
1688 |       nil
1689 |     else
1690 |       col
1691 |     end
1692 |   end
1693 | 
1694 |   alias remove_whitespace_width whitespace_width
1695 | 
1696 |   def wrap type, node
1697 |     value, line = node
1698 |     value = value.to_sym if value.respond_to? :to_sym
1699 |     s(type, value).line line
1700 |   end
1701 | 
1702 |   class Keyword
1703 |     include RubyLexer::State::Values
1704 | 
1705 |     class KWtable
1706 |       attr_accessor :name, :state, :id0, :id1
1707 |       def initialize(name, id=[], state=nil)
1708 |         @name  = name
1709 |         @id0, @id1 = id
1710 |         @state = state
1711 |       end
1712 |     end
1713 | 
1714 |     ##
1715 |     # :stopdoc:
1716 |     #
1717 |     # :expr_beg     = ignore newline, +/- is a sign.
1718 |     # :expr_end     = newline significant, +/- is an operator.
1719 |     # :expr_endarg  = ditto, and unbound braces.
1720 |     # :expr_endfn   = ditto, and unbound braces.
1721 |     # :expr_arg     = newline significant, +/- is an operator.
1722 |     # :expr_cmdarg  = ditto
1723 |     # :expr_mid     = ditto
1724 |     # :expr_fname   = ignore newline, no reserved words.
1725 |     # :expr_dot     = right after . or ::, no reserved words.
1726 |     # :expr_class   = immediate after class, no here document.
1727 |     # :expr_label   = flag bit, label is allowed.
1728 |     # :expr_labeled = flag bit, just after a label.
1729 |     # :expr_fitem   = symbol literal as FNAME.
1730 |     # :expr_value   = :expr_beg -- work to remove. Need multi-state support.
1731 | 
1732 |     expr_woot = EXPR_FNAME|EXPR_FITEM
1733 | 
1734 |     wordlist = [
1735 |                 ["alias",    [:kALIAS,    :kALIAS      ], expr_woot  ],
1736 |                 ["and",      [:kAND,      :kAND        ], EXPR_BEG   ],
1737 |                 ["begin",    [:kBEGIN,    :kBEGIN      ], EXPR_BEG   ],
1738 |                 ["break",    [:kBREAK,    :kBREAK      ], EXPR_MID   ],
1739 |                 ["case",     [:kCASE,     :kCASE       ], EXPR_BEG   ],
1740 |                 ["class",    [:kCLASS,    :kCLASS      ], EXPR_CLASS ],
1741 |                 ["def",      [:kDEF,      :kDEF        ], EXPR_FNAME ],
1742 |                 ["defined?", [:kDEFINED,  :kDEFINED    ], EXPR_ARG   ],
1743 |                 ["do",       [:kDO,       :kDO         ], EXPR_BEG   ],
1744 |                 ["else",     [:kELSE,     :kELSE       ], EXPR_BEG   ],
1745 |                 ["elsif",    [:kELSIF,    :kELSIF      ], EXPR_BEG   ],
1746 |                 ["end",      [:kEND,      :kEND        ], EXPR_END   ],
1747 |                 ["ensure",   [:kENSURE,   :kENSURE     ], EXPR_BEG   ],
1748 |                 ["false",    [:kFALSE,    :kFALSE      ], EXPR_END   ],
1749 |                 ["for",      [:kFOR,      :kFOR        ], EXPR_BEG   ],
1750 |                 ["if",       [:kIF,       :kIF_MOD     ], EXPR_BEG   ],
1751 |                 ["in",       [:kIN,       :kIN         ], EXPR_BEG   ],
1752 |                 ["module",   [:kMODULE,   :kMODULE     ], EXPR_BEG   ],
1753 |                 ["next",     [:kNEXT,     :kNEXT       ], EXPR_MID   ],
1754 |                 ["nil",      [:kNIL,      :kNIL        ], EXPR_END   ],
1755 |                 ["not",      [:kNOT,      :kNOT        ], EXPR_ARG   ],
1756 |                 ["or",       [:kOR,       :kOR         ], EXPR_BEG   ],
1757 |                 ["redo",     [:kREDO,     :kREDO       ], EXPR_END   ],
1758 |                 ["rescue",   [:kRESCUE,   :kRESCUE_MOD ], EXPR_MID   ],
1759 |                 ["retry",    [:kRETRY,    :kRETRY      ], EXPR_END   ],
1760 |                 ["return",   [:kRETURN,   :kRETURN     ], EXPR_MID   ],
1761 |                 ["self",     [:kSELF,     :kSELF       ], EXPR_END   ],
1762 |                 ["super",    [:kSUPER,    :kSUPER      ], EXPR_ARG   ],
1763 |                 ["then",     [:kTHEN,     :kTHEN       ], EXPR_BEG   ],
1764 |                 ["true",     [:kTRUE,     :kTRUE       ], EXPR_END   ],
1765 |                 ["undef",    [:kUNDEF,    :kUNDEF      ], expr_woot  ],
1766 |                 ["unless",   [:kUNLESS,   :kUNLESS_MOD ], EXPR_BEG   ],
1767 |                 ["until",    [:kUNTIL,    :kUNTIL_MOD  ], EXPR_BEG   ],
1768 |                 ["when",     [:kWHEN,     :kWHEN       ], EXPR_BEG   ],
1769 |                 ["while",    [:kWHILE,    :kWHILE_MOD  ], EXPR_BEG   ],
1770 |                 ["yield",    [:kYIELD,    :kYIELD      ], EXPR_ARG   ],
1771 |                 ["BEGIN",    [:klBEGIN,   :klBEGIN     ], EXPR_END   ],
1772 |                 ["END",      [:klEND,     :klEND       ], EXPR_END   ],
1773 |                 ["__FILE__", [:k__FILE__, :k__FILE__   ], EXPR_END   ],
1774 |                 ["__LINE__", [:k__LINE__, :k__LINE__   ], EXPR_END   ],
1775 |                 ["__ENCODING__", [:k__ENCODING__, :k__ENCODING__], EXPR_END],
1776 |                ].map { |args|
1777 |       KWtable.new(*args)
1778 |     }
1779 | 
1780 |     # :startdoc:
1781 | 
1782 |     WORDLIST = Hash[*wordlist.map { |o| [o.name, o] }.flatten]
1783 | 
1784 |     def self.keyword str
1785 |       WORDLIST[str]
1786 |     end
1787 |   end
1788 | 
1789 |   class Environment
1790 |     attr_reader :env, :dyn
1791 | 
1792 |     def [] k
1793 |       self.all[k]
1794 |     end
1795 | 
1796 |     def []= k, v
1797 |       raise "no" if v == true
1798 |       self.current[k] = v
1799 |     end
1800 | 
1801 |     def all
1802 |       idx = @dyn.index(false) || 0
1803 |       @env[0..idx].reverse.inject { |env, scope| env.merge scope }
1804 |     end
1805 | 
1806 |     def current
1807 |       @env.first
1808 |     end
1809 | 
1810 |     def extend dyn = false
1811 |       @dyn.unshift dyn
1812 |       @env.unshift({})
1813 |     end
1814 | 
1815 |     def initialize dyn = false
1816 |       @dyn = []
1817 |       @env = []
1818 |       self.reset
1819 |     end
1820 | 
1821 |     def reset
1822 |       @dyn.clear
1823 |       @env.clear
1824 |       self.extend
1825 |     end
1826 | 
1827 |     def unextend
1828 |       @dyn.shift
1829 |       @env.shift
1830 |       raise "You went too far unextending env" if @env.empty?
1831 |     end
1832 |   end
1833 | 
1834 |   class StackState
1835 |     attr_reader :name
1836 |     attr_reader :stack
1837 |     attr_accessor :debug
1838 | 
1839 |     def initialize name, debug=false
1840 |       @name = name
1841 |       @stack = [false]
1842 |       @debug = debug
1843 |     end
1844 | 
1845 |     def inspect
1846 |       "StackState(#{@name}, #{@stack.inspect})"
1847 |     end
1848 | 
1849 |     def is_in_state
1850 |       log :is_in_state if debug
1851 |       @stack.last
1852 |     end
1853 | 
1854 |     def lexpop
1855 |       raise if @stack.size == 0
1856 |       a = @stack.pop
1857 |       b = @stack.pop
1858 |       @stack.push(a || b)
1859 |       log :lexpop if debug
1860 |     end
1861 | 
1862 |     def log action
1863 |       c = caller[1]
1864 |       c = caller[2] if c =~ /expr_result/
1865 |       warn "%s_stack.%s: %p at %s" % [name, action, @stack, c.clean_caller]
1866 |       nil
1867 |     end
1868 | 
1869 |     def pop
1870 |       r = @stack.pop
1871 |       @stack.push false if @stack.empty?
1872 |       log :pop if debug
1873 |       r
1874 |     end
1875 | 
1876 |     def push val
1877 |       @stack.push val
1878 |       log :push if debug
1879 |     end
1880 | 
1881 |     def reset
1882 |       @stack = [false]
1883 |       log :reset if debug
1884 |     end
1885 | 
1886 |     def restore oldstate
1887 |       @stack.replace oldstate
1888 |       log :restore if debug
1889 |     end
1890 | 
1891 |     def store base = false
1892 |       result = @stack.dup
1893 |       @stack.replace [base]
1894 |       log :store if debug
1895 |       result
1896 |     end
1897 |   end
1898 | end
1899 | 


--------------------------------------------------------------------------------
/test/test_ruby_parser_extras.rb:
--------------------------------------------------------------------------------
  1 | # encoding: US-ASCII
  2 | 
  3 | require "minitest/autorun"
  4 | require "ruby_parser_extras"
  5 | require "ruby_parser"
  6 | 
  7 | class TestStackState < Minitest::Test
  8 |   attr_reader :s
  9 | 
 10 |   def setup
 11 |     @s = RubyParserStuff::StackState.new :test
 12 |   end
 13 | 
 14 |   def assert_encoding str, default = false
 15 |     orig_str = str.dup
 16 |     p = RubyParser.latest
 17 |     s = nil
 18 | 
 19 |     out, err = capture_io {
 20 |       s = p.handle_encoding str
 21 |     }
 22 | 
 23 |     assert_equal orig_str.sub(/\357\273\277/, ""), s
 24 | 
 25 |     exp_err = ""
 26 | 
 27 |     if defined?(Encoding) then
 28 |       assert_equal "UTF-8", s.encoding.to_s, str.inspect
 29 |     else
 30 |       exp_err = "Skipping magic encoding comment\n" unless default
 31 |     end
 32 | 
 33 |     assert_equal "", out, str.inspect
 34 |     assert_equal exp_err, err, str.inspect # HACK
 35 |   end
 36 | 
 37 |   def test_handle_encoding_bom
 38 |     # bom support, default to utf-8
 39 |     assert_encoding "\xEF\xBB\xBF# blah"
 40 |     # we force_encode to US-ASCII, then encode to UTF-8 so our lexer will work
 41 |     assert_encoding "\xEF\xBB\xBF# encoding: US-ASCII"
 42 |   end
 43 | 
 44 |   def test_handle_encoding_default
 45 |     assert_encoding "blah", :default
 46 |   end
 47 | 
 48 |   def test_handle_encoding_emacs
 49 |     # Q: how many different ways can we screw these up? A: ALL OF THEM
 50 | 
 51 |     assert_encoding "# - encoding: utf-8 -"
 52 |     assert_encoding "# - encoding:utf-8"
 53 |     assert_encoding "# -* coding: UTF-8 -*-"
 54 |     assert_encoding "# -*- coding: UTF-8 -*-"
 55 |     assert_encoding "# -*- coding: utf-8 -*"
 56 |     assert_encoding "# -*- coding: utf-8 -*-"
 57 |     assert_encoding "# -*- coding: utf-8; mode: ruby -*-"
 58 |     assert_encoding "# -*- coding: utf-8; mode: ruby; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2"
 59 |     assert_encoding "# -*- coding:utf-8; mode:ruby; -*-"
 60 |     assert_encoding "# -*- encoding: UTF-8 -*-"
 61 |     assert_encoding "# -*- encoding: utf-8 -*"
 62 |     assert_encoding "# -*- encoding: utf-8 -*-"
 63 |     assert_encoding "# -*- mode:ruby; coding:utf-8 -*-"
 64 |     assert_encoding "# -*- ruby encoding: utf-8 -*-"
 65 |     assert_encoding "# -- encoding: utf-8 --"
 66 |     assert_encoding "# ~*~ encoding: utf-8 ~*~"
 67 |     assert_encoding "#-*- coding: utf-8 -*-"
 68 |     assert_encoding "#-*- coding:utf-8"
 69 |     assert_encoding "#--  -*- mode: ruby; encoding: utf-8 -*-\n"
 70 |   end
 71 | 
 72 |   def test_handle_encoding_wtf
 73 |     assert_encoding "# coding : utf-8"
 74 |     assert_encoding "# Ruby 1.9: encoding: utf-8"
 75 |     assert_encoding "# Encoding: UTF-8 <-- required, please leave this in."
 76 |     assert_encoding "# Encoding: UTF-8"
 77 |     assert_encoding "# coding: utf-8"
 78 |     assert_encoding "# coding:utf-8"
 79 |     assert_encoding "# coding=utf-8"
 80 |     assert_encoding "# encoding: ASCII"
 81 |     assert_encoding "# encoding: ASCII-8BIT"
 82 |     assert_encoding "# encoding: ISO-8859-1"
 83 |     assert_encoding "# encoding: UTF-8"
 84 |     assert_encoding "# encoding: ascii-8bit"
 85 |     assert_encoding "# encoding: cp1252"
 86 |     assert_encoding "# encoding: euc-jp -*-"
 87 |     assert_encoding "# encoding: utf-8 # -*- ruby -*-"
 88 |     assert_encoding "# encoding: utf-8 require 'github_api/utils/url'"
 89 |     assert_encoding "# encoding: utf-8!"
 90 |     assert_encoding "# encoding: utf-8"
 91 |     assert_encoding "#<Encoding:UTF-8>"
 92 |     assert_encoding "#Encoding: UTF-8"
 93 |     assert_encoding "#coding:utf-8"
 94 |     assert_encoding "#encoding: UTF-8!"
 95 |     assert_encoding "#encoding: UTF-8"
 96 |     assert_encoding "#encoding: cp1252"
 97 |     assert_encoding "#encoding: sjis"
 98 |     assert_encoding "#encoding: utf-8"
 99 |   end
100 | 
101 |   def test_handle_encoding_normal
102 |     assert_encoding "# encoding: UTF-8"
103 |     assert_encoding "# encoding: UTF-8\r\n" # UGH I hate windoze
104 |     assert_encoding "# coding: UTF-8"
105 |     assert_encoding "# encoding = UTF-8"
106 |     assert_encoding "# coding = UTF-8"
107 |   end
108 | 
109 |   def test_handle_encoding_vim
110 |     assert_encoding "#  vim: set fileencoding=utf-8 filetype=ruby ts=2 : "
111 |     assert_encoding "# vim: fileencoding=UTF-8 ft=ruby syn=ruby ts=2 sw=2 ai eol et si"
112 |     assert_encoding "# vim: fileencoding=UTF-8 nobomb sw=2 ts=2 et"
113 |     assert_encoding "# vim: filetype=ruby, fileencoding=UTF-8, tabsize=2, shiftwidth=2"
114 |     assert_encoding "# vim: set fileencoding=utf-8"
115 |     assert_encoding "# vim:encoding=UTF-8:"
116 |     assert_encoding "# vim:fileencoding=UTF-8:"
117 |     assert_encoding "# vim:set fileencoding=utf-8 filetype=ruby"
118 |     assert_encoding "# vim:set fileencoding=utf-8:"
119 |   end
120 | 
121 |   def test_stack_state
122 |     s.push true
123 |     s.push false
124 |     s.lexpop
125 |     assert_equal [false, true], s.stack
126 |   end
127 | 
128 |   def test_is_in_state
129 |     assert_equal false, s.is_in_state
130 |     s.push false
131 |     assert_equal false, s.is_in_state
132 |     s.push true
133 |     assert_equal true, s.is_in_state
134 |     s.push false
135 |     assert_equal false, s.is_in_state
136 |   end
137 | 
138 |   def test_lexpop
139 |     assert_equal [false], s.stack
140 |     s.push true
141 |     s.push false
142 |     assert_equal [false, true, false], s.stack
143 |     s.lexpop
144 |     assert_equal [false, true], s.stack
145 |   end
146 | 
147 |   def test_pop
148 |     assert_equal [false], s.stack
149 |     s.push true
150 |     assert_equal [false, true], s.stack
151 |     assert_equal true, s.pop
152 |     assert_equal [false], s.stack
153 |   end
154 | 
155 |   def test_push
156 |     assert_equal [false], s.stack
157 |     s.push true
158 |     s.push false
159 |     assert_equal [false, true, false], s.stack
160 |   end
161 | end
162 | 
163 | class TestEnvironment < Minitest::Test
164 |   def deny t
165 |     assert !t
166 |   end
167 | 
168 |   def setup
169 |     @env = RubyParserStuff::Environment.new
170 |     @env[:blah] = 42
171 |     assert_equal 42, @env[:blah]
172 |   end
173 | 
174 |   def test_var_scope_dynamic
175 |     @env.extend :dynamic
176 |     assert_equal 42, @env[:blah]
177 |     @env.unextend
178 |     assert_equal 42, @env[:blah]
179 |   end
180 | 
181 |   def test_var_scope_static
182 |     @env.extend
183 |     assert_nil @env[:blah]
184 |     @env.unextend
185 |     assert_equal 42, @env[:blah]
186 |   end
187 | 
188 |   def test_all_dynamic
189 |     expected = { :blah => 42 }
190 | 
191 |     @env.extend :dynamic
192 |     assert_equal expected, @env.all
193 |     @env.unextend
194 |     assert_equal expected, @env.all
195 |   end
196 | 
197 |   def test_all_static
198 |     @env.extend
199 |     expected = { }
200 |     assert_equal expected, @env.all
201 | 
202 |     @env.unextend
203 |     expected = { :blah => 42 }
204 |     assert_equal expected, @env.all
205 |   end
206 | 
207 |   def test_all_static_deeper
208 |     expected0 = { :blah => 42 }
209 |     expected1 = { :blah => 42, :blah2 => 24 }
210 |     expected2 = { :blah => 27 }
211 | 
212 |     @env.extend :dynamic
213 |     @env[:blah2] = 24
214 |     assert_equal expected1, @env.all
215 | 
216 |     @env.extend
217 |     @env[:blah] = 27
218 |     assert_equal expected2, @env.all
219 | 
220 |     @env.unextend
221 |     assert_equal expected1, @env.all
222 | 
223 |     @env.unextend
224 |     assert_equal expected0, @env.all
225 |   end
226 | end
227 | 
228 | class Fake20
229 |   include RubyParserStuff
230 | 
231 |   def initialize
232 |   end
233 | 
234 |   def s(*a) # bypass lexer/lineno stuff that RP overrides in
235 |     Kernel.send :s, *a
236 |   end
237 | end
238 | 
239 | class TestValueExpr < Minitest::Test
240 |   def assert_value_expr exp, input
241 |     assert_equal exp, Fake20.new.value_expr(input.line(1))
242 |   end
243 | 
244 |   def assert_remove_begin exp, input
245 |     assert_equal exp, Fake20.new.remove_begin(input.line(1))
246 |   end
247 | 
248 |   def test_value_expr
249 |     assert_value_expr s(:nil),                     s(:begin)
250 |     assert_value_expr s(:nil),                     s(:begin, s(:nil))
251 |     assert_value_expr s(:nil),                     s(:begin, s(:begin, s(:nil)))
252 |     assert_value_expr s(:begin, s(:nil), s(:nil)), s(:begin, s(:nil), s(:nil))
253 |   end
254 | 
255 |   def test_remove_begin
256 |     assert_remove_begin s(:nil),                     s(:begin)
257 |     assert_remove_begin s(:nil),                     s(:begin, s(:nil))
258 |     assert_remove_begin s(:nil),                     s(:begin, s(:begin, s(:nil)))
259 |     assert_remove_begin s(:begin, s(:nil), s(:nil)), s(:begin, s(:nil), s(:nil))
260 |   end
261 | end
262 | 


--------------------------------------------------------------------------------
/tools/munge.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby -ws
  2 | 
  3 | $v ||= false
  4 | 
  5 | stack = []
  6 | last_token = nil
  7 | reduce_line = nil
  8 | 
  9 | def munge s
 10 |   renames = [
 11 |              "'='",             "tEQL",
 12 |              "'!'",             "tBANG",
 13 |              "'%'",             "tPERCENT",
 14 |              "'&'",             "tAMPER2",
 15 |              "'('",             "tLPAREN2",
 16 |              "')'",             "tRPAREN",
 17 |              "'*'",             "tSTAR2",
 18 |              "'+'",             "tPLUS",
 19 |              "','",             "tCOMMA",
 20 |              "'-'",             "tMINUS",
 21 |              "'.'",             "tDOT",
 22 |              "'/'",             "tDIVIDE",
 23 |              "';'",             "tSEMI",
 24 |              "':'",             "tCOLON",
 25 |              "'<'",             "tLT",
 26 |              "'>'",             "tGT",
 27 |              "'?'",             "tEH",
 28 |              "'['",             "tLBRACK",
 29 |              "'\\n'",           "tNL",
 30 |              "']'",             "tRBRACK",
 31 |              "'^'",             "tCARET",
 32 |              "'`'",             "tBACK_REF2",
 33 |              "'{'",             "tLCURLY",
 34 |              "'|'",             "tPIPE",
 35 |              "'}'",             "tRCURLY",
 36 |              "'~'",             "tTILDE",
 37 |              '"["',             "tLBRACK",
 38 | 
 39 |              # 2.0 changes?
 40 |              '"<=>"',            "tCMP",
 41 |              '"=="',             "tEQ",
 42 |              '"==="',            "tEQQ",
 43 |              '"!~"',             "tNMATCH",
 44 |              '"=~"',             "tMATCH",
 45 |              '">="',             "tGEQ",
 46 |              '"<="',             "tLEQ",
 47 |              '"!="',             "tNEQ",
 48 |              '"<<"',             "tLSHFT",
 49 |              '">>"',             "tRSHFT",
 50 |              '"*"',              "tSTAR",
 51 | 
 52 |              '".."',             "tDOT2",
 53 | 
 54 |              '"&"',              "tAMPER",
 55 |              '"&&"',             "tANDOP",
 56 |              '"&."',             "tLONELY",
 57 |              '"||"',             "tOROP",
 58 | 
 59 |              '"..."',            "tDOT3",
 60 |              '"**"',             "tPOW",
 61 |              '"unary+"',         "tUPLUS",
 62 |              '"unary-"',         "tUMINUS",
 63 |              '"[]"',             "tAREF",
 64 |              '"[]="',            "tASET",
 65 |              '"::"',             "tCOLON2",
 66 |              '"{ arg"',          "tLBRACE_ARG",
 67 |              '"( arg"',          "tLPAREN_ARG",
 68 |              '"("',              "tLPAREN",
 69 |              'rparen',           "tRPAREN",
 70 |              '"{"',              "tLBRACE",
 71 |              '"=>"',             "tASSOC",
 72 |              '"->"',             "tLAMBDA",
 73 |              '":: at EXPR_BEG"', "tCOLON3",
 74 |              '"**arg"',          "tDSTAR",
 75 |              '","',              "tCOMMA",
 76 | 
 77 |              # other
 78 | 
 79 |              'kTERMINATOR',     "tSTRING_END",
 80 |              '"kTERMINATOR"',   "tSTRING_END",
 81 |              'kTRCURLY',        "tSTRING_DEND",
 82 | 
 83 |              '"symbol literal"',       "tSYMBEG",
 84 |              '"string literal"',       "tSTRING_BEG",
 85 |              '"backtick literal"',     "tXSTRING_BEG",
 86 |              '"regexp literal"',       "tREGEXP_BEG",
 87 |              '"word list"',            "tWORDS_BEG",
 88 |              '"verbatim word list"',   "tQWORDS_BEG",
 89 |              '"symbol list"',          "tSYMBOLS_BEG",
 90 |              '"verbatim symbol list"', "tQSYMBOLS_BEG",
 91 |              '"terminator"',           "tSTRING_END",
 92 |              '"\'}\'"',                  "tSTRING_DEND",
 93 | 
 94 |              '"string literal"',"tSTRING_BEG",
 95 |              '"literal content"', "tSTRING_CONTENT",
 96 |              /\$/,              "", # try to remove these lumps?
 97 | 
 98 |              'tLBRACK2',        "tLBRACK", # HACK
 99 | 
100 |              "' '",             "tSPACE", # needs to be later to avoid bad hits
101 | 
102 |              "/* empty */",     "none",
103 |              /^\s*$/,           "",
104 | 
105 |              "keyword_BEGIN",   "klBEGIN",
106 |              "keyword_END",     "klEND",
107 |              /keyword_(\w+)/,   proc { "k#{$1.upcase}" },
108 |              /\bk_([a-z_]+)/,   proc { "k#{$1.upcase}" },
109 |              /modifier_(\w+)/,  proc { "k#{$1.upcase}_MOD" },
110 |              "kVARIABLE",       "keyword_variable", # ugh
111 |              "tCONST",          "kCONST",
112 | 
113 |              # 2.6 collapses klBEGIN to kBEGIN
114 |              "klBEGIN",   "kBEGIN",
115 |              "klEND",     "kEND",
116 | 
117 |              /keyword_(\w+)/,          proc { "k#{$1.upcase}" },
118 |              /\bk_([^_][a-z_]+)/,      proc { "k#{$1.upcase}" },
119 |              /modifier_(\w+)/,         proc { "k#{$1.upcase}_MOD" },
120 | 
121 |              "kVARIABLE",       "keyword_variable", # ugh: this is a rule name
122 | 
123 |              # UGH
124 |              "k_LINE__",       "k__LINE__",
125 |              "k_FILE__",       "k__FILE__",
126 |              "k_ENCODING__",   "k__ENCODING__",
127 | 
128 |              '"defined?"',     "kDEFINED",
129 | 
130 |              "<none>",         "none",
131 | 
132 |              '"do (for condition)"', "kDO_COND",
133 |              '"do (for lambda)"',    "kDO_LAMBDA",
134 |              '"do (for block)"',     "kDO_BLOCK",
135 |              '"local variable or method"', "tIDENTIFIER",
136 | 
137 |              /\"(\w+) \(modifier\)\"/, proc { |x| "k#{$1.upcase}_MOD" },
138 |              /\"(\w+)\"/,              proc { |x| "k#{$1.upcase}" },
139 |              /\"`(\w+)'\"/,            proc { |x| "k#{$1.upcase}" },
140 | 
141 |              /@(\d+)(\s+|$)/,       "",
142 |              /\$?@(\d+) */,         "", # TODO: remove?
143 | 
144 |              /_EXPR/,               "",
145 |             ]
146 | 
147 |   renames.each_slice(2) do |(a, b)|
148 |     if Proc === b then
149 |       s.gsub!(a, &b)
150 |     else
151 |       s.gsub!(a, b)
152 |     end
153 |   end
154 | 
155 |   if s.empty? then
156 |     nil
157 |   else
158 |     s.strip.squeeze " "
159 |   end
160 | end
161 | 
162 | ARGF.each_line do |line|
163 |   case line
164 |   when /^(Stack now|Entering state|Shifting|Cleanup|Starting)/ then
165 |     # do nothing
166 |   when /^vtable_/ then
167 |     # do nothing
168 |   when /Gem::MissingSpecError/ then
169 |     # do nothing -- ruby 2.5 is being bitchy?
170 |   when /^Reading a token: Next token is token (.*?) \(\)/ then
171 |     token = munge $1
172 |     next if last_token == token
173 |     puts "next token is %p" % [token]
174 |     last_token = token
175 |   when /^Reading a token: / then
176 |     next # skip
177 |   when /^Reading a token$/ then # wtf?
178 |     next # skip
179 |   when /^(?:add_delayed_token|parser_dispatch)/ then # dunno what this is yet
180 |     next # skip
181 |   when /^read\s+:(\w+)/ then # read    :tNL(tNL) nil
182 |     token = munge $1
183 |     next if last_token == token
184 |     puts "next token is %p" % [token]
185 |     last_token = token
186 |   when /^Next token is token ("[^"]+"|\S+)/ then
187 |     token = munge $1
188 |     next if last_token == token
189 |     puts "next token is %p" % [token]
190 |     last_token = token
191 |   when /^read\s+false/ then # read    false($end) "$end"
192 |     puts "next token is EOF"
193 |   when /^Now at end of input./ then
194 |     # do nothing
195 |   when /^.:scan=>\["([^"]+)"/ then
196 |     puts "scan = %p" % [$1]
197 |   when /^.:getch=>\["([^"]+)/ then
198 |     puts "SCAN = %p" % [$1]
199 |   when /^Reducing stack by rule (\d+) \(line (\d+)\):/ then
200 |     reduce_line = $2.to_i
201 |   when /^   \$\d+ = (?:token|nterm) (.+) \(.*\)/ then
202 |     item = $1
203 |     stack << munge(item)
204 |   when /^-> \$\$ = (?:token|nterm) (.+) \(.*\)/ then
205 |     stack << "none" if stack.empty?
206 |     item = munge $1
207 |     x = stack.compact.map { |s| munge s.strip }.compact.join " "
208 |     if x != item then # prevent kdef -> kdef
209 |       if $v && reduce_line then
210 |         puts "reduce #{x} --> #{item} at #{reduce_line}".squeeze " "
211 |       else
212 |         puts "reduce #{x} --> #{item}".squeeze " "
213 |       end
214 |       puts
215 |     end
216 |     reduce_line = nil
217 |     stack.clear
218 |   when /^reduce/ then # ruby_parser side
219 |     s = munge line.chomp
220 |     next if s =~ /reduce\s+(\w+) --> \1/
221 |     puts s
222 |     puts
223 |   when /^(\w+_stack)\.(\w+)/ then
224 |     # TODO: make pretty, but still informative w/ line numbers etc
225 |     puts line.gsub("true", "1").gsub("false", "0")
226 |     # puts "#{$1}(#{$2})"
227 |   when /^(\w+_stack(\(\w+\))?: \S+)/ then
228 |     # _data = $v ? line.chomp : $1
229 |     # puts line
230 |     # TODO: make pretty, but still informative w/ line numbers etc
231 |     puts line.gsub("true", "1").gsub("false", "0")
232 |   when /^lex_state: :?([\w|()]+) -> :?([\w|]+)(?: (?:at|from) (.*))?/ then
233 |     a, b, c = $1.upcase, $2.upcase, $3
234 |     a.gsub!(/EXPR_/, "")
235 |     b.gsub!(/EXPR_/, "")
236 |     if c && $v then
237 |       puts "lex_state: #{a} -> #{b} at #{c}"
238 |     else
239 |       puts "lex_state: #{a} -> #{b}"
240 |     end
241 |   when /debug|FUCK/ then
242 |     puts line.chomp
243 |   when /^(#.*parse error|on )/ then
244 |     puts line.chomp
245 |   when /^(goto|shift| +\[|$)/ then # racc
246 |     # do nothing
247 |   # when /^Reading a token: Now at end of input./ then
248 |   #   # puts "EOF"
249 |   # when /^Reading a token: Next token is token (.+)/ then
250 |   #   puts "READ: #{$1.inspect}"
251 |   when /^accept/ then
252 |     puts "DONE"
253 |   else
254 |     puts "unparsed: #{line.chomp}"
255 |   end
256 | end
257 | 


--------------------------------------------------------------------------------
/tools/ripper.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby -ws
 2 | 
 3 | $b ||= false # bug mode -- ripper is buggy, use Ripper.sexp
 4 | $d ||= false # debug -- turn on yydebug
 5 | $p ||= false # Use pp
 6 | 
 7 | require "ripper/sexp"
 8 | require "pp" if $p
 9 | 
10 | if ARGV.empty? then
11 |   warn "reading from stdin"
12 |   ARGV << "-"
13 | end
14 | 
15 | class MySexpBuilder < Ripper::SexpBuilderPP
16 |   def on_parse_error msg
17 |     Kernel.warn msg
18 |   end
19 | end
20 | 
21 | ARGV.each do |path|
22 |   src = path == "-" ? $stdin.read : File.read(path)
23 | 
24 |   sexp = nil
25 | 
26 |   if $b then
27 |     sexp = Ripper.sexp src
28 |   else
29 |     rip = MySexpBuilder.new src
30 |     rip.yydebug = $d
31 |     sexp = rip.parse
32 | 
33 |     if rip.error? then
34 |       warn "skipping"
35 |       next
36 |     end
37 |   end
38 | 
39 |   puts "accept"
40 | 
41 |   if $p then
42 |     pp sexp
43 |   else
44 |     p sexp
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------