├── .autotest ├── History.rdoc ├── Manifest.txt ├── README.rdoc ├── Rakefile ├── bin ├── ruby_parse └── ruby_parse_extract_error ├── compare └── normalize.rb ├── debugging.md ├── gauntlet.md ├── lib ├── .document ├── rp_extensions.rb ├── rp_stringscanner.rb ├── ruby_lexer.rb ├── ruby_lexer.rex ├── ruby_lexer_strings.rb ├── ruby_parser.rb ├── ruby_parser2.yy ├── ruby_parser3.yy └── ruby_parser_extras.rb ├── test ├── test_ruby_lexer.rb ├── test_ruby_parser.rb └── test_ruby_parser_extras.rb └── tools ├── munge.rb └── ripper.rb /.autotest: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | require 'autotest/restart' 4 | # require 'autotest/isolate' 5 | require 'autotest/rcov' if ENV['RCOV'] 6 | 7 | Autotest.add_hook :initialize do |at| 8 | at.extra_files << "../../sexp_processor/dev/lib/pt_testcase.rb" 9 | at.libs << ":../../sexp_processor/dev/lib" 10 | at.add_exception ".diff" 11 | at.add_exception ".rdoc" 12 | at.add_exception ".yml" 13 | at.add_exception ".txt" 14 | at.add_exception ".output" 15 | at.add_exception "bin" 16 | at.add_exception "compare" 17 | at.add_exception "coverage" 18 | at.add_exception "coverage.info" 19 | at.add_exception "misc" 20 | 21 | Dir["lib/ruby??_parser.{rb,y}"].each do |f| 22 | at.add_exception f 23 | end 24 | 25 | Dir["gauntlet*"].each do |f| 26 | at.add_exception f 27 | end 28 | 29 | at.libs << ':../../minitest/dev/lib' 30 | at.testlib = "minitest/autorun" 31 | 32 | at.add_mapping(/^lib\/.*\.y$/) do |f, _| 33 | g = File.basename(f, ".y").gsub("_", "_?").gsub(/2\d/, '') 34 | at.files_matching %r%^test/.*#{g}.rb$% 35 | end 36 | 37 | at.add_mapping(/^lib\/.*\.yy$/) do |f, _| 38 | g = File.basename(f, ".yy").gsub("_", "_?") 39 | at.files_matching %r%^test/.*#{g}.rb$% 40 | end 41 | 42 | at.add_mapping(/^lib\/ruby_lexer\.rex\.rb$/) do |f, _| 43 | at.files_matching %r%^test/.*ruby_lexer\.rb$% 44 | end 45 | 46 | at.add_mapping(/^lib\/.*\.rex$/) do |f, _| 47 | g = File.basename(f, ".rex").gsub("_", "_?") 48 | at.files_matching %r%^test/.*#{g}.rb$% 49 | end 50 | 51 | at.add_mapping(/pt_testcase.rb/) do |f, _| 52 | at.files_matching(/test_.*rb$/) 53 | end 54 | 55 | %w(TestEnvironment TestStackState TestValueExpr).each do |klass| 56 | at.extra_class_map[klass] = "test/test_ruby_parser_extras.rb" 57 | end 58 | 59 | Dir["lib/ruby??_parser.rb"].each do |s| 60 | n = s[/\d+/] 61 | at.extra_class_map["TestRubyParserV#{n}"] = "test/test_ruby_parser.rb" 62 | end 63 | end 64 | 65 | Autotest.add_hook :run_command do |at, _| 66 | system "rake parser lexer DEBUG=1" 67 | end 68 | -------------------------------------------------------------------------------- /Manifest.txt: -------------------------------------------------------------------------------- 1 | .autotest 2 | History.rdoc 3 | Manifest.txt 4 | README.rdoc 5 | Rakefile 6 | bin/ruby_parse 7 | bin/ruby_parse_extract_error 8 | compare/normalize.rb 9 | debugging.md 10 | gauntlet.md 11 | lib/.document 12 | lib/rp_extensions.rb 13 | lib/rp_stringscanner.rb 14 | lib/ruby_lexer.rb 15 | lib/ruby_lexer.rex 16 | lib/ruby_lexer.rex.rb 17 | lib/ruby_lexer_strings.rb 18 | lib/ruby_parser.rb 19 | lib/ruby_parser2.yy 20 | lib/ruby_parser20.rb 21 | lib/ruby_parser21.rb 22 | lib/ruby_parser22.rb 23 | lib/ruby_parser23.rb 24 | lib/ruby_parser24.rb 25 | lib/ruby_parser25.rb 26 | lib/ruby_parser26.rb 27 | lib/ruby_parser27.rb 28 | lib/ruby_parser3.yy 29 | lib/ruby_parser30.rb 30 | lib/ruby_parser31.rb 31 | lib/ruby_parser32.rb 32 | lib/ruby_parser33.rb 33 | lib/ruby_parser34.rb 34 | lib/ruby_parser_extras.rb 35 | test/test_ruby_lexer.rb 36 | test/test_ruby_parser.rb 37 | test/test_ruby_parser_extras.rb 38 | tools/munge.rb 39 | tools/ripper.rb 40 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = ruby_parser 2 | 3 | home :: https://github.com/seattlerb/ruby_parser 4 | bugs :: https://github.com/seattlerb/ruby_parser/issues 5 | rdoc :: http://docs.seattlerb.org/ruby_parser 6 | 7 | == DESCRIPTION: 8 | 9 | ruby_parser (RP) is a ruby parser written in pure ruby (utilizing 10 | racc--which does by default use a C extension). It outputs 11 | s-expressions which can be manipulated and converted back to ruby via 12 | the ruby2ruby gem. 13 | 14 | As an example: 15 | 16 | def conditional1 arg1 17 | return 1 if arg1 == 0 18 | return 0 19 | end 20 | 21 | becomes: 22 | 23 | s(:defn, :conditional1, s(:args, :arg1), 24 | s(:if, 25 | s(:call, s(:lvar, :arg1), :==, s(:lit, 0)), 26 | s(:return, s(:lit, 1)), 27 | nil), 28 | s(:return, s(:lit, 0))) 29 | 30 | Tested against 801,039 files from the latest of all rubygems (as of 2013-05): 31 | 32 | * 1.8 parser is at 99.9739% accuracy, 3.651 sigma 33 | * 1.9 parser is at 99.9940% accuracy, 4.013 sigma 34 | * 2.0 parser is at 99.9939% accuracy, 4.008 sigma 35 | * 2.6 parser is at 99.9972% accuracy, 4.191 sigma 36 | * 3.0 parser has a 100% parse rate. 37 | * Tested against 2,672,412 unique ruby files across 167k gems. 38 | * As do all the others now, basically. 39 | 40 | == FEATURES/PROBLEMS: 41 | 42 | * Pure ruby, no compiles. 43 | * Includes preceding comment data for defn/defs/class/module nodes! 44 | * Incredibly simple interface. 45 | * Output is 100% equivalent to ParseTree. 46 | * Can utilize PT's SexpProcessor and UnifiedRuby for language processing. 47 | * Known Issue: Speed is now pretty good, but can always improve: 48 | * RP parses a corpus of 3702 files in 125s (avg 108 Kb/s) 49 | * MRI+PT parsed the same in 67.38s (avg 200.89 Kb/s) 50 | * Known Issue: Code is much better, but still has a long way to go. 51 | * Known Issue: Totally awesome. 52 | * Known Issue: line number values can be slightly off. Parsing LR sucks. 53 | 54 | == SYNOPSIS: 55 | 56 | RubyParser.new.parse "1+1" 57 | # => s(:call, s(:lit, 1), :+, s(:lit, 1)) 58 | 59 | You can also use Ruby19Parser, Ruby18Parser, or RubyParser.for_current_ruby: 60 | 61 | RubyParser.for_current_ruby.parse "1+1" 62 | # => s(:call, s(:lit, 1), :+, s(:lit, 1)) 63 | 64 | == DEVELOPER NOTES: 65 | 66 | To add a new version: 67 | 68 | * New parser should be generated from lib/ruby_parser[23].yy. 69 | * Extend lib/ruby_parser[23].yy with new class name. 70 | * Add new version number to V2/V3 in Rakefile for rule creation. 71 | * Add new `ruby_parse "x.y.z"` line to Rakefile for rake compare (line ~300). 72 | * Require generated parser in lib/ruby_parser.rb. 73 | * Add new V## = ::Ruby##Parser; end to ruby_parser.rb (bottom of file). 74 | * Add empty TestRubyParserShared##Plus module and TestRubyParserV## to test/test_ruby_parser.rb. 75 | * Extend Manifest.txt with generated file names. 76 | * Add new version number to sexp_processor's pt_testcase.rb in all_versions. 77 | 78 | Until all of these are done, you won't have a clean test run. 79 | 80 | == REQUIREMENTS: 81 | 82 | * ruby. woot. 83 | * sexp_processor for Sexp and SexpProcessor classes, and testing. 84 | * racc full package for parser development (compiling .y to .rb). 85 | 86 | == INSTALL: 87 | 88 | * sudo gem install ruby_parser 89 | 90 | == LICENSE: 91 | 92 | (The MIT License) 93 | 94 | Copyright (c) Ryan Davis, seattle.rb 95 | 96 | Permission is hereby granted, free of charge, to any person obtaining 97 | a copy of this software and associated documentation files (the 98 | 'Software'), to deal in the Software without restriction, including 99 | without limitation the rights to use, copy, modify, merge, publish, 100 | distribute, sublicense, and/or sell copies of the Software, and to 101 | permit persons to whom the Software is furnished to do so, subject to 102 | the following conditions: 103 | 104 | The above copyright notice and this permission notice shall be 105 | included in all copies or substantial portions of the Software. 106 | 107 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 108 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 109 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 110 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 111 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 112 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 113 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 114 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | require "hoe" 4 | 5 | Hoe.plugin :seattlerb 6 | Hoe.plugin :racc 7 | Hoe.plugin :isolate 8 | Hoe.plugin :rdoc 9 | 10 | Hoe.add_include_dirs "lib" 11 | Hoe.add_include_dirs "../../sexp_processor/dev/lib" 12 | Hoe.add_include_dirs "../../minitest/dev/lib" 13 | Hoe.add_include_dirs "../../oedipus_lex/dev/lib" 14 | Hoe.add_include_dirs "../../ruby2ruby/dev/lib" 15 | 16 | V2 = %w[20 21 22 23 24 25 26 27] 17 | V3 = %w[30 31 32 33 34] 18 | 19 | VERS = V2 + V3 20 | 21 | ENV["FAST"] = VERS.last if ENV["FAST"] && !VERS.include?(ENV["FAST"]) 22 | VERS.replace [ENV["FAST"]] if ENV["FAST"] 23 | 24 | racc_flags = nil 25 | 26 | Hoe.spec "ruby_parser" do 27 | developer "Ryan Davis", "ryand-ruby@zenspider.com" 28 | 29 | license "MIT" 30 | 31 | dependency "sexp_processor", "~> 4.16" 32 | dependency "racc", "~> 1.5" 33 | dependency "rake", [">= 10", "< 15"], :developer 34 | dependency "oedipus_lex", "~> 2.6", :developer 35 | 36 | require_ruby_version [">= 2.6", "< 4"] 37 | 38 | if plugin? :perforce then # generated files 39 | VERS.each do |n| 40 | self.perforce_ignore << "lib/ruby_parser#{n}.rb" 41 | end 42 | 43 | VERS.each do |n| 44 | self.perforce_ignore << "lib/ruby_parser#{n}.y" 45 | end 46 | 47 | self.perforce_ignore << "lib/ruby_lexer.rex.rb" 48 | end 49 | 50 | if plugin?(:racc) 51 | self.racc_flags << " -t" if ENV["DEBUG"] 52 | self.racc_flags << " --superclass RubyParser::Parser" 53 | racc_flags = self.racc_flags 54 | end 55 | end 56 | 57 | def maybe_add_to_top path, string 58 | file = File.read path 59 | 60 | return if file.start_with? string 61 | 62 | warn "Altering top of #{path}" 63 | tmp_path = "#{path}.tmp" 64 | File.open(tmp_path, "w") do |f| 65 | f.puts string 66 | f.puts 67 | 68 | f.write file 69 | # TODO: make this deal with encoding comments properly? 70 | end 71 | File.rename tmp_path, path 72 | end 73 | 74 | def unifdef? 75 | @unifdef ||= system("which unifdef") or abort <<~EOM 76 | unifdef not found! 77 | 78 | Please install 'unifdef' package on your system or `rake generate` on a mac. 79 | EOM 80 | end 81 | 82 | def racc? 83 | @racc ||= system("which racc") or abort <<~EOM 84 | racc not found! `gem install racc` 85 | EOM 86 | end 87 | 88 | generate_parser = proc do |t| 89 | unifdef? 90 | racc? 91 | n = t.name[/\d+/] 92 | sh "unifdef -tk -DV=%s %s | racc %s /dev/stdin -o %s" % [n, t.source, racc_flags, t.name] 93 | maybe_add_to_top t.name, "# frozen_string_literal: true" 94 | end 95 | 96 | V2.each do |n| 97 | file "lib/ruby_parser#{n}.rb" => "lib/ruby_parser2.yy", &generate_parser 98 | end 99 | 100 | V3.each do |n| 101 | file "lib/ruby_parser#{n}.rb" => "lib/ruby_parser3.yy", &generate_parser 102 | end 103 | 104 | file "lib/ruby_lexer.rex.rb" => "lib/ruby_lexer.rex" 105 | 106 | task :generate => [:lexer, :parser] 107 | 108 | task :clean do 109 | rm_rf(Dir["**/*~"] + 110 | Dir["diff.diff"] + # not all diffs. bit me too many times 111 | Dir["coverage.info"] + 112 | Dir["coverage"] + 113 | Dir["lib/ruby_parser2*.y"] + 114 | Dir["lib/ruby_parser3*.y"] + 115 | Dir["lib/*.output"]) 116 | end 117 | 118 | task :sort do 119 | sh "grepsort '^ +def' lib/ruby_lexer.rb" 120 | sh "grepsort '^ +def (test|util)' test/test_ruby_lexer.rb" 121 | end 122 | 123 | desc "what was that command again?" 124 | task :huh? do 125 | puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..." 126 | end 127 | 128 | def (task(:phony)).timestamp 129 | Time.at 0 130 | end 131 | 132 | task :isolate => :phony 133 | 134 | def dl v, f 135 | dir = v[/^\d+\.\d+/] 136 | url = "https://cache.ruby-lang.org/pub/ruby/#{dir}/ruby-#{v}.tar.xz" 137 | 138 | warn "Downloading ruby #{v}" 139 | system "curl -s -o #{f} #{url}" 140 | end 141 | 142 | task :parser => :isolate 143 | 144 | multitask :compare_build 145 | task :compare_build => :generate 146 | task :compare => :compare_build 147 | 148 | def ruby_parse version 149 | v = version[/^\d+\.\d+/].delete "." 150 | diff = "compare/diff#{v}.diff" 151 | rp_txt = "compare/rp#{v}.txt" 152 | mri_txt = "compare/mri#{v}.txt" 153 | parse_y = "compare/parse#{v}.y" 154 | tarball = "compare/ruby-#{version}.tar.xz" 155 | ruby_dir = "compare/ruby-#{version}" 156 | rp_out = "lib/ruby_parser#{v}.output" 157 | rp_y_rb = "lib/ruby_parser#{v}.rb" 158 | normalize = "compare/normalize.rb" 159 | 160 | file tarball do 161 | dl version, tarball 162 | end 163 | 164 | desc "fetch all tarballs" 165 | task :fetch => tarball 166 | 167 | file ruby_dir => tarball do 168 | extract_glob = case 169 | when version > "3.3" then 170 | "{id.h,parse.y,tool/{id2token.rb,lrama},defs/id.def}" 171 | when version > "3.2" then 172 | "{id.h,parse.y,tool/id2token.rb,defs/id.def}" 173 | when version > "2.7" then 174 | "{id.h,parse.y,tool/{id2token.rb,lib/vpath.rb}}" 175 | else 176 | "{id.h,parse.y,tool/{id2token.rb,vpath.rb}}" 177 | end 178 | system "tar xf #{tarball} -C compare #{File.basename ruby_dir}/#{extract_glob}" 179 | end 180 | 181 | file parse_y => ruby_dir do 182 | # env -u RUBYOPT rake compare/parse33.y 183 | warn "Warning: RUBYOPT is set! Use 'env -u RUBYOPT rake'" if ENV["RUBYOPT"] 184 | 185 | # Debugging a new parse build system: 186 | # 187 | # Unpack the ruby tarball in question, configure, and run the following: 188 | # 189 | # % [ -e Makefile ] || ./configure ; make -n -W parse.y parse.c 190 | # ... 191 | # echo generating parse.c 192 | # ruby --disable=gems ./tool/id2token.rb parse.y | \ 193 | # ruby ./tool/lrama/exe/lrama -oparse.c -Hparse.h - parse.y 194 | # 195 | # Then integrate these commands into the mess below: 196 | 197 | d = ruby_dir 198 | cmd = if version > "3.2" then 199 | "ruby #{d}/tool/id2token.rb #{d}/parse.y | expand > #{parse_y}" 200 | else 201 | "ruby #{d}/tool/id2token.rb --path-separator=.:./ #{d}/id.h #{d}/parse.y | expand | ruby -pe 'gsub(/^%pure-parser/, \"%define api.pure\")' > #{parse_y}" 202 | end 203 | 204 | sh cmd 205 | end 206 | 207 | bison = Dir["/opt/homebrew/opt/bison/bin/bison", 208 | "/usr/local/opt/bison/bin/bison", 209 | `which bison`.chomp, 210 | ].first 211 | 212 | file mri_txt => [parse_y, normalize] do 213 | d = ruby_dir 214 | if version > "3.3" then 215 | sh "./#{d}/tool/lrama/exe/lrama -r states --report-file=compare/parse#{v}.output -ocompare/parse#{v}.tab.c #{parse_y}" 216 | else 217 | sh "#{bison} -r all #{parse_y}" 218 | mv Dir["parse#{v}.*"], "compare" 219 | end 220 | 221 | sh "#{normalize} compare/parse#{v}.output > #{mri_txt}" 222 | rm ["compare/parse#{v}.output", "compare/parse#{v}.tab.c"] 223 | end 224 | 225 | file rp_out => rp_y_rb 226 | 227 | file rp_txt => [rp_out, normalize] do 228 | sh "#{normalize} #{rp_out} > #{rp_txt}" 229 | end 230 | 231 | compare = "compare#{v}" 232 | compare_build = "compare_build#{v}" 233 | 234 | desc "Compare all grammars to MRI" 235 | task :compare => compare 236 | task :compare_build => compare_build 237 | 238 | task compare_build => diff 239 | 240 | file diff => [mri_txt, rp_txt] do 241 | sh "diff -du #{mri_txt} #{rp_txt} > #{diff}; true" 242 | end 243 | 244 | desc "Compare #{v} grammar to MRI #{version}" 245 | task compare => diff do 246 | system "wc -l #{diff}" 247 | end 248 | 249 | task :clean do 250 | rm_f Dir[mri_txt, rp_txt, ruby_dir] 251 | end 252 | 253 | task :realclean do 254 | rm_f Dir[parse_y, tarball] 255 | end 256 | end 257 | 258 | task :versions do 259 | require "open-uri" 260 | require "net/http" # avoid require issues in threads 261 | require "net/https" 262 | 263 | versions = VERS.map { |s| s.split(//).join "." } 264 | 265 | base_url = "https://cache.ruby-lang.org/pub/ruby" 266 | 267 | class Array 268 | def human_sort 269 | sort_by { |item| item.to_s.split(/(\d+)/).map { |e| [e.to_i, e] } } 270 | end 271 | end 272 | 273 | versions = versions.map { |ver| 274 | Thread.new { 275 | URI 276 | .parse("#{base_url}/#{ver}/") 277 | .read 278 | .scan(/ruby-\d+\.\d+\.\d+[-\w.]*?.tar.gz/) 279 | .reject { |s| s =~ /-(?:rc|preview)\d/ } 280 | .human_sort 281 | .last 282 | .delete_prefix("ruby-") 283 | .delete_suffix ".tar.gz" 284 | } 285 | }.map(&:value).sort 286 | 287 | puts versions.map { |v| "ruby_parse %p" % [v] } 288 | end 289 | 290 | ruby_parse "2.0.0-p648" 291 | ruby_parse "2.1.10" 292 | ruby_parse "2.2.10" 293 | ruby_parse "2.3.8" 294 | ruby_parse "2.4.10" 295 | ruby_parse "2.5.9" 296 | ruby_parse "2.6.10" 297 | ruby_parse "2.7.8" 298 | ruby_parse "3.0.6" 299 | ruby_parse "3.1.7" 300 | ruby_parse "3.2.8" 301 | ruby_parse "3.3.7" 302 | ruby_parse "3.4.2" 303 | 304 | task :debug => :isolate do 305 | ENV["V"] ||= VERS.last 306 | Rake.application[:parser].invoke # this way we can have DEBUG set 307 | Rake.application[:lexer].invoke # this way we can have DEBUG set 308 | 309 | $:.unshift "lib" 310 | require "ruby_parser" 311 | require "pp" 312 | 313 | klass = Object.const_get("Ruby#{ENV["V"]}Parser") rescue nil 314 | raise "Unsupported version #{ENV["V"]}" unless klass 315 | parser = klass.new 316 | 317 | time = (ENV["RP_TIMEOUT"] || 10).to_i 318 | 319 | n = ENV["BUG"] 320 | file = (n && "bug#{n}.rb") || ENV["F"] || ENV["FILE"] || "debug.rb" 321 | ruby = ENV["R"] || ENV["RUBY"] 322 | 323 | if ruby then 324 | file = "env" 325 | else 326 | ruby = File.read file 327 | end 328 | 329 | 330 | begin 331 | pp parser.process(ruby, file, time) 332 | rescue ArgumentError, Racc::ParseError => e 333 | p e 334 | puts e.backtrace.join "\n " 335 | ss = parser.lexer.ss 336 | src = ss.string 337 | lines = src[0..ss.pos].split(/\n/) 338 | abort "on #{file}:#{lines.size}" 339 | end 340 | end 341 | 342 | task :debug3 do 343 | file = ENV["F"] || "debug.rb" 344 | version = ENV["V"] || "" 345 | verbose = ENV["VERBOSE"] ? "-v" : "" 346 | munge = "./tools/munge.rb #{verbose}" 347 | 348 | abort "Need a file to parse, via: F=path.rb" unless file 349 | 350 | ENV.delete "V" 351 | 352 | ruby = "ruby#{version}" 353 | 354 | sh "#{ruby} -v" 355 | sh "#{ruby} -y #{file} 2>&1 | #{munge} > tmp/ruby" 356 | sh "#{ruby} ./tools/ripper.rb -d #{file} | #{munge} > tmp/rip" 357 | sh "rake debug F=#{file} DEBUG=1 2>&1 | #{munge} > tmp/rp" 358 | sh "diff -U 999 -d tmp/{ruby,rp}" 359 | end 360 | 361 | task :cmp do 362 | sh %(emacsclient --eval '(ediff-files "tmp/ruby" "tmp/rp")') 363 | end 364 | 365 | task :cmp3 do 366 | sh %(emacsclient --eval '(ediff-files3 "tmp/ruby" "tmp/rip" "tmp/rp")') 367 | end 368 | 369 | task :extract => :isolate do 370 | ENV["V"] ||= VERS.last 371 | Rake.application[:parser].invoke # this way we can have DEBUG set 372 | 373 | file = ENV["F"] || ENV["FILE"] || abort("Need to provide F=") 374 | 375 | ruby "-Ilib", "bin/ruby_parse_extract_error", file 376 | end 377 | 378 | task :parse => :isolate do 379 | ENV["V"] ||= VERS.last 380 | Rake.application[:parser].invoke # this way we can have DEBUG set 381 | 382 | file = ENV["F"] || ENV["FILE"] || abort("Need to provide F=") 383 | 384 | ruby "-Ilib", "bin/ruby_parse", file 385 | end 386 | 387 | task :bugs do 388 | sh "for f in bug*.rb bad*.rb ; do #{Gem.ruby} -S rake debug F=$f && rm $f ; done" 389 | end 390 | 391 | # vim: syntax=Ruby 392 | -------------------------------------------------------------------------------- /bin/ruby_parse: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -s 2 | 3 | $q ||= false 4 | $g ||= false 5 | 6 | require 'rubygems' 7 | require 'ruby_parser' 8 | require 'pp' 9 | 10 | good = bad = 0 11 | 12 | multi = ARGV.size != 1 13 | total_time = 0 14 | total_loc = 0 15 | total_kbytes = 0 16 | times = {} 17 | locs = {} 18 | kbytes = {} 19 | 20 | begin 21 | time = (ENV["RP_TIMEOUT"] || 10).to_i 22 | 23 | ARGV.each do |file| 24 | rp = RubyParser.new 25 | loc = `wc -l #{file}`.strip.to_i 26 | size = `wc -c #{file}`.strip.to_i / 1024.0 27 | locs[file] = loc 28 | kbytes[file] = size 29 | total_loc += loc 30 | total_kbytes += size 31 | if $q then 32 | $stderr.print "." 33 | else 34 | warn "# file = #{file} loc = #{loc}" 35 | end 36 | GC.start if $g 37 | 38 | t = Time.now 39 | begin 40 | begin 41 | rp.reset 42 | r = rp.process(File.binread(file), file, time) 43 | pp r unless $q 44 | good += 1 45 | rescue SyntaxError => e 46 | warn "SyntaxError for #{file}: #{e.message}" 47 | bad += 1 48 | end 49 | rescue => e 50 | warn "#{e.backtrace.first} #{e.inspect.gsub(/\n/, ' ')} for #{file}" 51 | warn " #{e.backtrace.join("\n ")}" 52 | bad += 1 53 | end 54 | 55 | t = Time.now - t 56 | times[file] = t 57 | total_time += t 58 | end 59 | rescue Interrupt 60 | # do nothing 61 | end 62 | 63 | warn "done" 64 | 65 | total = 0 66 | times.values.each do |t| 67 | total += t 68 | end 69 | 70 | puts 71 | puts "good = #{good} bad = #{bad}" if multi 72 | puts 73 | 74 | format = "%5.2fs:%9.2f l/s:%8.2f Kb/s:%5d Kb:%5d loc:%s" 75 | 76 | times.sort_by { |f, t| -t }.each do |f, t| 77 | next if t < 0.005 78 | loc = locs[f] 79 | size = kbytes[f] 80 | puts format % [t, loc / t, size / t, size, loc, f] 81 | end 82 | 83 | puts 84 | 85 | puts format % [total_time, 86 | total_loc / total_time, 87 | total_kbytes / total_time, 88 | total_kbytes, 89 | total_loc, 90 | "TOTAL"] unless total_time == 0 91 | -------------------------------------------------------------------------------- /bin/ruby_parse_extract_error: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -ws 2 | 3 | $d ||= ENV["DELETE"] || false 4 | $t ||= ENV["DELETE_TIMEOUT"] || false 5 | $m ||= ENV["MOVE_TIMEOUT"] || false 6 | $q ||= ENV["QUIET"] || false 7 | $s ||= ENV["SPEED"] || false 8 | 9 | require 'rubygems' 10 | require 'ruby_parser' 11 | require 'fileutils' 12 | 13 | ARGV.push "-" if ARGV.empty? 14 | 15 | class RubyParser 16 | def extract_defs 17 | ss = current.lexer.ss 18 | 19 | raise "can't access source. possible encoding issue" unless ss 20 | 21 | src = ss.string 22 | pre_error = src[0...ss.pos] 23 | 24 | defs = pre_error.lines.grep(/^ *(?:def|it)/) 25 | 26 | raise "can't figure out where the bad code starts" unless defs.last 27 | 28 | last_def_indent = defs.last[/^ */] 29 | 30 | post_error = src[ss.pos..-1] 31 | idx = post_error =~ /^#{last_def_indent}end.*/ 32 | 33 | raise "can't figure out where the bad code ends" unless idx 34 | 35 | src = pre_error + post_error[0..idx+$&.length] 36 | 37 | src.scan(/^(( *)(?:def|it) .*?^\2end)/m) 38 | end 39 | 40 | def retest_for_errors defs 41 | parser = self.class.new 42 | 43 | parser.process(defs.join("\n\n")) 44 | rescue SyntaxError, StandardError 45 | nil 46 | end 47 | end 48 | 49 | def expand path 50 | if File.directory? path then 51 | require 'find' 52 | 53 | files = [] 54 | 55 | Find.find(*Dir[path]) do |f| 56 | files << f if File.file? f 57 | end 58 | 59 | files.sort 60 | else 61 | Dir.glob path 62 | end 63 | end 64 | 65 | def process_error parser 66 | defs = parser.extract_defs 67 | 68 | if parser.retest_for_errors defs then 69 | warn "Can't reproduce error with just methods, punting..." 70 | return 71 | end 72 | 73 | catch :extract_done do 74 | (1..defs.size).each do |perm_size| 75 | defs.combination(perm_size).each do |trial| 76 | unless parser.retest_for_errors trial then 77 | puts trial.join "\n" 78 | throw :extract_done 79 | end 80 | end 81 | end 82 | end 83 | rescue RuntimeError, Racc::ParseError => e 84 | warn "# process error: #{e.message.strip}" 85 | warn "# #{e.backtrace.first}" 86 | end 87 | 88 | def process file 89 | ruby = file == "-" ? $stdin.binread : File.binread(file) 90 | time = (ENV["RP_TIMEOUT"] || 10).to_i 91 | 92 | $stderr.print "# Validating #{file}: " 93 | parser = RubyParser.new 94 | t0 = Time.now if $s 95 | parser.process(ruby, file, time) 96 | if $s then 97 | warn "good: #{Time.now - t0}" 98 | else 99 | warn "good" 100 | end 101 | File.unlink file if $d 102 | rescue Timeout::Error 103 | $exit = 1 104 | warn "TIMEOUT parsing #{file}. Skipping." 105 | 106 | if $m then 107 | base_dir, *rest = file.split("/") 108 | base_dir.sub!(/\.slow\.?.*/, "") 109 | base_dir += ".slow.#{time}" 110 | 111 | new_file = File.join(base_dir, *rest) 112 | 113 | FileUtils.mkdir_p File.dirname(new_file) 114 | FileUtils.move file, new_file, verbose:true 115 | elsif $t then 116 | File.unlink file 117 | end 118 | rescue StandardError, SyntaxError, Racc::ParseError => e 119 | $exit = 1 120 | warn "" 121 | warn "# error: #{e.message.strip}" unless $q 122 | warn "# #{e.backtrace.first}" 123 | warn "" 124 | return if $q 125 | 126 | process_error parser 127 | end 128 | 129 | $exit = 0 130 | $stdout.sync = true 131 | 132 | ARGV.each do |path| 133 | expand(path).each do |file| 134 | next unless File.file? file # omg... why would you name a dir support.rb? 135 | process file 136 | end 137 | end 138 | 139 | exit $exit 140 | -------------------------------------------------------------------------------- /compare/normalize.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -w 2 | 3 | good = false 4 | 5 | rules = Hash.new { |h,k| h[k] = [] } 6 | rule = nil 7 | order = [] 8 | 9 | def munge s 10 | renames = [ 11 | # unquote... wtf? 12 | /`(.+?)'/, proc { $1 }, 13 | /"'(.+?)'"/, proc { "\"#{$1}\"" }, 14 | 15 | "'='", "tEQL", 16 | "'!'", "tBANG", 17 | "'%'", "tPERCENT", 18 | "'&'", "tAMPER2", 19 | "'('", "tLPAREN2", 20 | "')'", "tRPAREN", 21 | "'*'", "tSTAR2", 22 | "'+'", "tPLUS", 23 | "','", "tCOMMA", 24 | "'-'", "tMINUS", 25 | "'.'", "tDOT", 26 | "'/'", "tDIVIDE", 27 | "';'", "tSEMI", 28 | "':'", "tCOLON", 29 | "'<'", "tLT", 30 | "'>'", "tGT", 31 | "'?'", "tEH", 32 | "'['", "tLBRACK", 33 | "'\\n'", "tNL", 34 | "']'", "tRBRACK", 35 | "'^'", "tCARET", 36 | "'`'", "tBACK_REF2", 37 | "'{'", "tLCURLY", 38 | "'|'", "tPIPE", 39 | "'}'", "tRCURLY", 40 | "'~'", "tTILDE", 41 | '"["', "tLBRACK", 42 | 43 | # 2.0 changes? 44 | '"<=>"', "tCMP", 45 | '"=="', "tEQ", 46 | '"==="', "tEQQ", 47 | '"!~"', "tNMATCH", 48 | '"=~"', "tMATCH", 49 | '">="', "tGEQ", 50 | '"<="', "tLEQ", 51 | '"!="', "tNEQ", 52 | '"<<"', "tLSHFT", 53 | '">>"', "tRSHFT", 54 | '"*"', "tSTAR", 55 | 56 | '".."', "tDOT2", 57 | 58 | '"&"', "tAMPER", 59 | '"&&"', "tANDOP", 60 | '"&."', "tLONELY", 61 | '"||"', "tOROP", 62 | 63 | '"..."', "tDOT3", 64 | '"**"', "tPOW", 65 | '"unary+"', "tUPLUS", 66 | '"unary-"', "tUMINUS", 67 | '"[]"', "tAREF", 68 | '"[]="', "tASET", 69 | '"::"', "tCOLON2", 70 | '"{ arg"', "tLBRACE_ARG", 71 | '"( arg"', "tLPAREN_ARG", 72 | '"("', "tLPAREN", 73 | 'rparen', "tRPAREN", 74 | '"{"', "tLBRACE", 75 | '"=>"', "tASSOC", 76 | '"->"', "tLAMBDA", 77 | '":: at EXPR_BEG"', "tCOLON3", 78 | '"**arg"', "tDSTAR", 79 | '","', "tCOMMA", 80 | 81 | # other 82 | 83 | 'tLBRACK2', "tLBRACK", # HACK 84 | 85 | "' '", "tSPACE", # needs to be later to avoid bad hits 86 | 87 | "ε", "none", # bison 3+ 88 | "%empty", "none", # newer bison 89 | "/* empty */", "none", 90 | /^\s*$/, "none", 91 | 92 | "keyword_BEGIN", "klBEGIN", 93 | "keyword_END", "klEND", 94 | /keyword_(\w+)/, proc { "k#{$1.upcase}" }, 95 | /\bk_([a-z_]+)/, proc { "k#{$1.upcase}" }, 96 | /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" }, 97 | "kVARIABLE", "keyword_variable", # ugh 98 | 99 | # 2.6 collapses klBEGIN to kBEGIN 100 | "klBEGIN", "kBEGIN", 101 | "klEND", "kEND", 102 | 103 | /keyword_(\w+)/, proc { "k#{$1.upcase}" }, 104 | /\bk_([^_][a-z_]+)/, proc { "k#{$1.upcase}" }, 105 | /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" }, 106 | 107 | "kVARIABLE", "keyword_variable", # ugh: this is a rule name 108 | 109 | # 2.7 changes: 110 | 111 | '"global variable"', "tGVAR", 112 | '"operator-assignment"', "tOP_ASGN", 113 | '"back reference"', "tBACK_REF", 114 | '"numbered reference"', "tNTH_REF", 115 | '"local variable or method"', "tIDENTIFIER", 116 | '"constant"', "tCONSTANT", 117 | 118 | '"(.."', "tBDOT2", 119 | '"(..."', "tBDOT3", 120 | '"char literal"', "tCHAR", 121 | '"literal content"', "tSTRING_CONTENT", 122 | '"string literal"', "tSTRING_BEG", 123 | '"symbol literal"', "tSYMBEG", 124 | '"backtick literal"', "tXSTRING_BEG", 125 | '"regexp literal"', "tREGEXP_BEG", 126 | '"word list"', "tWORDS_BEG", 127 | '"verbatim word list"', "tQWORDS_BEG", 128 | '"symbol list"', "tSYMBOLS_BEG", 129 | '"verbatim symbol list"', "tQSYMBOLS_BEG", 130 | 131 | '"float literal"', "tFLOAT", 132 | '"imaginary literal"', "tIMAGINARY", 133 | '"integer literal"', "tINTEGER", 134 | '"rational literal"', "tRATIONAL", 135 | 136 | '"instance variable"', "tIVAR", 137 | '"class variable"', "tCVAR", 138 | '"terminator"', "tSTRING_END", # TODO: switch this? 139 | '"method"', "tFID", 140 | '"}"', "tSTRING_DEND", 141 | 142 | '"do for block"', "kDO_BLOCK", 143 | '"do for condition"', "kDO_COND", 144 | '"do for lambda"', "kDO_LAMBDA", 145 | "tLABEL", "kLABEL", 146 | 147 | # UGH 148 | "k_LINE__", "k__LINE__", 149 | "k_FILE__", "k__FILE__", 150 | "k_ENCODING__", "k__ENCODING__", 151 | 152 | '"defined?"', "kDEFINED", 153 | 154 | '"do (for condition)"', "kDO_COND", 155 | '"do (for lambda)"', "kDO_LAMBDA", 156 | %("'do' for block"), "kDO_BLOCK", # 3.4 157 | %("'do' for lambda"), "kDO_LAMBDA", # 3.4 158 | %("'do' for condition"),"kDO_COND", # 3.4 159 | %q("#{"), "tSTRING_DBEG", # 3.4 160 | '"do (for block)"', "kDO_BLOCK", # 3.4 161 | 162 | /\"'(\w+)' \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" }, # 3.4 163 | /\"(\w+) \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" }, 164 | /\"((?!k)\w+)\"/, proc { |x| "k#{$1.upcase}" }, 165 | 166 | /\$?@(\d+)(\s+|$)/, "", # newer bison 167 | 168 | # 3.4(ish?) changes: 169 | "option_tNL", "opt_nl", # ruby 3.4 170 | 171 | # TODO: remove for 3.0 work: 172 | "lex_ctxt ", "" # 3.0 production that's mostly noise right now 173 | ] 174 | 175 | renames.each_slice(2) do |(a, b)| 176 | if Proc === b then 177 | s.gsub!(a, &b) 178 | else 179 | s.gsub!(a, b) 180 | end 181 | end 182 | 183 | s.strip 184 | end 185 | 186 | ARGF.each_line do |line| 187 | next unless good or line =~ /^-* ?Grammar|\$accept : / 188 | 189 | case line.strip # TODO: .delete %q["'()] 190 | when /^$/ then 191 | when /^(\d+) (\$?[@\w]+): (.*)/ then # yacc 192 | rule = $2 193 | order << rule unless rules.has_key? rule 194 | rules[rule] << munge($3) 195 | when /^(\d+) (\$?[@\w]+'(?: |\\n)'): (.*)/ then # munges both sides 196 | rule = $2 197 | order << rule unless rules.has_key? rule 198 | rules[munge(rule)] << munge($3) 199 | when /^(\d+) \s+\| (.*)/ then # yacc 200 | rules[rule] << munge($2) 201 | when /^(\d+) (@\d+): (.*)/ then # yacc 202 | rule = $2 203 | order << rule unless rules.has_key? rule 204 | rules[rule] << munge($3) 205 | when /^rule (\d+) (@?\w+):(.*)/ then # racc 206 | rule = $2 207 | order << rule unless rules.has_key? rule 208 | rules[rule] << munge($3) 209 | when /\$accept/ then # byacc? 210 | good = true 211 | when /Grammar/ then # both 212 | good = true 213 | when /^-+ Symbols/ then # racc 214 | break 215 | when /^Terminals/ then # yacc 216 | break 217 | when /^State \d/ then # lrama 218 | break 219 | when /^\cL/ then # byacc 220 | break 221 | else 222 | warn "unparsed: #{$.}: #{line.strip.inspect}" 223 | end 224 | end 225 | 226 | require 'yaml' 227 | 228 | order.each do |k| 229 | next if k =~ /@/ 230 | puts 231 | puts "#{k}:" 232 | puts rules[k].map { |r| " #{r}" }.join "\n" 233 | end 234 | -------------------------------------------------------------------------------- /debugging.md: -------------------------------------------------------------------------------- 1 | # Quick Notes to Help with Debugging 2 | 3 | ## Reducing 4 | 5 | One of the most important steps is reducing the code sample to a 6 | minimal reproduction. For example, one thing I'm debugging right now 7 | was reported as: 8 | 9 | ```ruby 10 | a, b, c, d, e, f, g, h, i, j = 1, *[p1, p2, p3], *[p1, p2, p3], *[p4, p5, p6] 11 | ``` 12 | 13 | This original sample has 10 items on the left-hand-side (LHS) and 1 + 14 | 3 groups of 3 (calls) on the RHS + 3 arrays + 3 splats. That's a lot. 15 | 16 | It's already been reported (perhaps incorrectly) that this has to do 17 | with multiple splats on the RHS, so let's focus on that. At a minimum 18 | the code can be reduced to 2 splats on the RHS and some 19 | experimentation shows that it needs a non-splat item to fail: 20 | 21 | ``` 22 | _, _, _ = 1, *[2], *[3] 23 | ``` 24 | 25 | and some intuition further removed the arrays: 26 | 27 | ``` 28 | _, _, _ = 1, *2, *3 29 | ``` 30 | 31 | the difference is huge and will make a ton of difference when 32 | debugging. 33 | 34 | ## Getting something to compare 35 | 36 | ``` 37 | % rake debug3 F=file.rb 38 | ``` 39 | 40 | TODO 41 | 42 | ## Comparing against ruby / ripper: 43 | 44 | ``` 45 | % rake cmp3 F=file.rb 46 | ``` 47 | 48 | This compiles the parser & lexer and then parses file.rb using both 49 | ruby, ripper, and ruby_parser in debug modes. The output is munged to 50 | be as uniform as possible and diffable. I'm using emacs' 51 | `ediff-files3` to compare these files (via `rake cmp3`) all at once, 52 | but regular `diff -u tmp/{ruby,rp}` will suffice for most tasks. 53 | 54 | From there? Good luck. I'm currently trying to backtrack from rule 55 | reductions to state change differences. I'd like to figure out a way 56 | to go from this sort of diff to a reasonable test that checks state 57 | changes but I don't have that set up at this point. 58 | 59 | ## Adding New Grammar Productions 60 | 61 | Ruby adds stuff to the parser ALL THE TIME. It's actually hard to keep 62 | up with, but I've added some tools and shown what a typical workflow 63 | looks like. Let's say you want to add ruby 2.7's "beginless range" (eg 64 | `..42`). 65 | 66 | Whenever there's a language feature missing, I start with comparing 67 | the parse trees between MRI and RP: 68 | 69 | ### Structural Comparing 70 | 71 | There's a bunch of rake tasks `compare27`, `compare26`, etc that try 72 | to normalize and diff MRI's parse.y parse tree (just the structure of 73 | the tree in yacc) to ruby\_parser's parse tree (racc). It's the first 74 | thing I do when I'm adding a new version. Stub out all the version 75 | differences, and then start to diff the structure and move 76 | ruby\_parser towards the new changes. 77 | 78 | Some differences are just gonna be there... but here's an example of a 79 | real diff between MRI 2.7 and ruby_parser as of today: 80 | 81 | ```diff 82 | arg tDOT3 arg 83 | arg tDOT2 84 | arg tDOT3 85 | - tBDOT2 arg 86 | - tBDOT3 arg 87 | arg tPLUS arg 88 | arg tMINUS arg 89 | arg tSTAR2 arg 90 | ``` 91 | 92 | This is a new language feature that ruby_parser doesn't handle yet. 93 | It's in MRI (the left hand side of the diff) but not ruby\_parser (the 94 | right hand side) so it is a `-` or missing line. 95 | 96 | Some other diffs will have both `+` and `-` lines. That usually 97 | happens when MRI has been refactoring the grammar. Sometimes I choose 98 | to adapt those refactorings and sometimes it starts to get too 99 | difficult to maintain multiple versions of ruby parsing in a single 100 | file. 101 | 102 | But! This structural comparing is always a place you should look when 103 | ruby_parser is failing to parse something. Maybe it just hasn't been 104 | implemented yet and the easiest place to look is the diff. 105 | 106 | ### Starting Test First 107 | 108 | The next thing I do is to add a parser test to cover that feature. I 109 | usually start with the parser and work backwards towards the lexer as 110 | needed, as I find it structures things properly and keeps things goal 111 | oriented. 112 | 113 | So, make a new parser test, usually in the versioned section of the 114 | parser tests. 115 | 116 | ``` 117 | def test_beginless2 118 | rb = "..10\n; ..a\n; c" 119 | pt = s(:block, 120 | s(:dot2, nil, s(:lit, 0).line(1)).line(1), 121 | s(:dot2, nil, s(:call, nil, :a).line(2)).line(2), 122 | s(:call, nil, :c).line(3)).line(1) 123 | 124 | assert_parse_line rb, pt, 1 125 | 126 | flunk "not done yet" 127 | end 128 | ``` 129 | 130 | (In this case copied and modified the tests for open ranges from 2.6) 131 | and run it to get my first error: 132 | 133 | ``` 134 | % rake N=/beginless/ 135 | 136 | ... 137 | 138 | E 139 | 140 | Finished in 0.021814s, 45.8421 runs/s, 0.0000 assertions/s. 141 | 142 | 1) Error: 143 | TestRubyParserV27#test_whatevs: 144 | Racc::ParseError: (string):1 :: parse error on value ".." (tDOT2) 145 | GEMS/2.7.0/gems/racc-1.5.0/lib/racc/parser.rb:538:in `on_error' 146 | WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1304:in `on_error' 147 | (eval):3:in `_racc_do_parse_c' 148 | (eval):3:in `do_parse' 149 | WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1329:in `block in process' 150 | RUBY/lib/ruby/2.7.0/timeout.rb:95:in `block in timeout' 151 | RUBY/lib/ruby/2.7.0/timeout.rb:33:in `block in catch' 152 | RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch' 153 | RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch' 154 | RUBY/lib/ruby/2.7.0/timeout.rb:110:in `timeout' 155 | WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1317:in `process' 156 | WORK/ruby_parser/dev/test/test_ruby_parser.rb:4198:in `assert_parse' 157 | WORK/ruby_parser/dev/test/test_ruby_parser.rb:4221:in `assert_parse_line' 158 | WORK/ruby_parser/dev/test/test_ruby_parser.rb:4451:in `test_whatevs' 159 | ``` 160 | 161 | For starters, we know the missing production is for `tBDOT2 arg`. It 162 | is currently blowing up because it is getting `tDOT2` and simply 163 | doesn't know what to do with it, so it raises the error. As the diff 164 | suggests, that's the wrong token to begin with, so it is probably time 165 | to also create a lexer test: 166 | 167 | ``` 168 | def test_yylex_bdot2 169 | assert_lex3("..42", 170 | s(:dot2, nil, s(:lit, 42)), 171 | 172 | :tBDOT2, "..", EXPR_BEG, 173 | :tINTEGER, "42", EXPR_NUM) 174 | 175 | flunk "not done yet" 176 | end 177 | ``` 178 | 179 | This one is mostly speculative at this point. It says "if we're lexing 180 | this string, we should get this sexp if we fully parse it, and the 181 | lexical stream should look like this"... That last bit is mostly made 182 | up at this point. Sometimes I don't know exactly what expression state 183 | things should be in until I start really digging in. 184 | 185 | At this point, I have 2 failing tests that are directing me in the 186 | right direction. It's now a matter of digging through 187 | `compare/parse26.y` to see how the lexer differs and implementing 188 | it... 189 | 190 | But this is a good start to the doco for now. I'll add more later. 191 | -------------------------------------------------------------------------------- /gauntlet.md: -------------------------------------------------------------------------------- 1 | # Running the Gauntlet 2 | 3 | ## Maintaining a Gem Mirror 4 | 5 | I use rubygems-mirror to keep an archive of all the latest rubygems on 6 | an external disk. Here is the config: 7 | 8 | ``` 9 | --- 10 | - from: https://rubygems.org 11 | to: /Volumes/StuffA/gauntlet/mirror 12 | parallelism: 10 13 | retries: 3 14 | delete: true 15 | skiperror: true 16 | hashdir: true 17 | ``` 18 | 19 | And I update using rake: 20 | 21 | ``` 22 | % cd GIT/rubygems/rubygems-mirror 23 | % git down 24 | % rake mirror:latest 25 | % /Volumes/StuffA/gauntlet/bin/cleanup.rb -y -v 26 | ``` 27 | 28 | This rather quickly updates my mirror to the latest versions of 29 | everything and then deletes all old versions. I then run a cleanup 30 | script that fixes the file dates to their publication date and deletes 31 | any gems that have invalid specs. This can argue with the mirror a 32 | bit, but it is pretty minimal (currently ~20 bad gems). 33 | 34 | ## Curating an Archive of Ruby Files 35 | 36 | Next, I process the gem mirror into a much more digestable structure 37 | using `unpack_gems.rb`. 38 | 39 | ``` 40 | % cd RP/gauntlet 41 | % time caffeinate /Volumes/StuffA/gauntlet/bin/unpack_gems.rb -v [-a] ; say done 42 | ... waaaait ... 43 | % DIR=gauntlet.$(today).(all|new).noindex 44 | % mv hashed.noindex $DIR 45 | % tar vc -T <(fd -tf . $DIR | sort) | zstd -5 -T0 --long > archives/$DIR.tar.zst ; say done 46 | % ./bin/sync.sh 47 | ``` 48 | 49 | This script filters all the newer (< 1 year old) gems (unless `-a` is 50 | used), unpacks them, finds all the files that look like they're valid 51 | ruby, ensures they're valid ruby (using the current version of ruby to 52 | compile them), and then moves them into a SHA dir structure that looks 53 | something like this: 54 | 55 | ``` 56 | hashed.noindex/a/b/c/.rb 57 | ``` 58 | 59 | This removes all duplicates and puts everything in a fairly even, 60 | wide, flat directory layout. 61 | 62 | This process takes a very long time, even with a lot of 63 | parallelization. There are currently about 160k gems in the mirror. 64 | Unpacking, validating, SHA'ing everything is disk and CPU intensive. 65 | The `.noindex` extension stops spotlight from indexing the continous 66 | churn of files being unpacked and moved and saves time. 67 | 68 | Finally, I rename and archive it all up (currently using zstd to 69 | compress). 70 | 71 | ### Stats 72 | 73 | ``` 74 | 9696 % find gauntlet.$(today).noindex -type f | lc 75 | 561270 76 | 3.5G gauntlet.2021-08-06.noindex 77 | 239M gauntlet.2021-08-06.noindex.tar.zst 78 | ``` 79 | 80 | So I wind up with a little over half a million unique ruby files to 81 | parse. It's about 3.5g but compresses very nicely down to 240m 82 | 83 | ## Running the Gauntlet 84 | 85 | Assuming you're starting from scratch, unpack the archive once: 86 | 87 | ``` 88 | % zstdcat gauntlet.$(today).noindex.tar.zst | tar x 89 | ``` 90 | 91 | Then, either run a single process (easier to read): 92 | 93 | ``` 94 | % ./gauntlet/bin/gauntlet.rb gauntlet/*.noindex/? 95 | ``` 96 | 97 | Or max out your machine using xargs (note the `-P 16` and choose accordingly): 98 | 99 | ``` 100 | % ls -d gauntlet/*.noindex/?/? | time xargs -n 1 -P 16 ./gauntlet/bin/gauntlet.rb 101 | ``` 102 | 103 | In another terminal I usually monitor the progress like so: 104 | 105 | ``` 106 | % while true ; do clear; fd . -t d -t e gauntlet/*.noindex -X rmdir -p 2> /dev/null ; for D in gauntlet/*.noindex/? ; do echo -n "$D: "; fd .rb $D | wc -l ; done ; echo ; sleep 30 ; done 107 | ``` 108 | -------------------------------------------------------------------------------- /lib/.document: -------------------------------------------------------------------------------- 1 | *.rb 2 | -------------------------------------------------------------------------------- /lib/rp_extensions.rb: -------------------------------------------------------------------------------- 1 | # :stopdoc: 2 | # WHY do I have to do this?!? 3 | class Regexp 4 | ONCE = 0 unless defined? ONCE # FIX: remove this - it makes no sense 5 | 6 | unless defined? ENC_NONE then 7 | ENC_NONE = /x/n.options 8 | ENC_EUC = /x/e.options 9 | ENC_SJIS = /x/s.options 10 | ENC_UTF8 = /x/u.options 11 | end 12 | end 13 | # :startdoc: 14 | 15 | class Array 16 | def prepend *vals 17 | self[0,0] = vals 18 | end 19 | end unless [].respond_to?(:prepend) 20 | 21 | # :stopdoc: 22 | class Symbol 23 | def end_with? o 24 | self.to_s.end_with? o 25 | end 26 | end unless :woot.respond_to?(:end_with?) 27 | # :startdoc: 28 | 29 | ############################################################ 30 | # HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK 31 | 32 | class String 33 | def clean_caller 34 | self.sub(File.dirname(__FILE__), "./lib").sub(/:in.*/, "") 35 | end if $DEBUG 36 | end 37 | 38 | require "sexp" 39 | 40 | class Sexp 41 | attr_writer :paren # TODO: retire 42 | 43 | def paren 44 | @paren ||= false 45 | end 46 | 47 | def block_pass? 48 | any? { |s| Sexp === s && s.sexp_type == :block_pass } 49 | end 50 | end 51 | 52 | # END HACK 53 | ############################################################ 54 | -------------------------------------------------------------------------------- /lib/rp_stringscanner.rb: -------------------------------------------------------------------------------- 1 | require "strscan" 2 | 3 | class RPStringScanner < StringScanner 4 | if ENV["DEBUG"] || ENV["TALLY"] then 5 | def getch 6 | c = super 7 | where = caller.drop_while { |s| s =~ /(getch|nextc).$/ }.first 8 | where = where.split(/:/).first(2).join(":") 9 | if ENV["TALLY"] then 10 | d getch:where 11 | else 12 | d getch:[c, where] 13 | end 14 | c 15 | end 16 | 17 | def scan re 18 | s = super 19 | where = caller.drop_while { |x| x =~ /scan.$/ }.first 20 | where = where.split(/:/).first(2).join(":") 21 | if ENV["TALLY"] then 22 | d scan:[where] 23 | else 24 | d scan:[s, where] if s 25 | end 26 | s 27 | end 28 | 29 | def d o 30 | STDERR.puts o.inspect 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/ruby_lexer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | # encoding: UTF-8 3 | 4 | $DEBUG = true if ENV["DEBUG"] 5 | 6 | class RubyLexer 7 | # :stopdoc: 8 | EOF = :eof_haha! 9 | 10 | ESCAPES = { 11 | "a" => "\007", 12 | "b" => "\010", 13 | "e" => "\033", 14 | "f" => "\f", 15 | "n" => "\n", 16 | "r" => "\r", 17 | "s" => " ", 18 | "t" => "\t", 19 | "v" => "\13", 20 | "\\" => '\\', 21 | "\n" => "", 22 | "C-\?" => 127.chr, 23 | "c\?" => 127.chr, 24 | } 25 | 26 | BTOKENS = { 27 | ".." => :tBDOT2, 28 | "..." => :tBDOT3, 29 | } 30 | 31 | TOKENS = { 32 | "!" => :tBANG, 33 | "!=" => :tNEQ, 34 | "!@" => :tBANG, 35 | "!~" => :tNMATCH, 36 | "," => :tCOMMA, 37 | ".." => :tDOT2, 38 | "..." => :tDOT3, 39 | "=" => :tEQL, 40 | "==" => :tEQ, 41 | "===" => :tEQQ, 42 | "=>" => :tASSOC, 43 | "=~" => :tMATCH, 44 | "->" => :tLAMBDA, 45 | } 46 | 47 | PERCENT_END = { 48 | "(" => ")", 49 | "[" => "]", 50 | "{" => "}", 51 | "<" => ">", 52 | } 53 | 54 | SIMPLE_RE_META = /[\$\*\+\.\?\^\|\)\]\}\>]/ 55 | 56 | @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) } 57 | @@regexp_cache[nil] = nil 58 | 59 | def regexp_cache 60 | @@regexp_cache 61 | end 62 | 63 | if $DEBUG then 64 | attr_reader :lex_state 65 | 66 | def lex_state= o 67 | return if @lex_state == o 68 | 69 | from = "" 70 | if ENV["VERBOSE"] 71 | path = caller[0] 72 | path = caller[1] if path =~ /result/ 73 | path, line, *_ = path.split(/:/) 74 | path.delete_prefix! File.dirname File.dirname __FILE__ 75 | from = " at .%s:%s" % [path, line] 76 | end 77 | 78 | warn "lex_state: %p -> %p%s" % [lex_state, o, from] 79 | 80 | @lex_state = o 81 | end 82 | end 83 | 84 | # :startdoc: 85 | 86 | attr_accessor :lex_state unless $DEBUG 87 | 88 | attr_accessor :brace_nest 89 | attr_accessor :cmdarg 90 | attr_accessor :command_start 91 | attr_accessor :cmd_state # temporary--ivar to avoid passing everywhere 92 | attr_accessor :last_state 93 | attr_accessor :cond 94 | attr_accessor :old_ss 95 | attr_accessor :old_lineno 96 | 97 | # these are generated via ruby_lexer.rex: ss, lineno 98 | 99 | ## 100 | # Additional context surrounding tokens that both the lexer and 101 | # grammar use. 102 | 103 | attr_accessor :lex_strterm 104 | attr_accessor :lpar_beg 105 | attr_accessor :paren_nest 106 | attr_accessor :parser # HACK for very end of lexer... *sigh* 107 | attr_accessor :space_seen 108 | attr_accessor :string_buffer 109 | attr_accessor :string_nest 110 | 111 | # Last token read via next_token. 112 | attr_accessor :token 113 | 114 | # Last comment lexed, or nil 115 | attr_accessor :comment 116 | 117 | def initialize _ = nil 118 | @lex_state = nil # remove one warning under $DEBUG 119 | @lex_state = EXPR_NONE 120 | 121 | self.cond = RubyParserStuff::StackState.new(:cond, $DEBUG) 122 | self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG) 123 | self.ss = RPStringScanner.new "" 124 | 125 | reset 126 | end 127 | 128 | def arg_ambiguous 129 | self.warning "Ambiguous first argument. make sure." 130 | end 131 | 132 | def arg_state 133 | is_after_operator? ? EXPR_ARG : EXPR_BEG 134 | end 135 | 136 | def debug n 137 | raise "debug #{n}" 138 | end 139 | 140 | def expr_dot? 141 | lex_state =~ EXPR_DOT 142 | end 143 | 144 | def expr_fname? # REFACTOR 145 | lex_state =~ EXPR_FNAME 146 | end 147 | 148 | def expr_result token, text 149 | cond.push false 150 | cmdarg.push false 151 | result EXPR_BEG, token, text 152 | end 153 | 154 | def in_fname? # REFACTOR 155 | lex_state =~ EXPR_FNAME 156 | end 157 | 158 | def int_with_base base 159 | rb_compile_error "Invalid numeric format" if matched =~ /__/ 160 | 161 | text = matched 162 | case 163 | when text.end_with?("ri") 164 | result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))) 165 | when text.end_with?("r") 166 | result EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)) 167 | when text.end_with?("i") 168 | result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)) 169 | else 170 | result EXPR_NUM, :tINTEGER, text.to_i(base) 171 | end 172 | end 173 | 174 | def is_after_operator? 175 | lex_state =~ EXPR_FNAME|EXPR_DOT 176 | end 177 | 178 | def is_arg? 179 | lex_state =~ EXPR_ARG_ANY 180 | end 181 | 182 | def is_beg? 183 | lex_state =~ EXPR_BEG_ANY || lex_state == EXPR_LAB # yes, == EXPR_LAB 184 | end 185 | 186 | def is_end? 187 | lex_state =~ EXPR_END_ANY 188 | end 189 | 190 | def is_label_possible? 191 | (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg? 192 | end 193 | 194 | def is_label_suffix? 195 | check(/:(?!:)/) 196 | end 197 | 198 | def is_space_arg? c = "x" 199 | is_arg? and space_seen and c !~ /\s/ 200 | end 201 | 202 | def lambda_beginning? 203 | lpar_beg && lpar_beg == paren_nest 204 | end 205 | 206 | def is_local_id id 207 | # maybe just make this false for now 208 | self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right 209 | end 210 | 211 | def lvar_defined? id 212 | # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id) 213 | self.parser.env[id.to_sym] == :lvar 214 | end 215 | 216 | def not_end? 217 | not is_end? 218 | end 219 | 220 | def possibly_escape_string text, check 221 | content = match[1] 222 | 223 | if text =~ check then 224 | unescape_string content 225 | else 226 | content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'") 227 | end 228 | end 229 | 230 | def process_amper text 231 | token = if is_arg? && space_seen && !check(/\s/) then 232 | warning("`&' interpreted as argument prefix") 233 | :tAMPER 234 | elsif lex_state =~ EXPR_BEG|EXPR_MID then 235 | :tAMPER 236 | else 237 | :tAMPER2 238 | end 239 | 240 | result :arg_state, token, "&" 241 | end 242 | 243 | def process_backref text 244 | token = match[1].to_sym 245 | # TODO: can't do lineno hack w/ symbol 246 | result EXPR_END, :tBACK_REF, token 247 | end 248 | 249 | def process_begin text 250 | self.comment ||= +"" 251 | self.comment << matched 252 | 253 | unless scan(/.*?\n=end( |\t|\f)*[^\n]*(\n|\z)/m) then 254 | self.comment = nil 255 | rb_compile_error("embedded document meets end of file") 256 | end 257 | 258 | self.comment << matched 259 | self.lineno += matched.count("\n") # HACK? 260 | 261 | nil # TODO 262 | end 263 | 264 | # TODO: make all tXXXX terminals include lexer.lineno ... enforce it somehow? 265 | 266 | def process_brace_close text 267 | case matched 268 | when "}" then 269 | self.brace_nest -= 1 270 | return :tSTRING_DEND, matched if brace_nest < 0 271 | end 272 | 273 | # matching compare/parse26.y:8099 274 | cond.pop 275 | cmdarg.pop 276 | 277 | case matched 278 | when "}" then 279 | self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END 280 | return :tRCURLY, matched 281 | when "]" then 282 | self.paren_nest -= 1 283 | self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END 284 | return :tRBRACK, matched 285 | when ")" then 286 | self.paren_nest -= 1 287 | self.lex_state = EXPR_ENDFN 288 | return :tRPAREN, matched 289 | else 290 | raise "Unknown bracing: #{matched.inspect}" 291 | end 292 | end 293 | 294 | def process_brace_open text 295 | # matching compare/parse23.y:8694 296 | self.brace_nest += 1 297 | 298 | if lambda_beginning? then 299 | self.lpar_beg = nil 300 | self.paren_nest -= 1 # close arg list when lambda opens body 301 | 302 | return expr_result(:tLAMBEG, "{") 303 | end 304 | 305 | token = case 306 | when lex_state =~ EXPR_LABELED then 307 | :tLBRACE # hash 308 | when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then 309 | :tLCURLY # block (primary) "{" in parse.y 310 | when lex_state =~ EXPR_ENDARG then 311 | :tLBRACE_ARG # block (expr) 312 | else 313 | :tLBRACE # hash 314 | end 315 | 316 | state = token == :tLBRACE_ARG ? EXPR_BEG : EXPR_PAR 317 | self.command_start = true if token != :tLBRACE 318 | 319 | cond.push false 320 | cmdarg.push false 321 | result state, token, text 322 | end 323 | 324 | def process_colon1 text 325 | # ?: / then / when 326 | if is_end? || check(/\s/) then 327 | return result EXPR_BEG, :tCOLON, text 328 | end 329 | 330 | case 331 | when scan(/\'/) then 332 | string STR_SSYM, matched 333 | when scan(/\"/) then 334 | string STR_DSYM, matched 335 | end 336 | 337 | result EXPR_FNAME, :tSYMBEG, text 338 | end 339 | 340 | def process_colon2 text 341 | if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then 342 | result EXPR_BEG, :tCOLON3, text 343 | else 344 | result EXPR_DOT, :tCOLON2, text 345 | end 346 | end 347 | 348 | def process_dots text # parse32.y:10216 349 | is_beg = self.is_beg? 350 | self.lex_state = EXPR_BEG 351 | 352 | return result EXPR_ENDARG, :tBDOT3, text if 353 | parser.in_argdef && text == "..." # TODO: version check? 354 | 355 | tokens = ruby27plus? && is_beg ? BTOKENS : TOKENS 356 | 357 | result EXPR_BEG, tokens[text], text 358 | end 359 | 360 | def process_float text 361 | rb_compile_error "Invalid numeric format" if text =~ /__/ 362 | 363 | case 364 | when text.end_with?("ri") 365 | result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop)) 366 | when text.end_with?("i") 367 | result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f) 368 | when text.end_with?("r") 369 | result EXPR_NUM, :tRATIONAL, Rational(text.chop) 370 | else 371 | result EXPR_NUM, :tFLOAT, text.to_f 372 | end 373 | end 374 | 375 | def process_gvar text 376 | if parser.class.version > 20 && text == "$-" then 377 | rb_compile_error "unexpected $undefined" 378 | end 379 | 380 | result EXPR_END, :tGVAR, text 381 | end 382 | 383 | def process_gvar_oddity text 384 | rb_compile_error "#{text.inspect} is not allowed as a global variable name" 385 | end 386 | 387 | def process_ivar text 388 | tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR 389 | result EXPR_END, tok_id, text 390 | end 391 | 392 | def process_label text 393 | symbol = possibly_escape_string text, /^\"/ 394 | 395 | result EXPR_LAB, :tLABEL, symbol 396 | end 397 | 398 | def process_label_or_string text 399 | if @was_label && text =~ /:\Z/ then 400 | @was_label = nil 401 | return process_label text 402 | elsif text =~ /:\Z/ then 403 | self.pos -= 1 # put back ":" 404 | text = text[0..-2] 405 | end 406 | 407 | orig_line = lineno 408 | str = text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'") 409 | self.lineno += str.count("\n") 410 | 411 | result EXPR_END, :tSTRING, str, orig_line 412 | end 413 | 414 | def process_lchevron text 415 | if (lex_state !~ EXPR_DOT|EXPR_CLASS && 416 | !is_end? && 417 | (!is_arg? || lex_state =~ EXPR_LABELED || space_seen)) then 418 | tok = self.heredoc_identifier 419 | return tok if tok 420 | end 421 | 422 | if is_after_operator? then 423 | self.lex_state = EXPR_ARG 424 | else 425 | self.command_start = true if lex_state =~ EXPR_CLASS 426 | self.lex_state = EXPR_BEG 427 | end 428 | 429 | result lex_state, :tLSHFT, "\<\<" 430 | end 431 | 432 | def process_newline_or_comment text # ../compare/parse30.y:9126 ish 433 | c = matched 434 | 435 | if c == "#" then 436 | self.pos -= 1 437 | 438 | while scan(/\s*\#.*(\n+|\z)/) do 439 | self.lineno += matched.count "\n" 440 | self.comment ||= +"" 441 | self.comment << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "") 442 | end 443 | 444 | return nil if end_of_stream? 445 | end 446 | 447 | c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT && 448 | lex_state !~ EXPR_LABELED) 449 | if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB 450 | # ignore if !fallthrough? 451 | if !c && parser.in_kwarg then 452 | # normal newline 453 | self.command_start = true 454 | return result EXPR_BEG, :tNL, nil 455 | else 456 | maybe_pop_stack 457 | return # goto retry 458 | end 459 | end 460 | 461 | if scan(/[\ \t\r\f\v]+/) then 462 | self.space_seen = true 463 | end 464 | 465 | if check(/#/) then 466 | return # goto retry 467 | elsif check(/&\.|\.(?!\.)/) then # C version is a hellish obfuscated xnor 468 | return # goto retry 469 | end 470 | 471 | self.command_start = true 472 | 473 | result EXPR_BEG, :tNL, nil 474 | end 475 | 476 | def process_nthref text 477 | # TODO: can't do lineno hack w/ number 478 | result EXPR_END, :tNTH_REF, match[1].to_i 479 | end 480 | 481 | def process_paren text 482 | token = if is_beg? then 483 | :tLPAREN 484 | elsif !space_seen then 485 | # foo( ... ) => method call, no ambiguity 486 | :tLPAREN2 487 | elsif is_space_arg? then 488 | :tLPAREN_ARG 489 | elsif lex_state =~ EXPR_ENDFN && !lambda_beginning? then 490 | # TODO: 491 | # warn("parentheses after method name is interpreted as " \ 492 | # "an argument list, not a decomposed argument") 493 | :tLPAREN2 494 | else 495 | :tLPAREN2 # plain "(" in parse.y 496 | end 497 | 498 | self.paren_nest += 1 499 | 500 | cond.push false 501 | cmdarg.push false 502 | result EXPR_PAR, token, text 503 | end 504 | 505 | def process_percent text 506 | case 507 | when is_beg? then 508 | process_percent_quote 509 | when scan(/\=/) 510 | result EXPR_BEG, :tOP_ASGN, "%" 511 | when is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/)) 512 | process_percent_quote 513 | else 514 | result :arg_state, :tPERCENT, "%" 515 | end 516 | end 517 | 518 | def process_plus_minus text 519 | sign = matched 520 | utype, type = if sign == "+" then 521 | [:tUPLUS, :tPLUS] 522 | else 523 | [:tUMINUS, :tMINUS] 524 | end 525 | 526 | if is_after_operator? then 527 | if scan(/@/) then 528 | return result(EXPR_ARG, utype, "#{sign}@") 529 | else 530 | return result(EXPR_ARG, type, sign) 531 | end 532 | end 533 | 534 | return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/) 535 | 536 | if is_beg? || (is_arg? && space_seen && !check(/\s/)) then 537 | arg_ambiguous if is_arg? 538 | 539 | if check(/\d/) then 540 | return nil if utype == :tUPLUS 541 | return result EXPR_BEG, :tUMINUS_NUM, sign 542 | end 543 | 544 | return result EXPR_BEG, utype, sign 545 | end 546 | 547 | result EXPR_BEG, type, sign 548 | end 549 | 550 | def process_questionmark text 551 | if is_end? then 552 | return result EXPR_BEG, :tEH, "?" 553 | end 554 | 555 | if end_of_stream? then 556 | rb_compile_error "incomplete character syntax: parsed #{text.inspect}" 557 | end 558 | 559 | if check(/\s|\v/) then 560 | unless is_arg? then 561 | c2 = { " " => "s", 562 | "\n" => "n", 563 | "\t" => "t", 564 | "\v" => "v", 565 | "\r" => "r", 566 | "\f" => "f" }[matched] 567 | 568 | if c2 then 569 | warning("invalid character syntax; use ?\\" + c2) 570 | end 571 | end 572 | 573 | # ternary 574 | return result EXPR_BEG, :tEH, "?" 575 | elsif check(/\w(?=\w)/) then # ternary, also 576 | return result EXPR_BEG, :tEH, "?" 577 | end 578 | 579 | c = if scan(/\\/) then 580 | self.read_escape 581 | else 582 | getch 583 | end 584 | 585 | result EXPR_END, :tSTRING, c 586 | end 587 | 588 | def process_simple_string text 589 | orig_line = lineno 590 | self.lineno += text.count("\n") 591 | 592 | str = unescape_string text[1..-2] 593 | 594 | result EXPR_END, :tSTRING, str, orig_line 595 | end 596 | 597 | def process_slash text 598 | if is_beg? then 599 | string STR_REGEXP, matched 600 | 601 | return result nil, :tREGEXP_BEG, "/" 602 | end 603 | 604 | if scan(/\=/) then 605 | return result(EXPR_BEG, :tOP_ASGN, "/") 606 | end 607 | 608 | if is_arg? && space_seen then 609 | unless scan(/\s/) then 610 | arg_ambiguous 611 | string STR_REGEXP, "/" 612 | return result(nil, :tREGEXP_BEG, "/") 613 | end 614 | end 615 | 616 | result :arg_state, :tDIVIDE, "/" 617 | end 618 | 619 | def process_square_bracket text 620 | self.paren_nest += 1 621 | 622 | token = nil 623 | 624 | if is_after_operator? then 625 | case 626 | when scan(/\]\=/) then 627 | self.paren_nest -= 1 # HACK? I dunno, or bug in MRI 628 | return result EXPR_ARG, :tASET, "[]=" 629 | when scan(/\]/) then 630 | self.paren_nest -= 1 # HACK? I dunno, or bug in MRI 631 | return result EXPR_ARG, :tAREF, "[]" 632 | else 633 | rb_compile_error "unexpected '['" 634 | end 635 | elsif is_beg? then 636 | token = :tLBRACK 637 | elsif is_arg? && (space_seen || lex_state =~ EXPR_LABELED) then 638 | token = :tLBRACK 639 | else 640 | token = :tLBRACK2 641 | end 642 | 643 | cond.push false 644 | cmdarg.push false 645 | result EXPR_PAR, token, text 646 | end 647 | 648 | def process_symbol text 649 | symbol = possibly_escape_string text, /^:\"/ # stupid emacs 650 | 651 | result EXPR_LIT, :tSYMBOL, symbol 652 | end 653 | 654 | def process_token text 655 | # matching: parse_ident in compare/parse23.y:7989 656 | # FIX: remove: self.last_state = lex_state 657 | 658 | token = self.token = text 659 | token << matched if scan(/[\!\?](?!=)/) 660 | 661 | tok_id = 662 | case 663 | when token =~ /[!?]$/ then 664 | :tFID 665 | when lex_state =~ EXPR_FNAME && scan(/=(?:(?![~>=])|(?==>))/) then 666 | # ident=, not =~ => == or followed by => 667 | # TODO test lexing of a=>b vs a==>b 668 | token << matched 669 | :tIDENTIFIER 670 | when token =~ /^[A-Z]/ then 671 | :tCONSTANT 672 | else 673 | :tIDENTIFIER 674 | end 675 | 676 | if is_label_possible? and is_label_suffix? then 677 | scan(/:/) 678 | return result EXPR_LAB, :tLABEL, token 679 | end 680 | 681 | # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT 682 | if lex_state !~ EXPR_DOT then 683 | # See if it is a reserved word. 684 | keyword = RubyParserStuff::Keyword.keyword token 685 | 686 | return process_token_keyword keyword if keyword 687 | end 688 | 689 | # matching: compare/parse32.y:9031 690 | state = if lex_state =~ EXPR_BEG_ANY|EXPR_ARG_ANY|EXPR_DOT then 691 | cmd_state ? EXPR_CMDARG : EXPR_ARG 692 | elsif lex_state =~ EXPR_FNAME then 693 | EXPR_ENDFN 694 | else 695 | EXPR_END 696 | end 697 | self.lex_state = state 698 | 699 | tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token) 700 | 701 | if last_state !~ EXPR_DOT|EXPR_FNAME and 702 | (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn 703 | lvar_defined?(token) then 704 | state = EXPR_END|EXPR_LABEL 705 | end 706 | 707 | result state, tok_id, token 708 | end 709 | 710 | def process_token_keyword keyword 711 | # matching MIDDLE of parse_ident in compare/parse32.y:9695 712 | state = lex_state 713 | 714 | return result(EXPR_ENDFN, keyword.id0, token) if lex_state =~ EXPR_FNAME 715 | 716 | self.lex_state = keyword.state 717 | self.command_start = true if lex_state =~ EXPR_BEG 718 | 719 | case 720 | when keyword.id0 == :kDO then # parse32.y line 9712 721 | case 722 | when lambda_beginning? then 723 | self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end" 724 | self.paren_nest -= 1 # TODO: question this? 725 | result lex_state, :kDO_LAMBDA, token 726 | when cond.is_in_state then 727 | result lex_state, :kDO_COND, token 728 | when cmdarg.is_in_state && state != EXPR_CMDARG then 729 | result lex_state, :kDO_BLOCK, token 730 | else 731 | result lex_state, :kDO, token 732 | end 733 | when state =~ EXPR_PAD then 734 | result lex_state, keyword.id0, token 735 | when keyword.id0 != keyword.id1 then 736 | result EXPR_PAR, keyword.id1, token 737 | else 738 | result lex_state, keyword.id1, token 739 | end 740 | end 741 | 742 | def process_underscore text 743 | self.unscan # put back "_" 744 | 745 | if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then 746 | ss.terminate 747 | [RubyLexer::EOF, RubyLexer::EOF] 748 | elsif scan(/#{IDENT_CHAR}+/) then 749 | process_token matched 750 | end 751 | end 752 | 753 | def rb_compile_error msg 754 | msg += ". near line #{self.lineno}: #{self.rest[/^.*/].inspect}" 755 | raise RubyParser::SyntaxError, msg 756 | end 757 | 758 | def reset 759 | self.lineno = 1 760 | self.brace_nest = 0 761 | self.command_start = true 762 | self.comment = nil 763 | self.lex_state = EXPR_NONE 764 | self.lex_strterm = nil 765 | self.lpar_beg = nil 766 | self.paren_nest = 0 767 | self.space_seen = false 768 | self.string_nest = 0 769 | self.token = nil 770 | self.string_buffer = [] 771 | self.old_ss = nil 772 | self.old_lineno = nil 773 | 774 | self.cond.reset 775 | self.cmdarg.reset 776 | end 777 | 778 | def result new_state, token, text, line = self.lineno # :nodoc: 779 | new_state = self.arg_state if new_state == :arg_state 780 | self.lex_state = new_state if new_state 781 | 782 | [token, [text, line]] 783 | end 784 | 785 | def ruby22_label? 786 | ruby22plus? and is_label_possible? 787 | end 788 | 789 | def ruby22plus? 790 | parser.class.version >= 22 791 | end 792 | 793 | def ruby23plus? 794 | parser.class.version >= 23 795 | end 796 | 797 | def ruby24minus? 798 | parser.class.version <= 24 799 | end 800 | 801 | def ruby27plus? 802 | parser.class.version >= 27 803 | end 804 | 805 | def space_vs_beginning space_type, beg_type, fallback 806 | if is_space_arg? check(/./m) then 807 | warning "`**' interpreted as argument prefix" 808 | space_type 809 | elsif is_beg? then 810 | beg_type 811 | else 812 | # TODO: warn_balanced("**", "argument prefix"); 813 | fallback 814 | end 815 | end 816 | 817 | def unescape_string str 818 | str = str.gsub(ESC) { unescape($1).b.force_encoding Encoding::UTF_8 } 819 | if str.valid_encoding? 820 | str 821 | else 822 | str.b 823 | end 824 | end 825 | 826 | def unescape s 827 | r = ESCAPES[s] 828 | 829 | return r if r 830 | 831 | x = case s 832 | when /^[0-7]{1,3}/ then 833 | ($&.to_i(8) & 0xFF).chr 834 | when /^x([0-9a-fA-F]{1,2})/ then 835 | $1.to_i(16).chr 836 | when /^M-(.)/ then 837 | ($1[0].ord | 0x80).chr 838 | when /^(C-|c)(.)/ then 839 | ($2[0].ord & 0x9f).chr 840 | when /^[89a-f]/i then # bad octal or hex... ignore? that's what MRI does :( 841 | s 842 | when /^[McCx0-9]/ then 843 | rb_compile_error("Invalid escape character syntax") 844 | when /u(\h{4})/ then 845 | [$1.delete("{}").to_i(16)].pack("U") 846 | when /u(\h{1,3})/ then 847 | rb_compile_error("Invalid escape character syntax") 848 | when /u\{(\h+(?:\s+\h+)*)\}/ then 849 | $1.split.map { |cp| cp.to_i(16) }.pack("U*") 850 | else 851 | s 852 | end 853 | x 854 | end 855 | 856 | def warning s 857 | # do nothing for now 858 | end 859 | 860 | def was_label? 861 | @was_label = ruby22_label? 862 | true 863 | end 864 | 865 | class State 866 | attr_accessor :n 867 | attr_accessor :names 868 | 869 | # TODO: take a shared hash of strings for inspect/to_s 870 | def initialize o, names 871 | raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove 872 | 873 | self.n = o 874 | self.names = names 875 | end 876 | 877 | def == o 878 | self.equal?(o) || (o.class == self.class && o.n == self.n) 879 | end 880 | 881 | def =~ v 882 | (self.n & v.n) != 0 883 | end 884 | 885 | def | v 886 | raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless 887 | self.names == v.names 888 | self.class.new(self.n | v.n, self.names) 889 | end 890 | 891 | def inspect 892 | return "EXPR_NONE" if n.zero? # HACK? 893 | 894 | names.map { |v, k| k if self =~ v }. 895 | compact. 896 | join("|"). 897 | gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "") 898 | end 899 | 900 | alias to_s inspect 901 | 902 | module Values 903 | expr_names = {} 904 | 905 | EXPR_NONE = State.new 0x0, expr_names 906 | EXPR_BEG = State.new 0x1, expr_names 907 | EXPR_END = State.new 0x2, expr_names 908 | EXPR_ENDARG = State.new 0x4, expr_names 909 | EXPR_ENDFN = State.new 0x8, expr_names 910 | EXPR_ARG = State.new 0x10, expr_names 911 | EXPR_CMDARG = State.new 0x20, expr_names 912 | EXPR_MID = State.new 0x40, expr_names 913 | EXPR_FNAME = State.new 0x80, expr_names 914 | EXPR_DOT = State.new 0x100, expr_names 915 | EXPR_CLASS = State.new 0x200, expr_names 916 | EXPR_LABEL = State.new 0x400, expr_names 917 | EXPR_LABELED = State.new 0x800, expr_names 918 | EXPR_FITEM = State.new 0x1000, expr_names 919 | 920 | EXPR_BEG_ANY = EXPR_BEG | EXPR_MID | EXPR_CLASS 921 | EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG 922 | EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN 923 | 924 | # extra fake lex_state names to make things a bit cleaner 925 | 926 | EXPR_LAB = EXPR_ARG|EXPR_LABELED 927 | EXPR_LIT = EXPR_END|EXPR_ENDARG 928 | EXPR_PAR = EXPR_BEG|EXPR_LABEL 929 | EXPR_PAD = EXPR_BEG|EXPR_LABELED 930 | 931 | EXPR_NUM = EXPR_LIT 932 | 933 | expr_names.merge!(EXPR_NONE => "EXPR_NONE", 934 | EXPR_BEG => "EXPR_BEG", 935 | EXPR_END => "EXPR_END", 936 | EXPR_ENDARG => "EXPR_ENDARG", 937 | EXPR_ENDFN => "EXPR_ENDFN", 938 | EXPR_ARG => "EXPR_ARG", 939 | EXPR_CMDARG => "EXPR_CMDARG", 940 | EXPR_MID => "EXPR_MID", 941 | EXPR_FNAME => "EXPR_FNAME", 942 | EXPR_DOT => "EXPR_DOT", 943 | EXPR_CLASS => "EXPR_CLASS", 944 | EXPR_LABEL => "EXPR_LABEL", 945 | EXPR_LABELED => "EXPR_LABELED", 946 | EXPR_FITEM => "EXPR_FITEM") 947 | 948 | # ruby constants for strings 949 | 950 | str_func_names = {} 951 | 952 | STR_FUNC_BORING = State.new 0x00, str_func_names 953 | STR_FUNC_ESCAPE = State.new 0x01, str_func_names 954 | STR_FUNC_EXPAND = State.new 0x02, str_func_names 955 | STR_FUNC_REGEXP = State.new 0x04, str_func_names 956 | STR_FUNC_QWORDS = State.new 0x08, str_func_names 957 | STR_FUNC_SYMBOL = State.new 0x10, str_func_names 958 | STR_FUNC_INDENT = State.new 0x20, str_func_names # <<-HEREDOC 959 | STR_FUNC_LABEL = State.new 0x40, str_func_names 960 | STR_FUNC_LIST = State.new 0x4000, str_func_names 961 | STR_FUNC_TERM = State.new 0x8000, str_func_names 962 | STR_FUNC_DEDENT = State.new 0x10000, str_func_names # <<~HEREDOC 963 | 964 | # TODO: check parser25.y on how they do STR_FUNC_INDENT 965 | 966 | STR_SQUOTE = STR_FUNC_BORING 967 | STR_DQUOTE = STR_FUNC_EXPAND 968 | STR_XQUOTE = STR_FUNC_EXPAND 969 | STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND 970 | STR_SWORD = STR_FUNC_QWORDS | STR_FUNC_LIST 971 | STR_DWORD = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST 972 | STR_SSYM = STR_FUNC_SYMBOL 973 | STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND 974 | STR_LABEL = STR_FUNC_LABEL 975 | 976 | str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE", 977 | STR_FUNC_EXPAND => "STR_FUNC_EXPAND", 978 | STR_FUNC_REGEXP => "STR_FUNC_REGEXP", 979 | STR_FUNC_QWORDS => "STR_FUNC_QWORDS", 980 | STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL", 981 | STR_FUNC_INDENT => "STR_FUNC_INDENT", 982 | STR_FUNC_LABEL => "STR_FUNC_LABEL", 983 | STR_FUNC_LIST => "STR_FUNC_LIST", 984 | STR_FUNC_TERM => "STR_FUNC_TERM", 985 | STR_FUNC_DEDENT => "STR_FUNC_DEDENT", 986 | STR_SQUOTE => "STR_SQUOTE") 987 | end 988 | 989 | include Values 990 | end 991 | 992 | include State::Values 993 | end 994 | 995 | class RubyLexer 996 | module SSWrapper 997 | def string= s 998 | ss.string= s 999 | end 1000 | 1001 | def beginning_of_line? 1002 | ss.bol? 1003 | end 1004 | 1005 | alias bol? beginning_of_line? # to make .rex file more readable 1006 | 1007 | def check re 1008 | maybe_pop_stack 1009 | 1010 | ss.check re 1011 | end 1012 | 1013 | def end_of_stream? 1014 | ss.eos? 1015 | end 1016 | 1017 | alias eos? end_of_stream? 1018 | 1019 | def getch 1020 | c = ss.getch 1021 | c = ss.getch if c == "\r" && ss.peek(1) == "\n" 1022 | c 1023 | end 1024 | 1025 | def match 1026 | ss 1027 | end 1028 | 1029 | def matched 1030 | ss.matched 1031 | end 1032 | 1033 | def in_heredoc? 1034 | !!self.old_ss 1035 | end 1036 | 1037 | def maybe_pop_stack 1038 | if ss.eos? && in_heredoc? then 1039 | self.ss_pop 1040 | self.lineno_pop 1041 | end 1042 | end 1043 | 1044 | def pos 1045 | ss.pos 1046 | end 1047 | 1048 | def pos= n 1049 | ss.pos = n 1050 | end 1051 | 1052 | def rest 1053 | ss.rest 1054 | end 1055 | 1056 | def scan re 1057 | maybe_pop_stack 1058 | 1059 | ss.scan re 1060 | end 1061 | 1062 | def scanner_class # TODO: design this out of oedipus_lex. or something. 1063 | RPStringScanner 1064 | end 1065 | 1066 | def ss_string 1067 | ss.string 1068 | end 1069 | 1070 | def ss_string= s 1071 | raise "Probably not" 1072 | ss.string = s 1073 | end 1074 | 1075 | def unscan 1076 | ss.unscan 1077 | end 1078 | end 1079 | 1080 | include SSWrapper 1081 | end 1082 | 1083 | class RubyLexer 1084 | module SSStackish 1085 | def lineno_push new_lineno 1086 | self.old_lineno = self.lineno 1087 | self.lineno = new_lineno 1088 | end 1089 | 1090 | def lineno_pop 1091 | self.lineno = self.old_lineno 1092 | self.old_lineno = nil 1093 | end 1094 | 1095 | def ss= o 1096 | raise "Clearing ss while in heredoc!?!" if in_heredoc? 1097 | @old_ss = nil 1098 | super 1099 | end 1100 | 1101 | def ss_push new_ss 1102 | @old_ss = self.ss 1103 | @ss = new_ss 1104 | end 1105 | 1106 | def ss_pop 1107 | @ss = self.old_ss 1108 | @old_ss = nil 1109 | end 1110 | end 1111 | 1112 | prepend SSStackish 1113 | end 1114 | 1115 | if ENV["RP_STRTERM_DEBUG"] then 1116 | class RubyLexer 1117 | def d o 1118 | $stderr.puts o.inspect 1119 | end 1120 | 1121 | alias old_lex_strterm= lex_strterm= 1122 | 1123 | def lex_strterm= o 1124 | self.old_lex_strterm= o 1125 | where = caller.first.split(/:/).first(2).join(":") 1126 | $stderr.puts 1127 | d :lex_strterm => [o, where] 1128 | end 1129 | end 1130 | end 1131 | 1132 | require_relative "./ruby_lexer.rex.rb" 1133 | require_relative "./ruby_lexer_strings.rb" 1134 | 1135 | if ENV["RP_LINENO_DEBUG"] then 1136 | class RubyLexer 1137 | def d o 1138 | $stderr.puts o.inspect 1139 | end 1140 | 1141 | alias old_lineno= lineno= 1142 | 1143 | def lineno= n 1144 | self.old_lineno= n 1145 | where = caller.first.split(/:/).first(2).join(":") 1146 | $stderr.puts 1147 | d :lineno => [n, where] 1148 | end 1149 | end 1150 | end 1151 | -------------------------------------------------------------------------------- /lib/ruby_lexer.rex: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | # 3 | # lexical scanner definition for ruby 4 | 5 | class RubyLexer 6 | 7 | option 8 | 9 | lineno 10 | column 11 | 12 | macro 13 | 14 | IDENT_CHAR /[a-zA-Z0-9_[:^ascii:]]/ 15 | 16 | ESC /\\((?>[0-7]{1,3}|x\h{1,2}|M-[^\\]|(C-|c)[^\\]|u\h{1,4}|u\{\h+(?:\s+\h+)*\}|[^0-7xMCc]))/ 17 | SIMPLE_STRING /((#{ESC}|\#(#{ESC}|[^\{\#\@\$\"\\])|[^\"\\\#])*)/o 18 | SSTRING /((\\.|[^\'])*)/ 19 | 20 | INT_DEC /[+]?(?:(?:[1-9][\d_]*|0)(?!\.\d)(ri|r|i)?\b|0d[0-9_]+)(ri|r|i)?/i 21 | INT_HEX /[+]?0x[a-f0-9_]+(ri|r|i)?/i 22 | INT_BIN /[+]?0b[01_]+(ri|r|i)?/i 23 | INT_OCT /[+]?0o?[0-7_]+(ri|r|i)?|0o(ri|r|i)?/i 24 | FLOAT /[+]?\d[\d_]*\.[\d_]+(e[+-]?[\d_]+)?(?:(ri|r|i)\b)?|[+]?[\d_]+e[+-]?[\d_]+(?:(ri|r|i)\b)?/i 25 | INT_DEC2 /[+]?\d[0-9_]*(?![e])((ri|r|i)\b)?/i 26 | 27 | NUM_BAD /[+]?0[xbd]\b/i 28 | INT_OCT_BAD /[+]?0o?[0-7_]*[89]/i 29 | FLOAT_BAD /[+]?\d[\d_]*_(e|\.)/i 30 | 31 | start 32 | 33 | maybe_pop_stack 34 | return process_string_or_heredoc if lex_strterm 35 | 36 | self.cmd_state = self.command_start 37 | self.command_start = false 38 | self.space_seen = false # TODO: rename token_seen? 39 | self.last_state = lex_state 40 | 41 | rule 42 | 43 | # [:state] pattern [actions] 44 | 45 | # \s - \n + \v 46 | /[\ \t\r\f\v]+/ { self.space_seen = true; next } 47 | 48 | /\n|\#/ process_newline_or_comment 49 | 50 | /[\]\)\}]/ process_brace_close 51 | 52 | : /\!/ 53 | | is_after_operator? /\!\@/ { result EXPR_ARG, TOKENS[text], text } 54 | | /\![=~]?/ { result :arg_state, TOKENS[text], text } 55 | 56 | : /\./ 57 | | /\.\.\.?/ process_dots 58 | | /\.\d/ { rb_compile_error "no . floating literal anymore put 0 before dot" } 59 | | /\./ { self.lex_state = EXPR_BEG; result EXPR_DOT, :tDOT, "." } 60 | 61 | /\(/ process_paren 62 | 63 | /\,/ { result EXPR_PAR, TOKENS[text], text } 64 | 65 | : /=/ 66 | | /\=\=\=|\=\=|\=~|\=>|\=(?!begin\b)/ { result arg_state, TOKENS[text], text } 67 | | bol? /\=begin(?=\s)/ process_begin 68 | | /\=(?=begin\b)/ { result arg_state, TOKENS[text], text } 69 | 70 | ruby22_label? /\"#{SIMPLE_STRING}\":/o process_label 71 | /\"(#{SIMPLE_STRING})\"/o process_simple_string 72 | /\"/ { string STR_DQUOTE, '"'; result nil, :tSTRING_BEG, text } 73 | 74 | /\@\@?\d/ { rb_compile_error "`#{text}` is not allowed as a variable name" } 75 | /\@\@?#{IDENT_CHAR}+/o process_ivar 76 | 77 | : /:/ 78 | | not_end? /:([a-zA-Z_]#{IDENT_CHAR}*(?:[?]|[!](?!=)|=(?==>)|=(?![=>]))?)/o process_symbol 79 | | not_end? /\:\"(#{SIMPLE_STRING})\"/o process_symbol 80 | | not_end? /\:\'(#{SSTRING})\'/o process_symbol 81 | | /\:\:/ process_colon2 82 | | /\:/ process_colon1 83 | 84 | /->/ { result EXPR_ENDFN, :tLAMBDA, text } 85 | 86 | /[+-]/ process_plus_minus 87 | 88 | : /[+\d]/ 89 | | /#{NUM_BAD}/o { rb_compile_error "Invalid numeric format" } 90 | | /#{INT_DEC}/o { int_with_base 10 } 91 | | /#{INT_HEX}/o { int_with_base 16 } 92 | | /#{INT_BIN}/o { int_with_base 2 } 93 | | /#{INT_OCT_BAD}/o { rb_compile_error "Illegal octal digit." } 94 | | /#{INT_OCT}/o { int_with_base 8 } 95 | | /#{FLOAT_BAD}/o { rb_compile_error "Trailing '_' in number." } 96 | | /#{FLOAT}/o process_float 97 | | /#{INT_DEC2}/o { int_with_base 10 } 98 | | /[0-9]/ { rb_compile_error "Bad number format" } 99 | 100 | /\[/ process_square_bracket 101 | 102 | was_label? /\'#{SSTRING}\':?/o process_label_or_string 103 | /\'/ { string STR_SQUOTE, "'"; result nil, :tSTRING_BEG, text } 104 | 105 | : /\|/ 106 | | /\|\|\=/ { result EXPR_BEG, :tOP_ASGN, "||" } 107 | | /\|\|/ { result EXPR_BEG, :tOROP, "||" } 108 | | /\|\=/ { result EXPR_BEG, :tOP_ASGN, "|" } 109 | | /\|/ { state = is_after_operator? ? EXPR_ARG : EXPR_PAR; result state, :tPIPE, "|" } 110 | 111 | /\{/ process_brace_open 112 | 113 | : /\*/ 114 | | /\*\*=/ { result EXPR_BEG, :tOP_ASGN, "**" } 115 | | /\*\*/ { result :arg_state, space_vs_beginning(:tDSTAR, :tDSTAR, :tPOW), "**" } 116 | | /\*\=/ { result EXPR_BEG, :tOP_ASGN, "*" } 117 | | /\*/ { result :arg_state, space_vs_beginning(:tSTAR, :tSTAR, :tSTAR2), "*" } 118 | 119 | # TODO: fix result+process_lchevron to set command_start = true 120 | : // { result :arg_state, :tCMP, "<=>" } 122 | | /\<\=/ { result :arg_state, :tLEQ, "<=" } 123 | | /\<\<\=/ { result EXPR_BEG, :tOP_ASGN, "<<" } 124 | | /\<\/ 128 | | /\>\=/ { result :arg_state, :tGEQ, ">=" } 129 | | /\>\>=/ { result EXPR_BEG, :tOP_ASGN, ">>" } 130 | | /\>\>/ { result :arg_state, :tRSHFT, ">>" } 131 | | /\>/ { result :arg_state, :tGT, ">" } 132 | 133 | : /\`/ 134 | | expr_fname? /\`/ { result EXPR_END, :tBACK_REF2, "`" } 135 | | expr_dot? /\`/ { result((cmd_state ? EXPR_CMDARG : EXPR_ARG), :tBACK_REF2, "`") } 136 | | /\`/ { string STR_XQUOTE, '`'; result nil, :tXSTRING_BEG, "`" } 137 | 138 | /\?/ process_questionmark 139 | 140 | : /&/ 141 | | /\&\&\=/ { result EXPR_BEG, :tOP_ASGN, "&&" } 142 | | /\&\&/ { result EXPR_BEG, :tANDOP, "&&" } 143 | | /\&\=/ { result EXPR_BEG, :tOP_ASGN, "&" } 144 | | /\&\./ { result EXPR_DOT, :tLONELY, "&." } 145 | | /\&/ process_amper 146 | 147 | /\// process_slash 148 | 149 | : /\^/ 150 | | /\^=/ { result EXPR_BEG, :tOP_ASGN, "^" } 151 | | /\^/ { result :arg_state, :tCARET, "^" } 152 | 153 | /\;/ { self.command_start = true; result EXPR_BEG, :tSEMI, ";" } 154 | 155 | : /~/ 156 | | is_after_operator? /\~@/ { result :arg_state, :tTILDE, "~" } 157 | | /\~/ { result :arg_state, :tTILDE, "~" } 158 | 159 | : /\\/ 160 | | /\\\r?\n/ { self.lineno += 1; self.space_seen = true; next } 161 | | /\\/ { rb_compile_error "bare backslash only allowed before newline" } 162 | 163 | /\%/ process_percent 164 | 165 | : /\$/ 166 | | /\$_\w+/ process_gvar 167 | | /\$_/ process_gvar 168 | | /\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/ process_gvar 169 | | in_fname? /\$([\&\`\'\+])/ process_gvar 170 | | /\$([\&\`\'\+])/ process_backref 171 | | in_fname? /\$([1-9]\d*)/ process_gvar 172 | | /\$([1-9]\d*)/ process_nthref 173 | | /\$0/ process_gvar 174 | | /\$#{IDENT_CHAR}+/ process_gvar 175 | | /\$\W/ process_gvar_oddity 176 | 177 | /\_/ process_underscore 178 | 179 | /#{IDENT_CHAR}+/o process_token 180 | 181 | /\004|\032|\000|\Z/ { [RubyLexer::EOF, RubyLexer::EOF] } 182 | 183 | /./ { rb_compile_error "Invalid char #{text.inspect} in expression" } 184 | 185 | end 186 | -------------------------------------------------------------------------------- /lib/ruby_lexer_strings.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | class RubyLexer 4 | def eat_whitespace 5 | r = scan(/\s+/) 6 | self.lineno += r.count("\n") if r 7 | 8 | r += eat_whitespace if eos? && in_heredoc? # forces heredoc pop 9 | 10 | r 11 | end 12 | 13 | def heredoc here # ../compare/parse30.y:7678 14 | _, term, func, _indent_max, _lineno, range = here 15 | 16 | start_line = lineno 17 | eos = term # HACK 18 | indent = func =~ STR_FUNC_INDENT 19 | 20 | self.string_buffer = [] 21 | 22 | last_line = self.ss_string[range] if range 23 | eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n" # HACK 24 | 25 | expand = func =~ STR_FUNC_EXPAND 26 | 27 | # TODO? p->heredoc_line_indent == -1 28 | 29 | indent_re = indent ? "[ \t]*" : nil 30 | eos_re = /#{indent_re}#{Regexp.escape eos}(?=\r?\n|\z)/ 31 | err_msg = "can't match #{eos_re.inspect} anywhere in " 32 | 33 | maybe_pop_stack 34 | rb_compile_error err_msg if end_of_stream? 35 | 36 | if beginning_of_line? && scan(eos_re) then 37 | scan(/\r?\n|\z/) 38 | self.lineno += 1 if matched =~ /\n/ 39 | 40 | heredoc_restore 41 | 42 | self.lex_strterm = nil 43 | self.lex_state = EXPR_END 44 | 45 | return :tSTRING_END, [term, func, range] 46 | end 47 | 48 | if expand then 49 | case 50 | when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then 51 | # TODO: !ISASCII 52 | # ?! see parser_peek_variable_name 53 | return :tSTRING_DVAR, matched 54 | when scan(/#(?=\@\@?[a-zA-Z_])/) then 55 | # TODO: !ISASCII 56 | return :tSTRING_DVAR, matched 57 | when scan(/#[{]/) then 58 | self.command_start = true 59 | return :tSTRING_DBEG, [matched, lineno] 60 | when scan(/#/) then 61 | string_buffer << "#" 62 | end 63 | 64 | begin 65 | # NOTE: this visibly diverges from the C code but uses tokadd_string 66 | # to stay clean. 67 | 68 | str = tokadd_string func, eol, nil 69 | rb_compile_error err_msg if str == RubyLexer::EOF 70 | 71 | if str != eol then 72 | str = string_buffer.join 73 | string_buffer.clear 74 | return result nil, :tSTRING_CONTENT, str, start_line 75 | else 76 | string_buffer << scan(/\r?\n/) 77 | self.lineno += 1 # TODO: try to remove most scan(/\n/) and friends 78 | end 79 | end until check eos_re 80 | else 81 | until check(eos_re) do 82 | string_buffer << scan(/.*(\r?\n|\z)/) 83 | self.lineno += 1 84 | rb_compile_error err_msg if end_of_stream? 85 | end 86 | end 87 | 88 | string_content = begin 89 | s = string_buffer.join 90 | s.b.force_encoding Encoding::UTF_8 91 | s 92 | end 93 | string_buffer.clear 94 | 95 | result nil, :tSTRING_CONTENT, string_content, start_line 96 | end 97 | 98 | def heredoc_identifier # ../compare/parse30.y:7354 99 | token = :tSTRING_BEG 100 | func = STR_FUNC_BORING 101 | term = nil 102 | indent = nil 103 | quote = nil 104 | char_pos = nil 105 | byte_pos = nil 106 | 107 | heredoc_indent_mods = "-" 108 | heredoc_indent_mods += '\~' if ruby23plus? 109 | 110 | case 111 | when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then 112 | mods, quote, term = match[1], match[2], match[3] 113 | char_pos = ss.charpos 114 | byte_pos = ss.pos 115 | 116 | func |= STR_FUNC_INDENT unless mods.empty? 117 | func |= STR_FUNC_DEDENT if mods == "~" 118 | func |= case quote 119 | when "\'" then 120 | STR_SQUOTE 121 | when '"' then 122 | STR_DQUOTE 123 | when "`" then 124 | token = :tXSTRING_BEG 125 | STR_XQUOTE 126 | else 127 | debug 1 128 | end 129 | when scan(/[#{heredoc_indent_mods}]?([\'\"\`])(?!\1*\Z)/) then 130 | rb_compile_error "unterminated here document identifier" 131 | when scan(/([#{heredoc_indent_mods}]?)(#{IDENT_CHAR}+)/) then 132 | mods, term = match[1], match[2] 133 | quote = '"' 134 | char_pos = ss.charpos 135 | byte_pos = ss.pos 136 | 137 | func |= STR_FUNC_INDENT unless mods.empty? 138 | func |= STR_FUNC_DEDENT if mods == "~" 139 | func |= STR_DQUOTE 140 | else 141 | return 142 | end 143 | 144 | old_lineno = self.lineno 145 | rest_of_line = scan(/.*(?:\r?\n|\z)/) 146 | self.lineno += rest_of_line.count "\n" 147 | 148 | char_pos_end = ss.charpos - 1 149 | 150 | range = nil 151 | range = char_pos..char_pos_end unless rest_of_line.empty? 152 | 153 | self.lex_strterm = [:heredoc, term, func, indent, old_lineno, range, byte_pos] 154 | 155 | result nil, token, quote, old_lineno 156 | end 157 | 158 | def heredoc_restore # ../compare/parse30.y:7438 159 | _, _term, _func, _indent, lineno, range, bytepos = lex_strterm 160 | 161 | new_ss = ss.class.new self.ss_string[0..range.max] 162 | new_ss.pos = bytepos 163 | 164 | lineno_push lineno 165 | ss_push new_ss 166 | 167 | nil 168 | end 169 | 170 | def newtok 171 | string_buffer.clear 172 | end 173 | 174 | def nextc 175 | # TODO: 176 | # if (UNLIKELY((p->lex.pcur == p->lex.pend) || p->eofp || RTEST(p->lex.nextline))) { 177 | # if (nextline(p)) return -1; 178 | # } 179 | 180 | maybe_pop_stack 181 | 182 | c = ss.getch 183 | 184 | if c == "\n" then 185 | ss.unscan 186 | c = nil 187 | end 188 | 189 | c 190 | end 191 | 192 | def parse_string quote # ../compare/parse30.y:7273 193 | _, func, term, paren = quote 194 | 195 | qwords = func =~ STR_FUNC_QWORDS 196 | regexp = func =~ STR_FUNC_REGEXP 197 | expand = func =~ STR_FUNC_EXPAND 198 | list = func =~ STR_FUNC_LIST 199 | termx = func =~ STR_FUNC_TERM # TODO: document wtf this means 200 | 201 | space = false 202 | term_re = regexp_cache[term] 203 | 204 | if termx then 205 | # self.nextc if qwords # delayed term 206 | 207 | self.lex_strterm = nil 208 | 209 | return result EXPR_END, regexp ? :tREGEXP_END : :tSTRING_END, term 210 | end 211 | 212 | space = true if qwords and eat_whitespace 213 | 214 | if list then 215 | debug 4 216 | # quote[1] -= STR_FUNC_LIST 217 | # space = true 218 | end 219 | 220 | # TODO: move to quote.nest! 221 | if string_nest == 0 && scan(term_re) then 222 | if qwords then 223 | quote[1] |= STR_FUNC_TERM 224 | 225 | return :tSPACE, matched 226 | end 227 | 228 | return string_term func 229 | end 230 | 231 | return result nil, :tSPACE, " " if space 232 | 233 | newtok 234 | 235 | if expand && check(/#/) then 236 | t = self.scan_variable_name 237 | return t if t 238 | 239 | tokadd "#" 240 | end 241 | 242 | # TODO: add string_nest, enc, base_enc ? 243 | lineno = self.lineno 244 | if tokadd_string(func, term, paren) == RubyLexer::EOF then 245 | if qwords then 246 | rb_compile_error "unterminated list meets end of file" 247 | end 248 | 249 | if regexp then 250 | rb_compile_error "unterminated regexp meets end of file" 251 | else 252 | rb_compile_error "unterminated string meets end of file" 253 | end 254 | end 255 | 256 | result nil, :tSTRING_CONTENT, string_buffer.join, lineno 257 | end 258 | 259 | # called from process_percent 260 | def process_percent_quote # ../compare/parse30.y:8645 261 | c = getch # type %... 262 | 263 | long_hand = !!(c =~ /[QqWwIixrs]/) 264 | 265 | if end_of_stream? || c !~ /\p{Alnum}/ then 266 | term = c # TODO? PERCENT_END[c] || c 267 | 268 | debug 2 if c && c !~ /\p{ASCII}/ 269 | c = "Q" 270 | else 271 | term = getch 272 | 273 | debug 3 if term =~ /\p{Alnum}|\P{ASCII}/ 274 | end 275 | 276 | if end_of_stream? or c == RubyLexer::EOF or term == RubyLexer::EOF then 277 | rb_compile_error "unterminated quoted string meets end of file" 278 | end 279 | 280 | # "\0" is special to indicate beg=nnd and that no nesting? 281 | paren = term 282 | term = PERCENT_END[term] 283 | term, paren = paren, "\0" if term.nil? # TODO: "\0" -> nil 284 | 285 | text = long_hand ? "%#{c}#{paren}" : "%#{term}" 286 | 287 | current_line = self.lineno 288 | 289 | token_type, string_type = 290 | case c 291 | when "Q" then 292 | [:tSTRING_BEG, STR_DQUOTE] 293 | when "q" then 294 | [:tSTRING_BEG, STR_SQUOTE] 295 | when "W" then 296 | eat_whitespace 297 | [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS] 298 | when "w" then 299 | eat_whitespace 300 | [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS] 301 | when "I" then 302 | eat_whitespace 303 | [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS] 304 | when "i" then 305 | eat_whitespace 306 | [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS] 307 | when "x" then 308 | [:tXSTRING_BEG, STR_XQUOTE] 309 | when "r" then 310 | [:tREGEXP_BEG, STR_REGEXP] 311 | when "s" then 312 | self.lex_state = EXPR_FNAME 313 | [:tSYMBEG, STR_SSYM] 314 | else 315 | rb_compile_error "unknown type of %string. Expected [QqWwIixrs], found '#{c}'." 316 | end 317 | 318 | string string_type, term, paren 319 | 320 | result nil, token_type, text, current_line 321 | end 322 | 323 | def process_string_or_heredoc # ../compare/parse30.y:9075 324 | if lex_strterm[0] == :heredoc then 325 | self.heredoc lex_strterm 326 | else 327 | self.parse_string lex_strterm 328 | end 329 | end 330 | 331 | def read_escape flags = nil # ../compare/parse30.y:6712 332 | case 333 | when scan(/\\/) then # Backslash 334 | '\\' 335 | when scan(/n/) then # newline 336 | "\n" 337 | when scan(/t/) then # horizontal tab 338 | "\t" 339 | when scan(/r/) then # carriage-return 340 | "\r" 341 | when scan(/f/) then # form-feed 342 | "\f" 343 | when scan(/v/) then # vertical tab 344 | "\13" 345 | when scan(/a/) then # alarm(bell) 346 | "\007" 347 | when scan(/e/) then # escape 348 | "\033" 349 | when scan(/[0-7]{1,3}/) then # octal constant 350 | (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8 351 | when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant 352 | # TODO: force encode everything to UTF-8? 353 | match[1].to_i(16).chr.force_encoding Encoding::UTF_8 354 | when scan(/b/) then # backspace 355 | "\010" 356 | when scan(/s/) then # space 357 | " " 358 | when check(/M-\\u/) then 359 | debug 5 360 | when scan(/M-\\(?=.)/) then 361 | c = read_escape 362 | c[0] = (c[0].ord | 0x80).chr 363 | c 364 | when scan(/M-(\p{ASCII})/) then 365 | # TODO: ISCNTRL(c) -> goto eof 366 | c = match[1] 367 | c[0] = (c[0].ord | 0x80).chr 368 | c 369 | when check(/(C-|c)\\u/) then 370 | debug 6 371 | when scan(/(C-|c)\\?\?/) then 372 | 127.chr 373 | when scan(/(C-|c)\\/) then 374 | c = read_escape 375 | c[0] = (c[0].ord & 0x9f).chr 376 | c 377 | when scan(/(?:C-|c)(.)/) then 378 | c = match[1] 379 | c[0] = (c[0].ord & 0x9f).chr 380 | c 381 | when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :( 382 | matched 383 | when scan(/u(\h{4})/) then 384 | [match[1].to_i(16)].pack("U") 385 | when scan(/u(\h{1,3})/) then 386 | debug 7 387 | rb_compile_error "Invalid escape character syntax" 388 | when scan(/u\{(\h+(?: +\h+)*)\}/) then 389 | match[1].split.map { |s| s.to_i(16) }.pack("U*") 390 | when scan(/[McCx0-9]/) || end_of_stream? then 391 | rb_compile_error("Invalid escape character syntax") 392 | else 393 | getch 394 | end.dup 395 | end 396 | 397 | def regx_options # ../compare/parse30.y:6914 398 | newtok 399 | 400 | options = scan(/\p{Alpha}+/) || "" 401 | 402 | rb_compile_error("unknown regexp options: %s" % [options]) if 403 | options =~ /[^ixmonesu]/ 404 | 405 | options 406 | end 407 | 408 | def scan_variable_name # ../compare/parse30.y:7208 409 | case 410 | when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then 411 | # TODO: !ISASCII 412 | return :tSTRING_DVAR, matched 413 | when scan(/#(?=\@\@?[a-zA-Z_])/) then 414 | # TODO: !ISASCII 415 | return :tSTRING_DVAR, matched 416 | when scan(/#[{]/) then 417 | self.command_start = true 418 | return :tSTRING_DBEG, [matched, lineno] 419 | when scan(/#/) then 420 | # do nothing but swallow 421 | end 422 | 423 | # if scan(/\P{ASCII}|_|\p{Alpha}/) then # TODO: fold into above DVAR cases 424 | # # if (!ISASCII(c) || c == '_' || ISALPHA(c)) 425 | # # return tSTRING_DVAR; 426 | # end 427 | 428 | nil 429 | end 430 | 431 | def string type, beg, nnd = nil 432 | # label = (IS_LABEL_POSSIBLE() ? str_label : 0); 433 | # p->lex.strterm = NEW_STRTERM(str_dquote | label, '"', 0); 434 | # p->lex.ptok = p->lex.pcur-1; 435 | 436 | type |= STR_FUNC_LABEL if is_label_possible? 437 | self.lex_strterm = [:strterm, type, beg, nnd || "\0"] 438 | end 439 | 440 | def string_term func # ../compare/parse30.y:7254 441 | self.lex_strterm = nil 442 | 443 | return result EXPR_END, :tREGEXP_END, self.regx_options if 444 | func =~ STR_FUNC_REGEXP 445 | 446 | if func =~ STR_FUNC_LABEL && is_label_suffix? then 447 | self.getch 448 | self.lex_state = EXPR_BEG|EXPR_LABEL 449 | 450 | return :tLABEL_END, string_buffer.join 451 | end 452 | 453 | self.lex_state = EXPR_END 454 | 455 | return :tSTRING_END, [self.matched, func] 456 | end 457 | 458 | def tokadd c # ../compare/parse30.y:6548 459 | string_buffer << c 460 | end 461 | 462 | def tokadd_escape # ../compare/parse30.y:6840 463 | case 464 | when scan(/\\\n/) then 465 | # just ignore 466 | when scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then 467 | tokadd matched 468 | when scan(/\\([MC]-|c)(?=\\)/) then 469 | tokadd matched 470 | self.tokadd_escape 471 | when scan(/\\([MC]-|c)(.)/) then 472 | tokadd matched 473 | 474 | self.tokadd_escape if check(/\\/) # recurse if continued! 475 | when scan(/\\[McCx]/) then # all unprocessed branches from above have failed 476 | rb_compile_error "Invalid escape character syntax" 477 | when scan(/\\(.)/m) then 478 | chr, = match[1] 479 | 480 | tokadd "\\" 481 | tokadd chr 482 | else 483 | rb_compile_error "Invalid escape character syntax: %p" % [self.rest.lines.first] 484 | end 485 | end 486 | 487 | def tokadd_string func, term, paren # ../compare/parse30.y:7020 488 | qwords = func =~ STR_FUNC_QWORDS 489 | escape = func =~ STR_FUNC_ESCAPE 490 | expand = func =~ STR_FUNC_EXPAND 491 | regexp = func =~ STR_FUNC_REGEXP 492 | 493 | paren_re = regexp_cache[paren] if paren != "\0" 494 | term_re = if term == "\n" 495 | /\r?\n/ 496 | else 497 | regexp_cache[term] 498 | end 499 | 500 | until end_of_stream? do 501 | case 502 | when paren_re && scan(paren_re) then 503 | self.string_nest += 1 504 | when scan(term_re) then 505 | if self.string_nest == 0 then 506 | self.pos -= 1 # TODO: ss.unscan 665 errors #$ HACK: why do we depend on this so hard? 507 | break # leave eos loop, go parse term in caller (heredoc or parse_string) 508 | else 509 | self.lineno += matched.count("\n") 510 | self.string_nest -= 1 511 | end 512 | 513 | when expand && check(/#[\$\@\{]/) then 514 | # do nothing since we used `check` 515 | break # leave eos loop 516 | when check(/\\/) then 517 | case 518 | when scan(/\\\n/) then 519 | self.lineno += 1 520 | case 521 | when qwords then 522 | tokadd "\n" 523 | next 524 | when expand then 525 | next if func !~ STR_FUNC_INDENT 526 | 527 | if term == "\n" then 528 | unscan # rollback 529 | scan(/\\/) # and split 530 | scan(/\n/) # this is `matched` 531 | break 532 | end 533 | 534 | tokadd "\\" 535 | debug 9 536 | else 537 | unscan # rollback 538 | scan(/\\/) # this is `matched` 539 | end 540 | when check(/\\\\/) then 541 | tokadd '\\' if escape 542 | nextc # ignore 1st \\ 543 | nextc # for tokadd ss.matched, below 544 | when scan(/\\u/) then 545 | unless expand then 546 | tokadd "\\" 547 | next 548 | end 549 | 550 | tokadd_utf8 term, func, regexp 551 | 552 | next 553 | else 554 | scan(/\\/) # eat it, we know it's there 555 | 556 | return RubyLexer::EOF if end_of_stream? 557 | 558 | if scan(/\P{ASCII}/) then 559 | tokadd "\\" unless expand 560 | tokadd self.matched 561 | next 562 | end 563 | 564 | case 565 | when regexp then 566 | if term !~ SIMPLE_RE_META && scan(term_re) then 567 | tokadd matched 568 | next 569 | end 570 | 571 | self.pos -= 1 # TODO: ss.unscan 15 errors 572 | # HACK? decide whether to eat the \\ above 573 | if _esc = tokadd_escape && end_of_stream? then 574 | debug 10 575 | end 576 | 577 | next # C's continue = Ruby's next 578 | when expand then 579 | tokadd "\\" if escape 580 | tokadd read_escape 581 | next 582 | when qwords && scan(/\s/) then 583 | # ignore backslashed spaces in %w 584 | when !check(term_re) && !(paren_re && check(paren_re)) then 585 | tokadd "\\" 586 | next 587 | else 588 | getch # slurp it too for matched below 589 | end 590 | end # inner case for /\\/ 591 | 592 | when scan(/\P{ASCII}/) then 593 | # not currently checking encoding stuff -- drops to tokadd below 594 | when qwords && check(/\s/) then 595 | break # leave eos loop 596 | else 597 | t = Regexp.escape term == "\n" ? "\r\n" : term 598 | x = Regexp.escape paren if paren && paren != "\000" 599 | q = "\\s" if qwords 600 | re = /[^#{t}#{x}\#\\#{q}]+/ 601 | 602 | scan re or getch 603 | self.lineno += matched.count "\n" if matched 604 | end # big case 605 | 606 | tokadd self.matched 607 | end # until end_of_stream? 608 | 609 | if self.matched then 610 | self.matched 611 | elsif end_of_stream? then 612 | RubyLexer::EOF 613 | end 614 | end # tokadd_string 615 | 616 | def tokadd_utf8 term, func, regexp_literal # ../compare/parse30.y:6646 617 | tokadd "\\u" if regexp_literal 618 | 619 | case 620 | when scan(/\h{4}/) then 621 | codepoint = [matched.to_i(16)].pack("U") 622 | 623 | tokadd regexp_literal ? matched : codepoint 624 | when scan(/\{\s*(\h{1,6}(?:\s+\h{1,6})*)\s*\}/) then 625 | codepoints = match[1].split.map { |s| s.to_i 16 }.pack("U") 626 | 627 | if regexp_literal then 628 | tokadd "{" 629 | tokadd match[1].split.join(" ") 630 | tokadd "}" 631 | else 632 | tokadd codepoints 633 | end 634 | else 635 | rb_compile_error "unterminated Unicode escape" 636 | end 637 | end 638 | end 639 | -------------------------------------------------------------------------------- /lib/ruby_parser.rb: -------------------------------------------------------------------------------- 1 | require "ruby_parser_extras" 2 | require "racc/parser" 3 | 4 | ## 5 | # RubyParser is a compound parser that uses all known versions to 6 | # attempt to parse. 7 | 8 | class RubyParser 9 | 10 | VERSIONS = [] 11 | 12 | attr_accessor :current 13 | 14 | def self.for_current_ruby 15 | name = "V#{RUBY_VERSION[/^\d+\.\d+/].delete "."}" 16 | klass = if const_defined? name then 17 | const_get name 18 | else 19 | latest = VERSIONS.first 20 | warn "NOTE: RubyParser::#{name} undefined, using #{latest}." 21 | latest 22 | end 23 | 24 | klass.new 25 | end 26 | 27 | def self.latest 28 | VERSIONS.first.new 29 | end 30 | 31 | def process s, f = "(string)", t = 10 32 | e = nil 33 | VERSIONS.each do |klass| 34 | self.current = parser = klass.new 35 | begin 36 | return parser.process s, f, t 37 | rescue Racc::ParseError, RubyParser::SyntaxError => exc 38 | e ||= exc 39 | end 40 | end 41 | raise e 42 | end 43 | 44 | alias :parse :process 45 | 46 | def reset 47 | # do nothing 48 | end 49 | 50 | class Parser < Racc::Parser 51 | include RubyParserStuff 52 | 53 | def self.inherited x 54 | RubyParser::VERSIONS << x 55 | end 56 | 57 | def self.version= v 58 | @version = v 59 | end 60 | 61 | def self.version 62 | @version ||= Parser > self && self.name[/(?:V|Ruby)(\d+)/, 1].to_i 63 | end 64 | end 65 | 66 | class SyntaxError < RuntimeError; end 67 | end 68 | 69 | ## 70 | # Unfortunately a problem with racc is that it won't let me namespace 71 | # properly, so instead of RubyParser::V25, I still have to generate 72 | # the old RubyParser25 and shove it in as V25. 73 | 74 | require "ruby_parser20" 75 | require "ruby_parser21" 76 | require "ruby_parser22" 77 | require "ruby_parser23" 78 | require "ruby_parser24" 79 | require "ruby_parser25" 80 | require "ruby_parser26" 81 | require "ruby_parser27" 82 | require "ruby_parser30" 83 | require "ruby_parser31" 84 | require "ruby_parser32" 85 | require "ruby_parser33" 86 | require "ruby_parser34" 87 | 88 | class RubyParser # HACK 89 | VERSIONS.clear # also a HACK caused by racc namespace issues 90 | 91 | class V34 < ::Ruby33Parser; end 92 | class V33 < ::Ruby33Parser; end 93 | class V32 < ::Ruby32Parser; end 94 | class V31 < ::Ruby31Parser; end 95 | class V30 < ::Ruby30Parser; end 96 | class V27 < ::Ruby27Parser; end 97 | class V26 < ::Ruby26Parser; end 98 | class V25 < ::Ruby25Parser; end 99 | class V24 < ::Ruby24Parser; end 100 | class V23 < ::Ruby23Parser; end 101 | class V22 < ::Ruby22Parser; end 102 | class V21 < ::Ruby21Parser; end 103 | class V20 < ::Ruby20Parser; end 104 | end 105 | -------------------------------------------------------------------------------- /lib/ruby_parser_extras.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "sexp" 4 | require "ruby_lexer" 5 | require "timeout" 6 | require "rp_extensions" 7 | require "rp_stringscanner" 8 | 9 | class Sexp 10 | def check_line_numbers 11 | raise "bad nil line for:\n%s" % [self.pretty_inspect] if nil_line? 12 | raise "bad line number for:\n%s" % [self.pretty_inspect] unless 13 | Integer === self.line && 14 | self.line >= 1 && 15 | self.line <= self.line_min 16 | end 17 | 18 | ## 19 | # Returns the minimum line number of the children of self. 20 | 21 | def line_min 22 | @line_min ||= [self.deep_each.map(&:line).min, self.line].compact.min 23 | end 24 | 25 | def nil_line? 26 | self.deep_each.map(&:line).any?(&:nil?) 27 | end 28 | end 29 | 30 | module RubyParserStuff 31 | VERSION = "3.21.1" 32 | 33 | attr_accessor :lexer, :in_def, :in_single, :file, :in_argdef 34 | attr_accessor :in_kwarg 35 | attr_reader :env 36 | 37 | ## 38 | # Canonicalize conditionals. Eg: 39 | # 40 | # not x ? a : b 41 | # 42 | # becomes: 43 | # 44 | # x ? b : a 45 | 46 | attr_accessor :canonicalize_conditions 47 | 48 | ## 49 | # The last token type returned from #next_token 50 | 51 | attr_accessor :last_token_type 52 | 53 | $good20 = [] 54 | 55 | %w[ 56 | ].map(&:to_i).each do |n| 57 | $good20[n] = n 58 | end 59 | 60 | def debug20 n, v = nil, r = nil 61 | raise "not yet #{n} #{v.inspect} => #{r.inspect}" unless $good20[n] 62 | end 63 | 64 | def self.deprecate old, new 65 | define_method old do |*args| 66 | warn "DEPRECATED: #{old} -> #{new} from #{caller.first}" 67 | send new, *args 68 | end 69 | end 70 | 71 | ## 72 | # for pure ruby systems only 73 | 74 | def do_parse 75 | _racc_do_parse_rb(_racc_setup, false) 76 | end if ENV["PURE_RUBY"] || ENV["CHECK_LINE_NUMS"] 77 | 78 | if ENV["CHECK_LINE_NUMS"] then 79 | def _racc_do_reduce arg, act 80 | x = super 81 | 82 | @racc_vstack.grep(Sexp).each do |sexp| 83 | sexp.check_line_numbers 84 | end 85 | x 86 | end 87 | end 88 | 89 | ARG_TYPES = [:arglist, :call_args, :array, :args].map { |k| 90 | [k, true] 91 | }.to_h 92 | 93 | # This is in sorted order of occurrence according to 94 | # charlock_holmes against 500k files, with UTF_8 forced 95 | # to the top. 96 | # 97 | # Overwrite this contstant if you need something different. 98 | ENCODING_ORDER = [ 99 | Encoding::UTF_8, # moved to top to reflect default in 2.0 100 | Encoding::ISO_8859_1, 101 | Encoding::ISO_8859_2, 102 | Encoding::ISO_8859_9, 103 | Encoding::SHIFT_JIS, 104 | Encoding::WINDOWS_1252, 105 | Encoding::EUC_JP 106 | ] 107 | 108 | JUMP_TYPE = [:return, :next, :break, :yield].map { |k| [k, true] }.to_h 109 | 110 | TAB_WIDTH = 8 111 | 112 | def initialize(options = {}) 113 | super() 114 | 115 | v = self.class.name[/[23]\d/] 116 | raise "Bad Class name #{self.class}" unless v 117 | 118 | self.lexer = RubyLexer.new v && v.to_i 119 | self.lexer.parser = self 120 | self.in_kwarg = false 121 | self.in_argdef = false 122 | 123 | @env = RubyParserStuff::Environment.new 124 | 125 | @canonicalize_conditions = true 126 | 127 | self.reset 128 | end 129 | 130 | def arg_concat node1, node2 # TODO: nuke 131 | raise "huh" unless node2 132 | 133 | splat = s(:splat, node2) 134 | splat.line node2.line 135 | 136 | node1 << splat 137 | end 138 | 139 | def argl x 140 | x = s(:arglist, x) if x and x.sexp_type == :array 141 | x 142 | end 143 | 144 | def args args 145 | result = s(:args) 146 | 147 | ss = args.grep Sexp 148 | if ss.empty? then 149 | result.line lexer.lineno 150 | else 151 | result.line ss.first.line 152 | result.line_max = ss.first.line_max 153 | end 154 | 155 | args.each do |arg| 156 | if arg.instance_of? Array and arg.size == 2 and arg.last.is_a? Numeric then 157 | arg = arg.first 158 | end 159 | 160 | case arg 161 | when Sexp then 162 | case arg.sexp_type 163 | when :args, :block, :array, :call_args then # HACK call_args mismatch 164 | rest = arg.sexp_body 165 | 166 | rest.map! { |x| 167 | if x.instance_of? Array and x.size == 2 and Numeric === x.last then 168 | x.first 169 | else 170 | x 171 | end 172 | } 173 | 174 | result.concat rest 175 | when :forward_args then 176 | self.env[:*] = :lvar # TODO: arg_var(p, idFWD_REST) ? 177 | self.env[:**] = :lvar 178 | self.env[:&] = :lvar 179 | 180 | result << arg 181 | when :block_arg then 182 | result << :"&#{arg.last}" 183 | when :shadow then 184 | name = arg.last 185 | self.env[name] = :lvar 186 | if Sexp === result.last and result.last.sexp_type == :shadow then 187 | result.last << name 188 | else 189 | result << arg 190 | end 191 | when :masgn, :block_pass, :hash then # HACK: remove. prolly call_args 192 | result << arg 193 | else 194 | raise "unhandled: #{arg.sexp_type} in #{args.inspect}" 195 | end 196 | when Symbol then 197 | name = arg.to_s.delete("&*") 198 | self.env[name.to_sym] = :lvar unless name.empty? 199 | result << arg 200 | when true, false then 201 | self.in_kwarg = arg 202 | when ",", "|", ";", "(", ")", nil then 203 | # ignore 204 | else 205 | raise "unhandled: #{arg.inspect} in #{args.inspect}" 206 | end 207 | end 208 | 209 | result 210 | end 211 | 212 | def end_args args 213 | lexer.lex_state = RubyLexer::State::Values::EXPR_BEG 214 | lexer.command_start = true 215 | self.args args 216 | end 217 | 218 | def attrset_id? id 219 | id =~ /^\[\]=$|^\w+=$/ 220 | end 221 | 222 | def endless_method_name defn_or_defs 223 | _, name, maybe_name, * = defn_or_defs 224 | name = maybe_name unless Symbol === name 225 | 226 | if attrset_id? name then 227 | yyerror "setter method cannot be defined in an endless method definition" 228 | end 229 | 230 | # TODO? token_info_drop(p, "def", loc->beg_pos); 231 | end 232 | 233 | def array_to_hash array 234 | case array.sexp_type 235 | when :kwsplat then 236 | array 237 | else 238 | s(:hash, *array.sexp_body).line array.line 239 | end 240 | end 241 | 242 | def aryset receiver, index 243 | index ||= s() 244 | l = receiver.line 245 | result = s(:attrasgn, receiver, :"[]=", 246 | *index.sexp_body).compact # [].sexp_body => nil 247 | result.line = l 248 | result 249 | end 250 | 251 | def assignable(lhs, value = nil) 252 | id, line = lhs 253 | id = id.to_sym 254 | 255 | result = case id 256 | when /^@@/ then 257 | asgn = in_def || in_single > 0 258 | s((asgn ? :cvasgn : :cvdecl), id) 259 | when /^@/ then 260 | s(:iasgn, id) 261 | when /^\$/ then 262 | s(:gasgn, id) 263 | when /^[A-Z]/ then 264 | s(:cdecl, id) 265 | else 266 | case self.env[id] 267 | when :lvar, :dvar, nil then 268 | s(:lasgn, id) 269 | else 270 | raise "wtf? unknown type: #{self.env[id]}" 271 | end 272 | end 273 | 274 | self.env[id] ||= :lvar if result.sexp_type == :lasgn 275 | 276 | result << value if value 277 | result.line line 278 | result 279 | end 280 | 281 | def backref_assign_error ref 282 | # TODO: need a test for this... obviously 283 | case ref.sexp_type 284 | when :nth_ref then 285 | raise "write a test 2" 286 | raise SyntaxError, "Can't set variable %p" % ref.last 287 | when :back_ref then 288 | raise "write a test 3" 289 | raise SyntaxError, "Can't set back reference %p" % ref.last 290 | else 291 | raise "Unknown backref type: #{ref.inspect}" 292 | end 293 | end 294 | 295 | def block_append(head, tail) 296 | return head if tail.nil? 297 | return tail if head.nil? 298 | 299 | line = [head.line, tail.line].compact.min 300 | 301 | head = remove_begin(head) 302 | head = s(:block, head).line(line) unless head.sexp_type == :block 303 | 304 | # head.line = line 305 | head << tail 306 | end 307 | 308 | def block_dup_check call_or_args, block 309 | syntax_error "Both block arg and actual block given." if 310 | block and call_or_args.block_pass? 311 | end 312 | 313 | def block_var *args 314 | result = self.args args 315 | result.sexp_type = :masgn 316 | result 317 | end 318 | 319 | def call_args args 320 | result = s(:call_args) 321 | 322 | a = args.grep(Sexp).first 323 | if a then 324 | result.line a.line 325 | else 326 | result.line lexer.lineno 327 | end 328 | 329 | args.each do |arg| 330 | # ruby 3.0+ TODO: next if arg in [String, Integer] # eg ["(", 1] 331 | next if arg.class == Array && arg.map(&:class) == [String, Integer] 332 | 333 | case arg 334 | when Sexp then 335 | case arg.sexp_type 336 | when :array, :args, :call_args then # HACK? remove array at some point 337 | result.sexp_body += arg.sexp_body 338 | else 339 | result << arg 340 | end 341 | when Symbol then 342 | result << arg 343 | when Array then 344 | id, _line = arg 345 | result << id 346 | when ",", nil, "(" then 347 | # ignore 348 | else 349 | raise "unhandled: #{arg.inspect} in #{args.inspect}" 350 | end 351 | end 352 | 353 | result 354 | end 355 | 356 | def clean_mlhs sexp 357 | case sexp.sexp_type 358 | when :masgn then 359 | if sexp.size == 2 and sexp[1].sexp_type == :array then 360 | s(:masgn, *sexp[1].sexp_body.map { |sub| clean_mlhs sub }) 361 | else 362 | debug20 5 363 | sexp 364 | end 365 | when :gasgn, :iasgn, :lasgn, :cvasgn then 366 | if sexp.size == 2 then 367 | sexp.last 368 | else 369 | debug20 7 370 | sexp # optional value 371 | end 372 | else 373 | raise "unsupported type: #{sexp.inspect}" 374 | end 375 | end 376 | 377 | def cond node 378 | return nil if node.nil? 379 | node = value_expr node 380 | 381 | case node.sexp_type 382 | when :lit then 383 | if Regexp === node.last then 384 | s(:match, node) 385 | else 386 | node 387 | end 388 | when :and then 389 | _, lhs, rhs = node 390 | s(:and, cond(lhs), cond(rhs)) 391 | when :or then 392 | _, lhs, rhs = node 393 | s(:or, cond(lhs), cond(rhs)) 394 | when :dot2 then 395 | label = "flip#{node.hash}" 396 | env[label] = :lvar 397 | _, lhs, rhs = node 398 | s(:flip2, lhs, rhs) # TODO: recurse? 399 | when :dot3 then 400 | label = "flip#{node.hash}" 401 | env[label] = :lvar 402 | _, lhs, rhs = node 403 | s(:flip3, lhs, rhs) 404 | else 405 | node 406 | end.line node.line 407 | end 408 | 409 | def dedent sexp 410 | dedent_count = dedent_size sexp 411 | 412 | skip_one = false 413 | sexp.map { |obj| 414 | case obj 415 | when Symbol then 416 | obj 417 | when String then 418 | obj.lines.map { |l| remove_whitespace_width l, dedent_count }.join 419 | when Sexp then 420 | case obj.sexp_type 421 | when :evstr then 422 | skip_one = true 423 | obj 424 | when :str then 425 | _, str = obj 426 | str = if skip_one then 427 | skip_one = false 428 | s1, *rest = str.lines 429 | s1 + rest.map { |l| remove_whitespace_width l, dedent_count }.join 430 | else 431 | str.lines.map { |l| remove_whitespace_width l, dedent_count }.join 432 | end 433 | 434 | s(:str, str).line obj.line 435 | else 436 | warn "unprocessed sexp %p" % [obj] 437 | end 438 | else 439 | warn "unprocessed: %p" % [obj] 440 | end 441 | } 442 | end 443 | 444 | def dedent_size sexp 445 | skip_one = false 446 | sexp.flat_map { |s| 447 | case s 448 | when Symbol then 449 | next 450 | when String then 451 | s.lines 452 | when Sexp then 453 | case s.sexp_type 454 | when :evstr then 455 | skip_one = true 456 | next 457 | when :str then 458 | _, str = s 459 | lines = str.lines 460 | if skip_one then 461 | skip_one = false 462 | lines.shift 463 | end 464 | lines 465 | else 466 | warn "unprocessed sexp %p" % [s] 467 | end 468 | else 469 | warn "unprocessed: %p" % [s] 470 | end.map { |l| whitespace_width l } 471 | }.compact.min 472 | end 473 | 474 | def dedent_string string, width 475 | characters_skipped = 0 476 | indentation_skipped = 0 477 | 478 | string.chars.each do |char| 479 | break if indentation_skipped >= width 480 | if char == " " 481 | characters_skipped += 1 482 | indentation_skipped += 1 483 | elsif char == "\t" 484 | proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1) 485 | break if proposed > width 486 | characters_skipped += 1 487 | indentation_skipped = proposed 488 | end 489 | end 490 | string[characters_skipped..-1] 491 | end 492 | 493 | def gettable(id) 494 | id = id.to_sym if String === id 495 | 496 | result = case id.to_s 497 | when /^@@/ then 498 | s(:cvar, id) 499 | when /^@/ then 500 | s(:ivar, id) 501 | when /^\$/ then 502 | s(:gvar, id) 503 | when /^[A-Z]/ then 504 | s(:const, id) 505 | else 506 | type = env[id] 507 | if type then 508 | s(type, id) 509 | else 510 | new_call(nil, id) 511 | end 512 | end 513 | 514 | raise "identifier #{id.inspect} is not valid" unless result 515 | 516 | result 517 | end 518 | 519 | def hack_encoding str, extra = nil 520 | encodings = ENCODING_ORDER.dup 521 | encodings.unshift(extra) unless extra.nil? 522 | 523 | # terrible, horrible, no good, very bad, last ditch effort. 524 | encodings.each do |enc| 525 | begin 526 | str.force_encoding enc 527 | if str.valid_encoding? then 528 | str.encode! Encoding::UTF_8 529 | break 530 | end 531 | rescue ArgumentError # unknown encoding name 532 | # do nothing 533 | rescue Encoding::InvalidByteSequenceError 534 | # do nothing 535 | rescue Encoding::UndefinedConversionError 536 | # do nothing 537 | end 538 | end 539 | 540 | # no amount of pain is enough for you. 541 | raise "Bad encoding. Need a magic encoding comment." unless 542 | str.encoding.name == "UTF-8" 543 | end 544 | 545 | ## 546 | # Returns a UTF-8 encoded string after processing BOMs and magic 547 | # encoding comments. 548 | # 549 | # Holy crap... ok. Here goes: 550 | # 551 | # Ruby's file handling and encoding support is insane. We need to be 552 | # able to lex a file. The lexer file is explicitly UTF-8 to make 553 | # things cleaner. This allows us to deal with extended chars in 554 | # class and method names. In order to do this, we need to encode all 555 | # input source files as UTF-8. First, we look for a UTF-8 BOM by 556 | # looking at the first line while forcing its encoding to 557 | # ASCII-8BIT. If we find a BOM, we strip it and set the expected 558 | # encoding to UTF-8. Then, we search for a magic encoding comment. 559 | # If found, it overrides the BOM. Finally, we force the encoding of 560 | # the input string to whatever was found, and then encode that to 561 | # UTF-8 for compatibility with the lexer. 562 | 563 | def handle_encoding str 564 | str = str.dup 565 | encoding = nil 566 | 567 | header = str.each_line.first(2) 568 | header.map! { |s| s.force_encoding "ASCII-8BIT" } 569 | 570 | first = header.first || "" 571 | encoding, str = +"utf-8", str.b[3..-1] if first =~ /\A\xEF\xBB\xBF/n 572 | 573 | encoding = $1.strip if header.find { |s| 574 | s[/^#.*?-\*-.*?coding:\s*([^ ;]+).*?-\*-/, 1] || 575 | s[/^#.*(?:en)?coding(?:\s*[:=])\s*([\w-]+)/, 1] 576 | } 577 | 578 | if encoding then 579 | encoding.sub!(/utf-8-.+$/, "utf-8") # HACK for stupid emacs formats 580 | hack_encoding str, encoding 581 | else 582 | # nothing specified... ugh. try to encode as utf-8 583 | hack_encoding str 584 | end 585 | 586 | str 587 | end 588 | 589 | def invert_block_call val 590 | ret, iter = val 591 | type, call = ret 592 | 593 | iter.insert 1, call 594 | 595 | ret = s(type).line ret.line 596 | 597 | [iter, ret] 598 | end 599 | 600 | def inverted? val 601 | JUMP_TYPE[val[0].sexp_type] 602 | end 603 | 604 | def list_append list, item # TODO: nuke me *sigh* 605 | return s(:array, item) unless list 606 | list = s(:array, list) unless Sexp === list && list.sexp_type == :array 607 | list << item 608 | end 609 | 610 | def list_prepend item, list # TODO: nuke me *sigh* 611 | list = s(:array, list) unless Sexp === list && list.sexp_type == :array 612 | list.insert 1, item 613 | list 614 | end 615 | 616 | def literal_concat head, tail # TODO: ugh. rewrite 617 | return tail unless head 618 | return head unless tail 619 | 620 | htype, ttype = head.sexp_type, tail.sexp_type 621 | 622 | head = s(:dstr, "", head).line head.line if htype == :evstr 623 | 624 | case ttype 625 | when :str then 626 | if htype == :str 627 | a, b = head.last, tail.last 628 | b = b.dup.force_encoding a.encoding unless Encoding.compatible?(a, b) 629 | a << b 630 | elsif htype == :dstr and head.size == 2 then 631 | head.last << tail.last 632 | else 633 | head << tail 634 | end 635 | when :dstr then 636 | if htype == :str then 637 | lineno = head.line 638 | _, h1 = head 639 | _, t1, *rest = tail 640 | tail.sexp_body = [h1 + t1, *rest] 641 | 642 | head = tail 643 | head.line = lineno 644 | else 645 | tail.sexp_type = :array 646 | _, tail_s, *tail_r = tail 647 | if tail_s == "" then 648 | tail.sexp_body = tail_r 649 | else 650 | tail.sexp_body = [s(:str, tail_s).line(tail.line), *tail_r] 651 | end 652 | 653 | head.push(*tail.sexp_body) 654 | end 655 | when :evstr then 656 | if htype == :str then 657 | f, l = head.file, head.line 658 | head = s(:dstr, *head.sexp_body) 659 | head.file = f 660 | head.line = l 661 | end 662 | 663 | _, t1, * = tail 664 | if head.size == 2 and tail.size > 1 and t1.sexp_type == :str then 665 | _, h1 = head 666 | head.sexp_body = [h1.dup] if h1.frozen? # this is dumb 667 | head.last << t1.last 668 | head.sexp_type = :str if head.size == 2 # HACK ? 669 | else 670 | head.push(tail) 671 | end 672 | else 673 | x = [head, tail] 674 | raise "unknown type: #{x.inspect}" 675 | end 676 | 677 | return head 678 | end 679 | 680 | def local_pop in_def 681 | lexer.cond.pop # group = local_pop 682 | lexer.cmdarg.pop 683 | self.env.unextend 684 | self.in_def = in_def 685 | end 686 | 687 | def logical_op type, left, right 688 | left = value_expr left 689 | 690 | if left and left.sexp_type == type and not left.paren then 691 | node, rhs = left, nil 692 | 693 | loop do 694 | _, _lhs, rhs = node 695 | break unless rhs && rhs.sexp_type == type and not rhs.paren 696 | node = rhs 697 | end 698 | 699 | node.pop 700 | node << s(type, rhs, right).line(rhs.line) 701 | 702 | return left 703 | end 704 | 705 | result = s(type, left, right) 706 | result.line left.line if left.line 707 | result 708 | end 709 | 710 | def new_aref val 711 | val[2] ||= s(:arglist) 712 | val[2].sexp_type = :arglist if val[2].sexp_type == :array # REFACTOR 713 | new_call val[0], :"[]", val[2] 714 | end 715 | 716 | def new_arg val 717 | arg, = val 718 | 719 | case arg 720 | when Symbol then 721 | result = s(:args, arg).line line 722 | when Sexp then 723 | result = arg 724 | when Array then 725 | (arg, line), = val 726 | result = s(:args, arg).line line 727 | else 728 | debug20 32 729 | raise "Unknown f_arg type: #{val.inspect}" 730 | end 731 | 732 | result 733 | end 734 | 735 | def ary_to_pat ary 736 | pat = ary.dup 737 | pat.sexp_type = :array_TAIL 738 | 739 | new_array_pattern nil, nil, pat, ary.line 740 | end 741 | 742 | def new_array_pattern const, pre_arg, arypat, loc 743 | result = s(:array_pat, const).line loc 744 | result << pre_arg if pre_arg 745 | 746 | if arypat && arypat.sexp_type == :array_TAIL then 747 | result.concat arypat.sexp_body 748 | else 749 | raise "NO?: %p" % [arypat] 750 | end 751 | 752 | result 753 | end 754 | 755 | def array_pat_concat lhs, rhs 756 | case lhs.sexp_type 757 | when :PATTERN then 758 | lhs.sexp_type = :array_pat 759 | end 760 | 761 | if rhs then 762 | case rhs.sexp_type 763 | when :array_pat, :array_TAIL, :PATTERN then 764 | lhs.concat rhs.sexp_body 765 | else 766 | lhs << rhs 767 | end 768 | end 769 | end 770 | 771 | def new_array_pattern_tail pre_args, has_rest, rest_arg, post_args 772 | # TODO: remove has_rest once all tests pass !!! 773 | rest_arg = if has_rest then 774 | :"*#{rest_arg}" 775 | else 776 | nil 777 | end 778 | 779 | result = s(:array_TAIL).line 666 780 | 781 | array_pat_concat result, pre_args 782 | 783 | result << rest_arg if rest_arg 784 | 785 | array_pat_concat result, post_args 786 | 787 | result 788 | end 789 | 790 | def new_assign lhs, rhs 791 | return nil unless lhs 792 | 793 | rhs = value_expr rhs 794 | 795 | case lhs.sexp_type 796 | when :lasgn, :iasgn, :cdecl, :cvdecl, :gasgn, :cvasgn, :attrasgn, :safe_attrasgn then 797 | lhs << rhs 798 | lhs.line_max = rhs.line_max 799 | when :const then 800 | lhs.sexp_type = :cdecl 801 | lhs << rhs 802 | else 803 | raise "unknown lhs #{lhs.inspect} w/ #{rhs.inspect}" 804 | end 805 | 806 | lhs 807 | end 808 | 809 | def new_attrasgn recv, meth, call_op = :"." 810 | call_op = call_op.first if Array === call_op 811 | 812 | meth = :"#{meth}=" 813 | 814 | result = case call_op.to_sym 815 | when :"." 816 | s(:attrasgn, recv, meth) 817 | when :"&." 818 | s(:safe_attrasgn, recv, meth) 819 | else 820 | raise "unknown call operator: `#{type.inspect}`" 821 | end 822 | 823 | result.line = recv.line 824 | result 825 | end 826 | 827 | def new_begin val 828 | (_, line), _, body, _ = val 829 | 830 | result = body ? s(:begin, body) : s(:nil) 831 | result.line line 832 | 833 | result 834 | end 835 | 836 | def new_body val 837 | body, resbody, elsebody, ensurebody = val 838 | 839 | result = body 840 | 841 | if resbody then 842 | result = s(:rescue) 843 | result << body if body 844 | 845 | res = resbody 846 | 847 | while res do 848 | result << res 849 | res = res.find_node :resbody, :delete 850 | end 851 | 852 | result << elsebody if elsebody 853 | 854 | result.line = (body || resbody).line 855 | end 856 | 857 | if elsebody and not resbody then 858 | warning("else without rescue is useless") 859 | result = s(:begin, result).line result.line if result 860 | result = block_append(result, elsebody) 861 | end 862 | 863 | if ensurebody 864 | lineno = (result || ensurebody).line 865 | result = s(:ensure, result, ensurebody).compact.line lineno 866 | end 867 | 868 | result 869 | end 870 | 871 | def new_brace_body args, body, lineno 872 | new_iter(nil, args, body).line lineno 873 | end 874 | 875 | def new_call recv, meth, args = nil, call_op = :"." 876 | call_op = call_op.first if Array === call_op 877 | 878 | result = case call_op.to_sym 879 | when :"." 880 | s(:call, recv, meth) 881 | when :"&." 882 | s(:safe_call, recv, meth) 883 | else 884 | raise "unknown call operator: `#{type.inspect}`" 885 | end 886 | 887 | # TODO: need a test with f(&b) to produce block_pass 888 | # TODO: need a test with f(&b) { } to produce warning 889 | 890 | if args then 891 | if ARG_TYPES[args.sexp_type] then 892 | result.concat args.sexp_body 893 | else 894 | result << args 895 | end 896 | result.line_max = args.line_max 897 | end 898 | 899 | # line = result.grep(Sexp).map(&:line).compact.min 900 | result.line = recv.line if recv 901 | result.line ||= lexer.lineno 902 | 903 | result 904 | end 905 | 906 | def new_in pat, body, cases, line 907 | s(:in, pat, body, cases).line line 908 | end 909 | 910 | def new_case expr, body, line 911 | result = s(:case, expr) 912 | 913 | while body and [:when, :in].include? body.sexp_type 914 | result << body 915 | body = body.delete_at 3 916 | end 917 | 918 | _, _expr, *cases = result 919 | cases.each do |node| 920 | block = node.find_node :block, :delete 921 | node.concat block.sexp_body if block 922 | end 923 | 924 | # else 925 | body = nil if body == s(:block) 926 | result << body 927 | 928 | result.line = line 929 | result 930 | end 931 | 932 | def new_class val 933 | (_, line, comment), path, superclass, _, body, (_, line_max) = val 934 | 935 | path = path.first if path.instance_of? Array 936 | 937 | result = s(:class, path, superclass) 938 | 939 | if body then 940 | if body.sexp_type == :block then 941 | result.push(*body.sexp_body) 942 | else 943 | result.push body 944 | end 945 | end 946 | 947 | result.line = line 948 | result.line_max = line_max 949 | result.comments = comment if comment 950 | result 951 | end 952 | 953 | def new_compstmt val 954 | result = void_stmts(val.grep(Sexp)[0]) 955 | result = remove_begin(result) if result 956 | result 957 | end 958 | 959 | def new_const_op_asgn val 960 | lhs, (asgn_op, _), rhs = val 961 | asgn_op = asgn_op.to_sym 962 | 963 | result = case asgn_op 964 | when :"||" then 965 | s(:op_asgn_or, lhs, rhs) 966 | when :"&&" then 967 | s(:op_asgn_and, lhs, rhs) 968 | else 969 | s(:op_asgn, lhs, asgn_op, rhs) 970 | end 971 | 972 | result.line = lhs.line 973 | result 974 | end 975 | 976 | def new_defn val 977 | if val.size == 4 then 978 | ((_, line, comment), (name, _line, in_def)), args, body, (_, line_max) = val 979 | else 980 | (_, line, comment), (name, line), in_def, args, body, (_, line_max) = val 981 | end 982 | 983 | body ||= s(:nil).line line 984 | 985 | args.line line 986 | 987 | result = s(:defn, name.to_sym, args).line line 988 | result.line_max = line_max 989 | 990 | if body.sexp_type == :block then 991 | result.push(*body.sexp_body) 992 | else 993 | result.push body 994 | end 995 | 996 | result.comments = comment if comment 997 | 998 | [result, in_def] 999 | end 1000 | 1001 | def new_endless_defn val 1002 | # not available in 2.x so we don't need to check size 1003 | ((_, line, comment), (name, _, in_def)), args, _, body, _, resbody = val 1004 | 1005 | result = 1006 | if resbody then 1007 | s(:defn, name, args, 1008 | new_rescue(body, 1009 | new_resbody(s(:array).line(line), 1010 | resbody))).line line 1011 | else 1012 | s(:defn, name, args, body).line line 1013 | end 1014 | 1015 | local_pop in_def 1016 | endless_method_name result 1017 | 1018 | result.comments = comment if comment 1019 | 1020 | result 1021 | end 1022 | 1023 | def new_endless_defs val 1024 | # not available in 2.x so we don't need to check size 1025 | ((_, line, comment), recv, _, _, (name, line, in_def)), \ 1026 | args, _, body, _, resbody = val 1027 | 1028 | result = 1029 | if resbody then 1030 | s(:defs, recv, name, args, 1031 | new_rescue(body, 1032 | new_resbody(s(:array).line(line), 1033 | resbody))).line line 1034 | else 1035 | s(:defs, recv, name, args, body).line(line) 1036 | end 1037 | 1038 | self.in_single -= 1 1039 | local_pop in_def 1040 | endless_method_name result 1041 | 1042 | result.comments = comment if comment 1043 | 1044 | result 1045 | end 1046 | 1047 | def new_defs val 1048 | if val.size == 4 then 1049 | ((_, line, comment), recv, _, _, (name, line, in_def)), \ 1050 | args, body, (_, line_max) = val 1051 | else 1052 | (_, line, comment), recv, (name, _), in_def, \ 1053 | args, body, (_, line_max) = val 1054 | end 1055 | 1056 | body ||= s(:nil).line line 1057 | 1058 | args.line line 1059 | 1060 | result = s(:defs, recv, name.to_sym, args).line line 1061 | result.line_max = line_max 1062 | 1063 | # TODO: remove_begin 1064 | # TODO: reduce_nodes 1065 | 1066 | if body.sexp_type == :block then 1067 | result.push(*body.sexp_body) 1068 | else 1069 | result.push body 1070 | end 1071 | 1072 | result.comments = comment if comment 1073 | 1074 | [result, in_def] 1075 | end 1076 | 1077 | def new_do_body args, body, lineno 1078 | new_iter(nil, args, body).line(lineno) 1079 | end 1080 | 1081 | def new_find_pattern const, pat 1082 | pat.sexp_type = :find_pat 1083 | pat.insert 1, const 1084 | end 1085 | 1086 | def new_find_pattern_tail lhs, mid, rhs 1087 | lhs_id, line = lhs 1088 | rhs_id, _line = rhs 1089 | 1090 | # TODO: fpinfo->pre_rest_arg = pre_rest_arg ? assignable(p, pre_rest_arg, 0, loc) : NODE_SPECIAL_NO_NAME_REST; 1091 | 1092 | lhs_id = "*#{lhs_id}".to_sym 1093 | rhs_id = "*#{rhs_id}".to_sym 1094 | 1095 | raise "BAD?" unless mid.sexp_type == :array_TAIL 1096 | 1097 | s(:find_pat_TAIL, lhs_id, *mid.sexp_body, rhs_id).line line 1098 | end 1099 | 1100 | def new_for expr, var, body 1101 | result = s(:for, expr, var).line(var.line) 1102 | result << body if body 1103 | result 1104 | end 1105 | 1106 | def new_hash val 1107 | _, line, assocs = val 1108 | 1109 | s(:hash).line(line).concat assocs.sexp_body 1110 | end 1111 | 1112 | def new_hash_pattern const, hash_pat, loc 1113 | _, pat, kw_args, kw_rest_arg = hash_pat 1114 | 1115 | line = (const||hash_pat).line 1116 | 1117 | result = s(:hash_pat, const).line line 1118 | result.concat pat.sexp_body if pat 1119 | result << kw_args if kw_args 1120 | result << kw_rest_arg if kw_rest_arg 1121 | result 1122 | end 1123 | 1124 | def new_hash_pattern_tail kw_args, kw_rest_arg, line # TODO: remove line arg 1125 | # kw_rest_arg = assignable(kw_rest_arg, nil).line line if kw_rest_arg 1126 | 1127 | result = s(:hash_pat).line line 1128 | result << kw_args 1129 | 1130 | if kw_rest_arg then 1131 | name = kw_rest_arg.value 1132 | # TODO: I _hate_ this: 1133 | assignable [name, kw_rest_arg.line] if name != :** 1134 | result << kw_rest_arg 1135 | end 1136 | 1137 | result 1138 | end 1139 | 1140 | def push_pktbl 1141 | end 1142 | 1143 | def pop_pktbl 1144 | end 1145 | 1146 | def push_pvtbl 1147 | end 1148 | 1149 | def pop_pvtbl 1150 | end 1151 | 1152 | def new_if c, t, f 1153 | l = [c.line, t && t.line, f && f.line].compact.min 1154 | c = cond c 1155 | c, t, f = c.last, f, t if c.sexp_type == :not and canonicalize_conditions 1156 | s(:if, c, t, f).line(l) 1157 | end 1158 | 1159 | def new_iter call, args, body 1160 | body ||= nil 1161 | 1162 | args ||= s(:args) 1163 | args = s(:args, args) if Symbol === args 1164 | 1165 | result = s(:iter) 1166 | result << call if call 1167 | result << args 1168 | result << body if body 1169 | 1170 | result.line call.line if call 1171 | 1172 | unless args == 0 then 1173 | args.line call.line if call 1174 | args.sexp_type = :args 1175 | end 1176 | 1177 | result 1178 | end 1179 | 1180 | def new_masgn lhs, rhs, wrap = false 1181 | _, ary = lhs 1182 | 1183 | line = rhs.line 1184 | rhs = value_expr(rhs) 1185 | rhs = ary ? s(:to_ary, rhs) : s(:array, rhs) if wrap 1186 | rhs.line line if wrap 1187 | 1188 | lhs.delete_at 1 if ary.nil? 1189 | lhs << rhs 1190 | 1191 | lhs 1192 | end 1193 | 1194 | def new_masgn_arg rhs, wrap = false 1195 | rhs = value_expr(rhs) 1196 | # HACK: could be array if lhs isn't right 1197 | rhs = s(:to_ary, rhs).line rhs.line if wrap 1198 | rhs 1199 | end 1200 | 1201 | def new_match lhs, rhs 1202 | if lhs then 1203 | case lhs.sexp_type 1204 | when :dregx, :dregx_once then 1205 | # TODO: no test coverage 1206 | return s(:match2, lhs, rhs).line(lhs.line) 1207 | when :lit then 1208 | return s(:match2, lhs, rhs).line(lhs.line) if Regexp === lhs.last 1209 | end 1210 | end 1211 | 1212 | if rhs then 1213 | case rhs.sexp_type 1214 | when :dregx, :dregx_once then 1215 | # TODO: no test coverage 1216 | return s(:match3, rhs, lhs).line(lhs.line) 1217 | when :lit then 1218 | return s(:match3, rhs, lhs).line(lhs.line) if Regexp === rhs.last 1219 | end 1220 | end 1221 | 1222 | new_call(lhs, :"=~", argl(rhs)).line lhs.line 1223 | end 1224 | 1225 | def new_module val 1226 | (_, line_min, comment), path, _, body, (_, line_max) = val 1227 | 1228 | path = path.first if path.instance_of? Array 1229 | 1230 | result = s(:module, path).line line_min 1231 | result.line_max = line_max 1232 | 1233 | if body then # REFACTOR? 1234 | if body.sexp_type == :block then 1235 | result.push(*body.sexp_body) 1236 | else 1237 | result.push body 1238 | end 1239 | end 1240 | 1241 | result.comments = comment if comment 1242 | result 1243 | end 1244 | 1245 | def new_op_asgn val 1246 | lhs, (op, _line), rhs = val 1247 | op = op.to_sym 1248 | 1249 | name = gettable(lhs.last).line lhs.line 1250 | arg = remove_begin rhs 1251 | result = case op # REFACTOR 1252 | when :"||" then 1253 | lhs << arg 1254 | s(:op_asgn_or, name, lhs).line lhs.line 1255 | when :"&&" then 1256 | lhs << arg 1257 | s(:op_asgn_and, name, lhs).line lhs.line 1258 | else 1259 | lhs << new_call(name, op, argl(arg)) 1260 | lhs 1261 | end 1262 | 1263 | result 1264 | end 1265 | 1266 | def new_op_asgn1 val 1267 | lhs, _, args, _, (op, _), rhs = val 1268 | 1269 | args.sexp_type = :arglist if args 1270 | 1271 | result = s(:op_asgn1, lhs, args, op.to_sym, rhs) 1272 | result.line lhs.line 1273 | result 1274 | end 1275 | 1276 | def new_op_asgn2 val 1277 | recv, (call_op, _), (meth, _), (op, _), arg = val 1278 | meth = :"#{meth}=" 1279 | 1280 | result = case call_op.to_sym 1281 | when :"." 1282 | s(:op_asgn2, recv, meth, op.to_sym, arg) 1283 | when :"&." 1284 | s(:safe_op_asgn2, recv, meth, op.to_sym, arg) 1285 | else 1286 | raise "unknown call operator: `#{type.inspect}`" 1287 | end 1288 | 1289 | result.line = recv.line 1290 | result 1291 | end 1292 | 1293 | def new_qsym_list 1294 | s(:array).line lexer.lineno 1295 | end 1296 | 1297 | def new_qsym_list_entry val 1298 | _, (str, line), _ = val 1299 | s(:lit, str.to_sym).line line 1300 | end 1301 | 1302 | def new_qword_list 1303 | s(:array).line lexer.lineno 1304 | end 1305 | 1306 | def new_qword_list_entry val 1307 | _, (str, line), _ = val 1308 | str.force_encoding("ASCII-8BIT") unless str.valid_encoding? 1309 | s(:str, str).line line 1310 | end 1311 | 1312 | def new_regexp val 1313 | (_, line), node, (options, line_max) = val 1314 | 1315 | node ||= s(:str, "").line line 1316 | node.line_max = line_max 1317 | 1318 | o, k = 0, nil 1319 | options.split(//).uniq.each do |c| # FIX: this has a better home 1320 | v = { 1321 | "x" => Regexp::EXTENDED, 1322 | "i" => Regexp::IGNORECASE, 1323 | "m" => Regexp::MULTILINE, 1324 | "o" => Regexp::ONCE, 1325 | "n" => Regexp::ENC_NONE, 1326 | "e" => Regexp::ENC_EUC, 1327 | "s" => Regexp::ENC_SJIS, 1328 | "u" => Regexp::ENC_UTF8, 1329 | }[c] 1330 | raise "unknown regexp option: #{c}" unless v 1331 | o += v 1332 | end 1333 | 1334 | case node.sexp_type 1335 | when :str then 1336 | _, str = node 1337 | node.sexp_type = :lit 1338 | val = if k then 1339 | Regexp.new(str, o, k) 1340 | else 1341 | begin 1342 | Regexp.new(str, o) 1343 | rescue RegexpError => e 1344 | warn "WARNING: #{e.message} for #{str.inspect} #{options.inspect}" 1345 | begin 1346 | warn "WARNING: trying to recover with ENC_UTF8" 1347 | Regexp.new(str, Regexp::ENC_UTF8) 1348 | rescue RegexpError => e 1349 | warn "WARNING: trying to recover with ENC_NONE" 1350 | Regexp.new(str, Regexp::ENC_NONE) 1351 | end 1352 | end 1353 | end 1354 | node.sexp_body = [val] 1355 | when :dstr then 1356 | if options =~ /o/ then 1357 | node.sexp_type = :dregx_once 1358 | else 1359 | node.sexp_type = :dregx 1360 | end 1361 | node << o if o and o != 0 1362 | else 1363 | node = s(:dregx, "", node).line line 1364 | node.sexp_type = :dregx_once if options =~ /o/ 1365 | node << o if o and o != 0 1366 | end 1367 | 1368 | node 1369 | end 1370 | 1371 | def new_resbody cond, body 1372 | if body && body.sexp_type == :block then 1373 | body.shift # remove block and splat it in directly 1374 | else 1375 | body = [body] 1376 | end 1377 | 1378 | s(:resbody, cond, *body).line cond.line 1379 | end 1380 | 1381 | def new_rescue body, resbody 1382 | s(:rescue, body, resbody).line body.line 1383 | end 1384 | 1385 | def new_sclass val 1386 | (_, line), _, recv, in_def, _, in_single, body, _ = val 1387 | 1388 | result = s(:sclass, recv) 1389 | 1390 | if body then 1391 | if body.sexp_type == :block then 1392 | result.push(*body.sexp_body) 1393 | else 1394 | result.push body 1395 | end 1396 | end 1397 | 1398 | result.line = line 1399 | self.in_def = in_def 1400 | self.in_single = in_single 1401 | result 1402 | end 1403 | 1404 | def new_string val 1405 | (str, line), = val 1406 | 1407 | str.force_encoding("UTF-8") 1408 | # TODO: remove: 1409 | str.force_encoding("ASCII-8BIT") unless str.valid_encoding? 1410 | s(:str, str).line line 1411 | end 1412 | 1413 | def new_super args 1414 | if args && args.sexp_type == :block_pass then 1415 | s(:super, args).line args.line 1416 | else 1417 | args ||= s(:arglist).line lexer.lineno 1418 | s(:super, *args.sexp_body).line args.line 1419 | end 1420 | end 1421 | 1422 | def new_symbol val 1423 | name = val.last 1424 | s(:lit, name.to_sym).line lexer.lineno 1425 | end 1426 | 1427 | def new_symbol_list 1428 | # TODO: hunt down and try to remove ALL lexer.lineno usage! 1429 | s(:array).line lexer.lineno 1430 | end 1431 | 1432 | def new_symbol_list_entry val 1433 | _, sym, _ = val 1434 | 1435 | sym ||= s(:str, "").line lexer.lineno 1436 | 1437 | case sym.sexp_type 1438 | when :dstr then 1439 | sym.sexp_type = :dsym 1440 | when :str then 1441 | sym = s(:lit, sym.last.to_sym).line sym.line 1442 | else 1443 | sym = s(:dsym, "", sym).line sym.line 1444 | end 1445 | 1446 | sym 1447 | end 1448 | 1449 | def new_undef n, m = nil 1450 | if m then 1451 | block_append(n, s(:undef, m).line(m.line)) 1452 | else 1453 | s(:undef, n).line n.line 1454 | end 1455 | end 1456 | 1457 | def new_until block, expr, pre 1458 | new_until_or_while :until, block, expr, pre 1459 | end 1460 | 1461 | def new_until_or_while type, block, expr, pre 1462 | other = type == :until ? :while : :until 1463 | line = [block && block.line, expr.line].compact.min 1464 | block, pre = block.last, false if block && block.sexp_type == :begin 1465 | 1466 | expr = cond expr 1467 | 1468 | result = unless expr.sexp_type == :not and canonicalize_conditions then 1469 | s(type, expr, block, pre) 1470 | else 1471 | s(other, expr.last, block, pre) 1472 | end 1473 | 1474 | result.line = line 1475 | result 1476 | end 1477 | 1478 | def new_when cond, body 1479 | s(:when, cond, body) 1480 | end 1481 | 1482 | def new_while block, expr, pre 1483 | new_until_or_while :while, block, expr, pre 1484 | end 1485 | 1486 | def new_word_list 1487 | s(:array).line lexer.lineno 1488 | end 1489 | 1490 | def new_word_list_entry val 1491 | _, word, _ = val 1492 | word.sexp_type == :evstr ? s(:dstr, "", word).line(word.line) : word 1493 | end 1494 | 1495 | def new_xstring val 1496 | _, node = val 1497 | 1498 | node ||= s(:str, "").line lexer.lineno 1499 | 1500 | if node then 1501 | case node.sexp_type 1502 | when :str 1503 | node.sexp_type = :xstr 1504 | when :dstr 1505 | node.sexp_type = :dxstr 1506 | else 1507 | node = s(:dxstr, "", node).line node.line 1508 | end 1509 | end 1510 | 1511 | node 1512 | end 1513 | 1514 | def new_yield args = nil 1515 | # TODO: raise args.inspect unless [:arglist].include? args.first # HACK 1516 | raise "write a test 4" if args && args.sexp_type == :block_pass 1517 | raise SyntaxError, "Block argument should not be given." if 1518 | args && args.sexp_type == :block_pass 1519 | 1520 | args ||= s(:arglist).line lexer.lineno 1521 | 1522 | args.sexp_type = :arglist if [:call_args, :array].include? args.sexp_type 1523 | args = s(:arglist, args).line args.line unless args.sexp_type == :arglist 1524 | 1525 | s(:yield, *args.sexp_body).line args.line 1526 | end 1527 | 1528 | def prev_value_to_lineno v 1529 | s, n = v 1530 | if String === s then 1531 | n 1532 | else 1533 | lexer.lineno 1534 | end 1535 | end 1536 | 1537 | KEEP_COMMENT_TOKENS = [:kCLASS, :kMODULE, :kDEF, :tNL] 1538 | 1539 | def next_token 1540 | token = self.lexer.next_token 1541 | 1542 | if token and token.first != RubyLexer::EOF then 1543 | self.last_token_type = token 1544 | 1545 | self.lexer.comment = nil unless KEEP_COMMENT_TOKENS.include? token.first 1546 | 1547 | return token 1548 | elsif !token 1549 | return self.lexer.next_token 1550 | else 1551 | return [false, false] 1552 | end 1553 | end 1554 | 1555 | def on_error(et, ev, values) 1556 | ev = ev.first if ev.instance_of?(Array) && ev.size == 2 && ev.last.is_a?(Integer) 1557 | super 1558 | rescue Racc::ParseError => e 1559 | # I don't like how the exception obscures the error message 1560 | e.message.replace "%s:%p :: %s" % [self.file, lexer.lineno, e.message.strip] 1561 | warn e.message if $DEBUG 1562 | raise 1563 | end 1564 | 1565 | ## 1566 | # Parse +str+ at path +file+ and return a sexp. Raises 1567 | # Timeout::Error if it runs for more than +time+ seconds. 1568 | 1569 | def process(str, file = "(string)", time = 10) 1570 | str.freeze 1571 | 1572 | Timeout.timeout time do 1573 | raise "bad val: #{str.inspect}" unless String === str 1574 | 1575 | self.lexer.string = handle_encoding str 1576 | 1577 | self.file = file 1578 | 1579 | @yydebug = ENV.has_key? "DEBUG" 1580 | 1581 | do_parse 1582 | end 1583 | end 1584 | 1585 | alias parse process 1586 | 1587 | def remove_begin node 1588 | line = node.line 1589 | 1590 | node = node.last while node and node.sexp_type == :begin and node.size == 2 1591 | 1592 | node = s(:nil) if node == s(:begin) 1593 | 1594 | node.line ||= line 1595 | 1596 | node 1597 | end 1598 | 1599 | alias value_expr remove_begin # TODO: for now..? could check the tree, but meh? 1600 | 1601 | def reset 1602 | lexer.reset 1603 | self.in_def = false 1604 | self.in_single = 0 1605 | self.env.reset 1606 | self.last_token_type = nil 1607 | end 1608 | 1609 | def ret_args node 1610 | if node then 1611 | raise "write a test 5" if node.sexp_type == :block_pass 1612 | 1613 | raise SyntaxError, "block argument should not be given" if 1614 | node.sexp_type == :block_pass 1615 | 1616 | node.sexp_type = :array if node.sexp_type == :call_args 1617 | node = node.last if node.sexp_type == :array && node.size == 2 1618 | 1619 | # HACK matz wraps ONE of the FOUR splats in a newline to 1620 | # distinguish. I use paren for now. ugh 1621 | node = s(:svalue, node).line node.line if node.sexp_type == :splat and not node.paren 1622 | node.sexp_type = :svalue if node.sexp_type == :arglist && node[1].sexp_type == :splat 1623 | end 1624 | 1625 | node 1626 | end 1627 | 1628 | def s(*args) 1629 | result = Sexp.new(*args) 1630 | # result.line ||= lexer.lineno if lexer.ss unless ENV["CHECK_LINE_NUMS"] # otherwise... 1631 | result.file = self.file 1632 | result 1633 | end 1634 | 1635 | def debug n 1636 | if ENV["PRY"] then 1637 | require "pry"; binding.pry 1638 | end 1639 | 1640 | raise RubyParser::SyntaxError, "debug #{n}" 1641 | end 1642 | 1643 | def syntax_error msg 1644 | raise RubyParser::SyntaxError, msg 1645 | end 1646 | 1647 | alias yyerror syntax_error 1648 | 1649 | def void_stmts node 1650 | return nil unless node 1651 | return node unless node.sexp_type == :block 1652 | 1653 | if node.respond_to? :sexp_body= then 1654 | node.sexp_body = node.sexp_body.map { |n| remove_begin n } 1655 | else 1656 | node[1..-1] = node[1..-1].map { |n| remove_begin(n) } 1657 | end 1658 | 1659 | node 1660 | end 1661 | 1662 | def warning s 1663 | # do nothing for now 1664 | end 1665 | 1666 | def whitespace_width line, remove_width = nil 1667 | col = 0 1668 | idx = 0 1669 | 1670 | line.chars.each do |c| 1671 | break if remove_width && col >= remove_width 1672 | case c 1673 | when " " then 1674 | col += 1 1675 | when "\t" then 1676 | n = TAB_WIDTH * (col / TAB_WIDTH + 1) 1677 | break if remove_width && n > remove_width 1678 | col = n 1679 | else 1680 | break 1681 | end 1682 | idx += 1 1683 | end 1684 | 1685 | if remove_width then 1686 | line[idx..-1] 1687 | elsif line[idx] == "\n" 1688 | nil 1689 | else 1690 | col 1691 | end 1692 | end 1693 | 1694 | alias remove_whitespace_width whitespace_width 1695 | 1696 | def wrap type, node 1697 | value, line = node 1698 | value = value.to_sym if value.respond_to? :to_sym 1699 | s(type, value).line line 1700 | end 1701 | 1702 | class Keyword 1703 | include RubyLexer::State::Values 1704 | 1705 | class KWtable 1706 | attr_accessor :name, :state, :id0, :id1 1707 | def initialize(name, id=[], state=nil) 1708 | @name = name 1709 | @id0, @id1 = id 1710 | @state = state 1711 | end 1712 | end 1713 | 1714 | ## 1715 | # :stopdoc: 1716 | # 1717 | # :expr_beg = ignore newline, +/- is a sign. 1718 | # :expr_end = newline significant, +/- is an operator. 1719 | # :expr_endarg = ditto, and unbound braces. 1720 | # :expr_endfn = ditto, and unbound braces. 1721 | # :expr_arg = newline significant, +/- is an operator. 1722 | # :expr_cmdarg = ditto 1723 | # :expr_mid = ditto 1724 | # :expr_fname = ignore newline, no reserved words. 1725 | # :expr_dot = right after . or ::, no reserved words. 1726 | # :expr_class = immediate after class, no here document. 1727 | # :expr_label = flag bit, label is allowed. 1728 | # :expr_labeled = flag bit, just after a label. 1729 | # :expr_fitem = symbol literal as FNAME. 1730 | # :expr_value = :expr_beg -- work to remove. Need multi-state support. 1731 | 1732 | expr_woot = EXPR_FNAME|EXPR_FITEM 1733 | 1734 | wordlist = [ 1735 | ["alias", [:kALIAS, :kALIAS ], expr_woot ], 1736 | ["and", [:kAND, :kAND ], EXPR_BEG ], 1737 | ["begin", [:kBEGIN, :kBEGIN ], EXPR_BEG ], 1738 | ["break", [:kBREAK, :kBREAK ], EXPR_MID ], 1739 | ["case", [:kCASE, :kCASE ], EXPR_BEG ], 1740 | ["class", [:kCLASS, :kCLASS ], EXPR_CLASS ], 1741 | ["def", [:kDEF, :kDEF ], EXPR_FNAME ], 1742 | ["defined?", [:kDEFINED, :kDEFINED ], EXPR_ARG ], 1743 | ["do", [:kDO, :kDO ], EXPR_BEG ], 1744 | ["else", [:kELSE, :kELSE ], EXPR_BEG ], 1745 | ["elsif", [:kELSIF, :kELSIF ], EXPR_BEG ], 1746 | ["end", [:kEND, :kEND ], EXPR_END ], 1747 | ["ensure", [:kENSURE, :kENSURE ], EXPR_BEG ], 1748 | ["false", [:kFALSE, :kFALSE ], EXPR_END ], 1749 | ["for", [:kFOR, :kFOR ], EXPR_BEG ], 1750 | ["if", [:kIF, :kIF_MOD ], EXPR_BEG ], 1751 | ["in", [:kIN, :kIN ], EXPR_BEG ], 1752 | ["module", [:kMODULE, :kMODULE ], EXPR_BEG ], 1753 | ["next", [:kNEXT, :kNEXT ], EXPR_MID ], 1754 | ["nil", [:kNIL, :kNIL ], EXPR_END ], 1755 | ["not", [:kNOT, :kNOT ], EXPR_ARG ], 1756 | ["or", [:kOR, :kOR ], EXPR_BEG ], 1757 | ["redo", [:kREDO, :kREDO ], EXPR_END ], 1758 | ["rescue", [:kRESCUE, :kRESCUE_MOD ], EXPR_MID ], 1759 | ["retry", [:kRETRY, :kRETRY ], EXPR_END ], 1760 | ["return", [:kRETURN, :kRETURN ], EXPR_MID ], 1761 | ["self", [:kSELF, :kSELF ], EXPR_END ], 1762 | ["super", [:kSUPER, :kSUPER ], EXPR_ARG ], 1763 | ["then", [:kTHEN, :kTHEN ], EXPR_BEG ], 1764 | ["true", [:kTRUE, :kTRUE ], EXPR_END ], 1765 | ["undef", [:kUNDEF, :kUNDEF ], expr_woot ], 1766 | ["unless", [:kUNLESS, :kUNLESS_MOD ], EXPR_BEG ], 1767 | ["until", [:kUNTIL, :kUNTIL_MOD ], EXPR_BEG ], 1768 | ["when", [:kWHEN, :kWHEN ], EXPR_BEG ], 1769 | ["while", [:kWHILE, :kWHILE_MOD ], EXPR_BEG ], 1770 | ["yield", [:kYIELD, :kYIELD ], EXPR_ARG ], 1771 | ["BEGIN", [:klBEGIN, :klBEGIN ], EXPR_END ], 1772 | ["END", [:klEND, :klEND ], EXPR_END ], 1773 | ["__FILE__", [:k__FILE__, :k__FILE__ ], EXPR_END ], 1774 | ["__LINE__", [:k__LINE__, :k__LINE__ ], EXPR_END ], 1775 | ["__ENCODING__", [:k__ENCODING__, :k__ENCODING__], EXPR_END], 1776 | ].map { |args| 1777 | KWtable.new(*args) 1778 | } 1779 | 1780 | # :startdoc: 1781 | 1782 | WORDLIST = Hash[*wordlist.map { |o| [o.name, o] }.flatten] 1783 | 1784 | def self.keyword str 1785 | WORDLIST[str] 1786 | end 1787 | end 1788 | 1789 | class Environment 1790 | attr_reader :env, :dyn 1791 | 1792 | def [] k 1793 | self.all[k] 1794 | end 1795 | 1796 | def []= k, v 1797 | raise "no" if v == true 1798 | self.current[k] = v 1799 | end 1800 | 1801 | def all 1802 | idx = @dyn.index(false) || 0 1803 | @env[0..idx].reverse.inject { |env, scope| env.merge scope } 1804 | end 1805 | 1806 | def current 1807 | @env.first 1808 | end 1809 | 1810 | def extend dyn = false 1811 | @dyn.unshift dyn 1812 | @env.unshift({}) 1813 | end 1814 | 1815 | def initialize dyn = false 1816 | @dyn = [] 1817 | @env = [] 1818 | self.reset 1819 | end 1820 | 1821 | def reset 1822 | @dyn.clear 1823 | @env.clear 1824 | self.extend 1825 | end 1826 | 1827 | def unextend 1828 | @dyn.shift 1829 | @env.shift 1830 | raise "You went too far unextending env" if @env.empty? 1831 | end 1832 | end 1833 | 1834 | class StackState 1835 | attr_reader :name 1836 | attr_reader :stack 1837 | attr_accessor :debug 1838 | 1839 | def initialize name, debug=false 1840 | @name = name 1841 | @stack = [false] 1842 | @debug = debug 1843 | end 1844 | 1845 | def inspect 1846 | "StackState(#{@name}, #{@stack.inspect})" 1847 | end 1848 | 1849 | def is_in_state 1850 | log :is_in_state if debug 1851 | @stack.last 1852 | end 1853 | 1854 | def lexpop 1855 | raise if @stack.size == 0 1856 | a = @stack.pop 1857 | b = @stack.pop 1858 | @stack.push(a || b) 1859 | log :lexpop if debug 1860 | end 1861 | 1862 | def log action 1863 | c = caller[1] 1864 | c = caller[2] if c =~ /expr_result/ 1865 | warn "%s_stack.%s: %p at %s" % [name, action, @stack, c.clean_caller] 1866 | nil 1867 | end 1868 | 1869 | def pop 1870 | r = @stack.pop 1871 | @stack.push false if @stack.empty? 1872 | log :pop if debug 1873 | r 1874 | end 1875 | 1876 | def push val 1877 | @stack.push val 1878 | log :push if debug 1879 | end 1880 | 1881 | def reset 1882 | @stack = [false] 1883 | log :reset if debug 1884 | end 1885 | 1886 | def restore oldstate 1887 | @stack.replace oldstate 1888 | log :restore if debug 1889 | end 1890 | 1891 | def store base = false 1892 | result = @stack.dup 1893 | @stack.replace [base] 1894 | log :store if debug 1895 | result 1896 | end 1897 | end 1898 | end 1899 | -------------------------------------------------------------------------------- /test/test_ruby_parser_extras.rb: -------------------------------------------------------------------------------- 1 | # encoding: US-ASCII 2 | 3 | require "minitest/autorun" 4 | require "ruby_parser_extras" 5 | require "ruby_parser" 6 | 7 | class TestStackState < Minitest::Test 8 | attr_reader :s 9 | 10 | def setup 11 | @s = RubyParserStuff::StackState.new :test 12 | end 13 | 14 | def assert_encoding str, default = false 15 | orig_str = str.dup 16 | p = RubyParser.latest 17 | s = nil 18 | 19 | out, err = capture_io { 20 | s = p.handle_encoding str 21 | } 22 | 23 | assert_equal orig_str.sub(/\357\273\277/, ""), s 24 | 25 | exp_err = "" 26 | 27 | if defined?(Encoding) then 28 | assert_equal "UTF-8", s.encoding.to_s, str.inspect 29 | else 30 | exp_err = "Skipping magic encoding comment\n" unless default 31 | end 32 | 33 | assert_equal "", out, str.inspect 34 | assert_equal exp_err, err, str.inspect # HACK 35 | end 36 | 37 | def test_handle_encoding_bom 38 | # bom support, default to utf-8 39 | assert_encoding "\xEF\xBB\xBF# blah" 40 | # we force_encode to US-ASCII, then encode to UTF-8 so our lexer will work 41 | assert_encoding "\xEF\xBB\xBF# encoding: US-ASCII" 42 | end 43 | 44 | def test_handle_encoding_default 45 | assert_encoding "blah", :default 46 | end 47 | 48 | def test_handle_encoding_emacs 49 | # Q: how many different ways can we screw these up? A: ALL OF THEM 50 | 51 | assert_encoding "# - encoding: utf-8 -" 52 | assert_encoding "# - encoding:utf-8" 53 | assert_encoding "# -* coding: UTF-8 -*-" 54 | assert_encoding "# -*- coding: UTF-8 -*-" 55 | assert_encoding "# -*- coding: utf-8 -*" 56 | assert_encoding "# -*- coding: utf-8 -*-" 57 | assert_encoding "# -*- coding: utf-8; mode: ruby -*-" 58 | assert_encoding "# -*- coding: utf-8; mode: ruby; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2" 59 | assert_encoding "# -*- coding:utf-8; mode:ruby; -*-" 60 | assert_encoding "# -*- encoding: UTF-8 -*-" 61 | assert_encoding "# -*- encoding: utf-8 -*" 62 | assert_encoding "# -*- encoding: utf-8 -*-" 63 | assert_encoding "# -*- mode:ruby; coding:utf-8 -*-" 64 | assert_encoding "# -*- ruby encoding: utf-8 -*-" 65 | assert_encoding "# -- encoding: utf-8 --" 66 | assert_encoding "# ~*~ encoding: utf-8 ~*~" 67 | assert_encoding "#-*- coding: utf-8 -*-" 68 | assert_encoding "#-*- coding:utf-8" 69 | assert_encoding "#-- -*- mode: ruby; encoding: utf-8 -*-\n" 70 | end 71 | 72 | def test_handle_encoding_wtf 73 | assert_encoding "# coding : utf-8" 74 | assert_encoding "# Ruby 1.9: encoding: utf-8" 75 | assert_encoding "# Encoding: UTF-8 <-- required, please leave this in." 76 | assert_encoding "# Encoding: UTF-8" 77 | assert_encoding "# coding: utf-8" 78 | assert_encoding "# coding:utf-8" 79 | assert_encoding "# coding=utf-8" 80 | assert_encoding "# encoding: ASCII" 81 | assert_encoding "# encoding: ASCII-8BIT" 82 | assert_encoding "# encoding: ISO-8859-1" 83 | assert_encoding "# encoding: UTF-8" 84 | assert_encoding "# encoding: ascii-8bit" 85 | assert_encoding "# encoding: cp1252" 86 | assert_encoding "# encoding: euc-jp -*-" 87 | assert_encoding "# encoding: utf-8 # -*- ruby -*-" 88 | assert_encoding "# encoding: utf-8 require 'github_api/utils/url'" 89 | assert_encoding "# encoding: utf-8!" 90 | assert_encoding "# encoding: utf-8" 91 | assert_encoding "#" 92 | assert_encoding "#Encoding: UTF-8" 93 | assert_encoding "#coding:utf-8" 94 | assert_encoding "#encoding: UTF-8!" 95 | assert_encoding "#encoding: UTF-8" 96 | assert_encoding "#encoding: cp1252" 97 | assert_encoding "#encoding: sjis" 98 | assert_encoding "#encoding: utf-8" 99 | end 100 | 101 | def test_handle_encoding_normal 102 | assert_encoding "# encoding: UTF-8" 103 | assert_encoding "# encoding: UTF-8\r\n" # UGH I hate windoze 104 | assert_encoding "# coding: UTF-8" 105 | assert_encoding "# encoding = UTF-8" 106 | assert_encoding "# coding = UTF-8" 107 | end 108 | 109 | def test_handle_encoding_vim 110 | assert_encoding "# vim: set fileencoding=utf-8 filetype=ruby ts=2 : " 111 | assert_encoding "# vim: fileencoding=UTF-8 ft=ruby syn=ruby ts=2 sw=2 ai eol et si" 112 | assert_encoding "# vim: fileencoding=UTF-8 nobomb sw=2 ts=2 et" 113 | assert_encoding "# vim: filetype=ruby, fileencoding=UTF-8, tabsize=2, shiftwidth=2" 114 | assert_encoding "# vim: set fileencoding=utf-8" 115 | assert_encoding "# vim:encoding=UTF-8:" 116 | assert_encoding "# vim:fileencoding=UTF-8:" 117 | assert_encoding "# vim:set fileencoding=utf-8 filetype=ruby" 118 | assert_encoding "# vim:set fileencoding=utf-8:" 119 | end 120 | 121 | def test_stack_state 122 | s.push true 123 | s.push false 124 | s.lexpop 125 | assert_equal [false, true], s.stack 126 | end 127 | 128 | def test_is_in_state 129 | assert_equal false, s.is_in_state 130 | s.push false 131 | assert_equal false, s.is_in_state 132 | s.push true 133 | assert_equal true, s.is_in_state 134 | s.push false 135 | assert_equal false, s.is_in_state 136 | end 137 | 138 | def test_lexpop 139 | assert_equal [false], s.stack 140 | s.push true 141 | s.push false 142 | assert_equal [false, true, false], s.stack 143 | s.lexpop 144 | assert_equal [false, true], s.stack 145 | end 146 | 147 | def test_pop 148 | assert_equal [false], s.stack 149 | s.push true 150 | assert_equal [false, true], s.stack 151 | assert_equal true, s.pop 152 | assert_equal [false], s.stack 153 | end 154 | 155 | def test_push 156 | assert_equal [false], s.stack 157 | s.push true 158 | s.push false 159 | assert_equal [false, true, false], s.stack 160 | end 161 | end 162 | 163 | class TestEnvironment < Minitest::Test 164 | def deny t 165 | assert !t 166 | end 167 | 168 | def setup 169 | @env = RubyParserStuff::Environment.new 170 | @env[:blah] = 42 171 | assert_equal 42, @env[:blah] 172 | end 173 | 174 | def test_var_scope_dynamic 175 | @env.extend :dynamic 176 | assert_equal 42, @env[:blah] 177 | @env.unextend 178 | assert_equal 42, @env[:blah] 179 | end 180 | 181 | def test_var_scope_static 182 | @env.extend 183 | assert_nil @env[:blah] 184 | @env.unextend 185 | assert_equal 42, @env[:blah] 186 | end 187 | 188 | def test_all_dynamic 189 | expected = { :blah => 42 } 190 | 191 | @env.extend :dynamic 192 | assert_equal expected, @env.all 193 | @env.unextend 194 | assert_equal expected, @env.all 195 | end 196 | 197 | def test_all_static 198 | @env.extend 199 | expected = { } 200 | assert_equal expected, @env.all 201 | 202 | @env.unextend 203 | expected = { :blah => 42 } 204 | assert_equal expected, @env.all 205 | end 206 | 207 | def test_all_static_deeper 208 | expected0 = { :blah => 42 } 209 | expected1 = { :blah => 42, :blah2 => 24 } 210 | expected2 = { :blah => 27 } 211 | 212 | @env.extend :dynamic 213 | @env[:blah2] = 24 214 | assert_equal expected1, @env.all 215 | 216 | @env.extend 217 | @env[:blah] = 27 218 | assert_equal expected2, @env.all 219 | 220 | @env.unextend 221 | assert_equal expected1, @env.all 222 | 223 | @env.unextend 224 | assert_equal expected0, @env.all 225 | end 226 | end 227 | 228 | class Fake20 229 | include RubyParserStuff 230 | 231 | def initialize 232 | end 233 | 234 | def s(*a) # bypass lexer/lineno stuff that RP overrides in 235 | Kernel.send :s, *a 236 | end 237 | end 238 | 239 | class TestValueExpr < Minitest::Test 240 | def assert_value_expr exp, input 241 | assert_equal exp, Fake20.new.value_expr(input.line(1)) 242 | end 243 | 244 | def assert_remove_begin exp, input 245 | assert_equal exp, Fake20.new.remove_begin(input.line(1)) 246 | end 247 | 248 | def test_value_expr 249 | assert_value_expr s(:nil), s(:begin) 250 | assert_value_expr s(:nil), s(:begin, s(:nil)) 251 | assert_value_expr s(:nil), s(:begin, s(:begin, s(:nil))) 252 | assert_value_expr s(:begin, s(:nil), s(:nil)), s(:begin, s(:nil), s(:nil)) 253 | end 254 | 255 | def test_remove_begin 256 | assert_remove_begin s(:nil), s(:begin) 257 | assert_remove_begin s(:nil), s(:begin, s(:nil)) 258 | assert_remove_begin s(:nil), s(:begin, s(:begin, s(:nil))) 259 | assert_remove_begin s(:begin, s(:nil), s(:nil)), s(:begin, s(:nil), s(:nil)) 260 | end 261 | end 262 | -------------------------------------------------------------------------------- /tools/munge.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -ws 2 | 3 | $v ||= false 4 | 5 | stack = [] 6 | last_token = nil 7 | reduce_line = nil 8 | 9 | def munge s 10 | renames = [ 11 | "'='", "tEQL", 12 | "'!'", "tBANG", 13 | "'%'", "tPERCENT", 14 | "'&'", "tAMPER2", 15 | "'('", "tLPAREN2", 16 | "')'", "tRPAREN", 17 | "'*'", "tSTAR2", 18 | "'+'", "tPLUS", 19 | "','", "tCOMMA", 20 | "'-'", "tMINUS", 21 | "'.'", "tDOT", 22 | "'/'", "tDIVIDE", 23 | "';'", "tSEMI", 24 | "':'", "tCOLON", 25 | "'<'", "tLT", 26 | "'>'", "tGT", 27 | "'?'", "tEH", 28 | "'['", "tLBRACK", 29 | "'\\n'", "tNL", 30 | "']'", "tRBRACK", 31 | "'^'", "tCARET", 32 | "'`'", "tBACK_REF2", 33 | "'{'", "tLCURLY", 34 | "'|'", "tPIPE", 35 | "'}'", "tRCURLY", 36 | "'~'", "tTILDE", 37 | '"["', "tLBRACK", 38 | 39 | # 2.0 changes? 40 | '"<=>"', "tCMP", 41 | '"=="', "tEQ", 42 | '"==="', "tEQQ", 43 | '"!~"', "tNMATCH", 44 | '"=~"', "tMATCH", 45 | '">="', "tGEQ", 46 | '"<="', "tLEQ", 47 | '"!="', "tNEQ", 48 | '"<<"', "tLSHFT", 49 | '">>"', "tRSHFT", 50 | '"*"', "tSTAR", 51 | 52 | '".."', "tDOT2", 53 | 54 | '"&"', "tAMPER", 55 | '"&&"', "tANDOP", 56 | '"&."', "tLONELY", 57 | '"||"', "tOROP", 58 | 59 | '"..."', "tDOT3", 60 | '"**"', "tPOW", 61 | '"unary+"', "tUPLUS", 62 | '"unary-"', "tUMINUS", 63 | '"[]"', "tAREF", 64 | '"[]="', "tASET", 65 | '"::"', "tCOLON2", 66 | '"{ arg"', "tLBRACE_ARG", 67 | '"( arg"', "tLPAREN_ARG", 68 | '"("', "tLPAREN", 69 | 'rparen', "tRPAREN", 70 | '"{"', "tLBRACE", 71 | '"=>"', "tASSOC", 72 | '"->"', "tLAMBDA", 73 | '":: at EXPR_BEG"', "tCOLON3", 74 | '"**arg"', "tDSTAR", 75 | '","', "tCOMMA", 76 | 77 | # other 78 | 79 | 'kTERMINATOR', "tSTRING_END", 80 | '"kTERMINATOR"', "tSTRING_END", 81 | 'kTRCURLY', "tSTRING_DEND", 82 | 83 | '"symbol literal"', "tSYMBEG", 84 | '"string literal"', "tSTRING_BEG", 85 | '"backtick literal"', "tXSTRING_BEG", 86 | '"regexp literal"', "tREGEXP_BEG", 87 | '"word list"', "tWORDS_BEG", 88 | '"verbatim word list"', "tQWORDS_BEG", 89 | '"symbol list"', "tSYMBOLS_BEG", 90 | '"verbatim symbol list"', "tQSYMBOLS_BEG", 91 | '"terminator"', "tSTRING_END", 92 | '"\'}\'"', "tSTRING_DEND", 93 | 94 | '"string literal"',"tSTRING_BEG", 95 | '"literal content"', "tSTRING_CONTENT", 96 | /\$/, "", # try to remove these lumps? 97 | 98 | 'tLBRACK2', "tLBRACK", # HACK 99 | 100 | "' '", "tSPACE", # needs to be later to avoid bad hits 101 | 102 | "/* empty */", "none", 103 | /^\s*$/, "", 104 | 105 | "keyword_BEGIN", "klBEGIN", 106 | "keyword_END", "klEND", 107 | /keyword_(\w+)/, proc { "k#{$1.upcase}" }, 108 | /\bk_([a-z_]+)/, proc { "k#{$1.upcase}" }, 109 | /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" }, 110 | "kVARIABLE", "keyword_variable", # ugh 111 | "tCONST", "kCONST", 112 | 113 | # 2.6 collapses klBEGIN to kBEGIN 114 | "klBEGIN", "kBEGIN", 115 | "klEND", "kEND", 116 | 117 | /keyword_(\w+)/, proc { "k#{$1.upcase}" }, 118 | /\bk_([^_][a-z_]+)/, proc { "k#{$1.upcase}" }, 119 | /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" }, 120 | 121 | "kVARIABLE", "keyword_variable", # ugh: this is a rule name 122 | 123 | # UGH 124 | "k_LINE__", "k__LINE__", 125 | "k_FILE__", "k__FILE__", 126 | "k_ENCODING__", "k__ENCODING__", 127 | 128 | '"defined?"', "kDEFINED", 129 | 130 | "", "none", 131 | 132 | '"do (for condition)"', "kDO_COND", 133 | '"do (for lambda)"', "kDO_LAMBDA", 134 | '"do (for block)"', "kDO_BLOCK", 135 | '"local variable or method"', "tIDENTIFIER", 136 | 137 | /\"(\w+) \(modifier\)\"/, proc { |x| "k#{$1.upcase}_MOD" }, 138 | /\"(\w+)\"/, proc { |x| "k#{$1.upcase}" }, 139 | /\"`(\w+)'\"/, proc { |x| "k#{$1.upcase}" }, 140 | 141 | /@(\d+)(\s+|$)/, "", 142 | /\$?@(\d+) */, "", # TODO: remove? 143 | 144 | /_EXPR/, "", 145 | ] 146 | 147 | renames.each_slice(2) do |(a, b)| 148 | if Proc === b then 149 | s.gsub!(a, &b) 150 | else 151 | s.gsub!(a, b) 152 | end 153 | end 154 | 155 | if s.empty? then 156 | nil 157 | else 158 | s.strip.squeeze " " 159 | end 160 | end 161 | 162 | ARGF.each_line do |line| 163 | case line 164 | when /^(Stack now|Entering state|Shifting|Cleanup|Starting)/ then 165 | # do nothing 166 | when /^vtable_/ then 167 | # do nothing 168 | when /Gem::MissingSpecError/ then 169 | # do nothing -- ruby 2.5 is being bitchy? 170 | when /^Reading a token: Next token is token (.*?) \(\)/ then 171 | token = munge $1 172 | next if last_token == token 173 | puts "next token is %p" % [token] 174 | last_token = token 175 | when /^Reading a token: / then 176 | next # skip 177 | when /^Reading a token$/ then # wtf? 178 | next # skip 179 | when /^(?:add_delayed_token|parser_dispatch)/ then # dunno what this is yet 180 | next # skip 181 | when /^read\s+:(\w+)/ then # read :tNL(tNL) nil 182 | token = munge $1 183 | next if last_token == token 184 | puts "next token is %p" % [token] 185 | last_token = token 186 | when /^Next token is token ("[^"]+"|\S+)/ then 187 | token = munge $1 188 | next if last_token == token 189 | puts "next token is %p" % [token] 190 | last_token = token 191 | when /^read\s+false/ then # read false($end) "$end" 192 | puts "next token is EOF" 193 | when /^Now at end of input./ then 194 | # do nothing 195 | when /^.:scan=>\["([^"]+)"/ then 196 | puts "scan = %p" % [$1] 197 | when /^.:getch=>\["([^"]+)/ then 198 | puts "SCAN = %p" % [$1] 199 | when /^Reducing stack by rule (\d+) \(line (\d+)\):/ then 200 | reduce_line = $2.to_i 201 | when /^ \$\d+ = (?:token|nterm) (.+) \(.*\)/ then 202 | item = $1 203 | stack << munge(item) 204 | when /^-> \$\$ = (?:token|nterm) (.+) \(.*\)/ then 205 | stack << "none" if stack.empty? 206 | item = munge $1 207 | x = stack.compact.map { |s| munge s.strip }.compact.join " " 208 | if x != item then # prevent kdef -> kdef 209 | if $v && reduce_line then 210 | puts "reduce #{x} --> #{item} at #{reduce_line}".squeeze " " 211 | else 212 | puts "reduce #{x} --> #{item}".squeeze " " 213 | end 214 | puts 215 | end 216 | reduce_line = nil 217 | stack.clear 218 | when /^reduce/ then # ruby_parser side 219 | s = munge line.chomp 220 | next if s =~ /reduce\s+(\w+) --> \1/ 221 | puts s 222 | puts 223 | when /^(\w+_stack)\.(\w+)/ then 224 | # TODO: make pretty, but still informative w/ line numbers etc 225 | puts line.gsub("true", "1").gsub("false", "0") 226 | # puts "#{$1}(#{$2})" 227 | when /^(\w+_stack(\(\w+\))?: \S+)/ then 228 | # _data = $v ? line.chomp : $1 229 | # puts line 230 | # TODO: make pretty, but still informative w/ line numbers etc 231 | puts line.gsub("true", "1").gsub("false", "0") 232 | when /^lex_state: :?([\w|()]+) -> :?([\w|]+)(?: (?:at|from) (.*))?/ then 233 | a, b, c = $1.upcase, $2.upcase, $3 234 | a.gsub!(/EXPR_/, "") 235 | b.gsub!(/EXPR_/, "") 236 | if c && $v then 237 | puts "lex_state: #{a} -> #{b} at #{c}" 238 | else 239 | puts "lex_state: #{a} -> #{b}" 240 | end 241 | when /debug|FUCK/ then 242 | puts line.chomp 243 | when /^(#.*parse error|on )/ then 244 | puts line.chomp 245 | when /^(goto|shift| +\[|$)/ then # racc 246 | # do nothing 247 | # when /^Reading a token: Now at end of input./ then 248 | # # puts "EOF" 249 | # when /^Reading a token: Next token is token (.+)/ then 250 | # puts "READ: #{$1.inspect}" 251 | when /^accept/ then 252 | puts "DONE" 253 | else 254 | puts "unparsed: #{line.chomp}" 255 | end 256 | end 257 | -------------------------------------------------------------------------------- /tools/ripper.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -ws 2 | 3 | $b ||= false # bug mode -- ripper is buggy, use Ripper.sexp 4 | $d ||= false # debug -- turn on yydebug 5 | $p ||= false # Use pp 6 | 7 | require "ripper/sexp" 8 | require "pp" if $p 9 | 10 | if ARGV.empty? then 11 | warn "reading from stdin" 12 | ARGV << "-" 13 | end 14 | 15 | class MySexpBuilder < Ripper::SexpBuilderPP 16 | def on_parse_error msg 17 | Kernel.warn msg 18 | end 19 | end 20 | 21 | ARGV.each do |path| 22 | src = path == "-" ? $stdin.read : File.read(path) 23 | 24 | sexp = nil 25 | 26 | if $b then 27 | sexp = Ripper.sexp src 28 | else 29 | rip = MySexpBuilder.new src 30 | rip.yydebug = $d 31 | sexp = rip.parse 32 | 33 | if rip.error? then 34 | warn "skipping" 35 | next 36 | end 37 | end 38 | 39 | puts "accept" 40 | 41 | if $p then 42 | pp sexp 43 | else 44 | p sexp 45 | end 46 | end 47 | --------------------------------------------------------------------------------