├── .github └── workflows │ ├── gouteur.yml │ ├── lint.yml │ └── tests.yml ├── .gitignore ├── .gouteur.yml ├── .rubocop.yml ├── CHANGELOG.md ├── Gemfile ├── LICENSE ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── lib ├── regexp_parser.rb └── regexp_parser │ ├── error.rb │ ├── expression.rb │ ├── expression │ ├── base.rb │ ├── classes │ │ ├── alternation.rb │ │ ├── anchor.rb │ │ ├── backreference.rb │ │ ├── character_set.rb │ │ ├── character_set │ │ │ ├── intersection.rb │ │ │ └── range.rb │ │ ├── character_type.rb │ │ ├── conditional.rb │ │ ├── escape_sequence.rb │ │ ├── free_space.rb │ │ ├── group.rb │ │ ├── keep.rb │ │ ├── literal.rb │ │ ├── posix_class.rb │ │ ├── root.rb │ │ └── unicode_property.rb │ ├── methods │ │ ├── construct.rb │ │ ├── escape_sequence_char.rb │ │ ├── escape_sequence_codepoint.rb │ │ ├── human_name.rb │ │ ├── match.rb │ │ ├── match_length.rb │ │ ├── negative.rb │ │ ├── options.rb │ │ ├── parts.rb │ │ ├── printing.rb │ │ ├── referenced_expressions.rb │ │ ├── strfregexp.rb │ │ ├── tests.rb │ │ └── traverse.rb │ ├── quantifier.rb │ ├── sequence.rb │ ├── sequence_operation.rb │ ├── shared.rb │ └── subexpression.rb │ ├── lexer.rb │ ├── parser.rb │ ├── scanner │ ├── char_type.rl │ ├── errors │ │ ├── premature_end_error.rb │ │ ├── scanner_error.rb │ │ └── validation_error.rb │ ├── properties │ │ ├── long.csv │ │ └── short.csv │ ├── property.rl │ └── scanner.rl │ ├── syntax.rb │ ├── syntax │ ├── any.rb │ ├── base.rb │ ├── token.rb │ ├── token │ │ ├── anchor.rb │ │ ├── assertion.rb │ │ ├── backreference.rb │ │ ├── character_set.rb │ │ ├── character_type.rb │ │ ├── conditional.rb │ │ ├── escape.rb │ │ ├── group.rb │ │ ├── keep.rb │ │ ├── meta.rb │ │ ├── posix_class.rb │ │ ├── quantifier.rb │ │ ├── unicode_property.rb │ │ └── virtual.rb │ ├── version_lookup.rb │ ├── versions.rb │ └── versions │ │ ├── 1.8.6.rb │ │ ├── 1.9.1.rb │ │ ├── 1.9.3.rb │ │ ├── 2.0.0.rb │ │ ├── 2.2.0.rb │ │ ├── 2.3.0.rb │ │ ├── 2.4.0.rb │ │ ├── 2.4.1.rb │ │ ├── 2.5.0.rb │ │ ├── 2.6.0.rb │ │ ├── 2.6.2.rb │ │ ├── 2.6.3.rb │ │ ├── 3.1.0.rb │ │ └── 3.2.0.rb │ ├── token.rb │ └── version.rb ├── regexp_parser.gemspec ├── spec ├── expression │ ├── base_spec.rb │ ├── clone_spec.rb │ ├── conditional_spec.rb │ ├── free_space_spec.rb │ ├── methods │ │ ├── construct_spec.rb │ │ ├── human_name_spec.rb │ │ ├── match_length_spec.rb │ │ ├── match_spec.rb │ │ ├── negative_spec.rb │ │ ├── parts_spec.rb │ │ ├── printing_spec.rb │ │ ├── strfregexp_spec.rb │ │ ├── tests_spec.rb │ │ └── traverse_spec.rb │ ├── options_spec.rb │ ├── subexpression_spec.rb │ ├── te_ts_spec.rb │ ├── to_h_spec.rb │ └── to_s_spec.rb ├── lexer │ ├── all_spec.rb │ ├── conditionals_spec.rb │ ├── delimiters_spec.rb │ ├── escapes_spec.rb │ ├── keep_spec.rb │ ├── literals_spec.rb │ ├── nesting_spec.rb │ └── refcalls_spec.rb ├── parser │ ├── all_spec.rb │ ├── alternation_spec.rb │ ├── anchors_spec.rb │ ├── conditionals_spec.rb │ ├── errors_spec.rb │ ├── escapes_spec.rb │ ├── free_space_spec.rb │ ├── groups_spec.rb │ ├── keep_spec.rb │ ├── options_spec.rb │ ├── posix_classes_spec.rb │ ├── properties_spec.rb │ ├── quantifiers_spec.rb │ ├── refcalls_spec.rb │ ├── set │ │ ├── intersections_spec.rb │ │ └── ranges_spec.rb │ ├── sets_spec.rb │ └── types_spec.rb ├── scanner │ ├── all_spec.rb │ ├── anchors_spec.rb │ ├── conditionals_spec.rb │ ├── delimiters_spec.rb │ ├── errors_spec.rb │ ├── escapes_spec.rb │ ├── free_space_spec.rb │ ├── groups_spec.rb │ ├── keep_spec.rb │ ├── literals_spec.rb │ ├── meta_spec.rb │ ├── options_spec.rb │ ├── properties_spec.rb │ ├── quantifiers_spec.rb │ ├── refcalls_spec.rb │ ├── sets_spec.rb │ └── types_spec.rb ├── spec_helper.rb ├── support │ ├── capturing_stderr.rb │ └── shared_examples.rb ├── syntax │ ├── syntax_spec.rb │ ├── syntax_token_map_spec.rb │ └── versions │ │ ├── 1.8.6_spec.rb │ │ ├── 1.9.1_spec.rb │ │ ├── 1.9.3_spec.rb │ │ ├── 2.0.0_spec.rb │ │ ├── 2.2.0_spec.rb │ │ └── 3.2.0_spec.rb └── token │ └── token_spec.rb └── tasks ├── benchmark.rake ├── benchmarks ├── log ├── minimal_regexp.rb └── uri_regexp.rb ├── props.rake └── ragel.rake /.github/workflows/gouteur.yml: -------------------------------------------------------------------------------- 1 | name: gouteur 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Set up Ruby 12 | uses: ruby/setup-ruby@v1 13 | with: 14 | ruby-version: 3.2 15 | bundler-cache: true 16 | - name: Install and run ragel 17 | run: | 18 | sudo apt-get install -yqq ragel 19 | bundle exec rake ragel:rb 20 | - name: Test 21 | run: bundle exec gouteur 22 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml 2 | 3 | name: rubocop linting 4 | 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Ruby 14 | uses: ruby/setup-ruby@v1 15 | with: 16 | ruby-version: 3.2 17 | bundler-cache: true 18 | - name: Install and run ragel 19 | run: | 20 | sudo apt-get install -yqq ragel 21 | bundle exec rake ragel:rb 22 | - name: Run rubocop 23 | run: bundle exec rubocop 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | ruby: [ '2.3', '2.4', '2.5', '2.6', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head' ] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Ruby ${{ matrix.ruby }} 20 | uses: ruby/setup-ruby@v1 21 | with: 22 | ruby-version: ${{ matrix.ruby }} 23 | bundler-cache: true 24 | - name: Install ragel 25 | run: sudo apt-get install -yqq ragel 26 | - name: Test with Rake 27 | run: bundle exec rake test:full 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | .*.swp 3 | .DS_Store 4 | .ruby-version 5 | .tags 6 | .tags1 7 | .tool-versions 8 | 9 | Gemfile.lock 10 | 11 | lib/regexp_parser/scanner.rb 12 | 13 | doc 14 | .yardoc 15 | 16 | .bundle/* 17 | pkg/* 18 | coverage/* 19 | tmp/* 20 | -------------------------------------------------------------------------------- /.gouteur.yml: -------------------------------------------------------------------------------- 1 | # Usage: https://github.com/jaynetics/gouteur/blob/main/README.md 2 | 3 | repos: 4 | - uri: https://github.com/jaynetics/js_regex 5 | 6 | - uri: https://github.com/jaynetics/repper 7 | 8 | - uri: https://github.com/rubocop-hq/rubocop 9 | tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" 10 | 11 | - uri: https://github.com/mbj/mutant 12 | tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" 13 | 14 | - uri: https://github.com/teamcapybara/capybara 15 | tasks: rspec spec/regexp_dissassembler_spec.rb 16 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | DisabledByDefault: true 3 | Exclude: 4 | - '{bin,pkg,tmp,vendor}/**/*' # vendored dependencies etc. 5 | - 'lib/regexp_parser/scanner.rb' # Ragel-generated code 6 | NewCops: enable 7 | RubyInterpreters: 8 | - ruby 9 | - rake 10 | SuggestExtensions: false 11 | TargetRubyVersion: 2.6 # really 2.0, but 2.6 is lowest supported by rubocop 12 | 13 | Lint: 14 | Enabled: true 15 | 16 | # ignore weird looking regexps in specs, we have these on purpose 17 | Lint/DuplicateRegexpCharacterClassElement: 18 | Exclude: ['spec/**/*'] 19 | Lint/MixedRegexpCaptureTypes: 20 | Exclude: ['spec/**/*'] 21 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | 5 | group :development, :test do 6 | gem 'leto', '~> 2.1' 7 | gem 'rake', '~> 13.1' 8 | gem 'regexp_property_values', '~> 1.5' 9 | gem 'rspec', '~> 3.10' 10 | if RUBY_VERSION.to_f >= 2.7 11 | gem 'benchmark-ips', '~> 2.1' 12 | gem 'gouteur', '~> 1.1' 13 | gem 'rubocop', '~> 1.59' 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, 2012-2024, Ammar Ali 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler' 2 | require 'rubygems' 3 | require 'rubygems/package_task' 4 | require 'rake' 5 | require 'rake/testtask' 6 | require 'rspec/core/rake_task' 7 | 8 | Dir['tasks/**/*.rake'].each { |file| load(file) } 9 | 10 | Bundler::GemHelper.install_tasks 11 | 12 | RSpec::Core::RakeTask.new(:spec) 13 | 14 | task :default => [:'test:full'] 15 | 16 | namespace :test do 17 | task full: [:'ragel:rb', :spec] 18 | end 19 | 20 | # Add ragel task as a prerequisite for building the gem to ensure that the 21 | # latest scanner code is generated and included in the build. 22 | desc "Runs ragel:rb before building the gem" 23 | task :build => ['ragel:rb'] 24 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'bundler/setup' 4 | require 'regexp_parser' 5 | require 'regexp_property_values' 6 | 7 | RL = Regexp::Lexer 8 | RP = Regexp::Parser 9 | RS = Regexp::Scanner 10 | PV = RegexpPropertyValues 11 | 12 | def lex(...); Regexp::Lexer.lex(...) end 13 | def parse(...); Regexp::Parser.parse(...) end 14 | def scan(...); Regexp::Scanner.scan(...) end 15 | 16 | require 'irb' 17 | IRB.start(__FILE__) 18 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -euo pipefail 3 | 4 | # install gems 5 | bundle 6 | 7 | # install ragel 8 | rake ragel:install 9 | -------------------------------------------------------------------------------- /lib/regexp_parser.rb: -------------------------------------------------------------------------------- 1 | require_relative 'regexp_parser/version' 2 | require_relative 'regexp_parser/token' 3 | require_relative 'regexp_parser/scanner' 4 | require_relative 'regexp_parser/syntax' 5 | require_relative 'regexp_parser/lexer' 6 | require_relative 'regexp_parser/parser' 7 | -------------------------------------------------------------------------------- /lib/regexp_parser/error.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Parser 2 | # base class for all gem-specific errors 3 | class Error < StandardError; end 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression.rb: -------------------------------------------------------------------------------- 1 | require_relative 'error' 2 | 3 | require_relative 'expression/shared' 4 | require_relative 'expression/base' 5 | require_relative 'expression/quantifier' 6 | require_relative 'expression/subexpression' 7 | require_relative 'expression/sequence' 8 | require_relative 'expression/sequence_operation' 9 | 10 | require_relative 'expression/classes/alternation' 11 | require_relative 'expression/classes/anchor' 12 | require_relative 'expression/classes/backreference' 13 | require_relative 'expression/classes/character_set' 14 | require_relative 'expression/classes/character_set/intersection' 15 | require_relative 'expression/classes/character_set/range' 16 | require_relative 'expression/classes/character_type' 17 | require_relative 'expression/classes/conditional' 18 | require_relative 'expression/classes/escape_sequence' 19 | require_relative 'expression/classes/free_space' 20 | require_relative 'expression/classes/group' 21 | require_relative 'expression/classes/keep' 22 | require_relative 'expression/classes/literal' 23 | require_relative 'expression/classes/posix_class' 24 | require_relative 'expression/classes/root' 25 | require_relative 'expression/classes/unicode_property' 26 | 27 | require_relative 'expression/methods/construct' 28 | require_relative 'expression/methods/escape_sequence_char' 29 | require_relative 'expression/methods/escape_sequence_codepoint' 30 | require_relative 'expression/methods/human_name' 31 | require_relative 'expression/methods/match' 32 | require_relative 'expression/methods/match_length' 33 | require_relative 'expression/methods/negative' 34 | require_relative 'expression/methods/options' 35 | require_relative 'expression/methods/parts' 36 | require_relative 'expression/methods/printing' 37 | require_relative 'expression/methods/referenced_expressions' 38 | require_relative 'expression/methods/strfregexp' 39 | require_relative 'expression/methods/tests' 40 | require_relative 'expression/methods/traverse' 41 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/base.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Base 3 | include Regexp::Expression::Shared 4 | 5 | def initialize(token, options = {}) 6 | init_from_token_and_options(token, options) 7 | end 8 | 9 | def to_re(format = :full) 10 | if set_level > 0 11 | warn "Calling #to_re on character set members is deprecated - "\ 12 | "their behavior might not be equivalent outside of the set." 13 | end 14 | ::Regexp.new(to_s(format)) 15 | end 16 | 17 | def quantify(*args) 18 | self.quantifier = Quantifier.new(*args) 19 | end 20 | 21 | def unquantified_clone 22 | clone.tap { |exp| exp.quantifier = nil } 23 | end 24 | 25 | # Deprecated. Prefer `#repetitions` which has a more uniform interface. 26 | def quantity 27 | return [nil,nil] unless quantified? 28 | [quantifier.min, quantifier.max] 29 | end 30 | 31 | def repetitions 32 | @repetitions ||= 33 | if quantified? 34 | min = quantifier.min 35 | max = quantifier.max < 0 ? Float::INFINITY : quantifier.max 36 | range = min..max 37 | # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807 38 | if RUBY_VERSION.to_f < 2.7 39 | range.define_singleton_method(:minmax) { [min, max] } 40 | end 41 | range 42 | else 43 | 1..1 44 | end 45 | end 46 | 47 | def greedy? 48 | quantified? and quantifier.greedy? 49 | end 50 | 51 | def reluctant? 52 | quantified? and quantifier.reluctant? 53 | end 54 | alias :lazy? :reluctant? 55 | 56 | def possessive? 57 | quantified? and quantifier.possessive? 58 | end 59 | 60 | def to_h 61 | { 62 | type: type, 63 | token: token, 64 | text: to_s(:base), 65 | starts_at: ts, 66 | length: full_length, 67 | level: level, 68 | set_level: set_level, 69 | conditional_level: conditional_level, 70 | options: options, 71 | quantifier: quantified? ? quantifier.to_h : nil, 72 | } 73 | end 74 | alias :attributes :to_h 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/alternation.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | # A sequence of expressions, used by Alternation as one of its alternatives. 3 | class Alternative < Regexp::Expression::Sequence; end 4 | 5 | class Alternation < Regexp::Expression::SequenceOperation 6 | OPERAND = Alternative 7 | 8 | alias :alternatives :expressions 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/anchor.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Anchor 3 | class Base < Regexp::Expression::Base; end 4 | 5 | class BeginningOfLine < Anchor::Base; end 6 | class EndOfLine < Anchor::Base; end 7 | 8 | class BeginningOfString < Anchor::Base; end 9 | class EndOfString < Anchor::Base; end 10 | 11 | class EndOfStringOrBeforeEndOfLine < Anchor::Base; end 12 | 13 | class WordBoundary < Anchor::Base; end 14 | class NonWordBoundary < Anchor::Base; end 15 | 16 | class MatchStart < Anchor::Base; end 17 | 18 | BOL = BeginningOfLine 19 | EOL = EndOfLine 20 | BOS = BeginningOfString 21 | EOS = EndOfString 22 | EOSobEOL = EndOfStringOrBeforeEndOfLine 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/backreference.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Backreference 3 | class Base < Regexp::Expression::Base; end 4 | 5 | class Number < Backreference::Base 6 | attr_reader :number 7 | alias reference number 8 | 9 | def initialize(token, options = {}) 10 | @number = token.text[/-?\d+/].to_i 11 | super 12 | end 13 | end 14 | 15 | class Name < Backreference::Base 16 | attr_reader :name 17 | alias reference name 18 | 19 | def initialize(token, options = {}) 20 | @name = token.text[3..-2] 21 | super 22 | end 23 | end 24 | 25 | class NumberRelative < Backreference::Number 26 | attr_accessor :effective_number 27 | alias reference effective_number 28 | end 29 | 30 | class NumberCall < Backreference::Number; end 31 | class NameCall < Backreference::Name; end 32 | class NumberCallRelative < Backreference::NumberRelative; end 33 | 34 | class NumberRecursionLevel < Backreference::NumberRelative 35 | attr_reader :recursion_level 36 | 37 | def initialize(token, options = {}) 38 | super 39 | @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i) 40 | end 41 | end 42 | 43 | class NameRecursionLevel < Backreference::Name 44 | attr_reader :recursion_level 45 | 46 | def initialize(token, options = {}) 47 | super 48 | @name, recursion_level = token.text[3..-2].split(/(?=[+-])/) 49 | @recursion_level = recursion_level.to_i 50 | end 51 | end 52 | end 53 | 54 | # alias for symmetry between token symbol and Expression class name 55 | Backref = Backreference 56 | end 57 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/character_set.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class CharacterSet < Regexp::Expression::Subexpression 3 | attr_accessor :closed, :negative 4 | alias :closed? :closed 5 | 6 | def initialize(token, options = {}) 7 | self.negative = false 8 | self.closed = false 9 | super 10 | end 11 | 12 | def negate 13 | self.negative = true 14 | end 15 | 16 | def close 17 | self.closed = true 18 | end 19 | end 20 | 21 | # alias for symmetry between token symbol and Expression class name 22 | Set = CharacterSet 23 | end # module Regexp::Expression 24 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/character_set/intersection.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class CharacterSet < Regexp::Expression::Subexpression 3 | class IntersectedSequence < Regexp::Expression::Sequence; end 4 | 5 | class Intersection < Regexp::Expression::SequenceOperation 6 | OPERAND = IntersectedSequence 7 | end 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/character_set/range.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class CharacterSet < Regexp::Expression::Subexpression 3 | class Range < Regexp::Expression::Subexpression 4 | def ts 5 | (head = expressions.first) ? head.ts : @ts 6 | end 7 | 8 | def <<(exp) 9 | complete? and raise Regexp::Parser::Error, 10 | "Can't add more than 2 expressions to a Range" 11 | super 12 | end 13 | 14 | def complete? 15 | count == 2 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/character_type.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module CharacterType 3 | class Base < Regexp::Expression::Base; end 4 | 5 | class Any < CharacterType::Base; end 6 | class Digit < CharacterType::Base; end 7 | class NonDigit < CharacterType::Base; end 8 | class Hex < CharacterType::Base; end 9 | class NonHex < CharacterType::Base; end 10 | class Word < CharacterType::Base; end 11 | class NonWord < CharacterType::Base; end 12 | class Space < CharacterType::Base; end 13 | class NonSpace < CharacterType::Base; end 14 | class Linebreak < CharacterType::Base; end 15 | class ExtendedGrapheme < CharacterType::Base; end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/conditional.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Conditional 3 | class TooManyBranches < Regexp::Parser::Error 4 | def initialize 5 | super('The conditional expression has more than 2 branches') 6 | end 7 | end 8 | 9 | class Condition < Regexp::Expression::Base 10 | # Name or number of the referenced capturing group that determines state. 11 | # Returns a String if reference is by name, Integer if by number. 12 | def reference 13 | ref = text.tr("'<>()", "") 14 | ref =~ /\D/ ? ref : Integer(ref) 15 | end 16 | end 17 | 18 | class Branch < Regexp::Expression::Sequence; end 19 | 20 | class Expression < Regexp::Expression::Subexpression 21 | def <<(exp) 22 | expressions.last << exp 23 | end 24 | 25 | def add_sequence(active_opts = {}, params = { ts: 0 }) 26 | raise TooManyBranches.new if branches.length == 2 27 | params = params.merge({ conditional_level: conditional_level + 1 }) 28 | Branch.add_to(self, params, active_opts) 29 | end 30 | alias :branch :add_sequence 31 | 32 | def condition=(exp) 33 | expressions.delete(condition) 34 | expressions.unshift(exp) 35 | end 36 | 37 | def condition 38 | find { |subexp| subexp.is_a?(Condition) } 39 | end 40 | 41 | def branches 42 | select { |subexp| subexp.is_a?(Sequence) } 43 | end 44 | 45 | def reference 46 | condition.reference 47 | end 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/escape_sequence.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module EscapeSequence 3 | Base = Class.new(Regexp::Expression::Base) 4 | 5 | AsciiEscape = Class.new(Base) # \e 6 | Backspace = Class.new(Base) # \b 7 | Bell = Class.new(Base) # \a 8 | FormFeed = Class.new(Base) # \f 9 | Newline = Class.new(Base) # \n 10 | Return = Class.new(Base) # \r 11 | Tab = Class.new(Base) # \t 12 | VerticalTab = Class.new(Base) # \v 13 | 14 | Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes) 15 | 16 | Octal = Class.new(Base) # e.g. \012 17 | Hex = Class.new(Base) # e.g. \x0A 18 | Codepoint = Class.new(Base) # e.g. \u000A 19 | 20 | CodepointList = Class.new(Base) # e.g. \u{A B} 21 | 22 | AbstractMetaControlSequence = Class.new(Base) 23 | Control = Class.new(AbstractMetaControlSequence) # e.g. \cB 24 | Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z 25 | MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX 26 | end 27 | 28 | # alias for symmetry between Token::* and Expression::* 29 | Escape = EscapeSequence 30 | end 31 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/free_space.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class FreeSpace < Regexp::Expression::Base 3 | def quantify(*_args) 4 | raise Regexp::Parser::Error, 'Can not quantify a free space object' 5 | end 6 | end 7 | 8 | class Comment < Regexp::Expression::FreeSpace 9 | end 10 | 11 | class WhiteSpace < Regexp::Expression::FreeSpace 12 | def merge(exp) 13 | warn("#{self.class}##{__method__} is deprecated and will be removed in v3.0.0.") 14 | text << exp.text 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/group.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Group 3 | class Base < Regexp::Expression::Subexpression 4 | end 5 | 6 | class Passive < Group::Base 7 | attr_writer :implicit 8 | 9 | def initialize(*) 10 | @implicit = false 11 | super 12 | end 13 | 14 | def implicit? 15 | @implicit 16 | end 17 | end 18 | 19 | class Absence < Group::Base; end 20 | class Atomic < Group::Base; end 21 | # TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no 22 | # longer inherit from Group because it is effectively a terminal expression. 23 | class Options < Group::Base 24 | attr_accessor :option_changes 25 | 26 | def initialize_copy(orig) 27 | self.option_changes = orig.option_changes.dup 28 | super 29 | end 30 | 31 | def quantify(*args) 32 | if token == :options_switch 33 | raise Regexp::Parser::Error, 'Can not quantify an option switch' 34 | else 35 | super 36 | end 37 | end 38 | end 39 | 40 | class Capture < Group::Base 41 | attr_accessor :number, :number_at_level 42 | alias identifier number 43 | end 44 | 45 | class Named < Group::Capture 46 | attr_reader :name 47 | alias identifier name 48 | 49 | def initialize(token, options = {}) 50 | @name = token.text[3..-2] 51 | super 52 | end 53 | 54 | def initialize_copy(orig) 55 | @name = orig.name.dup 56 | super 57 | end 58 | end 59 | 60 | class Comment < Group::Base 61 | end 62 | end 63 | 64 | module Assertion 65 | class Base < Regexp::Expression::Group::Base; end 66 | 67 | class Lookahead < Assertion::Base; end 68 | class NegativeLookahead < Assertion::Base; end 69 | 70 | class Lookbehind < Assertion::Base; end 71 | class NegativeLookbehind < Assertion::Base; end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/keep.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Keep 3 | # TODO: in regexp_parser v3.0.0 this should possibly be a Subexpression 4 | # that contains all expressions to its left. 5 | class Mark < Regexp::Expression::Base; end 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/literal.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Literal < Regexp::Expression::Base; end 3 | end 4 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/posix_class.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class PosixClass < Regexp::Expression::Base 3 | def name 4 | text[/\w+/] 5 | end 6 | end 7 | 8 | # alias for symmetry between token symbol and Expression class name 9 | Posixclass = PosixClass 10 | Nonposixclass = PosixClass 11 | end 12 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/root.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Root < Regexp::Expression::Subexpression 3 | def self.build(options = {}) 4 | warn "`#{self.class}.build(options)` is deprecated and will raise in "\ 5 | "regexp_parser v3.0.0. Please use `.construct(options: options)`." 6 | construct(options: options) 7 | end 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/classes/unicode_property.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module UnicodeProperty 3 | class Base < Regexp::Expression::Base 4 | def name 5 | text[/\A\\[pP]\{([^}]+)\}\z/, 1] 6 | end 7 | 8 | def shortcut 9 | Regexp::Scanner.short_prop_map.key(token.to_s) 10 | end 11 | end 12 | 13 | class Alnum < Base; end 14 | class Alpha < Base; end 15 | class Ascii < Base; end 16 | class Blank < Base; end 17 | class Cntrl < Base; end 18 | class Digit < Base; end 19 | class Graph < Base; end 20 | class Lower < Base; end 21 | class Print < Base; end 22 | class Punct < Base; end 23 | class Space < Base; end 24 | class Upper < Base; end 25 | class Word < Base; end 26 | class Xdigit < Base; end 27 | class XPosixPunct < Base; end 28 | 29 | class Newline < Base; end 30 | 31 | class Any < Base; end 32 | class Assigned < Base; end 33 | 34 | module Letter 35 | class Base < UnicodeProperty::Base; end 36 | 37 | class Any < Letter::Base; end 38 | class Cased < Letter::Base; end 39 | class Uppercase < Letter::Base; end 40 | class Lowercase < Letter::Base; end 41 | class Titlecase < Letter::Base; end 42 | class Modifier < Letter::Base; end 43 | class Other < Letter::Base; end 44 | end 45 | 46 | module Mark 47 | class Base < UnicodeProperty::Base; end 48 | 49 | class Any < Mark::Base; end 50 | class Combining < Mark::Base; end 51 | class Nonspacing < Mark::Base; end 52 | class Spacing < Mark::Base; end 53 | class Enclosing < Mark::Base; end 54 | end 55 | 56 | module Number 57 | class Base < UnicodeProperty::Base; end 58 | 59 | class Any < Number::Base; end 60 | class Decimal < Number::Base; end 61 | class Letter < Number::Base; end 62 | class Other < Number::Base; end 63 | end 64 | 65 | module Punctuation 66 | class Base < UnicodeProperty::Base; end 67 | 68 | class Any < Punctuation::Base; end 69 | class Connector < Punctuation::Base; end 70 | class Dash < Punctuation::Base; end 71 | class Open < Punctuation::Base; end 72 | class Close < Punctuation::Base; end 73 | class Initial < Punctuation::Base; end 74 | class Final < Punctuation::Base; end 75 | class Other < Punctuation::Base; end 76 | end 77 | 78 | module Separator 79 | class Base < UnicodeProperty::Base; end 80 | 81 | class Any < Separator::Base; end 82 | class Space < Separator::Base; end 83 | class Line < Separator::Base; end 84 | class Paragraph < Separator::Base; end 85 | end 86 | 87 | module Symbol 88 | class Base < UnicodeProperty::Base; end 89 | 90 | class Any < Symbol::Base; end 91 | class Math < Symbol::Base; end 92 | class Currency < Symbol::Base; end 93 | class Modifier < Symbol::Base; end 94 | class Other < Symbol::Base; end 95 | end 96 | 97 | module Codepoint 98 | class Base < UnicodeProperty::Base; end 99 | 100 | class Any < Codepoint::Base; end 101 | class Control < Codepoint::Base; end 102 | class Format < Codepoint::Base; end 103 | class Surrogate < Codepoint::Base; end 104 | class PrivateUse < Codepoint::Base; end 105 | class Unassigned < Codepoint::Base; end 106 | end 107 | 108 | class Age < UnicodeProperty::Base; end 109 | class Block < UnicodeProperty::Base; end 110 | class Derived < UnicodeProperty::Base; end 111 | class Emoji < UnicodeProperty::Base; end 112 | class Enumerated < UnicodeProperty::Base; end 113 | class Script < UnicodeProperty::Base; end 114 | end 115 | 116 | # alias for symmetry between token symbol and Expression class name 117 | Property = UnicodeProperty 118 | Nonproperty = UnicodeProperty 119 | end # module Regexp::Expression 120 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/construct.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | module ClassMethods 4 | # Convenience method to init a valid Expression without a Regexp::Token 5 | def construct(params = {}) 6 | attrs = construct_defaults.merge(params) 7 | options = attrs.delete(:options) 8 | token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) } 9 | token = Regexp::Token.new(*token_args) 10 | raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any? 11 | 12 | new(token, options) 13 | end 14 | 15 | def construct_defaults 16 | if self == Root 17 | { type: :expression, token: :root, ts: 0 } 18 | elsif self < Sequence 19 | { type: :expression, token: :sequence } 20 | else 21 | { type: token_class::Type } 22 | end.merge(level: 0, set_level: 0, conditional_level: 0, text: '') 23 | end 24 | 25 | def token_class 26 | if self == Root || self < Sequence 27 | nil # no token class because these objects are Parser-generated 28 | # TODO: synch exp class, token class & type names for this in v3.0.0 29 | elsif self == CharacterType::Any 30 | Regexp::Syntax::Token::Meta 31 | else 32 | Regexp::Syntax::Token.const_get(name.split('::')[2]) 33 | end 34 | end 35 | end 36 | 37 | def token_class 38 | self.class.token_class 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/escape_sequence_char.rb: -------------------------------------------------------------------------------- 1 | Regexp::Expression::EscapeSequence::Base.class_eval do 2 | def char 3 | codepoint.chr('utf-8') 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression::EscapeSequence 2 | AsciiEscape.class_eval { def codepoint; 0x1B end } 3 | Backspace.class_eval { def codepoint; 0x8 end } 4 | Bell.class_eval { def codepoint; 0x7 end } 5 | FormFeed.class_eval { def codepoint; 0xC end } 6 | Newline.class_eval { def codepoint; 0xA end } 7 | Return.class_eval { def codepoint; 0xD end } 8 | Tab.class_eval { def codepoint; 0x9 end } 9 | VerticalTab.class_eval { def codepoint; 0xB end } 10 | 11 | Literal.class_eval { def codepoint; text[1].ord end } 12 | 13 | Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end } 14 | 15 | Hex.class_eval { def codepoint; text[/\h+/].hex end } 16 | Codepoint.class_eval { def codepoint; text[/\h+/].hex end } 17 | 18 | CodepointList.class_eval do 19 | # Maybe this should be a unique top-level expression class? 20 | def char 21 | raise NoMethodError, 'CodepointList responds only to #chars' 22 | end 23 | 24 | def codepoint 25 | raise NoMethodError, 'CodepointList responds only to #codepoints' 26 | end 27 | 28 | def chars 29 | codepoints.map { |cp| cp.chr('utf-8') } 30 | end 31 | 32 | def codepoints 33 | text.scan(/\h+/).map(&:hex) 34 | end 35 | end 36 | 37 | AbstractMetaControlSequence.class_eval do 38 | private 39 | 40 | def control_sequence_to_s(control_sequence) 41 | five_lsb = control_sequence.unpack('B*').first[-5..-1] 42 | ["000#{five_lsb}"].pack('B*') 43 | end 44 | 45 | def meta_char_to_codepoint(meta_char) 46 | byte_value = meta_char.ord 47 | byte_value < 128 ? byte_value + 128 : byte_value 48 | end 49 | end 50 | 51 | Control.class_eval do 52 | def codepoint 53 | control_sequence_to_s(text).ord 54 | end 55 | end 56 | 57 | Meta.class_eval do 58 | def codepoint 59 | meta_char_to_codepoint(text[-1]) 60 | end 61 | end 62 | 63 | MetaControl.class_eval do 64 | def codepoint 65 | meta_char_to_codepoint(control_sequence_to_s(text)) 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/human_name.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | # default implementation, e.g. "atomic group", "hex escape", "word type", .. 4 | def human_name 5 | [token, type].compact.join(' ').tr('_', ' ') 6 | end 7 | end 8 | 9 | Alternation.class_eval { def human_name; 'alternation' end } 10 | Alternative.class_eval { def human_name; 'alternative' end } 11 | Anchor::BOL.class_eval { def human_name; 'beginning of line' end } 12 | Anchor::BOS.class_eval { def human_name; 'beginning of string' end } 13 | Anchor::EOL.class_eval { def human_name; 'end of line' end } 14 | Anchor::EOS.class_eval { def human_name; 'end of string' end } 15 | Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end } 16 | Anchor::MatchStart.class_eval { def human_name; 'match start' end } 17 | Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end } 18 | Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end } 19 | Assertion::Lookahead.class_eval { def human_name; 'lookahead' end } 20 | Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end } 21 | Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end } 22 | Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end } 23 | Backreference::Name.class_eval { def human_name; 'backreference by name' end } 24 | Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end } 25 | Backreference::Number.class_eval { def human_name; 'backreference' end } 26 | Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end } 27 | Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end } 28 | Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end } 29 | CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end } 30 | CharacterSet::Intersection.class_eval { def human_name; 'intersection' end } 31 | CharacterSet::Range.class_eval { def human_name; 'character range' end } 32 | CharacterType::Any.class_eval { def human_name; 'match-all' end } 33 | Comment.class_eval { def human_name; 'comment' end } 34 | Conditional::Branch.class_eval { def human_name; 'conditional branch' end } 35 | Conditional::Condition.class_eval { def human_name; 'condition' end } 36 | Conditional::Expression.class_eval { def human_name; 'conditional' end } 37 | Group::Capture.class_eval { def human_name; "capture group #{number}" end } 38 | Group::Named.class_eval { def human_name; 'named capture group' end } 39 | Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end } 40 | Literal.class_eval { def human_name; 'literal' end } 41 | Root.class_eval { def human_name; 'root' end } 42 | WhiteSpace.class_eval { def human_name; 'free space' end } 43 | end 44 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/match.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Base 3 | def match?(string) 4 | !!match(string) 5 | end 6 | alias :matches? :match? 7 | 8 | def match(string, offset = 0) 9 | Regexp.new(to_s).match(string, offset) 10 | end 11 | alias :=~ :match 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/match_length.rb: -------------------------------------------------------------------------------- 1 | class Regexp::MatchLength 2 | include Enumerable 3 | 4 | def self.of(obj) 5 | exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj) 6 | exp.match_length 7 | end 8 | 9 | def initialize(exp, opts = {}) 10 | self.exp_class = exp.class 11 | self.min_rep = exp.repetitions.min 12 | self.max_rep = exp.repetitions.max 13 | if (base = opts[:base]) 14 | self.base_min = base 15 | self.base_max = base 16 | self.reify = ->{ '.' * base } 17 | else 18 | self.base_min = opts.fetch(:base_min) 19 | self.base_max = opts.fetch(:base_max) 20 | self.reify = opts.fetch(:reify) 21 | end 22 | end 23 | 24 | def each(opts = {}) 25 | return enum_for(__method__, opts) unless block_given? 26 | limit = opts[:limit] || 1000 27 | yielded = 0 28 | (min..max).each do |num| 29 | next unless include?(num) 30 | yield(num) 31 | break if (yielded += 1) >= limit 32 | end 33 | end 34 | 35 | def endless_each 36 | return enum_for(__method__) unless block_given? 37 | (min..max).each { |num| yield(num) if include?(num) } 38 | end 39 | 40 | def include?(length) 41 | test_regexp.match?('X' * length) 42 | end 43 | 44 | def fixed? 45 | min == max 46 | end 47 | 48 | def min 49 | min_rep * base_min 50 | end 51 | 52 | def max 53 | max_rep * base_max 54 | end 55 | 56 | def minmax 57 | [min, max] 58 | end 59 | 60 | def inspect 61 | type = exp_class.name.sub('Regexp::Expression::', '') 62 | "#<#{self.class}<#{type}> min=#{min} max=#{max}>" 63 | end 64 | 65 | def to_re 66 | /(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/ 67 | end 68 | 69 | private 70 | 71 | attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify 72 | 73 | if Regexp.method_defined?(:match?) # ruby >= 2.4 74 | def test_regexp 75 | @test_regexp ||= /^#{to_re}$/ 76 | end 77 | else 78 | def test_regexp 79 | @test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end } 80 | end 81 | end 82 | end 83 | 84 | module Regexp::Expression 85 | MatchLength = Regexp::MatchLength 86 | 87 | [ 88 | CharacterSet, 89 | CharacterSet::Intersection, 90 | CharacterSet::IntersectedSequence, 91 | CharacterSet::Range, 92 | CharacterType::Base, 93 | EscapeSequence::Base, 94 | PosixClass, 95 | UnicodeProperty::Base, 96 | ].each do |klass| 97 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 98 | def match_length 99 | MatchLength.new(self, base: 1) 100 | end 101 | RUBY 102 | end 103 | 104 | class Literal 105 | def match_length 106 | MatchLength.new(self, base: text.length) 107 | end 108 | end 109 | 110 | class Subexpression 111 | def match_length 112 | MatchLength.new(self, 113 | base_min: map { |exp| exp.match_length.min }.inject(0, :+), 114 | base_max: map { |exp| exp.match_length.max }.inject(0, :+), 115 | reify: ->{ map { |exp| exp.match_length.to_re }.join }) 116 | end 117 | 118 | def inner_match_length 119 | dummy = Regexp::Expression::Root.construct 120 | dummy.expressions = expressions.map(&:clone) 121 | dummy.quantifier = quantifier && quantifier.clone 122 | dummy.match_length 123 | end 124 | end 125 | 126 | [ 127 | Alternation, 128 | Conditional::Expression, 129 | ].each do |klass| 130 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 131 | def match_length 132 | MatchLength.new(self, 133 | base_min: map { |exp| exp.match_length.min }.min, 134 | base_max: map { |exp| exp.match_length.max }.max, 135 | reify: ->{ map { |exp| exp.match_length.to_re }.join('|') }) 136 | end 137 | RUBY 138 | end 139 | 140 | [ 141 | Anchor::Base, 142 | Assertion::Base, 143 | Conditional::Condition, 144 | FreeSpace, 145 | Keep::Mark, 146 | ].each do |klass| 147 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 148 | def match_length 149 | MatchLength.new(self, base: 0) 150 | end 151 | RUBY 152 | end 153 | 154 | class Backreference::Base 155 | def match_length 156 | if referenced_expression.nil? 157 | raise ArgumentError, 'Missing referenced_expression - not parsed?' 158 | end 159 | referenced_expression.unquantified_clone.match_length 160 | end 161 | end 162 | 163 | class EscapeSequence::CodepointList 164 | def match_length 165 | MatchLength.new(self, base: codepoints.count) 166 | end 167 | end 168 | 169 | # Special case. Absence group can match 0.. chars, irrespective of content. 170 | # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})` 171 | class Group::Absence 172 | def match_length 173 | MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' }) 174 | end 175 | end 176 | end 177 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/negative.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | def negative? 4 | false 5 | end 6 | 7 | # not an alias so as to respect overrides of #negative? 8 | def negated? 9 | negative? 10 | end 11 | end 12 | 13 | Anchor::NonWordBoundary.class_eval { def negative?; true end } 14 | Assertion::NegativeLookahead.class_eval { def negative?; true end } 15 | Assertion::NegativeLookbehind.class_eval { def negative?; true end } 16 | CharacterSet.class_eval { def negative?; negative end } 17 | CharacterType::Base.class_eval { def negative?; token.to_s.start_with?('non') end } 18 | PosixClass.class_eval { def negative?; type == :nonposixclass end } 19 | UnicodeProperty::Base.class_eval { def negative?; type == :nonproperty end } 20 | end 21 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/options.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Base 3 | def multiline? 4 | options[:m] == true 5 | end 6 | alias :m? :multiline? 7 | 8 | def case_insensitive? 9 | options[:i] == true 10 | end 11 | alias :i? :case_insensitive? 12 | alias :ignore_case? :case_insensitive? 13 | 14 | def free_spacing? 15 | options[:x] == true 16 | end 17 | alias :x? :free_spacing? 18 | alias :extended? :free_spacing? 19 | 20 | def default_classes? 21 | options[:d] == true 22 | end 23 | alias :d? :default_classes? 24 | 25 | def ascii_classes? 26 | options[:a] == true 27 | end 28 | alias :a? :ascii_classes? 29 | 30 | def unicode_classes? 31 | options[:u] == true 32 | end 33 | alias :u? :unicode_classes? 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/parts.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | # default implementation 4 | def parts 5 | [text.dup] 6 | end 7 | 8 | private 9 | 10 | def intersperse(expressions, separator) 11 | expressions.flat_map { |exp| [exp, separator] }.slice(0...-1) 12 | end 13 | end 14 | 15 | CharacterSet.class_eval { def parts; ["#{text}#{'^' if negated?}", *expressions, ']'] end } 16 | CharacterSet::Range.class_eval { def parts; intersperse(expressions, text.dup) end } 17 | Conditional::Expression.class_eval { def parts; [text.dup, condition, *intersperse(branches, '|'), ')'] end } 18 | Group::Base.class_eval { def parts; [text.dup, *expressions, ')'] end } 19 | Group::Passive.class_eval { def parts; implicit? ? expressions : super end } 20 | Group::Comment.class_eval { def parts; [text.dup] end } 21 | Subexpression.class_eval { def parts; expressions end } 22 | SequenceOperation.class_eval { def parts; intersperse(expressions, text.dup) end } 23 | end 24 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/printing.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | def inspect 4 | [ 5 | "#<#{self.class}", 6 | pretty_print_instance_variables.map { |v| " #{v}=#{instance_variable_get(v).inspect}" }, 7 | ">" 8 | ].join 9 | end 10 | 11 | # Make pretty-print work despite #inspect implementation. 12 | def pretty_print(q) 13 | q.pp_object(self) 14 | end 15 | 16 | # Called by pretty_print (ruby/pp) and #inspect. 17 | def pretty_print_instance_variables 18 | [ 19 | (:@text unless text.to_s.empty?), 20 | (:@quantifier if quantified?), 21 | (:@options unless options.empty?), 22 | (:@expressions unless terminal?), 23 | ].compact 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/referenced_expressions.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module ReferencedExpressions 3 | attr_accessor :referenced_expressions 4 | 5 | def referenced_expression 6 | referenced_expressions && referenced_expressions.first 7 | end 8 | 9 | def initialize_copy(orig) 10 | exp_id = [self.class, self.starts_at] 11 | 12 | # prevent infinite recursion for recursive subexp calls 13 | copied = self.class.instance_eval { @copied_ref_exps ||= {} } 14 | self.referenced_expressions = 15 | if copied[exp_id] 16 | orig.referenced_expressions 17 | else 18 | copied[exp_id] = true 19 | orig.referenced_expressions && orig.referenced_expressions.map(&:dup) 20 | end 21 | copied.clear 22 | 23 | super 24 | end 25 | end 26 | 27 | Base.include ReferencedExpressions 28 | end 29 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/strfregexp.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Base 3 | 4 | # %l Level (depth) of the expression. Returns 'root' for the root 5 | # expression, returns zero or higher for all others. 6 | # 7 | # %> Indentation at expression's level. 8 | # 9 | # %x Index of the expression at its depth. Available when using 10 | # the sprintf_tree method only. 11 | # 12 | # %s Start offset within the whole expression. 13 | # %e End offset within the whole expression. 14 | # %S Length of expression. 15 | # 16 | # %o Coded offset and length, same as '@%s+%S' 17 | # 18 | # %y Type of expression. 19 | # %k Token of expression. 20 | # %i ID, same as '%y:%k' 21 | # %c Class name 22 | # 23 | # %q Quantifier info, as {m[,M]} 24 | # %Q Quantifier text 25 | # 26 | # %z Quantifier min 27 | # %Z Quantifier max 28 | # 29 | # %t Base text of the expression (excludes quantifier, if any) 30 | # %~t Full text if the expression is terminal, otherwise %i 31 | # %T Full text of the expression (includes quantifier, if any) 32 | # 33 | # %b Basic info, same as '%o %i' 34 | # %m Most info, same as '%b %q' 35 | # %a All info, same as '%m %t' 36 | # 37 | def strfregexp(format = '%a', indent_offset = 0, index = nil) 38 | have_index = index ? true : false 39 | 40 | part = {} 41 | 42 | print_level = nesting_level > 0 ? nesting_level - 1 : nil 43 | 44 | # Order is important! Fields that use other fields in their 45 | # definition must appear before the fields they use. 46 | part_keys = %w[a m b o i l x s e S y k c q Q z Z t ~t T >] 47 | part.keys.each {|k| part[k] = ""} 48 | 49 | part['>'] = print_level ? (' ' * (print_level + indent_offset)) : '' 50 | 51 | part['l'] = print_level ? "#{'%d' % print_level}" : 'root' 52 | part['x'] = "#{'%d' % index}" if have_index 53 | 54 | part['s'] = starts_at 55 | part['S'] = full_length 56 | part['e'] = starts_at + full_length 57 | part['o'] = coded_offset 58 | 59 | part['k'] = token 60 | part['y'] = type 61 | part['i'] = '%y:%k' 62 | part['c'] = self.class.name 63 | 64 | if quantified? 65 | if quantifier.max == -1 66 | part['q'] = "{#{quantifier.min}, or-more}" 67 | else 68 | part['q'] = "{#{quantifier.min}, #{quantifier.max}}" 69 | end 70 | 71 | part['Q'] = quantifier.text 72 | part['z'] = quantifier.min 73 | part['Z'] = quantifier.max 74 | else 75 | part['q'] = '{1}' 76 | part['Q'] = '' 77 | part['z'] = '1' 78 | part['Z'] = '1' 79 | end 80 | 81 | part['t'] = to_s(:base) 82 | part['~t'] = terminal? ? to_s : "#{type}:#{token}" 83 | part['T'] = to_s(:full) 84 | 85 | part['b'] = '%o %i' 86 | part['m'] = '%b %q' 87 | part['a'] = '%m %t' 88 | 89 | out = format.dup 90 | 91 | part_keys.each do |k| 92 | out.gsub!(/%#{k}/, part[k].to_s) 93 | end 94 | 95 | out 96 | end 97 | 98 | alias :strfre :strfregexp 99 | end 100 | 101 | class Subexpression < Regexp::Expression::Base 102 | def strfregexp_tree(format = '%a', include_self = true, separator = "\n") 103 | output = include_self ? [self.strfregexp(format)] : [] 104 | 105 | output += flat_map do |exp, index| 106 | exp.strfregexp(format, (include_self ? 1 : 0), index) 107 | end 108 | 109 | output.join(separator) 110 | end 111 | 112 | alias :strfre_tree :strfregexp_tree 113 | end 114 | end 115 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/methods/traverse.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Subexpression < Regexp::Expression::Base 3 | 4 | # Traverses the expression, passing each recursive child to the 5 | # given block. 6 | # If the block takes two arguments, the indices of the children within 7 | # their parents are also passed to it. 8 | def each_expression(include_self = false, &block) 9 | return enum_for(__method__, include_self) unless block 10 | 11 | if block.arity == 1 12 | block.call(self) if include_self 13 | each_expression_without_index(&block) 14 | else 15 | block.call(self, 0) if include_self 16 | each_expression_with_index(&block) 17 | end 18 | end 19 | 20 | # Traverses the subexpression (depth-first, pre-order) and calls the given 21 | # block for each expression with three arguments; the traversal event, 22 | # the expression, and the index of the expression within its parent. 23 | # 24 | # The event argument is passed as follows: 25 | # 26 | # - For subexpressions, :enter upon entering the subexpression, and 27 | # :exit upon exiting it. 28 | # 29 | # - For terminal expressions, :visit is called once. 30 | # 31 | # Returns self. 32 | def traverse(include_self = false, &block) 33 | return enum_for(__method__, include_self) unless block_given? 34 | 35 | block.call(:enter, self, 0) if include_self 36 | 37 | each_with_index do |exp, index| 38 | if exp.terminal? 39 | block.call(:visit, exp, index) 40 | else 41 | block.call(:enter, exp, index) 42 | exp.traverse(&block) 43 | block.call(:exit, exp, index) 44 | end 45 | end 46 | 47 | block.call(:exit, self, 0) if include_self 48 | 49 | self 50 | end 51 | alias :walk :traverse 52 | 53 | # Returns a new array with the results of calling the given block once 54 | # for every expression. If a block is not given, returns an array with 55 | # each expression and its level index as an array. 56 | def flat_map(include_self = false, &block) 57 | case block && block.arity 58 | when nil then each_expression(include_self).to_a 59 | when 2 then each_expression(include_self).map(&block) 60 | else each_expression(include_self).map { |exp| block.call(exp) } 61 | end 62 | end 63 | 64 | protected 65 | 66 | def each_expression_with_index(&block) 67 | each_with_index do |exp, index| 68 | block.call(exp, index) 69 | exp.each_expression_with_index(&block) unless exp.terminal? 70 | end 71 | end 72 | 73 | def each_expression_without_index(&block) 74 | each do |exp| 75 | block.call(exp) 76 | exp.each_expression_without_index(&block) unless exp.terminal? 77 | end 78 | end 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/quantifier.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | # TODO: in v3.0.0, maybe put Shared back into Base, and inherit from Base and 3 | # call super in #initialize, but raise in #quantifier= and #quantify, 4 | # or introduce an Expression::Quantifiable intermediate class. 5 | # Or actually allow chaining as a more concise but tricky solution than PR#69. 6 | class Quantifier 7 | include Regexp::Expression::Shared 8 | 9 | MODES = %i[greedy possessive reluctant] 10 | 11 | def initialize(*args) 12 | deprecated_old_init(*args) and return if args.count == 4 || args.count == 5 13 | 14 | init_from_token_and_options(*args) 15 | # TODO: remove in v3.0.0, stop removing parts of #token (?) 16 | self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym 17 | end 18 | 19 | def to_h 20 | { 21 | token: token, 22 | text: text, 23 | mode: mode, 24 | min: min, 25 | max: max, 26 | } 27 | end 28 | 29 | MODES.each do |mode| 30 | class_eval <<-RUBY, __FILE__, __LINE__ + 1 31 | def #{mode}? 32 | mode.equal?(:#{mode}) 33 | end 34 | RUBY 35 | end 36 | alias :lazy? :reluctant? 37 | 38 | def min 39 | derived_data[:min] 40 | end 41 | 42 | def max 43 | derived_data[:max] 44 | end 45 | 46 | def mode 47 | derived_data[:mode] 48 | end 49 | 50 | private 51 | 52 | def deprecated_old_init(token, text, _min, _max, _mode = :greedy) 53 | warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\ 54 | "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\ 55 | "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\ 56 | "with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode "\ 57 | "will be derived automatically.\n"\ 58 | "Or do `exp.quantifier = #{self.class}.construct(token: token, text: str)`.\n"\ 59 | "This is consistent with how Expression::Base instances are created. " 60 | @token = token 61 | @text = text 62 | end 63 | 64 | def derived_data 65 | @derived_data ||= begin 66 | min, max = 67 | case text[0] 68 | when '?'; [0, 1] 69 | when '*'; [0, -1] 70 | when '+'; [1, -1] 71 | else 72 | int_min = text[/\{(\d*)/, 1] 73 | int_max = text[/,?(\d*)\}/, 1] 74 | [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)] 75 | end 76 | 77 | mod = text[/.([?+])/, 1] 78 | mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy 79 | 80 | { min: min, max: max, mode: mode } 81 | end 82 | end 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/sequence.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | # A sequence of expressions. Differs from a Subexpressions by how it handles 3 | # quantifiers, as it applies them to its last element instead of itself as 4 | # a whole subexpression. 5 | # 6 | # Used as the base class for the Alternation alternatives, Conditional 7 | # branches, and CharacterSet::Intersection intersected sequences. 8 | class Sequence < Regexp::Expression::Subexpression 9 | class << self 10 | def add_to(exp, params = {}, active_opts = {}) 11 | sequence = construct( 12 | level: exp.level, 13 | set_level: exp.set_level, 14 | conditional_level: params[:conditional_level] || exp.conditional_level, 15 | ts: params[:ts], 16 | ) 17 | sequence.options = active_opts 18 | exp.expressions << sequence 19 | sequence 20 | end 21 | end 22 | 23 | def ts 24 | (head = expressions.first) ? head.ts : @ts 25 | end 26 | 27 | def quantify(token, *args) 28 | extract_quantifier_target(token.text).quantify(token, *args) 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/sequence_operation.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | # abstract class 3 | class SequenceOperation < Regexp::Expression::Subexpression 4 | alias :sequences :expressions 5 | alias :operands :expressions 6 | alias :operator :text 7 | 8 | def ts 9 | (head = expressions.first) ? head.ts : @ts 10 | end 11 | 12 | def <<(exp) 13 | expressions.last << exp 14 | end 15 | 16 | def add_sequence(active_opts = {}, params = { ts: 0 }) 17 | self.class::OPERAND.add_to(self, params, active_opts) 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/shared.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | module Shared 3 | module ClassMethods; end # filled in ./methods/*.rb 4 | 5 | def self.included(mod) 6 | mod.class_eval do 7 | extend Shared::ClassMethods 8 | 9 | attr_accessor :type, :token, :text, :ts, :te, 10 | :level, :set_level, :conditional_level, 11 | :options, :parent, 12 | :custom_to_s_handling, :pre_quantifier_decorations 13 | 14 | attr_reader :nesting_level, :quantifier 15 | end 16 | end 17 | 18 | def init_from_token_and_options(token, options = {}) 19 | self.type = token.type 20 | self.token = token.token 21 | self.text = token.text 22 | self.ts = token.ts 23 | self.te = token.te 24 | self.level = token.level 25 | self.set_level = token.set_level 26 | self.conditional_level = token.conditional_level 27 | self.nesting_level = 0 28 | self.options = options || {} 29 | end 30 | private :init_from_token_and_options 31 | 32 | def initialize_copy(orig) 33 | self.text = orig.text.dup if orig.text 34 | self.options = orig.options.dup if orig.options 35 | self.quantifier = orig.quantifier.clone if orig.quantifier 36 | self.parent = nil # updated by Subexpression#initialize_copy 37 | if orig.pre_quantifier_decorations 38 | self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup) 39 | end 40 | super 41 | end 42 | 43 | def starts_at 44 | ts 45 | end 46 | 47 | def ends_at(include_quantifier = true) 48 | ts + (include_quantifier ? full_length : base_length) 49 | end 50 | 51 | def base_length 52 | to_s(:base).length 53 | end 54 | 55 | def full_length 56 | to_s(:original).length 57 | end 58 | 59 | # #to_s reproduces the original source, as an unparser would. 60 | # 61 | # It takes an optional format argument. 62 | # 63 | # Example: 64 | # 65 | # lit = Regexp::Parser.parse(/a +/x)[0] 66 | # 67 | # lit.to_s # => 'a+' # default; with quantifier 68 | # lit.to_s(:full) # => 'a+' # default; with quantifier 69 | # lit.to_s(:base) # => 'a' # without quantifier 70 | # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations 71 | # 72 | def to_s(format = :full) 73 | base = parts.each_with_object(''.dup) do |part, buff| 74 | if part.instance_of?(String) 75 | buff << part 76 | elsif !part.custom_to_s_handling 77 | buff << part.to_s(:original) 78 | end 79 | end 80 | "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}" 81 | end 82 | alias :to_str :to_s 83 | 84 | def pre_quantifier_decoration(expression_format = :original) 85 | pre_quantifier_decorations.to_a.join if expression_format == :original 86 | end 87 | 88 | def quantifier_affix(expression_format = :full) 89 | quantifier.to_s if quantified? && expression_format != :base 90 | end 91 | 92 | def offset 93 | [starts_at, full_length] 94 | end 95 | 96 | def coded_offset 97 | '@%d+%d' % offset 98 | end 99 | 100 | def nesting_level=(lvl) 101 | @nesting_level = lvl 102 | quantifier && quantifier.nesting_level = lvl 103 | terminal? || each { |subexp| subexp.nesting_level = lvl + 1 } 104 | end 105 | 106 | def quantifier=(qtf) 107 | @quantifier = qtf 108 | @repetitions = nil # clear memoized value 109 | end 110 | end 111 | end 112 | -------------------------------------------------------------------------------- /lib/regexp_parser/expression/subexpression.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Expression 2 | class Subexpression < Regexp::Expression::Base 3 | include Enumerable 4 | 5 | attr_accessor :expressions 6 | 7 | def initialize(token, options = {}) 8 | self.expressions = [] 9 | super 10 | end 11 | 12 | # Override base method to clone the expressions as well. 13 | def initialize_copy(orig) 14 | self.expressions = orig.expressions.map do |exp| 15 | exp.clone.tap { |copy| copy.parent = self } 16 | end 17 | super 18 | end 19 | 20 | def <<(exp) 21 | exp.parent = self 22 | expressions << exp 23 | end 24 | 25 | %w[[] at each empty? fetch index join last length values_at].each do |method| 26 | class_eval <<-RUBY, __FILE__, __LINE__ + 1 27 | def #{method}(*args, &block) 28 | expressions.#{method}(*args, &block) 29 | end 30 | RUBY 31 | end 32 | 33 | def dig(*indices) 34 | exp = self 35 | indices.each { |idx| exp = exp.nil? || exp.terminal? ? nil : exp[idx] } 36 | exp 37 | end 38 | 39 | def te 40 | ts + base_length 41 | end 42 | 43 | def to_h 44 | attributes.merge( 45 | text: to_s(:base), 46 | expressions: expressions.map(&:to_h) 47 | ) 48 | end 49 | 50 | def extract_quantifier_target(quantifier_description) 51 | pre_quantifier_decorations = [] 52 | target = expressions.reverse.find do |exp| 53 | if exp.decorative? 54 | exp.custom_to_s_handling = true 55 | pre_quantifier_decorations << exp.text 56 | next 57 | end 58 | exp 59 | end 60 | target or raise Regexp::Parser::ParserError, 61 | "No valid target found for '#{quantifier_description}' quantifier" 62 | 63 | target.pre_quantifier_decorations = pre_quantifier_decorations 64 | target 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/regexp_parser/scanner/char_type.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine re_char_type; 3 | 4 | single_codepoint_char_type = [dDhHsSwW]; 5 | multi_codepoint_char_type = [RX]; 6 | 7 | char_type_char = single_codepoint_char_type | multi_codepoint_char_type; 8 | 9 | # Char types scanner 10 | # -------------------------------------------------------------------------- 11 | char_type := |* 12 | char_type_char { 13 | case text = copy(data, ts-1, te) 14 | when '\d'; emit(:type, :digit, text) 15 | when '\D'; emit(:type, :nondigit, text) 16 | when '\h'; emit(:type, :hex, text) 17 | when '\H'; emit(:type, :nonhex, text) 18 | when '\s'; emit(:type, :space, text) 19 | when '\S'; emit(:type, :nonspace, text) 20 | when '\w'; emit(:type, :word, text) 21 | when '\W'; emit(:type, :nonword, text) 22 | when '\R'; emit(:type, :linebreak, text) 23 | when '\X'; emit(:type, :xgrapheme, text) 24 | end 25 | fret; 26 | }; 27 | *|; 28 | }%% 29 | -------------------------------------------------------------------------------- /lib/regexp_parser/scanner/errors/premature_end_error.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Scanner 2 | # Unexpected end of pattern 3 | class PrematureEndError < ScannerError 4 | def initialize(where = '') 5 | super "Premature end of pattern at #{where}" 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /lib/regexp_parser/scanner/errors/scanner_error.rb: -------------------------------------------------------------------------------- 1 | require_relative '../../../regexp_parser/error' 2 | 3 | class Regexp::Scanner 4 | # General scanner error (catch all) 5 | class ScannerError < Regexp::Parser::Error; end 6 | end 7 | -------------------------------------------------------------------------------- /lib/regexp_parser/scanner/errors/validation_error.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Scanner 2 | # Base for all scanner validation errors 3 | class ValidationError < ScannerError 4 | # Centralizes and unifies the handling of validation related errors. 5 | def self.for(type, problem, reason = nil) 6 | types.fetch(type).new(problem, reason) 7 | end 8 | 9 | def self.types 10 | @types ||= { 11 | backref: InvalidBackrefError, 12 | group: InvalidGroupError, 13 | group_option: InvalidGroupOption, 14 | posix_class: UnknownPosixClassError, 15 | property: UnknownUnicodePropertyError, 16 | sequence: InvalidSequenceError, 17 | } 18 | end 19 | end 20 | 21 | # Invalid sequence format. Used for escape sequences, mainly. 22 | class InvalidSequenceError < ValidationError 23 | def initialize(what = 'sequence', where = '') 24 | super "Invalid #{what} at #{where}" 25 | end 26 | end 27 | 28 | # Invalid group. Used for named groups. 29 | class InvalidGroupError < ValidationError 30 | def initialize(what, reason) 31 | super "Invalid #{what}, #{reason}." 32 | end 33 | end 34 | 35 | # Invalid groupOption. Used for inline options. 36 | # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency 37 | class InvalidGroupOption < ValidationError 38 | def initialize(option, text) 39 | super "Invalid group option #{option} in #{text}" 40 | end 41 | end 42 | 43 | # Invalid back reference. Used for name a number refs/calls. 44 | class InvalidBackrefError < ValidationError 45 | def initialize(what, reason) 46 | super "Invalid back reference #{what}, #{reason}" 47 | end 48 | end 49 | 50 | # The property name was not recognized by the scanner. 51 | class UnknownUnicodePropertyError < ValidationError 52 | def initialize(name, _) 53 | super "Unknown unicode character property name #{name}" 54 | end 55 | end 56 | 57 | # The POSIX class name was not recognized by the scanner. 58 | class UnknownPosixClassError < ValidationError 59 | def initialize(text, _) 60 | super "Unknown POSIX class #{text}" 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/regexp_parser/scanner/property.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine re_property; 3 | 4 | property_char = [pP]; 5 | 6 | property_sequence = property_char . '{' . '^'? (alnum|space|[_\-\.=])+ '}'; 7 | 8 | action premature_property_end { 9 | raise PrematureEndError.new('unicode property') 10 | } 11 | 12 | # Unicode properties scanner 13 | # -------------------------------------------------------------------------- 14 | unicode_property := |* 15 | 16 | property_sequence < eof(premature_property_end) { 17 | text = copy(data, ts-1, te) 18 | type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property 19 | 20 | name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase 21 | 22 | token = self.class.short_prop_map[name] || self.class.long_prop_map[name] 23 | raise ValidationError.for(:property, name) unless token 24 | 25 | self.emit(type, token.to_sym, text) 26 | 27 | fret; 28 | }; 29 | *|; 30 | }%% 31 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax.rb: -------------------------------------------------------------------------------- 1 | require_relative 'error' 2 | 3 | module Regexp::Syntax 4 | class SyntaxError < Regexp::Parser::Error; end 5 | end 6 | 7 | require_relative 'syntax/token' 8 | require_relative 'syntax/base' 9 | require_relative 'syntax/any' 10 | require_relative 'syntax/version_lookup' 11 | require_relative 'syntax/versions' 12 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/any.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | # A syntax that always returns true, passing all tokens as implemented. This 3 | # is useful during development, testing, and should be useful for some types 4 | # of transformations as well. 5 | class Any < Base 6 | implements :*, [:*] 7 | 8 | def self.implements?(_type, _token) true end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/base.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | class NotImplementedError < Regexp::Syntax::SyntaxError 3 | def initialize(syntax, type, token) 4 | super "#{syntax} does not implement: [#{type}:#{token}]" 5 | end 6 | end 7 | 8 | # A lookup map of supported types and tokens in a given syntax 9 | class Base 10 | include Regexp::Syntax::Token 11 | 12 | class << self 13 | attr_accessor :features 14 | 15 | # automatically inherit features through the syntax class hierarchy 16 | def inherited(subclass) 17 | super 18 | subclass.features = features.to_h.map { |k, v| [k, v.dup] }.to_h 19 | end 20 | 21 | def implements(type, tokens) 22 | (features[type] ||= []).concat(tokens) 23 | added_features[type] = tokens 24 | end 25 | 26 | def excludes(type, tokens) 27 | tokens.each { |tok| features[type].delete(tok) } 28 | removed_features[type] = tokens 29 | end 30 | 31 | def implements?(type, token) 32 | implementations(type).include?(token) 33 | end 34 | alias :check? :implements? 35 | 36 | def implementations(type) 37 | features[type] || [] 38 | end 39 | 40 | def implements!(type, token) 41 | raise NotImplementedError.new(self, type, token) unless 42 | implements?(type, token) 43 | end 44 | alias :check! :implements! 45 | 46 | def added_features 47 | @added_features ||= {} 48 | end 49 | 50 | def removed_features 51 | @removed_features ||= {} 52 | end 53 | 54 | def normalize(type, token) 55 | case type 56 | when :group 57 | normalize_group(type, token) 58 | when :backref 59 | normalize_backref(type, token) 60 | else 61 | [type, token] 62 | end 63 | end 64 | 65 | def normalize_group(type, token) 66 | case token 67 | when :named_ab, :named_sq 68 | %i[group named] 69 | else 70 | [type, token] 71 | end 72 | end 73 | 74 | def normalize_backref(type, token) 75 | case token 76 | when :name_ref_ab, :name_ref_sq 77 | %i[backref name_ref] 78 | when :name_call_ab, :name_call_sq 79 | %i[backref name_call] 80 | when :name_recursion_ref_ab, :name_recursion_ref_sq 81 | %i[backref name_recursion_ref] 82 | when :number_ref_ab, :number_ref_sq 83 | %i[backref number_ref] 84 | when :number_call_ab, :number_call_sq 85 | %i[backref number_call] 86 | when :number_rel_ref_ab, :number_rel_ref_sq 87 | %i[backref number_rel_ref] 88 | when :number_rel_call_ab, :number_rel_call_sq 89 | %i[backref number_rel_call] 90 | when :number_recursion_ref_ab, :number_recursion_ref_sq 91 | %i[backref number_recursion_ref] 92 | else 93 | [type, token] 94 | end 95 | end 96 | end 97 | 98 | # TODO: drop this backwards compatibility code in v3.0.0, do `private :new` 99 | def initialize 100 | warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ 101 | "and will no longer be supported in v3.0.0." 102 | end 103 | 104 | def method_missing(name, *args) 105 | if self.class.respond_to?(name) 106 | warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ 107 | "and will no longer be supported in v3.0.0. Please call "\ 108 | "methods on the class directly, e.g.: #{self.class}.#{name}" 109 | self.class.send(name, *args) 110 | else 111 | super 112 | end 113 | end 114 | 115 | def respond_to_missing?(name, include_private = false) 116 | self.class.respond_to?(name) || super 117 | end 118 | # end of backwards compatibility code 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token.rb: -------------------------------------------------------------------------------- 1 | # Define the base module and the simplest of tokens. 2 | module Regexp::Syntax 3 | module Token 4 | Map = {} 5 | 6 | module Literal 7 | All = %i[literal] 8 | Type = :literal 9 | end 10 | 11 | module FreeSpace 12 | All = %i[comment whitespace] 13 | Type = :free_space 14 | end 15 | 16 | Map[FreeSpace::Type] = FreeSpace::All 17 | Map[Literal::Type] = Literal::All 18 | end 19 | end 20 | 21 | 22 | # Load all the token files, they will populate the Map constant. 23 | require_relative 'token/anchor' 24 | require_relative 'token/assertion' 25 | require_relative 'token/backreference' 26 | require_relative 'token/posix_class' 27 | require_relative 'token/character_set' 28 | require_relative 'token/character_type' 29 | require_relative 'token/conditional' 30 | require_relative 'token/escape' 31 | require_relative 'token/group' 32 | require_relative 'token/keep' 33 | require_relative 'token/meta' 34 | require_relative 'token/quantifier' 35 | require_relative 'token/unicode_property' 36 | 37 | 38 | # After loading all the tokens the map is full. Extract all tokens and types 39 | # into the All and Types constants. 40 | module Regexp::Syntax 41 | module Token 42 | All = Map.values.flatten.uniq.sort.freeze 43 | Types = Map.keys.freeze 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/anchor.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Anchor 4 | Basic = %i[bol eol] 5 | Extended = Basic + %i[word_boundary nonword_boundary] 6 | String = %i[bos eos eos_ob_eol] 7 | MatchStart = %i[match_start] 8 | 9 | All = Extended + String + MatchStart 10 | Type = :anchor 11 | end 12 | 13 | Map[Anchor::Type] = Anchor::All 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/assertion.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Assertion 4 | Lookahead = %i[lookahead nlookahead] 5 | Lookbehind = %i[lookbehind nlookbehind] 6 | 7 | All = Lookahead + Lookbehind 8 | Type = :assertion 9 | end 10 | 11 | Map[Assertion::Type] = Assertion::All 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/backreference.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Backreference 4 | Plain = %i[number] 5 | NumberRef = %i[number_ref number_rel_ref] 6 | Number = Plain + NumberRef 7 | Name = %i[name_ref] 8 | 9 | RecursionLevel = %i[name_recursion_ref number_recursion_ref] 10 | 11 | V1_8_6 = Plain 12 | 13 | V1_9_1 = Name + NumberRef + RecursionLevel 14 | 15 | All = V1_8_6 + V1_9_1 16 | Type = :backref 17 | end 18 | 19 | # Type is the same as Backreference so keeping it here, for now. 20 | module SubexpressionCall 21 | Name = %i[name_call] 22 | Number = %i[number_call number_rel_call] 23 | 24 | All = Name + Number 25 | end 26 | 27 | Map[Backreference::Type] = Backreference::All + 28 | SubexpressionCall::All 29 | 30 | # alias for symmetry between token symbol and Expression class name 31 | Backref = Backreference 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/character_set.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module CharacterSet 4 | Basic = %i[open close negate range] 5 | Extended = Basic + %i[intersection] 6 | 7 | All = Extended 8 | Type = :set 9 | end 10 | 11 | Map[CharacterSet::Type] = CharacterSet::All 12 | 13 | # alias for symmetry between token symbol and Token module name 14 | Set = CharacterSet 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/character_type.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module CharacterType 4 | Basic = [] 5 | Extended = %i[digit nondigit space nonspace word nonword] 6 | Hex = %i[hex nonhex] 7 | 8 | Clustered = %i[linebreak xgrapheme] 9 | 10 | All = Basic + Extended + Hex + Clustered 11 | Type = :type 12 | end 13 | 14 | Map[CharacterType::Type] = CharacterType::All 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/conditional.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Conditional 4 | Delimiters = %i[open close] 5 | 6 | Condition = %i[condition_open condition condition_close] 7 | Separator = %i[separator] 8 | 9 | All = Conditional::Delimiters + Conditional::Condition + Conditional::Separator 10 | 11 | Type = :conditional 12 | end 13 | 14 | Map[Conditional::Type] = Conditional::All 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/escape.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Escape 4 | Basic = %i[backslash literal] 5 | 6 | Control = %i[control meta_sequence] 7 | 8 | ASCII = %i[bell backspace escape form_feed newline carriage 9 | tab vertical_tab] 10 | 11 | Unicode = %i[codepoint codepoint_list] 12 | 13 | Meta = %i[dot alternation 14 | zero_or_one zero_or_more one_or_more 15 | bol eol 16 | group_open group_close 17 | interval_open interval_close 18 | set_open set_close] 19 | 20 | Hex = %i[hex] 21 | 22 | Octal = %i[octal] 23 | 24 | All = Basic + Control + ASCII + Unicode + Meta + Hex + Octal 25 | Type = :escape 26 | end 27 | 28 | Map[Escape::Type] = Escape::All 29 | 30 | # alias for symmetry between Token::* and Expression::* 31 | EscapeSequence = Escape 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/group.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Group 4 | Basic = %i[capture close] 5 | Extended = Basic + %i[options options_switch] 6 | 7 | Named = %i[named] 8 | Atomic = %i[atomic] 9 | Passive = %i[passive] 10 | Comment = %i[comment] 11 | 12 | V1_8_6 = Group::Extended + Group::Named + Group::Atomic + 13 | Group::Passive + Group::Comment 14 | 15 | V2_4_1 = %i[absence] 16 | 17 | All = V1_8_6 + V2_4_1 18 | Type = :group 19 | end 20 | 21 | Map[Group::Type] = Group::All 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/keep.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Keep 4 | Mark = %i[mark] 5 | 6 | All = Mark 7 | Type = :keep 8 | end 9 | 10 | Map[Keep::Type] = Keep::All 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/meta.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Meta 4 | Basic = %i[dot] 5 | Alternation = %i[alternation] 6 | Extended = Basic + Alternation 7 | 8 | All = Extended 9 | Type = :meta 10 | end 11 | 12 | Map[Meta::Type] = Meta::All 13 | 14 | # alias for symmetry between Token::* and Expression::* 15 | module Alternation 16 | All = Meta::Alternation 17 | Type = Meta::Type 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/posix_class.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module PosixClass 4 | Standard = %i[alnum alpha blank cntrl digit graph 5 | lower print punct space upper xdigit] 6 | 7 | Extensions = %i[ascii word] 8 | 9 | All = Standard + Extensions 10 | Type = :posixclass 11 | NonType = :nonposixclass 12 | end 13 | 14 | Map[PosixClass::Type] = PosixClass::All 15 | Map[PosixClass::NonType] = PosixClass::All 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/quantifier.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Quantifier 4 | Greedy = %i[ 5 | zero_or_one 6 | zero_or_more 7 | one_or_more 8 | ] 9 | 10 | Reluctant = %i[ 11 | zero_or_one_reluctant 12 | zero_or_more_reluctant 13 | one_or_more_reluctant 14 | ] 15 | 16 | Possessive = %i[ 17 | zero_or_one_possessive 18 | zero_or_more_possessive 19 | one_or_more_possessive 20 | ] 21 | 22 | Interval = %i[interval] 23 | IntervalReluctant = %i[interval_reluctant] 24 | IntervalPossessive = %i[interval_possessive] 25 | 26 | IntervalAll = Interval + IntervalReluctant + IntervalPossessive 27 | 28 | V1_8_6 = Greedy + Reluctant + Interval + IntervalReluctant 29 | All = Greedy + Reluctant + Possessive + IntervalAll 30 | Type = :quantifier 31 | end 32 | 33 | Map[Quantifier::Type] = Quantifier::All 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/token/virtual.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | module Token 3 | module Virtual 4 | Root = %i[root] 5 | Sequence = %i[sequence] 6 | 7 | All = %i[root sequence] 8 | Type = :expression 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/version_lookup.rb: -------------------------------------------------------------------------------- 1 | module Regexp::Syntax 2 | VERSION_FORMAT = '\Aruby/\d+\.\d+(\.\d+)?\z' 3 | VERSION_REGEXP = /#{VERSION_FORMAT}/ 4 | VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/ 5 | 6 | class InvalidVersionNameError < Regexp::Syntax::SyntaxError 7 | def initialize(name) 8 | super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'" 9 | end 10 | end 11 | 12 | class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError 13 | def initialize(name) 14 | super "Unknown syntax name '#{name}'." 15 | end 16 | end 17 | 18 | module_function 19 | 20 | # Returns the syntax specification class for the given syntax 21 | # version name. The special names 'any' and '*' return Syntax::Any. 22 | def for(name) 23 | (@alias_map ||= {})[name] ||= version_class(name) 24 | end 25 | 26 | def new(name) 27 | warn 'Regexp::Syntax.new is deprecated in favor of Regexp::Syntax.for. '\ 28 | 'It does not return distinct instances and will be removed in v3.0.0.' 29 | self.for(name) 30 | end 31 | 32 | def supported?(name) 33 | name =~ VERSION_REGEXP && comparable(name) >= comparable('1.8.6') 34 | end 35 | 36 | def version_class(version) 37 | return Regexp::Syntax::Any if ['*', 'any'].include?(version.to_s) 38 | 39 | version =~ VERSION_REGEXP || raise(InvalidVersionNameError, version) 40 | version_const_name = "V#{version.to_s.scan(/\d+/).join('_')}" 41 | const_get(version_const_name) || raise(UnknownSyntaxNameError, version) 42 | end 43 | 44 | def const_missing(const_name) 45 | if const_name =~ VERSION_CONST_REGEXP 46 | return fallback_version_class(const_name) 47 | end 48 | super 49 | end 50 | 51 | def fallback_version_class(version) 52 | sorted = (specified_versions + [version]).sort_by { |ver| comparable(ver) } 53 | index = sorted.index(version) 54 | index > 0 && const_get(sorted[index - 1]) 55 | end 56 | 57 | def specified_versions 58 | constants.select { |const_name| const_name =~ VERSION_CONST_REGEXP } 59 | end 60 | 61 | def comparable(name) 62 | # add .99 to treat versions without a patch value as latest patch version 63 | Gem::Version.new((name.to_s.scan(/\d+/) << 99).join('.')) 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions.rb: -------------------------------------------------------------------------------- 1 | # Ruby 1.x is no longer a supported runtime, 2 | # but its regex features are still recognized. 3 | # 4 | # Aliases for the latest patch version are provided as 'ruby/n.n', 5 | # e.g. 'ruby/1.9' refers to Ruby v1.9.3. 6 | Dir[File.expand_path('../versions/*.rb', __FILE__)].sort.each { |f| require_relative f } 7 | 8 | Regexp::Syntax::CURRENT = Regexp::Syntax.for("ruby/#{RUBY_VERSION}") 9 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/1.8.6.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V1_8_6 < Regexp::Syntax::Base 2 | implements :anchor, Anchor::All 3 | implements :assertion, Assertion::Lookahead 4 | implements :backref, Backreference::V1_8_6 5 | implements :escape, Escape::Basic + Escape::ASCII + Escape::Meta + Escape::Control 6 | implements :free_space, FreeSpace::All 7 | implements :group, Group::V1_8_6 8 | implements :literal, Literal::All 9 | implements :meta, Meta::Extended 10 | implements :posixclass, PosixClass::Standard 11 | implements :quantifier, Quantifier::V1_8_6 12 | implements :set, CharacterSet::All 13 | implements :type, CharacterType::Extended 14 | end 15 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/1.9.1.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V1_9_1 < Regexp::Syntax::V1_8_6 2 | implements :assertion, Assertion::Lookbehind 3 | implements :backref, Backreference::V1_9_1 + SubexpressionCall::All 4 | implements :escape, Escape::Unicode + Escape::Hex + Escape::Octal 5 | implements :posixclass, PosixClass::Extensions 6 | implements :nonposixclass, PosixClass::All 7 | implements :property, UnicodeProperty::V1_9_0 8 | implements :nonproperty, UnicodeProperty::V1_9_0 9 | implements :quantifier, Quantifier::Possessive + Quantifier::IntervalPossessive 10 | implements :type, CharacterType::Hex 11 | end 12 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/1.9.3.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V1_9_3 < Regexp::Syntax::V1_9_1 2 | implements :property, UnicodeProperty::V1_9_3 3 | implements :nonproperty, UnicodeProperty::V1_9_3 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.0.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_0_0 < Regexp::Syntax::V1_9_3 2 | implements :keep, Keep::All 3 | implements :conditional, Conditional::All 4 | implements :property, UnicodeProperty::V2_0_0 5 | implements :nonproperty, UnicodeProperty::V2_0_0 6 | implements :type, CharacterType::Clustered 7 | 8 | excludes :property, %i[newline] 9 | excludes :nonproperty, %i[newline] 10 | end 11 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.2.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_2_0 < Regexp::Syntax::V2_0_0 2 | implements :property, UnicodeProperty::V2_2_0 3 | implements :nonproperty, UnicodeProperty::V2_2_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.3.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_3_0 < Regexp::Syntax::V2_2_0 2 | implements :property, UnicodeProperty::V2_3_0 3 | implements :nonproperty, UnicodeProperty::V2_3_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.4.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_4_0 < Regexp::Syntax::V2_3_0 2 | implements :property, UnicodeProperty::V2_4_0 3 | implements :nonproperty, UnicodeProperty::V2_4_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.4.1.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_4_1 < Regexp::Syntax::V2_4_0 2 | implements :group, Group::V2_4_1 3 | end 4 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.5.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_5_0 < Regexp::Syntax::V2_4_1 2 | implements :property, UnicodeProperty::V2_5_0 3 | implements :nonproperty, UnicodeProperty::V2_5_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.6.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_6_0 < Regexp::Syntax::V2_5_0 2 | implements :property, UnicodeProperty::V2_6_0 3 | implements :nonproperty, UnicodeProperty::V2_6_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.6.2.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_6_2 < Regexp::Syntax::V2_6_0 2 | implements :property, UnicodeProperty::V2_6_2 3 | implements :nonproperty, UnicodeProperty::V2_6_2 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/2.6.3.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V2_6_3 < Regexp::Syntax::V2_6_2 2 | implements :property, UnicodeProperty::V2_6_3 3 | implements :nonproperty, UnicodeProperty::V2_6_3 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/3.1.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V3_1_0 < Regexp::Syntax::V2_6_3 2 | implements :property, UnicodeProperty::V3_1_0 3 | implements :nonproperty, UnicodeProperty::V3_1_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/syntax/versions/3.2.0.rb: -------------------------------------------------------------------------------- 1 | class Regexp::Syntax::V3_2_0 < Regexp::Syntax::V3_1_0 2 | implements :property, UnicodeProperty::V3_2_0 3 | implements :nonproperty, UnicodeProperty::V3_2_0 4 | end 5 | -------------------------------------------------------------------------------- /lib/regexp_parser/token.rb: -------------------------------------------------------------------------------- 1 | class Regexp 2 | TOKEN_KEYS = %i[ 3 | type 4 | token 5 | text 6 | ts 7 | te 8 | level 9 | set_level 10 | conditional_level 11 | ].freeze 12 | 13 | Token = Struct.new(*TOKEN_KEYS) do 14 | attr_accessor :previous, :next 15 | 16 | def offset 17 | [ts, te] 18 | end 19 | 20 | def length 21 | te - ts 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/regexp_parser/version.rb: -------------------------------------------------------------------------------- 1 | class Regexp 2 | class Parser 3 | VERSION = '2.10.0' 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /regexp_parser.gemspec: -------------------------------------------------------------------------------- 1 | $:.unshift File.join(File.dirname(__FILE__), 'lib') 2 | 3 | require 'regexp_parser/version' 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = 'regexp_parser' 7 | spec.version = ::Regexp::Parser::VERSION 8 | 9 | spec.summary = "Scanner, lexer, parser for ruby's regular expressions" 10 | spec.description = 'A library for tokenizing, lexing, and parsing Ruby regular expressions.' 11 | spec.homepage = 'https://github.com/ammar/regexp_parser' 12 | 13 | spec.metadata['bug_tracker_uri'] = "#{spec.homepage}/issues" 14 | spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/master/CHANGELOG.md" 15 | spec.metadata['homepage_uri'] = spec.homepage 16 | spec.metadata['source_code_uri'] = spec.homepage 17 | spec.metadata['wiki_uri'] = "#{spec.homepage}/wiki" 18 | 19 | spec.metadata['rubygems_mfa_required'] = 'true' 20 | 21 | spec.authors = ['Ammar Ali', 'Janosch Müller'] 22 | spec.email = ['ammarabuali@gmail.com', 'janosch84@gmail.com'] 23 | 24 | spec.license = 'MIT' 25 | 26 | spec.require_paths = ['lib'] 27 | 28 | spec.files = Dir.glob('lib/**/*.{csv,rb,rl}') + 29 | %w[Gemfile Rakefile LICENSE regexp_parser.gemspec] 30 | 31 | spec.platform = Gem::Platform::RUBY 32 | 33 | spec.required_ruby_version = '>= 2.0.0' 34 | end 35 | -------------------------------------------------------------------------------- /spec/expression/base_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Expression::Base) do 4 | # test #level 5 | include_examples 'parse', /^a(b(c(d)))e$/, 6 | [0] => [to_s: '^', level: 0], 7 | [1] => [to_s: 'a', level: 0], 8 | [2] => [to_s: '(b(c(d)))', level: 0], 9 | [2, 0] => [to_s: 'b', level: 1], 10 | [2, 1] => [to_s: '(c(d))', level: 1], 11 | [2, 1, 0] => [to_s: 'c', level: 2], 12 | [2, 1, 1] => [to_s: '(d)', level: 2], 13 | [2, 1, 1, 0] => [to_s: 'd', level: 3], 14 | [3] => [to_s: 'e', level: 0], 15 | [4] => [to_s: '$', level: 0] 16 | 17 | # test #coded_offset 18 | include_examples 'parse', /^a*(b+(c?))$/, 19 | [] => [Root, coded_offset: '@0+12'], 20 | [0] => [to_s: '^', coded_offset: '@0+1'], 21 | [1] => [to_s: 'a*', coded_offset: '@1+2'], 22 | [2] => [to_s: '(b+(c?))', coded_offset: '@3+8'], 23 | [2, 0] => [to_s: 'b+', coded_offset: '@4+2'], 24 | [2, 1] => [to_s: '(c?)', coded_offset: '@6+4'], 25 | [2, 1, 0] => [to_s: 'c?', coded_offset: '@7+2'], 26 | [3] => [to_s: '$', coded_offset: '@11+1'] 27 | 28 | # test #quantity 29 | include_examples 'parse', /aa/, [0] => [quantity: [nil, nil]] 30 | include_examples 'parse', /a?/, [0] => [quantity: [0, 1]] 31 | include_examples 'parse', /a*/, [0] => [quantity: [0, -1]] 32 | include_examples 'parse', /a+/, [0] => [quantity: [1, -1]] 33 | 34 | # test #repetitions 35 | include_examples 'parse', /aa/, [0] => [repetitions: 1..1] 36 | include_examples 'parse', /a?/, [0] => [repetitions: 0..1] 37 | include_examples 'parse', /a*/, [0] => [repetitions: 0..(Float::INFINITY)] 38 | include_examples 'parse', /a+/, [0] => [repetitions: 1..(Float::INFINITY)] 39 | 40 | # test #base_length, #full_length, #starts_at, #ends_at 41 | include_examples 'parse', /(aa)/, 42 | [] => [Root, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4], 43 | [0] => [Group::Capture, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4], 44 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] 45 | include_examples 'parse', /(aa){42}/, 46 | [] => [Root, base_length: 8, full_length: 8, starts_at: 0, ends_at: 8], 47 | [0] => [Group::Capture, base_length: 4, full_length: 8, starts_at: 0, ends_at: 8], 48 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] 49 | include_examples 'parse', /(aa) {42}/x, 50 | [] => [Root, base_length: 9, full_length: 9, starts_at: 0, ends_at: 9], 51 | [0] => [Group::Capture, base_length: 4, full_length: 9, starts_at: 0, ends_at: 9], 52 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] 53 | 54 | # test #to_re 55 | include_examples 'parse', '^a*(b([cde]+))+f?$', 56 | [] => [Root, to_re: /^a*(b([cde]+))+f?$/] 57 | 58 | specify '#parent' do 59 | root = Regexp::Parser.parse(/(a(b)){42}/) 60 | 61 | expect(root.parent).to be_nil 62 | expect(root[0].parent).to eq root 63 | expect(root[0].quantifier.parent).to be_nil 64 | expect(root[0][0].parent).to eq root[0] 65 | expect(root[0][1].parent).to eq root[0] 66 | expect(root[0][1][0].parent).to eq root[0][1] 67 | end 68 | 69 | specify '#to_re warns when used on set members' do 70 | expect do 71 | result = Regexp::Parser.parse(/[\b]/)[0][0].to_re 72 | expect(result).to eq(/\b/) 73 | end.to output(/set member/).to_stderr 74 | end 75 | 76 | specify 'updating #quantifier updates #repetitions' do 77 | exp = Regexp::Parser.parse(/a{3}/)[0] 78 | expect(exp.repetitions).to eq 3..3 79 | exp.quantifier = Regexp::Parser.parse(/b{5}/)[0].quantifier 80 | expect(exp.repetitions).to eq 5..5 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /spec/expression/conditional_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Expression::Conditional) do 4 | specify('Conditional#condition, #branches') do 5 | conditional = RP.parse(/(?a)(?()T|F)/)[1] 6 | expect(conditional.condition).to eq conditional[0] 7 | expect(conditional.branches).to eq conditional[1..2] 8 | end 9 | 10 | specify('Condition#referenced_expression') do 11 | root = RP.parse(/(?a)(?()T|F)/) 12 | condition = root[1].condition 13 | expect(condition.referenced_expression).to eq root[0] 14 | expect(condition.referenced_expression.to_s).to eq '(?a)' 15 | 16 | root = RP.parse(/(a)(?(1)T|F)/) 17 | condition = root[1].condition 18 | expect(condition.referenced_expression).to eq root[0] 19 | expect(condition.referenced_expression.to_s).to eq '(a)' 20 | end 21 | 22 | specify('parse conditional excessive branches') do 23 | regexp = '(?a)(?()T|F|X)' 24 | 25 | expect { RP.parse(regexp) }.to raise_error(Conditional::TooManyBranches) 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/expression/free_space_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Expression::FreeSpace) do 4 | specify('white space quantify raises error') do 5 | regexp = / 6 | a # Comment 7 | /x 8 | 9 | root = RP.parse(regexp) 10 | space = root[0] 11 | 12 | expect(space).to be_instance_of(FreeSpace::WhiteSpace) 13 | expect { space.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) 14 | end 15 | 16 | specify('comment quantify raises error') do 17 | regexp = / 18 | a # Comment 19 | /x 20 | 21 | root = RP.parse(regexp) 22 | comment = root[3] 23 | 24 | expect(comment).to be_instance_of(FreeSpace::Comment) 25 | expect { comment.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/expression/methods/construct_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Expression::Shared) do 4 | describe '::construct' do 5 | { 6 | Alternation => :meta, 7 | Alternative => :expression, 8 | Anchor::Base => :anchor, 9 | Anchor::EndOfLine => :anchor, 10 | Assertion::Base => :assertion, 11 | Assertion::Lookahead => :assertion, 12 | Backreference::Base => :backref, 13 | Backreference::Number => :backref, 14 | CharacterSet => :set, 15 | CharacterSet::IntersectedSequence => :expression, 16 | CharacterSet::Intersection => :set, 17 | CharacterSet::Range => :set, 18 | CharacterType::Any => :meta, 19 | CharacterType::Base => :type, 20 | CharacterType::Digit => :type, 21 | Conditional::Branch => :expression, 22 | Conditional::Condition => :conditional, 23 | Conditional::Expression => :conditional, 24 | EscapeSequence::Base => :escape, 25 | EscapeSequence::Literal => :escape, 26 | FreeSpace => :free_space, 27 | Group::Base => :group, 28 | Group::Capture => :group, 29 | Keep::Mark => :keep, 30 | Literal => :literal, 31 | PosixClass => :posixclass, 32 | Quantifier => :quantifier, 33 | Root => :expression, 34 | UnicodeProperty::Base => :property, 35 | UnicodeProperty::Number::Decimal => :property, 36 | }.each do |klass, expected_type| 37 | it "works for #{klass}" do 38 | result = klass.construct 39 | expect(result).to be_a klass 40 | expect(result.type).to eq expected_type 41 | end 42 | end 43 | 44 | it 'allows overriding defaults' do 45 | expect(Literal.construct(type: :foo).type).to eq :foo 46 | end 47 | 48 | it 'allows passing options' do 49 | expect(Literal.construct(options: { i: true }).options[:i]).to eq true 50 | end 51 | 52 | it 'raises ArgumentError for unknown parameters' do 53 | expect { Literal.construct(foo: :foo) }.to raise_error(ArgumentError) 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /spec/expression/methods/human_name_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Regexp::Expression::Shared#human_name') do 4 | include_examples 'parse', //, [] => [human_name: 'root'] 5 | include_examples 'parse', /a/, [0] => [human_name: 'literal'] 6 | include_examples 'parse', /./, [0] => [human_name: 'match-all'] 7 | include_examples 'parse', /[abc]/, [0] => [human_name: 'character set'] 8 | include_examples 'parse', /[a-c]/, [0, 0] => [human_name: 'character range'] 9 | include_examples 'parse', /\d/, [0] => [human_name: 'digit type'] 10 | include_examples 'parse', /\n/, [0] => [human_name: 'newline escape'] 11 | include_examples 'parse', /\u{61 62 63}/, [0] => [human_name: 'codepoint list escape'] 12 | include_examples 'parse', /\p{ascii}/, [0] => [human_name: 'ascii property'] 13 | include_examples 'parse', /[[:ascii:]]/, [0, 0] => [human_name: 'ascii posixclass'] 14 | include_examples 'parse', /a{5}/, [0, :q] => [human_name: 'interval quantifier'] 15 | include_examples 'parse', /^/, [0] => [human_name: 'beginning of line'] 16 | include_examples 'parse', /(?=abc)/, [0] => [human_name: 'lookahead'] 17 | include_examples 'parse', /(a)(b)/, [0] => [human_name: 'capture group 1'] 18 | include_examples 'parse', /(a)(b)/, [1] => [human_name: 'capture group 2'] 19 | include_examples 'parse', /(?abc)/, [0] => [human_name: 'named capture group'] 20 | include_examples 'parse', / /x, [0] => [human_name: 'free space'] 21 | include_examples 'parse', /#comment 22 | /x, [0] => [human_name: 'comment'] 23 | include_examples 'parse', /(?#comment)/x, [0] => [human_name: 'comment group'] 24 | include_examples 'parse', /(abc)\1/, [1] => [human_name: 'backreference'] 25 | include_examples 'parse', /(?)\k/, [1] => [human_name: 'backreference by name'] 26 | include_examples 'parse', /(abc)\g<-1>/, [1] => [human_name: 'relative subexpression call'] 27 | include_examples 'parse', /a|bc/, [0] => [human_name: 'alternation'] 28 | include_examples 'parse', /a|bc/, [0, 0] => [human_name: 'alternative'] 29 | end 30 | -------------------------------------------------------------------------------- /spec/expression/methods/match_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Base#match') do 4 | it 'returns the #match result of the respective Regexp' do 5 | expect(RP.parse(/a/).match('a')[0]).to eq 'a' 6 | end 7 | 8 | it 'can be given an offset, just like Regexp#match' do 9 | expect(RP.parse(/./).match('ab', 1)[0]).to eq 'b' 10 | end 11 | 12 | it 'works with the #=~ alias' do 13 | expect(RP.parse(/a/) =~ 'a').to be_a MatchData 14 | end 15 | end 16 | 17 | RSpec.describe('Expression::Base#match?') do 18 | it 'returns true if the Respective Regexp matches' do 19 | expect(RP.parse(/a/).match?('a')).to be true 20 | end 21 | 22 | it 'returns false if the Respective Regexp does not match' do 23 | expect(RP.parse(/a/).match?('b')).to be false 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/expression/methods/negative_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Base#negative?') do 4 | include_examples 'parse', //, [] => [:root, negative?: false] 5 | include_examples 'parse', /a/, [0] => [:literal, negative?: false] 6 | 7 | include_examples 'parse', /\b/, [0] => [:word_boundary, negative?: false] 8 | include_examples 'parse', /\B/, [0] => [:nonword_boundary, negative?: true] 9 | 10 | include_examples 'parse', /(?=)/, [0] => [:lookahead, negative?: false] 11 | include_examples 'parse', /(?!)/, [0] => [:nlookahead, negative?: true] 12 | 13 | include_examples 'parse', /(?<=)/, [0] => [:lookbehind, negative?: false] 14 | include_examples 'parse', /(? [:nlookbehind, negative?: true] 15 | 16 | include_examples 'parse', /[a]/, [0] => [:character, negative?: false] 17 | include_examples 'parse', /[^a]/, [0] => [:character, negative?: true] 18 | 19 | include_examples 'parse', /\d/, [0] => [:digit, negative?: false] 20 | include_examples 'parse', /\D/, [0] => [:nondigit, negative?: true] 21 | 22 | include_examples 'parse', /[[:word:]]/, [0, 0] => [:word, negative?: false] 23 | include_examples 'parse', /[[:^word:]]/, [0, 0] => [:word, negative?: true] 24 | 25 | include_examples 'parse', /\p{word}/, [0] => [:word, negative?: false] 26 | include_examples 'parse', /\p{^word}/, [0] => [:word, negative?: true] 27 | 28 | include_examples 'parse', //, [] => [:root, negated?: false] 29 | include_examples 'parse', /[^a]/, [0] => [:character, negated?: true] 30 | end 31 | -------------------------------------------------------------------------------- /spec/expression/methods/parts_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Base#parts') do 4 | include_examples 'parse', //, [] => [:root, parts: []] 5 | include_examples 'parse', /a/, [0] => [:literal, parts: ['a']] 6 | include_examples 'parse', /\K/, [0] => [:mark, parts: ['\K']] 7 | include_examples 'parse', /\p{any}/, [0] => [:any, parts: ['\p{any}']] 8 | include_examples 'parse', /[a]/, [0] => [:character, parts: ['[', s(Literal, 'a'), ']']] 9 | include_examples 'parse', /[^a]/, [0] => [:character, parts: ['[^', s(Literal, 'a'), ']']] 10 | include_examples 'parse', /(a)/, [0] => [:capture, parts: ['(', s(Literal, 'a'), ')']] 11 | include_examples 'parse', /(?>a)/, [0] => [:atomic, parts: ['(?>', s(Literal, 'a'), ')']] 12 | include_examples 'parse', /(?=a)/, [0] => [:lookahead, parts: ['(?=', s(Literal, 'a'), ')']] 13 | include_examples 'parse', /(?#a)/, [0] => [:comment, parts: ['(?#a)']] 14 | 15 | include_examples 'parse', /(a(b(c)))/, 16 | [0] => [:capture, parts: [ 17 | '(', 18 | s(Literal, 'a'), 19 | s(Group::Capture, '(', 20 | s(Literal, 'b'), 21 | s(Group::Capture, '(', 22 | s(Literal, 'c'), 23 | ) 24 | ), 25 | ')' 26 | ]] 27 | 28 | include_examples 'parse', /a|b|c/, 29 | [] => [:root, parts: [ 30 | s(Alternation, '|', 31 | s(Alternative, nil, s(Literal, 'a')), 32 | s(Alternative, nil, s(Literal, 'b')), 33 | s(Alternative, nil, s(Literal, 'c')) 34 | ) 35 | ]], 36 | [0] => [:alternation, parts: [ 37 | s(Alternative, nil, s(Literal, 'a')), 38 | '|', 39 | s(Alternative, nil, s(Literal, 'b')), 40 | '|', 41 | s(Alternative, nil, s(Literal, 'c')) 42 | ]] 43 | 44 | include_examples 'parse', /[a-z]/, 45 | [] => [:root, parts: [ 46 | s(CharacterSet, '[', 47 | s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), 48 | ) 49 | ]], 50 | [0] => [:character, parts: [ 51 | '[', 52 | s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), 53 | ']' 54 | ]], 55 | [0, 0] => [:range, parts: [ 56 | s(Literal, 'a'), 57 | '-', 58 | s(Literal, 'z') 59 | ]] 60 | 61 | include_examples 'parse', /[a&&b&&c]/, 62 | [] => [:root, parts: [ 63 | s(CharacterSet, '[', 64 | s(CharacterSet::Intersection, '&&', 65 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')), 66 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')), 67 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c')) 68 | ) 69 | ) 70 | ]], 71 | [0, 0] => [:intersection, parts: [ 72 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')), 73 | '&&', 74 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')), 75 | '&&', 76 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c')) 77 | ]] 78 | 79 | include_examples 'parse', /(a)(?(1)T|F)/, 80 | [1] => [Conditional::Expression, parts: [ 81 | '(?', 82 | s(Conditional::Condition, '(1)'), 83 | s(Conditional::Branch, nil, s(Literal, 'T')), 84 | '|', 85 | s(Conditional::Branch, nil, s(Literal, 'F')), 86 | ')' 87 | ]] 88 | end 89 | -------------------------------------------------------------------------------- /spec/expression/methods/printing_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Shared#inspect') do 4 | it 'includes only essential information' do 5 | root = Regexp::Parser.parse(//) 6 | expect(root.inspect).to eq '#' 7 | 8 | root = Regexp::Parser.parse(/(a)+/) 9 | expect(root.inspect) 10 | .to match(/#' 15 | expect(root[0][0].inspect) 16 | .to eq '#' 17 | end 18 | end 19 | 20 | RSpec.describe('Expression::Shared#pretty_print') do 21 | it 'works' do 22 | require 'pp' 23 | pp_to_s = ->(arg) { ''.dup.tap { |buffer| PP.new(buffer).pp(arg) } } 24 | 25 | root = Regexp::Parser.parse(/(a)+/) 26 | 27 | expect(pp_to_s.(root)).to start_with '# [Alternation, ts: 0, te: 19], 7 | [0, 0] => [Alternative, ts: 0, te: 4], 8 | [0, 1] => [Alternative, ts: 5, te: 9], 9 | [0, 2] => [Alternative, ts: 10, te: 14], 10 | [0, 3] => [Alternative, ts: 15, te: 19] 11 | 12 | # check #nesting_level 13 | include_examples 'parse', /a(b(\d|[ef-g[h]]))/, 14 | [0] => [Literal, to_s: 'a', nesting_level: 1], 15 | [1, 0] => [Literal, to_s: 'b', nesting_level: 2], 16 | [1, 1, 0] => [Alternation, to_s: '\d|[ef-g[h]]', nesting_level: 3], 17 | [1, 1, 0, 0] => [Alternative, to_s: '\d', nesting_level: 4], 18 | [1, 1, 0, 0, 0] => [CharacterType::Digit, to_s: '\d', nesting_level: 5], 19 | [1, 1, 0, 1] => [Alternative, to_s: '[ef-g[h]]', nesting_level: 4], 20 | [1, 1, 0, 1, 0] => [CharacterSet, to_s: '[ef-g[h]]', nesting_level: 5], 21 | [1, 1, 0, 1, 0, 0] => [Literal, to_s: 'e', nesting_level: 6], 22 | [1, 1, 0, 1, 0, 1] => [CharacterSet::Range, to_s: 'f-g', nesting_level: 6], 23 | [1, 1, 0, 1, 0, 1, 0] => [Literal, to_s: 'f', nesting_level: 7], 24 | [1, 1, 0, 1, 0, 2, 0] => [Literal, to_s: 'h', nesting_level: 7] 25 | 26 | specify('#dig') do 27 | root = RP.parse(/(((a)))/) 28 | 29 | expect(root.dig(0).to_s).to eq '(((a)))' 30 | expect(root.dig(0, 0, 0, 0).to_s).to eq 'a' 31 | expect(root.dig(0, 0, 0, 0, 0)).to be_nil 32 | expect(root.dig(3, 7)).to be_nil 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/expression/te_ts_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Shared#te,ts') do 4 | # Many tokens/expressions have their own tests for #te and #ts. 5 | # This is an integration-like test to ensure they are correct in conjunction. 6 | it 'is correct irrespective of nesting or preceding tokens' do 7 | regexp = regexp_with_all_features 8 | source = regexp.source 9 | root = RP.parse(regexp) 10 | 11 | checked_exps = root.each_expression.with_object([]) do |(exp), acc| 12 | acc.each { |e| fail "dupe: #{[e, exp]}" if e.to_s == exp.to_s } 13 | acc << exp unless exp.is_a?(Sequence) || exp.is_a?(WhiteSpace) 14 | end 15 | expect(checked_exps).not_to be_empty 16 | 17 | checked_exps.each do |exp| 18 | start = source.index(exp.to_s(:original)) 19 | expect(exp.ts).to eq(start), 20 | "expected #{exp.class} #{exp} to start at #{start}, got #{exp.ts}" 21 | 22 | end_idx = start + exp.base_length 23 | expect(exp.te).to eq(end_idx), 24 | "expected #{exp.class} #{exp} to end at #{end_idx}, got #{exp.te}" 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/expression/to_h_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Base#to_h') do 4 | include_examples 'parse', /abc/, [] => [Root, to_h: { 5 | token: :root, 6 | type: :expression, 7 | text: 'abc', 8 | starts_at: 0, 9 | length: 3, 10 | quantifier: nil, 11 | options: {}, 12 | level: 0, 13 | set_level: 0, 14 | conditional_level: 0, 15 | expressions: [ 16 | { 17 | token: :literal, 18 | type: :literal, 19 | text: 'abc', 20 | starts_at: 0, 21 | length: 3, 22 | quantifier: nil, 23 | options: {}, 24 | level: 0, 25 | set_level: 0, 26 | conditional_level: 0 27 | } 28 | ] 29 | }] 30 | 31 | include_examples 'parse', /a{2,4}/, [0, :q] => [Quantifier, to_h: { 32 | max: 4, 33 | min: 2, 34 | mode: :greedy, 35 | text: '{2,4}', 36 | token: :interval, 37 | }] 38 | 39 | specify('Conditional#to_h') do 40 | root = RP.parse('(?a)(?()b|c)') 41 | expect { root.to_h }.not_to(raise_error) 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /spec/expression/to_s_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Expression::Base#to_s') do 4 | def parse_frozen(pattern) 5 | Leto.deep_freeze(RP.parse(pattern)) 6 | end 7 | 8 | def expect_round_trip(pattern) 9 | parsed = parse_frozen(pattern) 10 | 11 | expect(parsed.to_s).to eql(pattern) 12 | end 13 | 14 | specify('literal alternation') do 15 | expect_round_trip('abcd|ghij|klmn|pqur') 16 | end 17 | 18 | specify('quantified alternations') do 19 | expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)') 20 | end 21 | 22 | specify('quantified sets') do 23 | expect_round_trip('[abc]+|[^def]{3,6}') 24 | end 25 | 26 | specify('property sets') do 27 | expect_round_trip('[\a\b\p{Lu}\P{Z}\c\d]+') 28 | end 29 | 30 | specify('groups') do 31 | expect_round_trip("(a(?>b(?:c(?d(?'N'e)??f)+g)*+h)*i)++") 32 | end 33 | 34 | specify('assertions') do 35 | expect_round_trip('(a+(?=b+(?!c+(?<=d+(?a)(?()b|c)/, 5 | 3 => [:conditional, :open, '(?', 7, 9, 0, 0, 0], 6 | 4 => [:conditional, :condition, '()', 9, 14, 0, 0, 1], 7 | 6 => [:conditional, :separator, '|', 15, 16, 0, 0, 1], 8 | 8 => [:conditional, :close, ')', 17, 18, 0, 0, 0] 9 | 10 | include_examples 'lex', /((?a)(?(?()b|((?()[e-g]|[h-j])))))/, 11 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 12 | 1 => [:group, :named, '(?', 1, 6, 1, 0, 0], 13 | 5 => [:conditional, :open, '(?', 13, 15, 2, 0, 0], 14 | 6 => [:conditional, :condition, '()', 15, 20, 2, 0, 1], 15 | 8 => [:conditional, :separator, '|', 21, 22, 2, 0, 1], 16 | 10 => [:conditional, :open, '(?', 23, 25, 3, 0, 1], 17 | 11 => [:conditional, :condition, '()', 25, 30, 3, 0, 2], 18 | 12 => [:set, :open, '[', 30, 31, 3, 0, 2], 19 | 13 => [:literal, :literal, 'e', 31, 32, 3, 1, 2], 20 | 14 => [:set, :range, '-', 32, 33, 3, 1, 2], 21 | 15 => [:literal, :literal, 'g', 33, 34, 3, 1, 2], 22 | 16 => [:set, :close, ']', 34, 35, 3, 0, 2], 23 | 17 => [:conditional, :separator, '|', 35, 36, 3, 0, 2], 24 | 23 => [:conditional, :close, ')', 41, 42, 3, 0, 1], 25 | 25 => [:conditional, :close, ')', 43, 44, 2, 0, 0], 26 | 26 => [:group, :close, ')', 44, 45, 1, 0, 0], 27 | 27 => [:group, :close, ')', 45, 46, 0, 0, 0] 28 | 29 | include_examples 'lex', /(a(b(c)))(?(1)(?(2)(?(3)d|e))|(?(3)(?(2)f|g)|(?(1)f|g)))/, 30 | 9 => [:conditional, :open, '(?', 9, 11, 0, 0, 0], 31 | 10 => [:conditional, :condition, '(1)', 11, 14, 0, 0, 1], 32 | 11 => [:conditional, :open, '(?', 14, 16, 0, 0, 1], 33 | 12 => [:conditional, :condition, '(2)', 16, 19, 0, 0, 2], 34 | 13 => [:conditional, :open, '(?', 19, 21, 0, 0, 2], 35 | 14 => [:conditional, :condition, '(3)', 21, 24, 0, 0, 3], 36 | 16 => [:conditional, :separator, '|', 25, 26, 0, 0, 3], 37 | 18 => [:conditional, :close, ')', 27, 28, 0, 0, 2], 38 | 19 => [:conditional, :close, ')', 28, 29, 0, 0, 1], 39 | 20 => [:conditional, :separator, '|', 29, 30, 0, 0, 1], 40 | 21 => [:conditional, :open, '(?', 30, 32, 0, 0, 1], 41 | 22 => [:conditional, :condition, '(3)', 32, 35, 0, 0, 2], 42 | 23 => [:conditional, :open, '(?', 35, 37, 0, 0, 2], 43 | 24 => [:conditional, :condition, '(2)', 37, 40, 0, 0, 3], 44 | 26 => [:conditional, :separator, '|', 41, 42, 0, 0, 3], 45 | 28 => [:conditional, :close, ')', 43, 44, 0, 0, 2], 46 | 29 => [:conditional, :separator, '|', 44, 45, 0, 0, 2], 47 | 30 => [:conditional, :open, '(?', 45, 47, 0, 0, 2], 48 | 31 => [:conditional, :condition, '(1)', 47, 50, 0, 0, 3], 49 | 33 => [:conditional, :separator, '|', 51, 52, 0, 0, 3], 50 | 35 => [:conditional, :close, ')', 53, 54, 0, 0, 2], 51 | 36 => [:conditional, :close, ')', 54, 55, 0, 0, 1], 52 | 37 => [:conditional, :close, ')', 55, 56, 0, 0, 0] 53 | end 54 | -------------------------------------------------------------------------------- /spec/lexer/delimiters_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Literal delimiter lexing') do 4 | include_examples 'lex', '}', 5 | 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0] 6 | 7 | include_examples 'lex', '}}', 8 | 0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0] 9 | 10 | include_examples 'lex', '{', 11 | 0 => [:literal, :literal, '{', 0, 1, 0, 0, 0] 12 | 13 | include_examples 'lex', '{{', 14 | 0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0] 15 | 16 | include_examples 'lex', '{}', 17 | 0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0] 18 | 19 | include_examples 'lex', '}{', 20 | 0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0] 21 | 22 | include_examples 'lex', '}{+', 23 | 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0], 24 | 1 => [:literal, :literal, '{', 1, 2, 0, 0, 0], 25 | 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] 26 | 27 | include_examples 'lex', '{{var}}', 28 | 0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0] 29 | 30 | include_examples 'lex', 'a{b}c', 31 | 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0] 32 | 33 | include_examples 'lex', 'a{1,2', 34 | 0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0] 35 | 36 | include_examples 'lex', '({.+})', 37 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 38 | 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0], 39 | 2 => [:meta, :dot, '.', 2, 3, 1, 0, 0], 40 | 3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0], 41 | 4 => [:literal, :literal, '}', 4, 5, 1, 0, 0], 42 | 5 => [:group, :close, ')', 5, 6, 0, 0, 0] 43 | 44 | include_examples 'lex', ']', 45 | 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0] 46 | 47 | include_examples 'lex', ']]', 48 | 0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0] 49 | 50 | include_examples 'lex', ']\[', 51 | 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0], 52 | 1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0] 53 | 54 | include_examples 'lex', '()', 55 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 56 | 1 => [:group, :close, ')', 1, 2, 0, 0, 0] 57 | 58 | include_examples 'lex', '{abc:.+}}}[^}]]}', 59 | 0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0], 60 | 1 => [:meta, :dot, '.', 5, 6, 0, 0, 0], 61 | 2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 62 | 3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0], 63 | 4 => [:set, :open, '[', 10, 11, 0, 0, 0], 64 | 5 => [:set, :negate, '^', 11, 12, 0, 1, 0], 65 | 6 => [:literal, :literal, '}', 12, 13, 0, 1, 0], 66 | 7 => [:set, :close, ']', 13, 14, 0, 0, 0], 67 | 8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0] 68 | end 69 | -------------------------------------------------------------------------------- /spec/lexer/escapes_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Escape lexing') do 4 | include_examples 'lex', '\u{62}', 5 | 0 => [:escape, :codepoint_list, '\u{62}', 0, 6, 0, 0, 0] 6 | 7 | include_examples 'lex', '\u{62 63 64}', 8 | 0 => [:escape, :codepoint_list, '\u{62 63 64}', 0, 12, 0, 0, 0] 9 | 10 | include_examples 'lex', '\u{62 63 64}+', 11 | 0 => [:escape, :codepoint_list, '\u{62 63}', 0, 9, 0, 0, 0], 12 | 1 => [:escape, :codepoint_list, '\u{64}', 9, 15, 0, 0, 0], 13 | 2 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0] 14 | end 15 | -------------------------------------------------------------------------------- /spec/lexer/keep_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Keep lexing') do 4 | include_examples 'lex', /ab\Kcd/, 5 | 1 => [:keep, :mark, '\K', 2, 4, 0, 0, 0] 6 | 7 | include_examples 'lex', /(a\Kb)|(c\\\Kd)ef/, 8 | 2 => [:keep, :mark, '\K', 2, 4, 1, 0, 0], 9 | 9 => [:keep, :mark, '\K', 11, 13, 1, 0, 0] 10 | end 11 | -------------------------------------------------------------------------------- /spec/lexer/literals_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Literal lexing') do 4 | # ascii, single byte characters 5 | include_examples 'lex', 'a', 6 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0] 7 | 8 | include_examples 'lex', 'ab+', 9 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 10 | 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 11 | 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] 12 | 13 | # 2 byte wide characters 14 | include_examples 'lex', 'äöü+', 15 | 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0], 16 | 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0], 17 | 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0] 18 | 19 | # 3 byte wide characters, Japanese 20 | include_examples 'lex', 'ab?れます+cd', 21 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 22 | 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 23 | 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 24 | 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0], 25 | 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0], 26 | 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 27 | 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0] 28 | 29 | # 4 byte wide characters, Osmanya 30 | include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃', 31 | 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0], 32 | 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0], 33 | 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 34 | 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0], 35 | 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0], 36 | 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 37 | 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0] 38 | 39 | include_examples 'lex', 'mu𝄞?si*𝄫c+', 40 | 0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0], 41 | 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0], 42 | 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0], 43 | 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0], 44 | 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0], 45 | 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0], 46 | 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0], 47 | 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0], 48 | 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0] 49 | 50 | specify('lex single 2 byte char') do 51 | tokens = RL.lex("\u0627+") 52 | expect(tokens.count).to eq 2 53 | end 54 | 55 | specify('lex single 3 byte char') do 56 | tokens = RL.lex("\u308C+") 57 | expect(tokens.count).to eq 2 58 | end 59 | 60 | specify('lex single 4 byte char') do 61 | tokens = RL.lex("\u{1D11E}+") 62 | expect(tokens.count).to eq 2 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /spec/lexer/refcalls_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('RefCall lexing') do 4 | # Traditional numerical group back-reference 5 | include_examples 'lex', '(abc)\1', 6 | 3 => [:backref, :number, '\1', 5, 7, 0, 0, 0] 7 | 8 | # Group back-references, named, numbered, and relative 9 | include_examples 'lex', '(?abc)\k', 10 | 3 => [:backref, :name_ref, '\k', 9, 14, 0, 0, 0] 11 | include_examples 'lex', "(?abc)\\k'X'", 12 | 3 => [:backref, :name_ref, "\\k'X'", 9, 14, 0, 0, 0] 13 | 14 | include_examples 'lex', '(abc)\k<1>', 15 | 3 => [:backref, :number_ref, '\k<1>', 5, 10, 0, 0, 0] 16 | include_examples 'lex', "(abc)\\k'1'", 17 | 3 => [:backref, :number_ref, "\\k'1'", 5, 10, 0, 0, 0] 18 | 19 | include_examples 'lex', '(abc)\k<-1>', 20 | 3 => [:backref, :number_rel_ref, '\k<-1>', 5, 11, 0, 0, 0] 21 | include_examples 'lex', "(abc)\\k'-1'", 22 | 3 => [:backref, :number_rel_ref, "\\k'-1'", 5, 11, 0, 0, 0] 23 | 24 | # Sub-expression invocation, named, numbered, and relative 25 | include_examples 'lex', '(?abc)\g', 26 | 3 => [:backref, :name_call, '\g', 9, 14, 0, 0, 0] 27 | include_examples 'lex', "(?abc)\\g'X'", 28 | 3 => [:backref, :name_call, "\\g'X'", 9, 14, 0, 0, 0] 29 | 30 | include_examples 'lex', '(abc)\g<1>', 31 | 3 => [:backref, :number_call, '\g<1>', 5, 10, 0, 0, 0] 32 | include_examples 'lex', "(abc)\\g'1'", 33 | 3 => [:backref, :number_call, "\\g'1'", 5, 10, 0, 0, 0] 34 | 35 | include_examples 'lex', '\g<0>', 36 | 0 => [:backref, :number_call, '\g<0>', 0, 5, 0, 0, 0] 37 | include_examples 'lex', "\\g'0'", 38 | 0 => [:backref, :number_call, "\\g'0'", 0, 5, 0, 0, 0] 39 | 40 | include_examples 'lex', '(abc)\g<-1>', 41 | 3 => [:backref, :number_rel_call, '\g<-1>', 5, 11, 0, 0, 0] 42 | include_examples 'lex', "(abc)\\g'-1'", 43 | 3 => [:backref, :number_rel_call, "\\g'-1'", 5, 11, 0, 0, 0] 44 | 45 | include_examples 'lex', '(abc)\g<+1>', 46 | 3 => [:backref, :number_rel_call, '\g<+1>', 5, 11, 0, 0, 0] 47 | include_examples 'lex', "(abc)\\g'+1'", 48 | 3 => [:backref, :number_rel_call, "\\g'+1'", 5, 11, 0, 0, 0] 49 | 50 | # Group back-references, with nesting level 51 | include_examples 'lex', '(?abc)\k', 52 | 3 => [:backref, :name_recursion_ref, '\k', 9, 16, 0, 0, 0] 53 | include_examples 'lex', "(?abc)\\k'X-0'", 54 | 3 => [:backref, :name_recursion_ref, "\\k'X-0'", 9, 16, 0, 0, 0] 55 | 56 | include_examples 'lex', '(abc)\k<1-0>', 57 | 3 => [:backref, :number_recursion_ref, '\k<1-0>', 5, 12, 0, 0, 0] 58 | include_examples 'lex', "(abc)\\k'1-0'", 59 | 3 => [:backref, :number_recursion_ref, "\\k'1-0'", 5, 12, 0, 0, 0] 60 | end 61 | -------------------------------------------------------------------------------- /spec/parser/all_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Parser) do 4 | specify('parse returns a root expression') do 5 | expect(RP.parse('abc')).to be_instance_of(Root) 6 | end 7 | 8 | specify('parse can be called with block') do 9 | expect(RP.parse('abc') { |root| root.class }).to eq Root 10 | end 11 | 12 | specify('parse root contains expressions') do 13 | root = RP.parse(/^a.c+[^one]{2,3}\b\d\\\C-C$/) 14 | expect(root.expressions).to all(be_a Regexp::Expression::Base) 15 | end 16 | 17 | specify('parse root options mi') do 18 | root = RP.parse(/[abc]/mi) 19 | 20 | expect(root.m?).to be true 21 | expect(root.i?).to be true 22 | expect(root.x?).to be false 23 | end 24 | 25 | specify('parse no quantifier target raises error') do 26 | expect { RP.parse('?abc') }.to raise_error(Regexp::Parser::Error) 27 | end 28 | 29 | specify('parse sequence no quantifier target raises error') do 30 | expect { RP.parse('abc|?def') }.to raise_error(Regexp::Parser::Error) 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/parser/alternation_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Alternation parsing') do 4 | include_examples 'parse', /a|b/, 5 | [0] => [Alternation, text: '|', count: 2], 6 | [0, 0] => [Alternative, text: '', count: 1], 7 | [0, 0, 0] => [:literal, text: 'a' ], 8 | [0, 1] => [Alternative, text: '', count: 1], 9 | [0, 1, 0] => [:literal, text: 'b' ] 10 | 11 | include_examples 'parse', /a|(b)c/, 12 | [0] => [Alternation, text: '|', count: 2], 13 | [0, 0] => [Alternative, text: '', count: 1], 14 | [0, 0, 0] => [:literal, text: 'a' ], 15 | [0, 1] => [Alternative, text: '', count: 2], 16 | [0, 1, 0] => [:capture, to_s: '(b)' ], 17 | [0, 1, 1] => [:literal, text: 'c' ] 18 | 19 | include_examples 'parse', /(ab??|cd*|ef+)*|(gh|ij|kl)?/, 20 | [0] => [Alternation, text: '|', count: 2, quantified?: false], 21 | [0, 0] => [Alternative, text: '', count: 1, quantified?: false], 22 | [0, 0, 0] => [:capture, count: 1, quantified?: true ], 23 | [0, 0, 0, 0] => [Alternation, text: '|', count: 3 ], 24 | [0, 0, 0, 0, 0] => [Alternative, text: '', count: 2 ], 25 | [0, 0, 0, 0, 0, 0] => [:literal, to_s: 'a' ], 26 | [0, 0, 0, 0, 0, 1] => [:literal, to_s: 'b??' ], 27 | [0, 1] => [Alternative, text: '', count: 1, quantified?: false], 28 | [0, 1, 0] => [:capture, count: 1, quantified?: true ] 29 | 30 | # test correct ts values for empty sequences 31 | include_examples 'parse', /|||/, 32 | [0] => [Alternation, text: '|', count: 4, starts_at: 0], 33 | [0, 0] => [Alternative, to_s: '', count: 0, starts_at: 0], 34 | [0, 1] => [Alternative, to_s: '', count: 0, starts_at: 1], 35 | [0, 2] => [Alternative, to_s: '', count: 0, starts_at: 2], 36 | [0, 3] => [Alternative, to_s: '', count: 0, starts_at: 3] 37 | 38 | # test correct ts values for non-empty sequences 39 | include_examples 'parse', /ab|cd|ef|gh/, 40 | [0] => [Alternation, text: '|', count: 4, starts_at: 0], 41 | [0, 0] => [Alternative, to_s: 'ab', count: 1, starts_at: 0], 42 | [0, 1] => [Alternative, to_s: 'cd', count: 1, starts_at: 3], 43 | [0, 2] => [Alternative, to_s: 'ef', count: 1, starts_at: 6], 44 | [0, 3] => [Alternative, to_s: 'gh', count: 1, starts_at: 9] 45 | end 46 | -------------------------------------------------------------------------------- /spec/parser/anchors_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Anchor parsing') do 4 | include_examples 'parse', /^a/, 0 => [:anchor, :bol, Anchor::BOL] 5 | include_examples 'parse', /a$/, 1 => [:anchor, :eol, Anchor::EOL] 6 | 7 | include_examples 'parse', /\Aa/, 0 => [:anchor, :bos, Anchor::BOS] 8 | include_examples 'parse', /a\z/, 1 => [:anchor, :eos, Anchor::EOS] 9 | include_examples 'parse', /a\Z/, 1 => [:anchor, :eos_ob_eol, Anchor::EOSobEOL] 10 | 11 | include_examples 'parse', /a\b/, 1 => [:anchor, :word_boundary, Anchor::WordBoundary] 12 | include_examples 'parse', /a\B/, 1 => [:anchor, :nonword_boundary, Anchor::NonWordBoundary] 13 | 14 | include_examples 'parse', /a\G/, 1 => [:anchor, :match_start, Anchor::MatchStart] 15 | 16 | include_examples 'parse', /\\A/, 0 => [:escape, :backslash, EscapeSequence::Literal] 17 | end 18 | -------------------------------------------------------------------------------- /spec/parser/conditionals_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Conditional parsing') do 4 | include_examples 'parse', /(?a)(?()T|F)/, 5 | [1] => [:conditional, :open, Conditional::Expression, to_s: '(?()T|F)', reference: 'A', ts: 7], 6 | [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '()', reference: 'A', ts: 9], 7 | [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 14], 8 | [1, 1, 0] => [:literal, text: 'T', ts: 14], 9 | [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 16], 10 | [1, 2, 0] => [:literal, text: 'F', ts: 16] 11 | 12 | include_examples 'parse', /(a)(?(1)T|F)/, 13 | [1] => [:conditional, :open, Conditional::Expression, to_s: '(?(1)T|F)', reference: 1, ts: 3], 14 | [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '(1)', reference: 1, ts: 5], 15 | [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 8], 16 | [1, 1, 0] => [:literal, text: 'T', ts: 8], 17 | [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 10], 18 | [1, 2, 0] => [:literal, text: 'F', ts: 10] 19 | 20 | include_examples 'parse', /(foo)(?(1)\d+|(\w)){42}/, 21 | [1] => [Conditional::Expression, quantified?: true, to_s: '(?(1)\d+|(\w)){42}'], 22 | [1, 0] => [Conditional::Condition, quantified?: false], 23 | [1, 1] => [Conditional::Branch, quantified?: false], 24 | [1, 1, 0] => [:digit, quantified?: true, to_s: '\d+'], 25 | [1, 2] => [Conditional::Branch, quantified?: false] 26 | 27 | # test nested and mixed with alternations 28 | include_examples 'parse', <<-EOS.gsub(/\s/, ''), 29 | ( 30 | (a) 31 | | 32 | (b) 33 | | 34 | ( 35 | ( 36 | ?(2) 37 | (c(d|e)+)? 38 | | 39 | ( 40 | ?(3) 41 | f 42 | | 43 | ( 44 | ?(4) 45 | (g|(h)(i)) 46 | ) 47 | ) 48 | ) 49 | ) 50 | ) 51 | EOS 52 | [0] => [Group::Capture, count: 1], 53 | [0, 0] => [Alternation, count: 3], 54 | [0, 0, 2] => [Alternative, count: 1], 55 | [0, 0, 2, 0] => [Group::Capture, count: 1], 56 | [0, 0, 2, 0, 0] => [Conditional::Expression, count: 3, conditional_level: 0], 57 | [0, 0, 2, 0, 0, 0] => [Conditional::Condition, to_s: '(2)', conditional_level: 1], 58 | [0, 0, 2, 0, 0, 1] => [Conditional::Branch, to_s: '(c(d|e)+)?', conditional_level: 1], 59 | [0, 0, 2, 0, 0, 2] => [Conditional::Branch, to_s: '(?(3)f|(?(4)(g|(h)(i))))', conditional_level: 1], 60 | [0, 0, 2, 0, 0, 2, 0] => [Conditional::Expression, count: 3, conditional_level: 1], 61 | [0, 0, 2, 0, 0, 2, 0, 0] => [Conditional::Condition, to_s: '(3)', conditional_level: 2], 62 | [0, 0, 2, 0, 0, 2, 0, 1] => [Conditional::Branch, count: 1, to_s: 'f', conditional_level: 2], 63 | [0, 0, 2, 0, 0, 2, 0, 1, 0] => [Literal, text: 'f', conditional_level: 2] 64 | 65 | # test empty branch 66 | include_examples 'parse', /(?a)(?()T|)/, 67 | [1] => [Conditional::Expression, count: 3, to_s: '(?()T|)'], 68 | [1, 2] => [Conditional::Branch, to_s: '', ts: 16] 69 | 70 | # test insignificant leading zeros in the condition's group number ref 71 | include_examples 'parse', /(a)(?(001)T)/, 72 | [1, 0] => [Conditional::Condition, to_s: '(001)', reference: 1] 73 | end 74 | -------------------------------------------------------------------------------- /spec/parser/errors_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Parsing errors') do 4 | let(:parser) { Regexp::Parser.new } 5 | before { parser.parse(/foo/) } # initializes ivars 6 | 7 | it('raises UnknownTokenTypeError for unknown token types') do 8 | expect { parser.send(:parse_token, Regexp::Token.new(:foo, :bar)) } 9 | .to raise_error(Regexp::Parser::UnknownTokenTypeError) 10 | end 11 | 12 | RSpec.shared_examples 'UnknownTokenError' do |type| 13 | it "raises for unknown tokens of type #{type}" do 14 | expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) } 15 | .to raise_error(Regexp::Parser::UnknownTokenError) 16 | end 17 | end 18 | 19 | include_examples 'UnknownTokenError', :anchor 20 | include_examples 'UnknownTokenError', :backref 21 | include_examples 'UnknownTokenError', :conditional 22 | include_examples 'UnknownTokenError', :free_space 23 | include_examples 'UnknownTokenError', :group 24 | include_examples 'UnknownTokenError', :meta 25 | include_examples 'UnknownTokenError', :nonproperty 26 | include_examples 'UnknownTokenError', :property 27 | include_examples 'UnknownTokenError', :quantifier 28 | include_examples 'UnknownTokenError', :set 29 | include_examples 'UnknownTokenError', :type 30 | end 31 | -------------------------------------------------------------------------------- /spec/parser/escapes_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('EscapeSequence parsing') do 4 | es = EscapeSequence 5 | 6 | include_examples 'parse', /a\ac/, 1 => [:escape, :bell, es::Bell] 7 | include_examples 'parse', /a\ec/, 1 => [:escape, :escape, es::AsciiEscape] 8 | include_examples 'parse', /a\fc/, 1 => [:escape, :form_feed, es::FormFeed] 9 | include_examples 'parse', /a\nc/, 1 => [:escape, :newline, es::Newline] 10 | include_examples 'parse', /a\rc/, 1 => [:escape, :carriage, es::Return] 11 | include_examples 'parse', /a\tc/, 1 => [:escape, :tab, es::Tab] 12 | include_examples 'parse', /a\vc/, 1 => [:escape, :vertical_tab, es::VerticalTab] 13 | 14 | # meta character escapes 15 | include_examples 'parse', /a\.c/, 1 => [:escape, :dot, es::Literal] 16 | include_examples 'parse', /a\?c/, 1 => [:escape, :zero_or_one, es::Literal] 17 | include_examples 'parse', /a\*c/, 1 => [:escape, :zero_or_more, es::Literal] 18 | include_examples 'parse', /a\+c/, 1 => [:escape, :one_or_more, es::Literal] 19 | include_examples 'parse', /a\|c/, 1 => [:escape, :alternation, es::Literal] 20 | include_examples 'parse', /a\(c/, 1 => [:escape, :group_open, es::Literal] 21 | include_examples 'parse', /a\)c/, 1 => [:escape, :group_close, es::Literal] 22 | include_examples 'parse', /a\{c/, 1 => [:escape, :interval_open, es::Literal] 23 | include_examples 'parse', /a\}c/, 1 => [:escape, :interval_close, es::Literal] 24 | 25 | # unicode escapes 26 | include_examples 'parse', /a\u0640/, 1 => [:escape, :codepoint, es::Codepoint] 27 | include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, es::CodepointList] 28 | include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, es::CodepointList] 29 | 30 | # hex escapes 31 | include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, es::Hex] 32 | 33 | # octal escapes 34 | include_examples 'parse', /a\177/n, 1 => [:escape, :octal, es::Octal] 35 | 36 | # test #char and #codepoint 37 | include_examples 'parse', /\n/, 0 => [char: "\n", codepoint: 10 ] 38 | include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ] 39 | include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ] 40 | include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ] 41 | include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ] 42 | include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ] 43 | include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]] 44 | 45 | specify('codepoint_list #char and #codepoint raise errors') do 46 | exp = RP.parse(/\u{44 45}/)[0] 47 | expect { exp.char }.to raise_error(/#chars/) 48 | expect { exp.codepoint }.to raise_error(/#codepoints/) 49 | end 50 | 51 | # Meta/control escapes 52 | # 53 | # After the following fix in Ruby 3.1, a Regexp#source containing meta/control 54 | # escapes can only be set with the Regexp::new constructor. 55 | # In Regexp literals, these escapes are now pre-processed to hex escapes. 56 | # 57 | # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 58 | n = ->(regexp_body){ Regexp.new(regexp_body.force_encoding('ascii-8bit')) } 59 | 60 | include_examples 'parse', n.('\\\\\c2b'), 1 => [es::Control, text: '\c2', char: "\x12", codepoint: 18 ] 61 | include_examples 'parse', n.('\d\C-C\w'), 1 => [es::Control, text: '\C-C', char: "\x03", codepoint: 3 ] 62 | include_examples 'parse', n.('\Z\M-Z'), 1 => [es::Meta, text: '\M-Z', char: "\u00DA", codepoint: 218] 63 | include_examples 'parse', n.('\A\M-\C-X'), 1 => [es::MetaControl, text: '\M-\C-X', char: "\u0098", codepoint: 152] 64 | include_examples 'parse', n.('\A\M-\cX'), 1 => [es::MetaControl, text: '\M-\cX', char: "\u0098", codepoint: 152] 65 | include_examples 'parse', n.('\A\C-\M-X'), 1 => [es::MetaControl, text: '\C-\M-X', char: "\u0098", codepoint: 152] 66 | include_examples 'parse', n.('\A\c\M-X'), 1 => [es::MetaControl, text: '\c\M-X', char: "\u0098", codepoint: 152] 67 | end 68 | -------------------------------------------------------------------------------- /spec/parser/free_space_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('FreeSpace parsing') do 4 | include_examples 'parse', /a b c/, 5 | [0] => [Literal, text: 'a b c'] 6 | 7 | include_examples 'parse', /a b c/x, 8 | [0] => [Literal, text: 'a'], 9 | [1] => [WhiteSpace, text: ' '], 10 | [2] => [Literal, text: 'b'], 11 | [3] => [WhiteSpace, text: ' '], 12 | [4] => [Literal, text: 'c'] 13 | 14 | include_examples 'parse', /a * b + c/x, 15 | [0] => [Literal, to_s: 'a*', quantified?: true], 16 | [1] => [WhiteSpace, text: ' '], 17 | [2] => [WhiteSpace, text: ' '], 18 | [3] => [Literal, to_s: 'b+', quantified?: true], 19 | [4] => [WhiteSpace, text: ' '], 20 | [5] => [WhiteSpace, text: ' '], 21 | [6] => [Literal, to_s: 'c'] 22 | 23 | include_examples 'parse', / 24 | a ? # One letter 25 | b {2,5} # Another one 26 | [c-g] + # A set 27 | (h|i|j) # A group 28 | /x, 29 | [0] => [WhiteSpace], 30 | [1] => [Literal, to_s: 'a?', quantified?: true], 31 | [2] => [WhiteSpace, text: ' '], 32 | [3] => [WhiteSpace, text: ' '], 33 | [4] => [Comment, to_s: "# One letter\n"], 34 | [5] => [WhiteSpace], 35 | [6] => [Literal, to_s: 'b{2,5}', quantified?: true], 36 | [7] => [WhiteSpace, text: ' '], 37 | [8] => [WhiteSpace, text: ' '], 38 | [9] => [Comment, to_s: "# Another one\n"], 39 | [10] => [WhiteSpace], 40 | [11] => [CharacterSet, to_s: '[c-g]+', quantified?: true], 41 | [12] => [WhiteSpace], 42 | [13] => [WhiteSpace], 43 | [14] => [Comment, to_s: "# A set\n"], 44 | [15] => [WhiteSpace], 45 | [16] => [Group::Capture], 46 | [17] => [WhiteSpace], 47 | [18] => [Comment, to_s: "# A group\n",] 48 | 49 | include_examples 'parse', / 50 | a 51 | # comment 1 52 | ? 53 | ( 54 | b # comment 2 55 | # comment 3 56 | + 57 | ) 58 | # comment 4 59 | * 60 | /x, 61 | [0] => [WhiteSpace], 62 | [1] => [Literal, to_s: 'a?', quantified?: true], 63 | [2] => [WhiteSpace], 64 | [3] => [Comment], 65 | [4] => [WhiteSpace], 66 | [5] => [WhiteSpace], 67 | [6] => [Group::Capture, quantified?: true], 68 | [6, 0] => [WhiteSpace], 69 | [6, 1] => [Literal, to_s: 'b+', quantified?: true], 70 | [6, 2] => [WhiteSpace], 71 | [6, 3] => [Comment, to_s: "# comment 2\n"], 72 | [6, 4] => [WhiteSpace], 73 | [6, 5] => [Comment, to_s: "# comment 3\n"], 74 | [6, 6] => [WhiteSpace], 75 | [6, 7] => [WhiteSpace] 76 | end 77 | -------------------------------------------------------------------------------- /spec/parser/keep_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Keep parsing') do 4 | include_examples 'parse', /ab\Kcd/, 1 => [:keep, :mark, Keep::Mark, text: '\K'] 5 | include_examples 'parse', /(a\K)/, [0, 1] => [:keep, :mark, Keep::Mark, text: '\K'] 6 | end 7 | -------------------------------------------------------------------------------- /spec/parser/options_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('passing options to parse') do 4 | it 'raises if if parsing from a Regexp and options are passed' do 5 | expect { RP.parse(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( 6 | ArgumentError, 7 | 'options cannot be supplied unless parsing a String' 8 | ) 9 | end 10 | 11 | it 'sets options if parsing from a String' do 12 | root = RP.parse('a+', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED) 13 | 14 | expect(root.options).to eq(m: true, x: true) 15 | end 16 | 17 | it 'allows options to not be supplied when parsing from a Regexp' do 18 | root = RP.parse(/a+/ix) 19 | 20 | expect(root.options).to eq(i: true, x: true) 21 | end 22 | 23 | it 'has an empty option-hash when parsing from a String and passing no options' do 24 | root = RP.parse('a+') 25 | 26 | expect(root.options).to be_empty 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /spec/parser/posix_classes_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('PosixClass parsing') do 4 | include_examples 'parse', /[[:word:]]/, 5 | [0] => [CharacterSet, count: 1], 6 | [0, 0] => [:posixclass, :word, PosixClass, name: 'word', text: '[:word:]'] 7 | include_examples 'parse', /[[:^word:]]/, 8 | [0] => [CharacterSet, count: 1], 9 | [0, 0] => [:nonposixclass, :word, PosixClass, name: 'word', text: '[:^word:]'] 10 | 11 | # cases treated as regular subsets by Ruby, not as (invalid) posix classes 12 | include_examples 'parse', '[[:ab]c:]', 13 | [0, 0] => [CharacterSet, count: 3], 14 | [0, 0, 0] => [Literal, text: ':'] 15 | 16 | include_examples 'parse', '[[:a[b]c:]]', 17 | [0, 0] => [CharacterSet, count: 5], 18 | [0, 0, 0] => [Literal, text: ':'] 19 | end 20 | -------------------------------------------------------------------------------- /spec/parser/properties_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Property parsing') do 4 | # test various notations supported by Ruby 5 | include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted] 6 | include_examples 'parse', '\p{SD}', 0 => [:property, :soft_dotted] 7 | include_examples 'parse', '\p{Soft Dotted}', 0 => [:property, :soft_dotted] 8 | include_examples 'parse', '\p{Soft-Dotted}', 0 => [:property, :soft_dotted] 9 | include_examples 'parse', '\p{sOfT_dOtTeD}', 0 => [:property, :soft_dotted] 10 | 11 | # test ^-negation 12 | include_examples 'parse', '\p{^sd}', 0 => [:nonproperty, :soft_dotted] 13 | include_examples 'parse', '\p{^SD}', 0 => [:nonproperty, :soft_dotted] 14 | include_examples 'parse', '\p{^Soft Dotted}', 0 => [:nonproperty, :soft_dotted] 15 | include_examples 'parse', '\p{^Soft-Dotted}', 0 => [:nonproperty, :soft_dotted] 16 | include_examples 'parse', '\p{^sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted] 17 | 18 | # test P-negation 19 | include_examples 'parse', '\P{sd}', 0 => [:nonproperty, :soft_dotted] 20 | include_examples 'parse', '\P{SD}', 0 => [:nonproperty, :soft_dotted] 21 | include_examples 'parse', '\P{Soft Dotted}', 0 => [:nonproperty, :soft_dotted] 22 | include_examples 'parse', '\P{Soft-Dotted}', 0 => [:nonproperty, :soft_dotted] 23 | include_examples 'parse', '\P{sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted] 24 | 25 | # double negation is positive again 26 | include_examples 'parse', '\P{^sd}', 0 => [:property, :soft_dotted] 27 | include_examples 'parse', '\P{^SD}', 0 => [:property, :soft_dotted] 28 | include_examples 'parse', '\P{^Soft Dotted}', 0 => [:property, :soft_dotted] 29 | include_examples 'parse', '\P{^Soft-Dotted}', 0 => [:property, :soft_dotted] 30 | include_examples 'parse', '\P{^sOfT_dOtTeD}', 0 => [:property, :soft_dotted] 31 | 32 | # test #shortcut 33 | include_examples 'parse', '\p{soft_dotted}', 0 => [:property, :soft_dotted, shortcut: 'sd'] 34 | include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted, shortcut: 'sd'] 35 | include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil] 36 | 37 | # test classification 38 | include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age] 39 | include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block] 40 | include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived] 41 | include_examples 'parse', '\p{Emoji}', 0 => [UnicodeProperty::Emoji] 42 | include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated] 43 | include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script] 44 | 45 | specify('parse abandoned newline property') do 46 | root = RP.parse('\p{newline}', 'ruby/1.9') 47 | expect(root.expressions.last).to be_a(UnicodeProperty::Base) 48 | 49 | expect { RP.parse('\p{newline}', 'ruby/2.0') }.to raise_error(Regexp::Syntax::NotImplementedError) 50 | end 51 | 52 | # cannot test older Rubies because of https://bugs.ruby-lang.org/issues/18686 53 | if ruby_version_at_least('3.2.0') 54 | specify('parse all properties of current ruby') do 55 | unsupported = RegexpPropertyValues.all_for_current_ruby.reject do |prop| 56 | RP.parse("\\p{#{prop}}") rescue false 57 | end 58 | expect(unsupported).to be_empty 59 | end 60 | end 61 | 62 | # Ruby 2.3 supports a short prop name (sterm) without supporting the long name 63 | # of the same prop (sentence_terminal). Let's ignore this unique case. 64 | if ruby_version_at_least('2.4.0') 65 | specify('parse only properties of current ruby') do 66 | syntax = Regexp::Syntax.for("ruby/#{RUBY_VERSION}") 67 | excessive = syntax.features.fetch(:property, []).reject do |prop| 68 | begin 69 | Regexp.new("\\p{#{prop}}") 70 | rescue RegexpError, SyntaxError # error class depends on Ruby version 71 | false 72 | end 73 | end 74 | expect(excessive).to be_empty 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /spec/parser/quantifiers_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Quantifier parsing') do 4 | include_examples 'parse', /a?b/, [0, :q] => [:zero_or_one, text: '?', mode: :greedy, min: 0, max: 1, ts: 1] 5 | include_examples 'parse', /a??b/, [0, :q] => [:zero_or_one, text: '??', mode: :reluctant, min: 0, max: 1, ts: 1] 6 | include_examples 'parse', /a?+b/, [0, :q] => [:zero_or_one, text: '?+', mode: :possessive, min: 0, max: 1, ts: 1] 7 | include_examples 'parse', /a*b/, [0, :q] => [:zero_or_more, text: '*', mode: :greedy, min: 0, max: -1, ts: 1] 8 | include_examples 'parse', /a*?b/, [0, :q] => [:zero_or_more, text: '*?', mode: :reluctant, min: 0, max: -1, ts: 1] 9 | include_examples 'parse', /a*+b/, [0, :q] => [:zero_or_more, text: '*+', mode: :possessive, min: 0, max: -1, ts: 1] 10 | include_examples 'parse', /a+b/, [0, :q] => [:one_or_more, text: '+', mode: :greedy, min: 1, max: -1, ts: 1] 11 | include_examples 'parse', /a+?b/, [0, :q] => [:one_or_more, text: '+?', mode: :reluctant, min: 1, max: -1, ts: 1] 12 | include_examples 'parse', /a++b/, [0, :q] => [:one_or_more, text: '++', mode: :possessive, min: 1, max: -1, ts: 1] 13 | include_examples 'parse', /a{2,4}b/, [0, :q] => [:interval, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] 14 | include_examples 'parse', /a{2,}b/, [0, :q] => [:interval, text: '{2,}', mode: :greedy, min: 2, max: -1, ts: 1] 15 | include_examples 'parse', /a{,3}b/, [0, :q] => [:interval, text: '{,3}', mode: :greedy, min: 0, max: 3, ts: 1] 16 | include_examples 'parse', /a{4}b/, [0, :q] => [:interval, text: '{4}', mode: :greedy, min: 4, max: 4, ts: 1] 17 | include_examples 'parse', /a{004}b/, [0, :q] => [:interval, text: '{004}', mode: :greedy, min: 4, max: 4, ts: 1] 18 | 19 | # special case: exps with chained quantifiers are wrapped in implicit passive groups 20 | include_examples 'parse', /a+{2}{3}/, 21 | [0] => [:group, :passive, Group::Passive, implicit?: true, level: 0], 22 | [0, :q] => [:quantifier, :interval, Quantifier, text: '{3}', level: 0], 23 | [0, 0] => [:group, :passive, Group::Passive, implicit?: true, level: 1], 24 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2}', level: 1], 25 | [0, 0, 0] => [:literal, :literal, Literal, text: 'a', level: 2], 26 | [0, 0, 0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', level: 2] 27 | 28 | # Ruby does not support modes for intervals, following `?` and `+` are read as chained quantifiers 29 | include_examples 'parse', /a{2,4}?b/, 30 | [0, :q] => [:quantifier, :zero_or_one, Quantifier, text: '?', mode: :greedy, min: 0, max: 1, ts: 6], 31 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] 32 | include_examples 'parse', /a{2,4}+b/, 33 | [0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', mode: :greedy, min: 1, max: -1, ts: 6], 34 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] 35 | 36 | specify('mode-checking methods') do 37 | exp = RP.parse(/a??/).first 38 | 39 | expect(exp).to be_reluctant 40 | expect(exp).to be_lazy 41 | expect(exp).not_to be_greedy 42 | expect(exp).not_to be_possessive 43 | expect(exp.quantifier).to be_reluctant 44 | expect(exp.quantifier).to be_lazy 45 | expect(exp.quantifier).not_to be_greedy 46 | expect(exp.quantifier).not_to be_possessive 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /spec/parser/set/intersections_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | # edge cases with `...-&&...` and `...&&-...` are checked in ./ranges_spec.rb 4 | 5 | RSpec.describe('CharacterSet::Intersection parsing') do 6 | include_examples 'parse', /[a&&z]/, 7 | [0] => [CharacterSet, count: 1], 8 | [0, 0] => [CharacterSet::Intersection, count: 2], 9 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], 10 | [0, 0, 0, 0] => [:literal, text: 'a'], 11 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 12 | [0, 0, 1, 0] => [:literal, text: 'z'] 13 | 14 | include_examples 'parse', /[a-z&&[^a]]/, 15 | [0] => [CharacterSet, count: 1], 16 | [0, 0] => [CharacterSet::Intersection, count: 2], 17 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], 18 | [0, 0, 0, 0] => [CharacterSet::Range, count: 2], 19 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 20 | [0, 0, 1, 0] => [CharacterSet, count: 1] 21 | 22 | include_examples 'parse', /[a&&a-z]/, 23 | [0] => [CharacterSet, count: 1], 24 | [0, 0] => [CharacterSet::Intersection, count: 2], 25 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], 26 | [0, 0, 0, 0] => [:literal, text: 'a'], 27 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 28 | [0, 0, 1, 0] => [CharacterSet::Range, count: 2] 29 | 30 | include_examples 'parse', /[a&&\w]/, 31 | [0] => [CharacterSet, count: 1], 32 | [0, 0] => [CharacterSet::Intersection, count: 2], 33 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 34 | [0, 0, 1, 0] => [:word, text: '\w'] 35 | 36 | include_examples 'parse', /[\h&&\w&&efg]/, 37 | [0] => [CharacterSet, count: 1], 38 | [0, 0] => [CharacterSet::Intersection, count: 3], 39 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], 40 | [0, 0, 0, 0] => [:hex, text: '\h'], 41 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 42 | [0, 0, 1, 0] => [:word, text: '\w'], 43 | [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 3], 44 | [0, 0, 2, 0] => [:literal, text: 'e'], 45 | [0, 0, 2, 1] => [:literal, text: 'f'], 46 | [0, 0, 2, 2] => [:literal, text: 'g'] 47 | 48 | # test correct ts values for empty sequences 49 | include_examples 'parse', /[&&]/, 50 | [0, 0] => [CharacterSet::Intersection, text: '&&', count: 2, ts: 1], 51 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 0, ts: 1], 52 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 0, ts: 3] 53 | 54 | # test correct ts values for non-empty sequences 55 | include_examples 'parse', /[ab&&cd&&ef]/, 56 | [0, 0] => [CharacterSet::Intersection, count: 3, text: '&&', ts: 1], 57 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ab', ts: 1], 58 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'cd', ts: 5], 59 | [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ef', ts: 9] 60 | 61 | # Some edge-case patterns are evaluated with #match to make sure that 62 | # their matching behavior still reflects the way they are parsed. 63 | # #capturing_stderr is used to skip any warnings generated by this. 64 | specify('intersections behavior remains unchanged') do 65 | capturing_stderr do 66 | expect(/[a&&z]/).not_to match 'a' 67 | expect(/[a&&z]/).not_to match '&' 68 | expect(/[a&&z]/).not_to match 'z' 69 | expect(/[a-z&&[^a]]/).not_to match 'a' 70 | expect(/[a-z&&[^a]]/).not_to match '&' 71 | expect(/[a-z&&[^a]]/).to match 'b' 72 | expect(/[a&&a-z]/).to match 'a' 73 | expect(/[a&&a-z]/).not_to match '&' 74 | expect(/[a&&a-z]/).not_to match 'b' 75 | expect(/[a&&\w]/).to match 'a' 76 | expect(/[a&&\w]/).not_to match '&' 77 | expect(/[a&&\w]/).not_to match 'b' 78 | expect(/[\h&&\w&&efg]/).to match 'e' 79 | expect(/[\h&&\w&&efg]/).to match 'f' 80 | expect(/[\h&&\w&&efg]/).not_to match 'a' 81 | expect(/[\h&&\w&&efg]/).not_to match 'g' 82 | end 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /spec/parser/set/ranges_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('CharacterSet::Range parsing') do 4 | include_examples 'parse', '[a-z]', 5 | [0] => [CharacterSet, count: 1], 6 | [0, 0] => [CharacterSet::Range, count: 2], 7 | [0, 0, 0] => [:literal, text: 'a'], 8 | [0, 0, 1] => [:literal, text: 'z'] 9 | 10 | include_examples 'parse', '[\x00-\x22]', 11 | [0] => [CharacterSet, count: 1], 12 | [0, 0] => [CharacterSet::Range, count: 2], 13 | [0, 0, 0] => [:hex, text: '\x00'], 14 | [0, 0, 1] => [:hex, text: '\x22'] 15 | 16 | include_examples 'parse', '[\u{40 42}-\u1234]', 17 | [0] => [CharacterSet, count: 1], 18 | [0, 0] => [CharacterSet::Range, count: 2], 19 | [0, 0, 0] => [:codepoint_list, text: '\u{40 42}'], 20 | [0, 0, 1] => [:codepoint, text: '\u1234'] 21 | 22 | include_examples 'parse', '[--z]', 23 | [0] => [CharacterSet, count: 1], 24 | [0, 0] => [CharacterSet::Range, count: 2], 25 | [0, 0, 0] => [:literal, text: '-'], 26 | [0, 0, 1] => [:literal, text: 'z'] 27 | 28 | include_examples 'parse', '[!--]', 29 | [0] => [CharacterSet, count: 1], 30 | [0, 0] => [CharacterSet::Range, count: 2], 31 | [0, 0, 0] => [:literal, text: '!'], 32 | [0, 0, 1] => [:literal, text: '-'] 33 | 34 | include_examples 'parse', '[!-^]', 35 | [0] => [CharacterSet, count: 1], 36 | [0, 0] => [CharacterSet::Range, count: 2], 37 | [0, 0, 0] => [:literal, text: '!'], 38 | [0, 0, 1] => [:literal, text: '^'] 39 | 40 | # edge cases that are NOT treated as range 41 | 42 | include_examples 'parse', '[^-z]', 43 | [0] => [CharacterSet, count: 2], 44 | [0, 0] => [:literal, text: '-'], 45 | [0, 1] => [:literal, text: 'z'] 46 | 47 | include_examples 'parse', '[[\-ab]&&-bc]', 48 | [0] => [CharacterSet, count: 1], 49 | [0, 0] => [CharacterSet::Intersection, count: 2], 50 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], 51 | [0, 0, 0, 0] => [CharacterSet, count: 3], 52 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 3], 53 | [0, 0, 1, 0] => [:literal, text: '-'] 54 | 55 | include_examples 'parse', '[bc-&&[\-ab]]', 56 | [0] => [CharacterSet, count: 1], 57 | [0, 0] => [CharacterSet::Intersection, count: 2], 58 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 3], 59 | [0, 0, 0, 2] => [:literal, text: '-'], 60 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], 61 | [0, 0, 1, 0] => [CharacterSet, count: 3] 62 | 63 | # Some edge-case patterns are evaluated with #match to make sure that 64 | # their matching behavior still reflects the way they are parsed. 65 | # #capturing_stderr is used to skip any warnings generated by this. 66 | specify('ranges behavior remains unchanged') do 67 | capturing_stderr do 68 | expect(Regexp.new('[\x00-\x22]')).to match "\x11" 69 | expect(Regexp.new('[\u{40 42}-\u1234]')).to match "\u0600" 70 | expect(Regexp.new('[--z]')).to match 'a' 71 | expect(Regexp.new('[!--]')).to match '$' 72 | expect(Regexp.new('[!-^]')).to match '$' 73 | 74 | # edge cases that are NOT treated as ranges 75 | expect(Regexp.new('[^-z]')).to match 'a' 76 | expect(Regexp.new('[^-z]')).not_to match 'z' 77 | expect(Regexp.new('[[\-ab]&&-bc]')).to match '-' 78 | expect(Regexp.new('[[\-ab]&&-bc]')).to match 'b' 79 | expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'a' 80 | expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'c' 81 | expect(Regexp.new('[bc-&&[\-ab]]')).to match '-' 82 | expect(Regexp.new('[bc-&&[\-ab]]')).to match 'b' 83 | expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'a' 84 | expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'c' 85 | end 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /spec/parser/sets_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('CharacterSet parsing') do 4 | include_examples 'parse', /[ab]+/, 5 | [0] => [:set, :character, CharacterSet, text: '[', count: 2, quantified?: true], 6 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], 7 | [0, 1] => [:literal, :literal, Literal, text: 'b', set_level: 1] 8 | 9 | include_examples 'parse', /[a\dc]/, 10 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 11 | [0, 1] => [:type, :digit, CharacterType::Digit] 12 | 13 | include_examples 'parse', /[a\bc]/, 14 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 15 | [0, 1] => [:escape, :backspace, EscapeSequence::Backspace, text: '\b'] 16 | 17 | include_examples 'parse', '[a\xFz]', 18 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 19 | [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\xF'] 20 | 21 | include_examples 'parse', '[a\x20c]', 22 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 23 | [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\x20'] 24 | 25 | include_examples 'parse', '[a\77c]', 26 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 27 | [0, 1] => [:escape, :octal, EscapeSequence::Octal, text: '\77'] 28 | 29 | include_examples 'parse', '[a\u0640c]', 30 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 31 | [0, 1] => [:escape, :codepoint, EscapeSequence::Codepoint, text: '\u0640'] 32 | 33 | include_examples 'parse', '[a\u{41 1F60D}c]', 34 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 35 | [0, 1] => [:escape, :codepoint_list, EscapeSequence::CodepointList, text: '\u{41 1F60D}'] 36 | 37 | include_examples 'parse', '[[:digit:][:^lower:]]+', 38 | [0] => [:set, :character, CharacterSet, text: '[', count: 2], 39 | [0, 0] => [:posixclass, :digit, PosixClass, text: '[:digit:]'], 40 | [0, 1] => [:nonposixclass, :lower, PosixClass, text: '[:^lower:]'] 41 | 42 | include_examples 'parse', '[a[b[c]d]e]', 43 | [0] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 0], 44 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], 45 | [0, 1] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 1], 46 | [0, 2] => [:literal, :literal, Literal, text: 'e', set_level: 1], 47 | [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2], 48 | [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] 49 | 50 | include_examples 'parse', '[a[^b[c]]]', 51 | [0] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 0], 52 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], 53 | [0, 1] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 1], 54 | [0, 1, 0] => [:literal, :literal, Literal, text: 'b', set_level: 2], 55 | [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2], 56 | [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] 57 | 58 | include_examples 'parse', '[aaa]', 59 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 60 | [0, 0] => [:literal, :literal, Literal, text: 'a'], 61 | [0, 1] => [:literal, :literal, Literal, text: 'a'], 62 | [0, 2] => [:literal, :literal, Literal, text: 'a'] 63 | 64 | include_examples 'parse', '[ ]', 65 | [0] => [:set, :character, CharacterSet, text: '[', count: 3], 66 | [0, 0] => [:literal, :literal, Literal, text: ' '], 67 | [0, 1] => [:literal, :literal, Literal, text: ' '], 68 | [0, 2] => [:literal, :literal, Literal, text: ' '] 69 | 70 | include_examples 'parse', '(?x)[ ]', # shouldn't merge whitespace even in x-mode 71 | [1] => [:set, :character, CharacterSet, text: '[', count: 3], 72 | [1, 0] => [:literal, :literal, Literal, text: ' '], 73 | [1, 1] => [:literal, :literal, Literal, text: ' '], 74 | [1, 2] => [:literal, :literal, Literal, text: ' '] 75 | 76 | include_examples 'parse', '[[.span-ll.]]', # collating sequences are disabled in Onigmo 77 | [0, 0] => [:set, :character, CharacterSet, text: '[', count: 7], 78 | [0, 0, 0] => [:literal, :literal, Literal, text: '.'] 79 | 80 | include_examples 'parse', '[[=e=]]', # character equivalents are disabled in Onigmo 81 | [0, 0] => [:set, :character, CharacterSet, text: '[', count: 3], 82 | [0, 0, 0] => [:literal, :literal, Literal, text: '='] 83 | end 84 | -------------------------------------------------------------------------------- /spec/parser/types_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('CharacterType parsing') do 4 | include_examples 'parse', /a\dc/, 1 => [:type, :digit, CharacterType::Digit] 5 | include_examples 'parse', /a\Dc/, 1 => [:type, :nondigit, CharacterType::NonDigit] 6 | 7 | include_examples 'parse', /a\sc/, 1 => [:type, :space, CharacterType::Space] 8 | include_examples 'parse', /a\Sc/, 1 => [:type, :nonspace, CharacterType::NonSpace] 9 | 10 | include_examples 'parse', /a\hc/, 1 => [:type, :hex, CharacterType::Hex] 11 | include_examples 'parse', /a\Hc/, 1 => [:type, :nonhex, CharacterType::NonHex] 12 | 13 | include_examples 'parse', /a\wc/, 1 => [:type, :word, CharacterType::Word] 14 | include_examples 'parse', /a\Wc/, 1 => [:type, :nonword, CharacterType::NonWord] 15 | 16 | include_examples 'parse', 'a\Rc', 1 => [:type, :linebreak, CharacterType::Linebreak] 17 | include_examples 'parse', 'a\Xc', 1 => [:type, :xgrapheme, CharacterType::ExtendedGrapheme] 18 | end 19 | -------------------------------------------------------------------------------- /spec/scanner/all_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Scanner) do 4 | specify('scanner returns an array') do 5 | expect(RS.scan('abc')).to be_instance_of(Array) 6 | end 7 | 8 | specify('scanner returns tokens as arrays') do 9 | tokens = RS.scan('^abc+[^one]{2,3}\b\d\C-C$') 10 | expect(tokens).to all(be_a Array) 11 | expect(tokens.map(&:length)).to all(eq 5) 12 | end 13 | 14 | specify('scanner token count') do 15 | re = /^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i 16 | expect(RS.scan(re).length).to eq 28 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/scanner/anchors_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Anchor scanning') do 4 | include_examples 'scan', '^abc', 0 => [:anchor, :bol, '^', 0, 1] 5 | include_examples 'scan', 'abc$', 1 => [:anchor, :eol, '$', 3, 4] 6 | 7 | include_examples 'scan', '\Aabc', 0 => [:anchor, :bos, '\A', 0, 2] 8 | include_examples 'scan', 'abc\z', 1 => [:anchor, :eos, '\z', 3, 5] 9 | include_examples 'scan', 'abc\Z', 1 => [:anchor, :eos_ob_eol, '\Z', 3, 5] 10 | 11 | include_examples 'scan', 'a\bc', 1 => [:anchor, :word_boundary, '\b', 1, 3] 12 | include_examples 'scan', 'a\Bc', 1 => [:anchor, :nonword_boundary, '\B', 1, 3] 13 | 14 | include_examples 'scan', 'a\Gc', 1 => [:anchor, :match_start, '\G', 1, 3] 15 | 16 | include_examples 'scan', "\\\\Ac", 0 => [:escape, :backslash, '\\\\', 0, 2] 17 | include_examples 'scan', "a\\\\z", 1 => [:escape, :backslash, '\\\\', 1, 3] 18 | include_examples 'scan', "a\\\\Z", 1 => [:escape, :backslash, '\\\\', 1, 3] 19 | include_examples 'scan', "a\\\\bc", 1 => [:escape, :backslash, '\\\\', 1, 3] 20 | include_examples 'scan', "a\\\\Bc", 1 => [:escape, :backslash, '\\\\', 1, 3] 21 | end 22 | -------------------------------------------------------------------------------- /spec/scanner/delimiters_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Literal delimiter scanning') do 4 | include_examples 'scan', '}', 5 | 0 => [:literal, :literal, '}', 0, 1] 6 | 7 | include_examples 'scan', '}}', 8 | 0 => [:literal, :literal, '}}', 0, 2] 9 | 10 | include_examples 'scan', '{', 11 | 0 => [:literal, :literal, '{', 0, 1] 12 | 13 | include_examples 'scan', '{{', 14 | 0 => [:literal, :literal, '{{', 0, 2] 15 | 16 | include_examples 'scan', '{}', 17 | 0 => [:literal, :literal, '{}', 0, 2] 18 | 19 | include_examples 'scan', '}{', 20 | 0 => [:literal, :literal, '}{', 0, 2] 21 | 22 | include_examples 'scan', '}{+', 23 | 0 => [:literal, :literal, '}{', 0, 2] 24 | 25 | include_examples 'scan', '{{var}}', 26 | 0 => [:literal, :literal, '{{var}}', 0, 7] 27 | 28 | include_examples 'scan', 'a{1,2', 29 | 0 => [:literal, :literal, 'a{1,2', 0, 5] 30 | 31 | include_examples 'scan', '({.+})', 32 | 0 => [:group, :capture, '(', 0, 1], 33 | 1 => [:literal, :literal, '{', 1, 2], 34 | 2 => [:meta, :dot, '.', 2, 3], 35 | 3 => [:quantifier, :one_or_more, '+', 3, 4], 36 | 4 => [:literal, :literal, '}', 4, 5], 37 | 5 => [:group, :close, ')', 5, 6] 38 | 39 | include_examples 'scan', ']', 40 | 0 => [:literal, :literal, ']', 0, 1] 41 | 42 | include_examples 'scan', ']]', 43 | 0 => [:literal, :literal, ']]', 0, 2] 44 | 45 | include_examples 'scan', ']\[', 46 | 0 => [:literal, :literal, ']', 0, 1], 47 | 1 => [:escape, :set_open, '\[', 1, 3] 48 | 49 | include_examples 'scan', '()', 50 | 0 => [:group, :capture, '(', 0, 1], 51 | 1 => [:group, :close, ')', 1, 2] 52 | end 53 | -------------------------------------------------------------------------------- /spec/scanner/keep_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Keep scanning') do 4 | include_examples 'scan', /ab\Kcd/, 5 | 1 => [:keep, :mark, '\K', 2, 4] 6 | 7 | include_examples 'scan', /(a\Kb)|(c\\\Kd)ef/, 8 | 2 => [:keep, :mark, '\K', 2, 4], 9 | 9 => [:keep, :mark, '\K', 11, 13] 10 | end 11 | -------------------------------------------------------------------------------- /spec/scanner/literals_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('UTF8 scanning') do 4 | # ascii, single byte characters 5 | include_examples 'scan', 'a', 6 | 0 => [:literal, :literal, 'a', 0, 1] 7 | 8 | include_examples 'scan', 'ab+', 9 | 0 => [:literal, :literal, 'ab', 0, 2], 10 | 1 => [:quantifier, :one_or_more, '+', 2, 3] 11 | 12 | # 2 byte wide characters 13 | include_examples 'scan', 'äöü', 14 | 0 => [:literal, :literal, 'äöü', 0, 3] 15 | 16 | # 3 byte wide characters, Japanese 17 | include_examples 'scan', 'ab?れます+cd', 18 | 0 => [:literal, :literal, 'ab', 0, 2], 19 | 1 => [:quantifier, :zero_or_one, '?', 2, 3], 20 | 2 => [:literal, :literal, 'れます', 3, 6], 21 | 3 => [:quantifier, :one_or_more, '+', 6, 7], 22 | 4 => [:literal, :literal, 'cd', 7, 9] 23 | 24 | # 4 byte wide characters, Osmanya 25 | include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 26 | 0 => [:literal, :literal, '𐒀𐒁', 0, 2], 27 | 1 => [:quantifier, :zero_or_one, '?', 2, 3], 28 | 2 => [:literal, :literal, '𐒂ab', 3, 6], 29 | 3 => [:quantifier, :one_or_more, '+', 6, 7], 30 | 4 => [:literal, :literal, '𐒃', 7, 8] 31 | 32 | include_examples 'scan', 'mu𝄞?si*𝄫c+', 33 | 0 => [:literal, :literal, 'mu𝄞', 0, 3], 34 | 1 => [:quantifier, :zero_or_one, '?', 3, 4], 35 | 2 => [:literal, :literal, 'si', 4, 6], 36 | 3 => [:quantifier, :zero_or_more, '*', 6, 7], 37 | 4 => [:literal, :literal, '𝄫c', 7, 9], 38 | 5 => [:quantifier, :one_or_more, '+', 9, 10] 39 | end 40 | -------------------------------------------------------------------------------- /spec/scanner/meta_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Meta scanning') do 4 | include_examples 'scan', /abc??|def*+|ghi+/, 5 | 0 => [:literal, :literal, 'abc', 0, 3], 6 | 1 => [:quantifier, :zero_or_one_reluctant, '??', 3, 5], 7 | 2 => [:meta, :alternation, '|', 5, 6], 8 | 3 => [:literal, :literal, 'def', 6, 9], 9 | 4 => [:quantifier, :zero_or_more_possessive, '*+', 9, 11], 10 | 5 => [:meta, :alternation, '|', 11, 12] 11 | 12 | include_examples 'scan', /(a\|b)|(c|d)\|(e[|]f)/, 13 | 2 => [:escape, :alternation, '\|', 2, 4], 14 | 5 => [:meta, :alternation, '|', 6, 7], 15 | 8 => [:meta, :alternation, '|', 9, 10], 16 | 11 => [:escape, :alternation, '\|', 12, 14], 17 | 15 => [:literal, :literal, '|', 17, 18] 18 | end 19 | -------------------------------------------------------------------------------- /spec/scanner/options_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('passing options to scan') do 4 | def expect_type_tokens(tokens, type_tokens) 5 | expect(tokens.map { |type, token, *| [type, token] }).to eq(type_tokens) 6 | end 7 | 8 | it 'raises if if scanning from a Regexp and options are passed' do 9 | expect { RS.scan(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( 10 | ArgumentError, 11 | 'options cannot be supplied unless scanning a String' 12 | ) 13 | end 14 | 15 | it 'sets free_spacing based on options if scanning from a String' do 16 | expect_type_tokens( 17 | RS.scan('a+#c', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED), 18 | [ 19 | %i[literal literal], 20 | %i[quantifier one_or_more], 21 | %i[free_space comment] 22 | ] 23 | ) 24 | end 25 | 26 | it 'does not set free_spacing if scanning from a String and passing no options' do 27 | expect_type_tokens( 28 | RS.scan('a+#c'), 29 | [ 30 | %i[literal literal], 31 | %i[quantifier one_or_more], 32 | %i[literal literal] 33 | ] 34 | ) 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /spec/scanner/properties_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Property scanning') do 4 | RSpec.shared_examples 'scan property' do |text, token| 5 | it("scans \\p{#{text}} as property #{token}") do 6 | result = RS.scan("\\p{#{text}}")[0] 7 | expect(result[0..1]).to eq [:property, token] 8 | end 9 | 10 | it("scans \\P{#{text}} as nonproperty #{token}") do 11 | result = RS.scan("\\P{#{text}}")[0] 12 | expect(result[0..1]).to eq [:nonproperty, token] 13 | end 14 | 15 | it("scans \\p{^#{text}} as nonproperty #{token}") do 16 | result = RS.scan("\\p{^#{text}}")[0] 17 | expect(result[0..1]).to eq [:nonproperty, token] 18 | end 19 | 20 | it("scans double-negated \\P{^#{text}} as property #{token}") do 21 | result = RS.scan("\\P{^#{text}}")[0] 22 | expect(result[0..1]).to eq [:property, token] 23 | end 24 | end 25 | 26 | include_examples 'scan property', 'Alnum', :alnum 27 | 28 | include_examples 'scan property', 'XPosixPunct', :xposixpunct 29 | 30 | include_examples 'scan property', 'Newline', :newline 31 | 32 | include_examples 'scan property', 'Any', :any 33 | 34 | include_examples 'scan property', 'Assigned', :assigned 35 | 36 | include_examples 'scan property', 'Age=1.1', :'age=1.1' 37 | include_examples 'scan property', 'Age=10.0', :'age=10.0' 38 | 39 | include_examples 'scan property', 'ahex', :ascii_hex_digit 40 | include_examples 'scan property', 'ASCII_Hex_Digit', :ascii_hex_digit # test underscore 41 | 42 | include_examples 'scan property', 'sd', :soft_dotted 43 | include_examples 'scan property', 'Soft-Dotted', :soft_dotted # test dash 44 | 45 | include_examples 'scan property', 'Egyp', :egyptian_hieroglyphs 46 | include_examples 'scan property', 'Egyptian Hieroglyphs', :egyptian_hieroglyphs # test whitespace 47 | 48 | include_examples 'scan property', 'Linb', :linear_b 49 | include_examples 'scan property', 'Linear-B', :linear_b # test dash 50 | 51 | include_examples 'scan property', 'InArabic', :in_arabic # test block 52 | include_examples 'scan property', 'in Arabic', :in_arabic # test block w. whitespace 53 | include_examples 'scan property', 'In_Arabic', :in_arabic # test block w. underscore 54 | 55 | include_examples 'scan property', 'Yiii', :yi 56 | include_examples 'scan property', 'Yi', :yi 57 | 58 | include_examples 'scan property', 'Zinh', :inherited 59 | include_examples 'scan property', 'Inherited', :inherited 60 | include_examples 'scan property', 'Qaai', :inherited 61 | 62 | include_examples 'scan property', 'Zzzz', :unknown 63 | include_examples 'scan property', 'Unknown', :unknown 64 | end 65 | -------------------------------------------------------------------------------- /spec/scanner/quantifiers_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Quantifier scanning') do 4 | include_examples 'scan', 'a?', 1 => [:quantifier, :zero_or_one, '?', 1, 2] 5 | include_examples 'scan', 'a??', 1 => [:quantifier, :zero_or_one_reluctant, '??', 1, 3] 6 | include_examples 'scan', 'a?+', 1 => [:quantifier, :zero_or_one_possessive, '?+', 1, 3] 7 | 8 | include_examples 'scan', 'a*', 1 => [:quantifier, :zero_or_more, '*', 1, 2] 9 | include_examples 'scan', 'a*?', 1 => [:quantifier, :zero_or_more_reluctant, '*?', 1, 3] 10 | include_examples 'scan', 'a*+', 1 => [:quantifier, :zero_or_more_possessive, '*+', 1, 3] 11 | 12 | include_examples 'scan', 'a+', 1 => [:quantifier, :one_or_more, '+', 1, 2] 13 | include_examples 'scan', 'a+?', 1 => [:quantifier, :one_or_more_reluctant, '+?', 1, 3] 14 | include_examples 'scan', 'a++', 1 => [:quantifier, :one_or_more_possessive, '++', 1, 3] 15 | 16 | include_examples 'scan', 'a{2}', 1 => [:quantifier, :interval, '{2}', 1, 4] 17 | include_examples 'scan', 'a{2,}', 1 => [:quantifier, :interval, '{2,}', 1, 5] 18 | include_examples 'scan', 'a{,2}', 1 => [:quantifier, :interval, '{,2}', 1, 5] 19 | include_examples 'scan', 'a{2,4}', 1 => [:quantifier, :interval, '{2,4}', 1, 6] 20 | 21 | # special case: chained quantifiers 22 | include_examples 'scan', 'a+{2}{3}', 1 => [:quantifier, :one_or_more, '+', 1, 2] 23 | include_examples 'scan', 'a+{2}{3}', 2 => [:quantifier, :interval, '{2}', 2, 5] 24 | include_examples 'scan', 'a+{2}{3}', 3 => [:quantifier, :interval, '{3}', 5, 8] 25 | end 26 | -------------------------------------------------------------------------------- /spec/scanner/refcalls_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('RefCall scanning') do 4 | # Traditional numerical group back-reference 5 | include_examples 'scan', '(abc)\1' , 3 => [:backref, :number, '\1', 5, 7] 6 | 7 | # Group back-references, named, numbered, and relative 8 | # 9 | # NOTE: only \g supports forward-looking references using '+', e.g. \g<+1> 10 | # refers to the next group, but \k<+1> refers to a group named '+1'. 11 | # Inversely, only \k supports addition or subtraction of a recursion level. 12 | # E.g. \k refers to a group named 'x' at the current recursion level, 13 | # but \g refers to a a group named 'x+0'. 14 | # 15 | include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_ref_ab, '\k', 9, 14] 16 | include_examples 'scan', "(?abc)\\k'X'", 3 => [:backref, :name_ref_sq, "\\k'X'", 9, 14] 17 | 18 | include_examples 'scan', '(?<+1>abc)\k<+1>', 3 => [:backref, :name_ref_ab, '\k<+1>', 10, 16] 19 | include_examples 'scan', "(?<+1>abc)\\k'+1'", 3 => [:backref, :name_ref_sq, "\\k'+1'", 10, 16] 20 | 21 | include_examples 'scan', '(abc)\k<1>', 3 => [:backref, :number_ref_ab, '\k<1>', 5, 10] 22 | include_examples 'scan', "(abc)\\k'1'", 3 => [:backref, :number_ref_sq, "\\k'1'", 5, 10] 23 | include_examples 'scan', "(abc)\\k'001'", 3 => [:backref, :number_ref_sq, "\\k'001'", 5, 12] 24 | 25 | include_examples 'scan', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref_ab, '\k<-1>', 5, 11] 26 | include_examples 'scan', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref_sq, "\\k'-1'", 5, 11] 27 | include_examples 'scan', '(abc)\k<-001>', 3 => [:backref, :number_rel_ref_ab, '\k<-001>', 5, 13] 28 | 29 | # Sub-expression invocation, named, numbered, and relative 30 | include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 14] 31 | include_examples 'scan', "(?abc)\\g'X'", 3 => [:backref, :name_call_sq, "\\g'X'", 9, 14] 32 | 33 | include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 16] 34 | include_examples 'scan', "(?abc)\\g'X-1'", 3 => [:backref, :name_call_sq, "\\g'X-1'", 9, 16] 35 | 36 | include_examples 'scan', '(abc)\g<1>', 3 => [:backref, :number_call_ab, '\g<1>', 5, 10] 37 | include_examples 'scan', "(abc)\\g'1'", 3 => [:backref, :number_call_sq, "\\g'1'", 5, 10] 38 | include_examples 'scan', '(abc)\g<001>', 3 => [:backref, :number_call_ab, '\g<001>', 5, 12] 39 | 40 | include_examples 'scan', 'a(b|\g<0>)', 4 => [:backref, :number_call_ab, '\g<0>', 4, 9] 41 | include_examples 'scan', "a(b|\\g'0')", 4 => [:backref, :number_call_sq, "\\g'0'", 4, 9] 42 | 43 | include_examples 'scan', '(abc)\g<-1>', 3 => [:backref, :number_rel_call_ab, '\g<-1>', 5, 11] 44 | include_examples 'scan', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call_sq, "\\g'-1'", 5, 11] 45 | include_examples 'scan', '(abc)\g<-001>', 3 => [:backref, :number_rel_call_ab, '\g<-001>', 5, 13] 46 | 47 | include_examples 'scan', '\g<+1>(abc)', 0 => [:backref, :number_rel_call_ab, '\g<+1>', 0, 6] 48 | include_examples 'scan', "\\g'+1'(abc)", 0 => [:backref, :number_rel_call_sq, "\\g'+1'", 0, 6] 49 | 50 | # Group back-references, with recursion level 51 | include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_recursion_ref_ab, '\k', 9, 16] 52 | include_examples 'scan', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'X-0'", 9, 16] 53 | 54 | include_examples 'scan', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref_ab, '\k<1-0>', 5, 12] 55 | include_examples 'scan', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref_sq, "\\k'1-0'", 5, 12] 56 | 57 | include_examples 'scan', '(abc)\k<+1-0>', 3 => [:backref, :name_recursion_ref_ab, '\k<+1-0>', 5, 13] 58 | include_examples 'scan', "(abc)\\k'+1-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'+1-0'", 5, 13] 59 | end 60 | -------------------------------------------------------------------------------- /spec/scanner/types_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe('Type scanning') do 4 | include_examples 'scan', 'a\dc', 1 => [:type, :digit, '\d', 1, 3] 5 | include_examples 'scan', 'a\Dc', 1 => [:type, :nondigit, '\D', 1, 3] 6 | include_examples 'scan', 'a\hc', 1 => [:type, :hex, '\h', 1, 3] 7 | include_examples 'scan', 'a\Hc', 1 => [:type, :nonhex, '\H', 1, 3] 8 | include_examples 'scan', 'a\sc', 1 => [:type, :space, '\s', 1, 3] 9 | include_examples 'scan', 'a\Sc', 1 => [:type, :nonspace, '\S', 1, 3] 10 | include_examples 'scan', 'a\wc', 1 => [:type, :word, '\w', 1, 3] 11 | include_examples 'scan', 'a\Wc', 1 => [:type, :nonword, '\W', 1, 3] 12 | include_examples 'scan', 'a\Rc', 1 => [:type, :linebreak, '\R', 1, 3] 13 | include_examples 'scan', 'a\Xc', 1 => [:type, :xgrapheme, '\X', 1, 3] 14 | end 15 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $VERBOSE = true 2 | 3 | require 'leto' 4 | require 'regexp_property_values' 5 | require_relative 'support/capturing_stderr' 6 | require_relative 'support/shared_examples' 7 | 8 | req_warn = capturing_stderr { @required_now = require('regexp_parser') } 9 | req_warn.empty? || fail("requiring parser generated warnings:\n#{req_warn}") 10 | @required_now || fail("regexp_parser was required earlier than expected") 11 | 12 | RS = Regexp::Scanner 13 | RL = Regexp::Lexer 14 | RP = Regexp::Parser 15 | RE = Regexp::Expression 16 | T = Regexp::Syntax::Token 17 | 18 | include Regexp::Expression 19 | 20 | def ruby_version_at_least(version) 21 | Gem::Version.new(RUBY_VERSION.dup) >= Gem::Version.new(version) 22 | end 23 | 24 | RSpec.configure do |config| 25 | config.around(:example) do |example| 26 | # treat unexpected warnings as failures 27 | expect { example.run }.not_to output.to_stderr 28 | end 29 | end 30 | 31 | def s(klass, text = '', *children) 32 | exp = klass.construct(text: text.to_s) 33 | children.each { |child| exp.expressions << child } 34 | exp 35 | end 36 | 37 | def regexp_with_all_features 38 | return /dummy/ unless ruby_version_at_least('2.4.1') 39 | 40 | Regexp.new(<<-'REGEXP', Regexp::EXTENDED) 41 | \A 42 | a++ 43 | (?: 44 | \b {2} 45 | (?> 46 | c ?? 47 | 😀😀😀 48 | # 😄😄😄 49 | (?# 😃😃😃 ) 50 | ( 51 | \d *+ 52 | ( 53 | ALT1 54 | | 55 | ALT2 56 | ) 57 | ) {004} 58 | | 59 | [ä-ü&&ö[:ascii:]\p{thai}] {6} 60 | | 61 | \z 62 | ) 63 | (?=lm{8}) ?+ 64 | \K 65 | (?~ 66 | \1 67 | \g<-1> {10} 68 | \uFFFF 69 | \012 70 | ) 71 | (?(1) 72 | BRANCH1 73 | | 74 | BRANCH2 75 | ) 76 | ) 77 | REGEXP 78 | end 79 | -------------------------------------------------------------------------------- /spec/support/capturing_stderr.rb: -------------------------------------------------------------------------------- 1 | require 'stringio' 2 | 3 | def capturing_stderr(&block) 4 | old_stderr, $stderr = $stderr, StringIO.new 5 | block.call 6 | $stderr.string 7 | ensure 8 | $stderr = old_stderr 9 | end 10 | -------------------------------------------------------------------------------- /spec/support/shared_examples.rb: -------------------------------------------------------------------------------- 1 | RSpec.shared_examples 'syntax' do |opts| 2 | opts[:implements].each do |type, tokens| 3 | tokens.each do |token| 4 | it("implements #{token} #{type}") do 5 | expect(described_class.implements?(type, token)).to be true 6 | end 7 | end 8 | end 9 | 10 | opts[:excludes] && opts[:excludes].each do |type, tokens| 11 | tokens.each do |token| 12 | it("does not implement #{token} #{type}") do 13 | expect(described_class.implements?(type, token)).to be false 14 | end 15 | end 16 | end 17 | end 18 | 19 | RSpec.shared_examples 'scan' do |pattern, checks| 20 | context "given the pattern #{pattern}" do 21 | before(:all) { @tokens = Regexp::Scanner.scan(pattern) } 22 | 23 | checks.each do |index, (type, token, text, ts, te)| 24 | it "scans token #{index} as #{token} #{type} at #{ts}..#{te}" do 25 | result = @tokens.at(index) 26 | result || fail("no token at index #{index}, max is #{@tokens.size - 1}") 27 | 28 | expect(result[0]).to eq type 29 | expect(result[1]).to eq token 30 | expect(result[2]).to eq text 31 | expect(result[3]).to eq ts 32 | expect(result[4]).to eq te 33 | end 34 | end 35 | end 36 | end 37 | 38 | RSpec.shared_examples 'lex' do |pattern, checks| 39 | context "given the pattern #{pattern}" do 40 | before(:all) { @tokens = Regexp::Lexer.lex(pattern) } 41 | 42 | checks.each do |index, (type, token, text, ts, te, lvl, set_lvl, cond_lvl)| 43 | it "lexes token #{index} as #{token} #{type} at #{lvl}, #{set_lvl}, #{cond_lvl}" do 44 | struct = @tokens.at(index) 45 | 46 | expect(struct.type).to eq type 47 | expect(struct.token).to eq token 48 | expect(struct.text).to eq text 49 | expect(struct.ts).to eq ts 50 | expect(struct.te).to eq te 51 | expect(struct.level).to eq lvl 52 | expect(struct.set_level).to eq set_lvl 53 | expect(struct.conditional_level).to eq cond_lvl 54 | end 55 | end 56 | end 57 | end 58 | 59 | RSpec.shared_examples 'parse' do |pattern, checks| 60 | context "given the pattern #{pattern}" do 61 | before(:all) { @root = Regexp::Parser.parse(pattern, '*') } 62 | 63 | checks.each do |path, expectations| 64 | path = Array(path) 65 | inspect_quantifier = path.last == :q && path.pop 66 | 67 | attributes = expectations.pop if expectations.last.is_a?(Hash) 68 | klass = expectations.pop if expectations.last.is_a?(Class) 69 | token = expectations.pop 70 | type = expectations.pop 71 | 72 | description = klass || token || type || 'Expression' 73 | 74 | it "parses expression at #{path} as #{description}" do 75 | exp = @root.dig(*path) 76 | exp = exp.quantifier if inspect_quantifier 77 | 78 | klass && expect(exp).to(be_instance_of(klass)) 79 | type && expect(exp.type).to(eq(type)) 80 | token && expect(exp.token).to(eq(token)) 81 | 82 | attributes && attributes.each do |method, value| 83 | actual = exp.send(method) 84 | expect(actual).to eq(value), 85 | "expected #{description} at #{path} to "\ 86 | "have #{method} #{value.inspect}, got #{actual.inspect}" 87 | end 88 | end 89 | end 90 | end 91 | end 92 | -------------------------------------------------------------------------------- /spec/syntax/syntax_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax) do 4 | describe('::for') do 5 | it { expect(Regexp::Syntax.for('ruby/1.8.6')).to eq Regexp::Syntax::V1_8_6 } 6 | it { expect(Regexp::Syntax.for('ruby/1.8')).to eq Regexp::Syntax::V1_8_6 } 7 | it { expect(Regexp::Syntax.for('ruby/1.9.1')).to eq Regexp::Syntax::V1_9_1 } 8 | it { expect(Regexp::Syntax.for('ruby/1.9')).to eq Regexp::Syntax::V1_9_3 } 9 | it { expect(Regexp::Syntax.for('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } 10 | it { expect(Regexp::Syntax.for('ruby/2.0')).to eq Regexp::Syntax::V2_0_0 } 11 | it { expect(Regexp::Syntax.for('ruby/2.1')).to eq Regexp::Syntax::V2_0_0 } 12 | it { expect(Regexp::Syntax.for('ruby/2.2.0')).to eq Regexp::Syntax::V2_2_0 } 13 | it { expect(Regexp::Syntax.for('ruby/2.2.10')).to eq Regexp::Syntax::V2_2_0 } 14 | it { expect(Regexp::Syntax.for('ruby/2.2')).to eq Regexp::Syntax::V2_2_0 } 15 | it { expect(Regexp::Syntax.for('ruby/2.3.0')).to eq Regexp::Syntax::V2_3_0 } 16 | it { expect(Regexp::Syntax.for('ruby/2.3')).to eq Regexp::Syntax::V2_3_0 } 17 | it { expect(Regexp::Syntax.for('ruby/2.4.0')).to eq Regexp::Syntax::V2_4_0 } 18 | it { expect(Regexp::Syntax.for('ruby/2.4.1')).to eq Regexp::Syntax::V2_4_1 } 19 | it { expect(Regexp::Syntax.for('ruby/2.5.0')).to eq Regexp::Syntax::V2_5_0 } 20 | it { expect(Regexp::Syntax.for('ruby/2.5')).to eq Regexp::Syntax::V2_5_0 } 21 | it { expect(Regexp::Syntax.for('ruby/2.6.0')).to eq Regexp::Syntax::V2_6_0 } 22 | it { expect(Regexp::Syntax.for('ruby/2.6.2')).to eq Regexp::Syntax::V2_6_2 } 23 | it { expect(Regexp::Syntax.for('ruby/2.6.3')).to eq Regexp::Syntax::V2_6_3 } 24 | it { expect(Regexp::Syntax.for('ruby/2.6')).to eq Regexp::Syntax::V2_6_3 } 25 | it { expect(Regexp::Syntax.for('ruby/3.0.0')).to eq Regexp::Syntax::V2_6_3 } 26 | it { expect(Regexp::Syntax.for('ruby/3.0')).to eq Regexp::Syntax::V2_6_3 } 27 | it { expect(Regexp::Syntax.for('ruby/3.1.0')).to eq Regexp::Syntax::V3_1_0 } 28 | it { expect(Regexp::Syntax.for('ruby/3.1')).to eq Regexp::Syntax::V3_1_0 } 29 | it { expect(Regexp::Syntax.for('ruby/3.2.0')).to eq Regexp::Syntax::V3_2_0 } 30 | it { expect(Regexp::Syntax.for('ruby/3.2')).to eq Regexp::Syntax::V3_2_0 } 31 | 32 | it { expect(Regexp::Syntax.for('any')).to eq Regexp::Syntax::Any } 33 | it { expect(Regexp::Syntax.for('*')).to eq Regexp::Syntax::Any } 34 | 35 | it 'raises for unknown names' do 36 | expect { Regexp::Syntax.for('ruby/1.0') }.to raise_error(Regexp::Syntax::UnknownSyntaxNameError) 37 | end 38 | 39 | it 'raises for invalid names' do 40 | expect { Regexp::Syntax.version_class('2.0.0') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) 41 | expect { Regexp::Syntax.version_class('ruby/20') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) 42 | end 43 | end 44 | 45 | specify('::new is a deprecated alias of ::for') do 46 | expect { expect(Regexp::Syntax.new('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } 47 | .to output(/deprecated/).to_stderr 48 | end 49 | 50 | specify('not implemented') do 51 | expect { RP.parse('\p{alpha}', 'ruby/1.8') }.to raise_error(Regexp::Syntax::NotImplementedError) 52 | end 53 | 54 | specify('supported?') do 55 | expect(Regexp::Syntax.supported?('ruby/1.1.1')).to be false 56 | expect(Regexp::Syntax.supported?('ruby/2.4.3')).to be true 57 | expect(Regexp::Syntax.supported?('ruby/2.5')).to be true 58 | end 59 | 60 | specify('raises for unknown constant lookups') do 61 | expect { Regexp::Syntax::V1 }.to raise_error(/V1/) 62 | end 63 | 64 | specify('instantiation is deprecated but still works') do 65 | expect { @instance = Regexp::Syntax::V3_1_0.new } 66 | .to output(/deprecated/).to_stderr 67 | expect { expect(@instance.implements?(:literal, :literal)).to be true } 68 | .to output(/deprecated/).to_stderr 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /spec/syntax/syntax_token_map_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::Token::Map) do 4 | let(:map) { Regexp::Syntax::Token::Map } 5 | let(:current_syntax) { Regexp::Syntax::CURRENT } 6 | 7 | specify('is complete') do 8 | current_syntax.features.each do |type, tokens| 9 | tokens.each { |token| expect(map[type]).to include(token) } 10 | end 11 | end 12 | 13 | specify('contains no duplicate tokens') do 14 | current_syntax.features.each do |_type, tokens| 15 | expect(tokens).to eq tokens.uniq 16 | end 17 | end 18 | 19 | specify('contains no duplicate type/token combinations') do 20 | combinations = map.flat_map do |type, tokens| 21 | tokens.map { |token| "#{type} #{token}" } 22 | end 23 | 24 | non_uniq = combinations.group_by { |str| str }.select { |_, v| v.count > 1 } 25 | 26 | expect(non_uniq.keys).to be_empty 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /spec/syntax/versions/1.8.6_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V1_8_6) do 4 | include_examples 'syntax', 5 | implements: { 6 | assertion: T::Assertion::Lookahead, 7 | backref: T::Backreference::Plain, 8 | escape: T::Escape::Basic + T::Escape::ASCII + T::Escape::Meta + T::Escape::Control, 9 | group: T::Group::V1_8_6, 10 | quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Interval + T::Quantifier::IntervalReluctant 11 | }, 12 | excludes: { 13 | assertion: T::Assertion::Lookbehind, 14 | backref: T::Backreference::All - T::Backreference::Plain + T::SubexpressionCall::All, 15 | quantifier: T::Quantifier::Possessive 16 | } 17 | end 18 | -------------------------------------------------------------------------------- /spec/syntax/versions/1.9.1_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V1_9_1) do 4 | include_examples 'syntax', 5 | implements: { 6 | escape: T::Escape::Hex + T::Escape::Octal + T::Escape::Unicode, 7 | type: T::CharacterType::Hex, 8 | quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Possessive 9 | } 10 | end 11 | -------------------------------------------------------------------------------- /spec/syntax/versions/1.9.3_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V1_9_3) do 4 | include_examples 'syntax', 5 | implements: { 6 | property: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3, 7 | nonproperty: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3 8 | } 9 | end 10 | -------------------------------------------------------------------------------- /spec/syntax/versions/2.0.0_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V2_0_0) do 4 | include_examples 'syntax', 5 | implements: { 6 | property: T::UnicodeProperty::Age_V2_0_0, 7 | nonproperty: T::UnicodeProperty::Age_V2_0_0 8 | }, 9 | excludes: { 10 | property: %i[newline], 11 | nonproperty: %i[newline] 12 | } 13 | end 14 | -------------------------------------------------------------------------------- /spec/syntax/versions/2.2.0_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V2_2_0) do 4 | include_examples 'syntax', 5 | implements: { 6 | property: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0, 7 | nonproperty: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0 8 | } 9 | end 10 | -------------------------------------------------------------------------------- /spec/syntax/versions/3.2.0_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Syntax::V3_2_0) do 4 | include_examples 'syntax', 5 | implements: { 6 | property: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0, 7 | nonproperty: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0 8 | } 9 | end 10 | -------------------------------------------------------------------------------- /spec/token/token_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe(Regexp::Token) do 4 | specify('#offset') do 5 | regexp = /ab?cd/ 6 | tokens = RL.lex(regexp) 7 | 8 | expect(tokens[1].text).to eq 'b' 9 | expect(tokens[1].offset).to eq [1, 2] 10 | 11 | expect(tokens[2].text).to eq '?' 12 | expect(tokens[2].offset).to eq [2, 3] 13 | 14 | expect(tokens[3].text).to eq 'cd' 15 | expect(tokens[3].offset).to eq [3, 5] 16 | end 17 | 18 | specify('#length') do 19 | regexp = /abc?def/ 20 | tokens = RL.lex(regexp) 21 | 22 | expect(tokens[0].text).to eq 'ab' 23 | expect(tokens[0].length).to eq 2 24 | 25 | expect(tokens[1].text).to eq 'c' 26 | expect(tokens[1].length).to eq 1 27 | 28 | expect(tokens[2].text).to eq '?' 29 | expect(tokens[2].length).to eq 1 30 | 31 | expect(tokens[3].text).to eq 'def' 32 | expect(tokens[3].length).to eq 3 33 | end 34 | 35 | specify('#to_h') do 36 | regexp = /abc?def/ 37 | tokens = RL.lex(regexp) 38 | 39 | expect(tokens[0].text).to eq 'ab' 40 | expect(tokens[0].to_h).to eq type: :literal, token: :literal, text: 'ab', ts: 0, te: 2, level: 0, set_level: 0, conditional_level: 0 41 | 42 | expect(tokens[2].text).to eq '?' 43 | expect(tokens[2].to_h).to eq type: :quantifier, token: :zero_or_one, text: '?', ts: 3, te: 4, level: 0, set_level: 0, conditional_level: 0 44 | end 45 | 46 | specify('#next') do 47 | regexp = /a+b?c*d{2,3}/ 48 | tokens = RL.lex(regexp) 49 | 50 | a = tokens.first 51 | expect(a.text).to eq 'a' 52 | 53 | plus = a.next 54 | expect(plus.text).to eq '+' 55 | 56 | b = plus.next 57 | expect(b.text).to eq 'b' 58 | 59 | interval = tokens.last 60 | expect(interval.text).to eq '{2,3}' 61 | 62 | expect(interval.next).to be_nil 63 | end 64 | 65 | specify('#previous') do 66 | regexp = /a+b?c*d{2,3}/ 67 | tokens = RL.lex(regexp) 68 | 69 | interval = tokens.last 70 | expect(interval.text).to eq '{2,3}' 71 | 72 | d = interval.previous 73 | expect(d.text).to eq 'd' 74 | 75 | star = d.previous 76 | expect(star.text).to eq '*' 77 | 78 | c = star.previous 79 | expect(c.text).to eq 'c' 80 | 81 | a = tokens.first 82 | expect(a.text).to eq 'a' 83 | expect(a.previous).to be_nil 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /tasks/benchmark.rake: -------------------------------------------------------------------------------- 1 | BENCHMARKS_DIR = "#{__dir__}/benchmarks" 2 | 3 | desc 'Run all IPS benchmarks' 4 | task :benchmark do 5 | Dir["#{BENCHMARKS_DIR}/*.rb"].sort.each { |file| load(file) } 6 | end 7 | 8 | namespace :benchmark do 9 | desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md' 10 | task :write_to_file do 11 | require 'stringio' 12 | 13 | string_io = StringIO.new 14 | with_stdouts(STDOUT, string_io) { Rake.application[:benchmark].invoke } 15 | 16 | File.write "#{BENCHMARKS_DIR}/log", 17 | "Results of rake:benchmark on #{RUBY_DESCRIPTION}\n\n" + 18 | string_io.string.gsub(/Warming up.*?Comparison:/m, '') 19 | end 20 | end 21 | 22 | def with_stdouts(*ios) 23 | old_stdout = $stdout 24 | ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } } 25 | ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) } 26 | $stdout = ios 27 | yield 28 | ensure 29 | $stdout = old_stdout 30 | end 31 | -------------------------------------------------------------------------------- /tasks/benchmarks/log: -------------------------------------------------------------------------------- 1 | Results of rake:benchmark on ruby 3.1.0p0 (2021-12-25 revision fb4df44d16) [arm64-darwin21] 2 | 3 | Parsing a minimal Regexp 4 | 5 | Scanner::scan: 32069.4 i/s 6 | Lexer::lex: 30700.6 i/s - same-ish: difference falls within error 7 | Parser::parse: 26248.5 i/s - 1.22x (± 0.00) slower 8 | 9 | Parsing a complex Regexp (URI.regexp) 10 | 11 | Scanner::scan: 843.4 i/s 12 | Lexer::lex: 546.3 i/s - 1.54x (± 0.00) slower 13 | Parser::parse: 332.5 i/s - 2.54x (± 0.00) slower 14 | 15 | -------------------------------------------------------------------------------- /tasks/benchmarks/minimal_regexp.rb: -------------------------------------------------------------------------------- 1 | require 'benchmark/ips' 2 | require_relative '../../lib/regexp_parser' 3 | 4 | puts 'Parsing a minimal Regexp' 5 | 6 | regexp = /./ 7 | 8 | Benchmark.ips do |x| 9 | x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } 10 | x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } 11 | x.report('Parser::parse') { Regexp::Parser.parse(regexp) } 12 | x.compare! 13 | end 14 | -------------------------------------------------------------------------------- /tasks/benchmarks/uri_regexp.rb: -------------------------------------------------------------------------------- 1 | require 'benchmark/ips' 2 | require_relative '../../lib/regexp_parser' 3 | 4 | puts 'Parsing a complex Regexp (URI.regexp)' 5 | 6 | require 'uri' 7 | regexp = URI::DEFAULT_PARSER.make_regexp 8 | 9 | Benchmark.ips do |x| 10 | x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } 11 | x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } 12 | x.report('Parser::parse') { Regexp::Parser.parse(regexp) } 13 | x.compare! 14 | end 15 | -------------------------------------------------------------------------------- /tasks/props.rake: -------------------------------------------------------------------------------- 1 | namespace :props do 2 | desc 'Write new property value hashes for the properties scanner' 3 | task :update do 4 | require 'regexp_property_values' 5 | RegexpPropertyValues.update 6 | dir = File.join(__dir__, '../lib/regexp_parser/scanner/properties') 7 | 8 | write_hash_to_file = ->(hash, path) do 9 | File.open(path, 'w') do |f| 10 | f.puts '# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT', 11 | *hash.sort.map { |pair| pair.join(',') } 12 | end 13 | puts "Wrote #{hash.count} aliases to `#{path}`" 14 | end 15 | 16 | long_names_to_tokens = RegexpPropertyValues.all.map do |val| 17 | [val.identifier, val.full_name.downcase] 18 | end 19 | write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv") 20 | 21 | short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v| 22 | [k.identifier, v.full_name.downcase] 23 | end 24 | write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv") 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /tasks/ragel.rake: -------------------------------------------------------------------------------- 1 | RAGEL_SOURCE_DIR = File.join(__dir__, '../lib/regexp_parser/scanner') 2 | RAGEL_OUTPUT_DIR = File.join(__dir__, '../lib/regexp_parser') 3 | RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files 4 | 5 | namespace :ragel do 6 | desc 'Process the ragel source files and output ruby code' 7 | task rb: :install do |task| 8 | RAGEL_SOURCE_FILES.each do |source_file| 9 | source_path = "#{RAGEL_SOURCE_DIR}/#{source_file}.rl" 10 | output_path = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb" 11 | # -L = omit line hint comments 12 | flags = ENV['DEBUG_RAGEL'].to_i == 1 ? ['-p'] : ['-L'] 13 | # using faster flat table driven FSM, about 25% larger code, but about 30% faster 14 | flags << '-F1' 15 | sh "ragel -R #{source_path} -o #{output_path} #{flags.join(' ')}" 16 | 17 | contents = File 18 | .read(output_path) 19 | .gsub(/[ \t]+$/, '') # remove trailing whitespace emitted by ragel 20 | .gsub(/(?<=\d,)[ \t]+|^[ \t]+(?=-?\d)/, '') # compact FSM tables (saves ~6KB) 21 | .gsub(/\n(?:[ \t]*\n){2,}/, "\n\n") # compact blank lines 22 | 23 | File.open(output_path, 'w') do |file| 24 | file.puts <<~RUBY 25 | # -*- warn-indent:false; -*- 26 | # 27 | # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY 28 | # 29 | # This file was generated from #{source_path.split('/').last} 30 | # by running `bundle exec rake #{task.name}` 31 | RUBY 32 | 33 | file.write(contents) 34 | end 35 | end 36 | end 37 | 38 | desc 'Delete the ragel generated source file(s)' 39 | task :clean do 40 | RAGEL_SOURCE_FILES.each do |file| 41 | sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb" 42 | end 43 | end 44 | 45 | desc 'Make sure that ragel is installed' 46 | task :install do 47 | next if ENV['CI'] 48 | 49 | if system('command -v ragel') 50 | # already installed 51 | elsif system('command -v brew') 52 | puts 'ragel not found, installing with homebrew ...' 53 | `brew install ragel` 54 | elsif system('command -v apt-get') 55 | puts 'ragel not found, installing with apt-get ...' 56 | `sudo apt-get install -y ragel` 57 | else 58 | raise 'Could not install ragel. Please install it manually.' 59 | end 60 | end 61 | end 62 | --------------------------------------------------------------------------------