├── .github
└── workflows
│ ├── gouteur.yml
│ ├── lint.yml
│ └── tests.yml
├── .gitignore
├── .gouteur.yml
├── .rubocop.yml
├── CHANGELOG.md
├── Gemfile
├── LICENSE
├── README.md
├── Rakefile
├── bin
├── console
└── setup
├── lib
├── regexp_parser.rb
└── regexp_parser
│ ├── error.rb
│ ├── expression.rb
│ ├── expression
│ ├── base.rb
│ ├── classes
│ │ ├── alternation.rb
│ │ ├── anchor.rb
│ │ ├── backreference.rb
│ │ ├── character_set.rb
│ │ ├── character_set
│ │ │ ├── intersection.rb
│ │ │ └── range.rb
│ │ ├── character_type.rb
│ │ ├── conditional.rb
│ │ ├── escape_sequence.rb
│ │ ├── free_space.rb
│ │ ├── group.rb
│ │ ├── keep.rb
│ │ ├── literal.rb
│ │ ├── posix_class.rb
│ │ ├── root.rb
│ │ └── unicode_property.rb
│ ├── methods
│ │ ├── construct.rb
│ │ ├── escape_sequence_char.rb
│ │ ├── escape_sequence_codepoint.rb
│ │ ├── human_name.rb
│ │ ├── match.rb
│ │ ├── match_length.rb
│ │ ├── negative.rb
│ │ ├── options.rb
│ │ ├── parts.rb
│ │ ├── printing.rb
│ │ ├── referenced_expressions.rb
│ │ ├── strfregexp.rb
│ │ ├── tests.rb
│ │ └── traverse.rb
│ ├── quantifier.rb
│ ├── sequence.rb
│ ├── sequence_operation.rb
│ ├── shared.rb
│ └── subexpression.rb
│ ├── lexer.rb
│ ├── parser.rb
│ ├── scanner
│ ├── char_type.rl
│ ├── errors
│ │ ├── premature_end_error.rb
│ │ ├── scanner_error.rb
│ │ └── validation_error.rb
│ ├── properties
│ │ ├── long.csv
│ │ └── short.csv
│ ├── property.rl
│ └── scanner.rl
│ ├── syntax.rb
│ ├── syntax
│ ├── any.rb
│ ├── base.rb
│ ├── token.rb
│ ├── token
│ │ ├── anchor.rb
│ │ ├── assertion.rb
│ │ ├── backreference.rb
│ │ ├── character_set.rb
│ │ ├── character_type.rb
│ │ ├── conditional.rb
│ │ ├── escape.rb
│ │ ├── group.rb
│ │ ├── keep.rb
│ │ ├── meta.rb
│ │ ├── posix_class.rb
│ │ ├── quantifier.rb
│ │ ├── unicode_property.rb
│ │ └── virtual.rb
│ ├── version_lookup.rb
│ ├── versions.rb
│ └── versions
│ │ ├── 1.8.6.rb
│ │ ├── 1.9.1.rb
│ │ ├── 1.9.3.rb
│ │ ├── 2.0.0.rb
│ │ ├── 2.2.0.rb
│ │ ├── 2.3.0.rb
│ │ ├── 2.4.0.rb
│ │ ├── 2.4.1.rb
│ │ ├── 2.5.0.rb
│ │ ├── 2.6.0.rb
│ │ ├── 2.6.2.rb
│ │ ├── 2.6.3.rb
│ │ ├── 3.1.0.rb
│ │ └── 3.2.0.rb
│ ├── token.rb
│ └── version.rb
├── regexp_parser.gemspec
├── spec
├── expression
│ ├── base_spec.rb
│ ├── clone_spec.rb
│ ├── conditional_spec.rb
│ ├── free_space_spec.rb
│ ├── methods
│ │ ├── construct_spec.rb
│ │ ├── human_name_spec.rb
│ │ ├── match_length_spec.rb
│ │ ├── match_spec.rb
│ │ ├── negative_spec.rb
│ │ ├── parts_spec.rb
│ │ ├── printing_spec.rb
│ │ ├── strfregexp_spec.rb
│ │ ├── tests_spec.rb
│ │ └── traverse_spec.rb
│ ├── options_spec.rb
│ ├── subexpression_spec.rb
│ ├── te_ts_spec.rb
│ ├── to_h_spec.rb
│ └── to_s_spec.rb
├── lexer
│ ├── all_spec.rb
│ ├── conditionals_spec.rb
│ ├── delimiters_spec.rb
│ ├── escapes_spec.rb
│ ├── keep_spec.rb
│ ├── literals_spec.rb
│ ├── nesting_spec.rb
│ └── refcalls_spec.rb
├── parser
│ ├── all_spec.rb
│ ├── alternation_spec.rb
│ ├── anchors_spec.rb
│ ├── conditionals_spec.rb
│ ├── errors_spec.rb
│ ├── escapes_spec.rb
│ ├── free_space_spec.rb
│ ├── groups_spec.rb
│ ├── keep_spec.rb
│ ├── options_spec.rb
│ ├── posix_classes_spec.rb
│ ├── properties_spec.rb
│ ├── quantifiers_spec.rb
│ ├── refcalls_spec.rb
│ ├── set
│ │ ├── intersections_spec.rb
│ │ └── ranges_spec.rb
│ ├── sets_spec.rb
│ └── types_spec.rb
├── scanner
│ ├── all_spec.rb
│ ├── anchors_spec.rb
│ ├── conditionals_spec.rb
│ ├── delimiters_spec.rb
│ ├── errors_spec.rb
│ ├── escapes_spec.rb
│ ├── free_space_spec.rb
│ ├── groups_spec.rb
│ ├── keep_spec.rb
│ ├── literals_spec.rb
│ ├── meta_spec.rb
│ ├── options_spec.rb
│ ├── properties_spec.rb
│ ├── quantifiers_spec.rb
│ ├── refcalls_spec.rb
│ ├── sets_spec.rb
│ └── types_spec.rb
├── spec_helper.rb
├── support
│ ├── capturing_stderr.rb
│ └── shared_examples.rb
├── syntax
│ ├── syntax_spec.rb
│ ├── syntax_token_map_spec.rb
│ └── versions
│ │ ├── 1.8.6_spec.rb
│ │ ├── 1.9.1_spec.rb
│ │ ├── 1.9.3_spec.rb
│ │ ├── 2.0.0_spec.rb
│ │ ├── 2.2.0_spec.rb
│ │ └── 3.2.0_spec.rb
└── token
│ └── token_spec.rb
└── tasks
├── benchmark.rake
├── benchmarks
├── log
├── minimal_regexp.rb
└── uri_regexp.rb
├── props.rake
└── ragel.rake
/.github/workflows/gouteur.yml:
--------------------------------------------------------------------------------
1 | name: gouteur
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - uses: actions/checkout@v4
11 | - name: Set up Ruby
12 | uses: ruby/setup-ruby@v1
13 | with:
14 | ruby-version: 3.2
15 | bundler-cache: true
16 | - name: Install and run ragel
17 | run: |
18 | sudo apt-get install -yqq ragel
19 | bundle exec rake ragel:rb
20 | - name: Test
21 | run: bundle exec gouteur
22 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2 |
3 | name: rubocop linting
4 |
5 | on: [push, pull_request]
6 |
7 | jobs:
8 | build:
9 | runs-on: ubuntu-latest
10 |
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: Set up Ruby
14 | uses: ruby/setup-ruby@v1
15 | with:
16 | ruby-version: 3.2
17 | bundler-cache: true
18 | - name: Install and run ragel
19 | run: |
20 | sudo apt-get install -yqq ragel
21 | bundle exec rake ragel:rb
22 | - name: Run rubocop
23 | run: bundle exec rubocop
24 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | pull_request:
6 | schedule:
7 | - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | strategy:
14 | matrix:
15 | ruby: [ '2.3', '2.4', '2.5', '2.6', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head' ]
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Set up Ruby ${{ matrix.ruby }}
20 | uses: ruby/setup-ruby@v1
21 | with:
22 | ruby-version: ${{ matrix.ruby }}
23 | bundler-cache: true
24 | - name: Install ragel
25 | run: sudo apt-get install -yqq ragel
26 | - name: Test with Rake
27 | run: bundle exec rake test:full
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | .*.swp
3 | .DS_Store
4 | .ruby-version
5 | .tags
6 | .tags1
7 | .tool-versions
8 |
9 | Gemfile.lock
10 |
11 | lib/regexp_parser/scanner.rb
12 |
13 | doc
14 | .yardoc
15 |
16 | .bundle/*
17 | pkg/*
18 | coverage/*
19 | tmp/*
20 |
--------------------------------------------------------------------------------
/.gouteur.yml:
--------------------------------------------------------------------------------
1 | # Usage: https://github.com/jaynetics/gouteur/blob/main/README.md
2 |
3 | repos:
4 | - uri: https://github.com/jaynetics/js_regex
5 |
6 | - uri: https://github.com/jaynetics/repper
7 |
8 | - uri: https://github.com/rubocop-hq/rubocop
9 | tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb"
10 |
11 | - uri: https://github.com/mbj/mutant
12 | tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb"
13 |
14 | - uri: https://github.com/teamcapybara/capybara
15 | tasks: rspec spec/regexp_dissassembler_spec.rb
16 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | AllCops:
2 | DisabledByDefault: true
3 | Exclude:
4 | - '{bin,pkg,tmp,vendor}/**/*' # vendored dependencies etc.
5 | - 'lib/regexp_parser/scanner.rb' # Ragel-generated code
6 | NewCops: enable
7 | RubyInterpreters:
8 | - ruby
9 | - rake
10 | SuggestExtensions: false
11 | TargetRubyVersion: 2.6 # really 2.0, but 2.6 is lowest supported by rubocop
12 |
13 | Lint:
14 | Enabled: true
15 |
16 | # ignore weird looking regexps in specs, we have these on purpose
17 | Lint/DuplicateRegexpCharacterClassElement:
18 | Exclude: ['spec/**/*']
19 | Lint/MixedRegexpCaptureTypes:
20 | Exclude: ['spec/**/*']
21 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | gemspec
4 |
5 | group :development, :test do
6 | gem 'leto', '~> 2.1'
7 | gem 'rake', '~> 13.1'
8 | gem 'regexp_property_values', '~> 1.5'
9 | gem 'rspec', '~> 3.10'
10 | if RUBY_VERSION.to_f >= 2.7
11 | gem 'benchmark-ips', '~> 2.1'
12 | gem 'gouteur', '~> 1.1'
13 | gem 'rubocop', '~> 1.59'
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010, 2012-2024, Ammar Ali
2 |
3 | Permission is hereby granted, free of charge, to any person
4 | obtaining a copy of this software and associated documentation
5 | files (the "Software"), to deal in the Software without
6 | restriction, including without limitation the rights to use,
7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the
9 | Software is furnished to do so, subject to the following
10 | conditions:
11 |
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler'
2 | require 'rubygems'
3 | require 'rubygems/package_task'
4 | require 'rake'
5 | require 'rake/testtask'
6 | require 'rspec/core/rake_task'
7 |
8 | Dir['tasks/**/*.rake'].each { |file| load(file) }
9 |
10 | Bundler::GemHelper.install_tasks
11 |
12 | RSpec::Core::RakeTask.new(:spec)
13 |
14 | task :default => [:'test:full']
15 |
16 | namespace :test do
17 | task full: [:'ragel:rb', :spec]
18 | end
19 |
20 | # Add ragel task as a prerequisite for building the gem to ensure that the
21 | # latest scanner code is generated and included in the build.
22 | desc "Runs ragel:rb before building the gem"
23 | task :build => ['ragel:rb']
24 |
--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'bundler/setup'
4 | require 'regexp_parser'
5 | require 'regexp_property_values'
6 |
7 | RL = Regexp::Lexer
8 | RP = Regexp::Parser
9 | RS = Regexp::Scanner
10 | PV = RegexpPropertyValues
11 |
12 | def lex(...); Regexp::Lexer.lex(...) end
13 | def parse(...); Regexp::Parser.parse(...) end
14 | def scan(...); Regexp::Scanner.scan(...) end
15 |
16 | require 'irb'
17 | IRB.start(__FILE__)
18 |
--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -euo pipefail
3 |
4 | # install gems
5 | bundle
6 |
7 | # install ragel
8 | rake ragel:install
9 |
--------------------------------------------------------------------------------
/lib/regexp_parser.rb:
--------------------------------------------------------------------------------
1 | require_relative 'regexp_parser/version'
2 | require_relative 'regexp_parser/token'
3 | require_relative 'regexp_parser/scanner'
4 | require_relative 'regexp_parser/syntax'
5 | require_relative 'regexp_parser/lexer'
6 | require_relative 'regexp_parser/parser'
7 |
--------------------------------------------------------------------------------
/lib/regexp_parser/error.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Parser
2 | # base class for all gem-specific errors
3 | class Error < StandardError; end
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression.rb:
--------------------------------------------------------------------------------
1 | require_relative 'error'
2 |
3 | require_relative 'expression/shared'
4 | require_relative 'expression/base'
5 | require_relative 'expression/quantifier'
6 | require_relative 'expression/subexpression'
7 | require_relative 'expression/sequence'
8 | require_relative 'expression/sequence_operation'
9 |
10 | require_relative 'expression/classes/alternation'
11 | require_relative 'expression/classes/anchor'
12 | require_relative 'expression/classes/backreference'
13 | require_relative 'expression/classes/character_set'
14 | require_relative 'expression/classes/character_set/intersection'
15 | require_relative 'expression/classes/character_set/range'
16 | require_relative 'expression/classes/character_type'
17 | require_relative 'expression/classes/conditional'
18 | require_relative 'expression/classes/escape_sequence'
19 | require_relative 'expression/classes/free_space'
20 | require_relative 'expression/classes/group'
21 | require_relative 'expression/classes/keep'
22 | require_relative 'expression/classes/literal'
23 | require_relative 'expression/classes/posix_class'
24 | require_relative 'expression/classes/root'
25 | require_relative 'expression/classes/unicode_property'
26 |
27 | require_relative 'expression/methods/construct'
28 | require_relative 'expression/methods/escape_sequence_char'
29 | require_relative 'expression/methods/escape_sequence_codepoint'
30 | require_relative 'expression/methods/human_name'
31 | require_relative 'expression/methods/match'
32 | require_relative 'expression/methods/match_length'
33 | require_relative 'expression/methods/negative'
34 | require_relative 'expression/methods/options'
35 | require_relative 'expression/methods/parts'
36 | require_relative 'expression/methods/printing'
37 | require_relative 'expression/methods/referenced_expressions'
38 | require_relative 'expression/methods/strfregexp'
39 | require_relative 'expression/methods/tests'
40 | require_relative 'expression/methods/traverse'
41 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/base.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Base
3 | include Regexp::Expression::Shared
4 |
5 | def initialize(token, options = {})
6 | init_from_token_and_options(token, options)
7 | end
8 |
9 | def to_re(format = :full)
10 | if set_level > 0
11 | warn "Calling #to_re on character set members is deprecated - "\
12 | "their behavior might not be equivalent outside of the set."
13 | end
14 | ::Regexp.new(to_s(format))
15 | end
16 |
17 | def quantify(*args)
18 | self.quantifier = Quantifier.new(*args)
19 | end
20 |
21 | def unquantified_clone
22 | clone.tap { |exp| exp.quantifier = nil }
23 | end
24 |
25 | # Deprecated. Prefer `#repetitions` which has a more uniform interface.
26 | def quantity
27 | return [nil,nil] unless quantified?
28 | [quantifier.min, quantifier.max]
29 | end
30 |
31 | def repetitions
32 | @repetitions ||=
33 | if quantified?
34 | min = quantifier.min
35 | max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
36 | range = min..max
37 | # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
38 | if RUBY_VERSION.to_f < 2.7
39 | range.define_singleton_method(:minmax) { [min, max] }
40 | end
41 | range
42 | else
43 | 1..1
44 | end
45 | end
46 |
47 | def greedy?
48 | quantified? and quantifier.greedy?
49 | end
50 |
51 | def reluctant?
52 | quantified? and quantifier.reluctant?
53 | end
54 | alias :lazy? :reluctant?
55 |
56 | def possessive?
57 | quantified? and quantifier.possessive?
58 | end
59 |
60 | def to_h
61 | {
62 | type: type,
63 | token: token,
64 | text: to_s(:base),
65 | starts_at: ts,
66 | length: full_length,
67 | level: level,
68 | set_level: set_level,
69 | conditional_level: conditional_level,
70 | options: options,
71 | quantifier: quantified? ? quantifier.to_h : nil,
72 | }
73 | end
74 | alias :attributes :to_h
75 | end
76 | end
77 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/alternation.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | # A sequence of expressions, used by Alternation as one of its alternatives.
3 | class Alternative < Regexp::Expression::Sequence; end
4 |
5 | class Alternation < Regexp::Expression::SequenceOperation
6 | OPERAND = Alternative
7 |
8 | alias :alternatives :expressions
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/anchor.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Anchor
3 | class Base < Regexp::Expression::Base; end
4 |
5 | class BeginningOfLine < Anchor::Base; end
6 | class EndOfLine < Anchor::Base; end
7 |
8 | class BeginningOfString < Anchor::Base; end
9 | class EndOfString < Anchor::Base; end
10 |
11 | class EndOfStringOrBeforeEndOfLine < Anchor::Base; end
12 |
13 | class WordBoundary < Anchor::Base; end
14 | class NonWordBoundary < Anchor::Base; end
15 |
16 | class MatchStart < Anchor::Base; end
17 |
18 | BOL = BeginningOfLine
19 | EOL = EndOfLine
20 | BOS = BeginningOfString
21 | EOS = EndOfString
22 | EOSobEOL = EndOfStringOrBeforeEndOfLine
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/backreference.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Backreference
3 | class Base < Regexp::Expression::Base; end
4 |
5 | class Number < Backreference::Base
6 | attr_reader :number
7 | alias reference number
8 |
9 | def initialize(token, options = {})
10 | @number = token.text[/-?\d+/].to_i
11 | super
12 | end
13 | end
14 |
15 | class Name < Backreference::Base
16 | attr_reader :name
17 | alias reference name
18 |
19 | def initialize(token, options = {})
20 | @name = token.text[3..-2]
21 | super
22 | end
23 | end
24 |
25 | class NumberRelative < Backreference::Number
26 | attr_accessor :effective_number
27 | alias reference effective_number
28 | end
29 |
30 | class NumberCall < Backreference::Number; end
31 | class NameCall < Backreference::Name; end
32 | class NumberCallRelative < Backreference::NumberRelative; end
33 |
34 | class NumberRecursionLevel < Backreference::NumberRelative
35 | attr_reader :recursion_level
36 |
37 | def initialize(token, options = {})
38 | super
39 | @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i)
40 | end
41 | end
42 |
43 | class NameRecursionLevel < Backreference::Name
44 | attr_reader :recursion_level
45 |
46 | def initialize(token, options = {})
47 | super
48 | @name, recursion_level = token.text[3..-2].split(/(?=[+-])/)
49 | @recursion_level = recursion_level.to_i
50 | end
51 | end
52 | end
53 |
54 | # alias for symmetry between token symbol and Expression class name
55 | Backref = Backreference
56 | end
57 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/character_set.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class CharacterSet < Regexp::Expression::Subexpression
3 | attr_accessor :closed, :negative
4 | alias :closed? :closed
5 |
6 | def initialize(token, options = {})
7 | self.negative = false
8 | self.closed = false
9 | super
10 | end
11 |
12 | def negate
13 | self.negative = true
14 | end
15 |
16 | def close
17 | self.closed = true
18 | end
19 | end
20 |
21 | # alias for symmetry between token symbol and Expression class name
22 | Set = CharacterSet
23 | end # module Regexp::Expression
24 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/character_set/intersection.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class CharacterSet < Regexp::Expression::Subexpression
3 | class IntersectedSequence < Regexp::Expression::Sequence; end
4 |
5 | class Intersection < Regexp::Expression::SequenceOperation
6 | OPERAND = IntersectedSequence
7 | end
8 | end
9 | end
10 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/character_set/range.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class CharacterSet < Regexp::Expression::Subexpression
3 | class Range < Regexp::Expression::Subexpression
4 | def ts
5 | (head = expressions.first) ? head.ts : @ts
6 | end
7 |
8 | def <<(exp)
9 | complete? and raise Regexp::Parser::Error,
10 | "Can't add more than 2 expressions to a Range"
11 | super
12 | end
13 |
14 | def complete?
15 | count == 2
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/character_type.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module CharacterType
3 | class Base < Regexp::Expression::Base; end
4 |
5 | class Any < CharacterType::Base; end
6 | class Digit < CharacterType::Base; end
7 | class NonDigit < CharacterType::Base; end
8 | class Hex < CharacterType::Base; end
9 | class NonHex < CharacterType::Base; end
10 | class Word < CharacterType::Base; end
11 | class NonWord < CharacterType::Base; end
12 | class Space < CharacterType::Base; end
13 | class NonSpace < CharacterType::Base; end
14 | class Linebreak < CharacterType::Base; end
15 | class ExtendedGrapheme < CharacterType::Base; end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/conditional.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Conditional
3 | class TooManyBranches < Regexp::Parser::Error
4 | def initialize
5 | super('The conditional expression has more than 2 branches')
6 | end
7 | end
8 |
9 | class Condition < Regexp::Expression::Base
10 | # Name or number of the referenced capturing group that determines state.
11 | # Returns a String if reference is by name, Integer if by number.
12 | def reference
13 | ref = text.tr("'<>()", "")
14 | ref =~ /\D/ ? ref : Integer(ref)
15 | end
16 | end
17 |
18 | class Branch < Regexp::Expression::Sequence; end
19 |
20 | class Expression < Regexp::Expression::Subexpression
21 | def <<(exp)
22 | expressions.last << exp
23 | end
24 |
25 | def add_sequence(active_opts = {}, params = { ts: 0 })
26 | raise TooManyBranches.new if branches.length == 2
27 | params = params.merge({ conditional_level: conditional_level + 1 })
28 | Branch.add_to(self, params, active_opts)
29 | end
30 | alias :branch :add_sequence
31 |
32 | def condition=(exp)
33 | expressions.delete(condition)
34 | expressions.unshift(exp)
35 | end
36 |
37 | def condition
38 | find { |subexp| subexp.is_a?(Condition) }
39 | end
40 |
41 | def branches
42 | select { |subexp| subexp.is_a?(Sequence) }
43 | end
44 |
45 | def reference
46 | condition.reference
47 | end
48 | end
49 | end
50 | end
51 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/escape_sequence.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module EscapeSequence
3 | Base = Class.new(Regexp::Expression::Base)
4 |
5 | AsciiEscape = Class.new(Base) # \e
6 | Backspace = Class.new(Base) # \b
7 | Bell = Class.new(Base) # \a
8 | FormFeed = Class.new(Base) # \f
9 | Newline = Class.new(Base) # \n
10 | Return = Class.new(Base) # \r
11 | Tab = Class.new(Base) # \t
12 | VerticalTab = Class.new(Base) # \v
13 |
14 | Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes)
15 |
16 | Octal = Class.new(Base) # e.g. \012
17 | Hex = Class.new(Base) # e.g. \x0A
18 | Codepoint = Class.new(Base) # e.g. \u000A
19 |
20 | CodepointList = Class.new(Base) # e.g. \u{A B}
21 |
22 | AbstractMetaControlSequence = Class.new(Base)
23 | Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
24 | Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z
25 | MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX
26 | end
27 |
28 | # alias for symmetry between Token::* and Expression::*
29 | Escape = EscapeSequence
30 | end
31 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/free_space.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class FreeSpace < Regexp::Expression::Base
3 | def quantify(*_args)
4 | raise Regexp::Parser::Error, 'Can not quantify a free space object'
5 | end
6 | end
7 |
8 | class Comment < Regexp::Expression::FreeSpace
9 | end
10 |
11 | class WhiteSpace < Regexp::Expression::FreeSpace
12 | def merge(exp)
13 | warn("#{self.class}##{__method__} is deprecated and will be removed in v3.0.0.")
14 | text << exp.text
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/group.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Group
3 | class Base < Regexp::Expression::Subexpression
4 | end
5 |
6 | class Passive < Group::Base
7 | attr_writer :implicit
8 |
9 | def initialize(*)
10 | @implicit = false
11 | super
12 | end
13 |
14 | def implicit?
15 | @implicit
16 | end
17 | end
18 |
19 | class Absence < Group::Base; end
20 | class Atomic < Group::Base; end
21 | # TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
22 | # longer inherit from Group because it is effectively a terminal expression.
23 | class Options < Group::Base
24 | attr_accessor :option_changes
25 |
26 | def initialize_copy(orig)
27 | self.option_changes = orig.option_changes.dup
28 | super
29 | end
30 |
31 | def quantify(*args)
32 | if token == :options_switch
33 | raise Regexp::Parser::Error, 'Can not quantify an option switch'
34 | else
35 | super
36 | end
37 | end
38 | end
39 |
40 | class Capture < Group::Base
41 | attr_accessor :number, :number_at_level
42 | alias identifier number
43 | end
44 |
45 | class Named < Group::Capture
46 | attr_reader :name
47 | alias identifier name
48 |
49 | def initialize(token, options = {})
50 | @name = token.text[3..-2]
51 | super
52 | end
53 |
54 | def initialize_copy(orig)
55 | @name = orig.name.dup
56 | super
57 | end
58 | end
59 |
60 | class Comment < Group::Base
61 | end
62 | end
63 |
64 | module Assertion
65 | class Base < Regexp::Expression::Group::Base; end
66 |
67 | class Lookahead < Assertion::Base; end
68 | class NegativeLookahead < Assertion::Base; end
69 |
70 | class Lookbehind < Assertion::Base; end
71 | class NegativeLookbehind < Assertion::Base; end
72 | end
73 | end
74 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/keep.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Keep
3 | # TODO: in regexp_parser v3.0.0 this should possibly be a Subexpression
4 | # that contains all expressions to its left.
5 | class Mark < Regexp::Expression::Base; end
6 | end
7 | end
8 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/literal.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Literal < Regexp::Expression::Base; end
3 | end
4 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/posix_class.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class PosixClass < Regexp::Expression::Base
3 | def name
4 | text[/\w+/]
5 | end
6 | end
7 |
8 | # alias for symmetry between token symbol and Expression class name
9 | Posixclass = PosixClass
10 | Nonposixclass = PosixClass
11 | end
12 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/root.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Root < Regexp::Expression::Subexpression
3 | def self.build(options = {})
4 | warn "`#{self.class}.build(options)` is deprecated and will raise in "\
5 | "regexp_parser v3.0.0. Please use `.construct(options: options)`."
6 | construct(options: options)
7 | end
8 | end
9 | end
10 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/classes/unicode_property.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module UnicodeProperty
3 | class Base < Regexp::Expression::Base
4 | def name
5 | text[/\A\\[pP]\{([^}]+)\}\z/, 1]
6 | end
7 |
8 | def shortcut
9 | Regexp::Scanner.short_prop_map.key(token.to_s)
10 | end
11 | end
12 |
13 | class Alnum < Base; end
14 | class Alpha < Base; end
15 | class Ascii < Base; end
16 | class Blank < Base; end
17 | class Cntrl < Base; end
18 | class Digit < Base; end
19 | class Graph < Base; end
20 | class Lower < Base; end
21 | class Print < Base; end
22 | class Punct < Base; end
23 | class Space < Base; end
24 | class Upper < Base; end
25 | class Word < Base; end
26 | class Xdigit < Base; end
27 | class XPosixPunct < Base; end
28 |
29 | class Newline < Base; end
30 |
31 | class Any < Base; end
32 | class Assigned < Base; end
33 |
34 | module Letter
35 | class Base < UnicodeProperty::Base; end
36 |
37 | class Any < Letter::Base; end
38 | class Cased < Letter::Base; end
39 | class Uppercase < Letter::Base; end
40 | class Lowercase < Letter::Base; end
41 | class Titlecase < Letter::Base; end
42 | class Modifier < Letter::Base; end
43 | class Other < Letter::Base; end
44 | end
45 |
46 | module Mark
47 | class Base < UnicodeProperty::Base; end
48 |
49 | class Any < Mark::Base; end
50 | class Combining < Mark::Base; end
51 | class Nonspacing < Mark::Base; end
52 | class Spacing < Mark::Base; end
53 | class Enclosing < Mark::Base; end
54 | end
55 |
56 | module Number
57 | class Base < UnicodeProperty::Base; end
58 |
59 | class Any < Number::Base; end
60 | class Decimal < Number::Base; end
61 | class Letter < Number::Base; end
62 | class Other < Number::Base; end
63 | end
64 |
65 | module Punctuation
66 | class Base < UnicodeProperty::Base; end
67 |
68 | class Any < Punctuation::Base; end
69 | class Connector < Punctuation::Base; end
70 | class Dash < Punctuation::Base; end
71 | class Open < Punctuation::Base; end
72 | class Close < Punctuation::Base; end
73 | class Initial < Punctuation::Base; end
74 | class Final < Punctuation::Base; end
75 | class Other < Punctuation::Base; end
76 | end
77 |
78 | module Separator
79 | class Base < UnicodeProperty::Base; end
80 |
81 | class Any < Separator::Base; end
82 | class Space < Separator::Base; end
83 | class Line < Separator::Base; end
84 | class Paragraph < Separator::Base; end
85 | end
86 |
87 | module Symbol
88 | class Base < UnicodeProperty::Base; end
89 |
90 | class Any < Symbol::Base; end
91 | class Math < Symbol::Base; end
92 | class Currency < Symbol::Base; end
93 | class Modifier < Symbol::Base; end
94 | class Other < Symbol::Base; end
95 | end
96 |
97 | module Codepoint
98 | class Base < UnicodeProperty::Base; end
99 |
100 | class Any < Codepoint::Base; end
101 | class Control < Codepoint::Base; end
102 | class Format < Codepoint::Base; end
103 | class Surrogate < Codepoint::Base; end
104 | class PrivateUse < Codepoint::Base; end
105 | class Unassigned < Codepoint::Base; end
106 | end
107 |
108 | class Age < UnicodeProperty::Base; end
109 | class Block < UnicodeProperty::Base; end
110 | class Derived < UnicodeProperty::Base; end
111 | class Emoji < UnicodeProperty::Base; end
112 | class Enumerated < UnicodeProperty::Base; end
113 | class Script < UnicodeProperty::Base; end
114 | end
115 |
116 | # alias for symmetry between token symbol and Expression class name
117 | Property = UnicodeProperty
118 | Nonproperty = UnicodeProperty
119 | end # module Regexp::Expression
120 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/construct.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | module ClassMethods
4 | # Convenience method to init a valid Expression without a Regexp::Token
5 | def construct(params = {})
6 | attrs = construct_defaults.merge(params)
7 | options = attrs.delete(:options)
8 | token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) }
9 | token = Regexp::Token.new(*token_args)
10 | raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any?
11 |
12 | new(token, options)
13 | end
14 |
15 | def construct_defaults
16 | if self == Root
17 | { type: :expression, token: :root, ts: 0 }
18 | elsif self < Sequence
19 | { type: :expression, token: :sequence }
20 | else
21 | { type: token_class::Type }
22 | end.merge(level: 0, set_level: 0, conditional_level: 0, text: '')
23 | end
24 |
25 | def token_class
26 | if self == Root || self < Sequence
27 | nil # no token class because these objects are Parser-generated
28 | # TODO: synch exp class, token class & type names for this in v3.0.0
29 | elsif self == CharacterType::Any
30 | Regexp::Syntax::Token::Meta
31 | else
32 | Regexp::Syntax::Token.const_get(name.split('::')[2])
33 | end
34 | end
35 | end
36 |
37 | def token_class
38 | self.class.token_class
39 | end
40 | end
41 | end
42 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/escape_sequence_char.rb:
--------------------------------------------------------------------------------
1 | Regexp::Expression::EscapeSequence::Base.class_eval do
2 | def char
3 | codepoint.chr('utf-8')
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression::EscapeSequence
2 | AsciiEscape.class_eval { def codepoint; 0x1B end }
3 | Backspace.class_eval { def codepoint; 0x8 end }
4 | Bell.class_eval { def codepoint; 0x7 end }
5 | FormFeed.class_eval { def codepoint; 0xC end }
6 | Newline.class_eval { def codepoint; 0xA end }
7 | Return.class_eval { def codepoint; 0xD end }
8 | Tab.class_eval { def codepoint; 0x9 end }
9 | VerticalTab.class_eval { def codepoint; 0xB end }
10 |
11 | Literal.class_eval { def codepoint; text[1].ord end }
12 |
13 | Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end }
14 |
15 | Hex.class_eval { def codepoint; text[/\h+/].hex end }
16 | Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
17 |
18 | CodepointList.class_eval do
19 | # Maybe this should be a unique top-level expression class?
20 | def char
21 | raise NoMethodError, 'CodepointList responds only to #chars'
22 | end
23 |
24 | def codepoint
25 | raise NoMethodError, 'CodepointList responds only to #codepoints'
26 | end
27 |
28 | def chars
29 | codepoints.map { |cp| cp.chr('utf-8') }
30 | end
31 |
32 | def codepoints
33 | text.scan(/\h+/).map(&:hex)
34 | end
35 | end
36 |
37 | AbstractMetaControlSequence.class_eval do
38 | private
39 |
40 | def control_sequence_to_s(control_sequence)
41 | five_lsb = control_sequence.unpack('B*').first[-5..-1]
42 | ["000#{five_lsb}"].pack('B*')
43 | end
44 |
45 | def meta_char_to_codepoint(meta_char)
46 | byte_value = meta_char.ord
47 | byte_value < 128 ? byte_value + 128 : byte_value
48 | end
49 | end
50 |
51 | Control.class_eval do
52 | def codepoint
53 | control_sequence_to_s(text).ord
54 | end
55 | end
56 |
57 | Meta.class_eval do
58 | def codepoint
59 | meta_char_to_codepoint(text[-1])
60 | end
61 | end
62 |
63 | MetaControl.class_eval do
64 | def codepoint
65 | meta_char_to_codepoint(control_sequence_to_s(text))
66 | end
67 | end
68 | end
69 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/human_name.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | # default implementation, e.g. "atomic group", "hex escape", "word type", ..
4 | def human_name
5 | [token, type].compact.join(' ').tr('_', ' ')
6 | end
7 | end
8 |
9 | Alternation.class_eval { def human_name; 'alternation' end }
10 | Alternative.class_eval { def human_name; 'alternative' end }
11 | Anchor::BOL.class_eval { def human_name; 'beginning of line' end }
12 | Anchor::BOS.class_eval { def human_name; 'beginning of string' end }
13 | Anchor::EOL.class_eval { def human_name; 'end of line' end }
14 | Anchor::EOS.class_eval { def human_name; 'end of string' end }
15 | Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end }
16 | Anchor::MatchStart.class_eval { def human_name; 'match start' end }
17 | Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end }
18 | Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end }
19 | Assertion::Lookahead.class_eval { def human_name; 'lookahead' end }
20 | Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end }
21 | Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end }
22 | Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end }
23 | Backreference::Name.class_eval { def human_name; 'backreference by name' end }
24 | Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end }
25 | Backreference::Number.class_eval { def human_name; 'backreference' end }
26 | Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end }
27 | Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end }
28 | Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end }
29 | CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end }
30 | CharacterSet::Intersection.class_eval { def human_name; 'intersection' end }
31 | CharacterSet::Range.class_eval { def human_name; 'character range' end }
32 | CharacterType::Any.class_eval { def human_name; 'match-all' end }
33 | Comment.class_eval { def human_name; 'comment' end }
34 | Conditional::Branch.class_eval { def human_name; 'conditional branch' end }
35 | Conditional::Condition.class_eval { def human_name; 'condition' end }
36 | Conditional::Expression.class_eval { def human_name; 'conditional' end }
37 | Group::Capture.class_eval { def human_name; "capture group #{number}" end }
38 | Group::Named.class_eval { def human_name; 'named capture group' end }
39 | Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end }
40 | Literal.class_eval { def human_name; 'literal' end }
41 | Root.class_eval { def human_name; 'root' end }
42 | WhiteSpace.class_eval { def human_name; 'free space' end }
43 | end
44 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/match.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Base
3 | def match?(string)
4 | !!match(string)
5 | end
6 | alias :matches? :match?
7 |
8 | def match(string, offset = 0)
9 | Regexp.new(to_s).match(string, offset)
10 | end
11 | alias :=~ :match
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/match_length.rb:
--------------------------------------------------------------------------------
1 | class Regexp::MatchLength
2 | include Enumerable
3 |
4 | def self.of(obj)
5 | exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj)
6 | exp.match_length
7 | end
8 |
9 | def initialize(exp, opts = {})
10 | self.exp_class = exp.class
11 | self.min_rep = exp.repetitions.min
12 | self.max_rep = exp.repetitions.max
13 | if (base = opts[:base])
14 | self.base_min = base
15 | self.base_max = base
16 | self.reify = ->{ '.' * base }
17 | else
18 | self.base_min = opts.fetch(:base_min)
19 | self.base_max = opts.fetch(:base_max)
20 | self.reify = opts.fetch(:reify)
21 | end
22 | end
23 |
24 | def each(opts = {})
25 | return enum_for(__method__, opts) unless block_given?
26 | limit = opts[:limit] || 1000
27 | yielded = 0
28 | (min..max).each do |num|
29 | next unless include?(num)
30 | yield(num)
31 | break if (yielded += 1) >= limit
32 | end
33 | end
34 |
35 | def endless_each
36 | return enum_for(__method__) unless block_given?
37 | (min..max).each { |num| yield(num) if include?(num) }
38 | end
39 |
40 | def include?(length)
41 | test_regexp.match?('X' * length)
42 | end
43 |
44 | def fixed?
45 | min == max
46 | end
47 |
48 | def min
49 | min_rep * base_min
50 | end
51 |
52 | def max
53 | max_rep * base_max
54 | end
55 |
56 | def minmax
57 | [min, max]
58 | end
59 |
60 | def inspect
61 | type = exp_class.name.sub('Regexp::Expression::', '')
62 | "#<#{self.class}<#{type}> min=#{min} max=#{max}>"
63 | end
64 |
65 | def to_re
66 | /(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/
67 | end
68 |
69 | private
70 |
71 | attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72 |
73 | if Regexp.method_defined?(:match?) # ruby >= 2.4
74 | def test_regexp
75 | @test_regexp ||= /^#{to_re}$/
76 | end
77 | else
78 | def test_regexp
79 | @test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end }
80 | end
81 | end
82 | end
83 |
84 | module Regexp::Expression
85 | MatchLength = Regexp::MatchLength
86 |
87 | [
88 | CharacterSet,
89 | CharacterSet::Intersection,
90 | CharacterSet::IntersectedSequence,
91 | CharacterSet::Range,
92 | CharacterType::Base,
93 | EscapeSequence::Base,
94 | PosixClass,
95 | UnicodeProperty::Base,
96 | ].each do |klass|
97 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
98 | def match_length
99 | MatchLength.new(self, base: 1)
100 | end
101 | RUBY
102 | end
103 |
104 | class Literal
105 | def match_length
106 | MatchLength.new(self, base: text.length)
107 | end
108 | end
109 |
110 | class Subexpression
111 | def match_length
112 | MatchLength.new(self,
113 | base_min: map { |exp| exp.match_length.min }.inject(0, :+),
114 | base_max: map { |exp| exp.match_length.max }.inject(0, :+),
115 | reify: ->{ map { |exp| exp.match_length.to_re }.join })
116 | end
117 |
118 | def inner_match_length
119 | dummy = Regexp::Expression::Root.construct
120 | dummy.expressions = expressions.map(&:clone)
121 | dummy.quantifier = quantifier && quantifier.clone
122 | dummy.match_length
123 | end
124 | end
125 |
126 | [
127 | Alternation,
128 | Conditional::Expression,
129 | ].each do |klass|
130 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
131 | def match_length
132 | MatchLength.new(self,
133 | base_min: map { |exp| exp.match_length.min }.min,
134 | base_max: map { |exp| exp.match_length.max }.max,
135 | reify: ->{ map { |exp| exp.match_length.to_re }.join('|') })
136 | end
137 | RUBY
138 | end
139 |
140 | [
141 | Anchor::Base,
142 | Assertion::Base,
143 | Conditional::Condition,
144 | FreeSpace,
145 | Keep::Mark,
146 | ].each do |klass|
147 | klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
148 | def match_length
149 | MatchLength.new(self, base: 0)
150 | end
151 | RUBY
152 | end
153 |
154 | class Backreference::Base
155 | def match_length
156 | if referenced_expression.nil?
157 | raise ArgumentError, 'Missing referenced_expression - not parsed?'
158 | end
159 | referenced_expression.unquantified_clone.match_length
160 | end
161 | end
162 |
163 | class EscapeSequence::CodepointList
164 | def match_length
165 | MatchLength.new(self, base: codepoints.count)
166 | end
167 | end
168 |
169 | # Special case. Absence group can match 0.. chars, irrespective of content.
170 | # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})`
171 | class Group::Absence
172 | def match_length
173 | MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' })
174 | end
175 | end
176 | end
177 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/negative.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | def negative?
4 | false
5 | end
6 |
7 | # not an alias so as to respect overrides of #negative?
8 | def negated?
9 | negative?
10 | end
11 | end
12 |
13 | Anchor::NonWordBoundary.class_eval { def negative?; true end }
14 | Assertion::NegativeLookahead.class_eval { def negative?; true end }
15 | Assertion::NegativeLookbehind.class_eval { def negative?; true end }
16 | CharacterSet.class_eval { def negative?; negative end }
17 | CharacterType::Base.class_eval { def negative?; token.to_s.start_with?('non') end }
18 | PosixClass.class_eval { def negative?; type == :nonposixclass end }
19 | UnicodeProperty::Base.class_eval { def negative?; type == :nonproperty end }
20 | end
21 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/options.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Base
3 | def multiline?
4 | options[:m] == true
5 | end
6 | alias :m? :multiline?
7 |
8 | def case_insensitive?
9 | options[:i] == true
10 | end
11 | alias :i? :case_insensitive?
12 | alias :ignore_case? :case_insensitive?
13 |
14 | def free_spacing?
15 | options[:x] == true
16 | end
17 | alias :x? :free_spacing?
18 | alias :extended? :free_spacing?
19 |
20 | def default_classes?
21 | options[:d] == true
22 | end
23 | alias :d? :default_classes?
24 |
25 | def ascii_classes?
26 | options[:a] == true
27 | end
28 | alias :a? :ascii_classes?
29 |
30 | def unicode_classes?
31 | options[:u] == true
32 | end
33 | alias :u? :unicode_classes?
34 | end
35 | end
36 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/parts.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | # default implementation
4 | def parts
5 | [text.dup]
6 | end
7 |
8 | private
9 |
10 | def intersperse(expressions, separator)
11 | expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
12 | end
13 | end
14 |
15 | CharacterSet.class_eval { def parts; ["#{text}#{'^' if negated?}", *expressions, ']'] end }
16 | CharacterSet::Range.class_eval { def parts; intersperse(expressions, text.dup) end }
17 | Conditional::Expression.class_eval { def parts; [text.dup, condition, *intersperse(branches, '|'), ')'] end }
18 | Group::Base.class_eval { def parts; [text.dup, *expressions, ')'] end }
19 | Group::Passive.class_eval { def parts; implicit? ? expressions : super end }
20 | Group::Comment.class_eval { def parts; [text.dup] end }
21 | Subexpression.class_eval { def parts; expressions end }
22 | SequenceOperation.class_eval { def parts; intersperse(expressions, text.dup) end }
23 | end
24 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/printing.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | def inspect
4 | [
5 | "#<#{self.class}",
6 | pretty_print_instance_variables.map { |v| " #{v}=#{instance_variable_get(v).inspect}" },
7 | ">"
8 | ].join
9 | end
10 |
11 | # Make pretty-print work despite #inspect implementation.
12 | def pretty_print(q)
13 | q.pp_object(self)
14 | end
15 |
16 | # Called by pretty_print (ruby/pp) and #inspect.
17 | def pretty_print_instance_variables
18 | [
19 | (:@text unless text.to_s.empty?),
20 | (:@quantifier if quantified?),
21 | (:@options unless options.empty?),
22 | (:@expressions unless terminal?),
23 | ].compact
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/referenced_expressions.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module ReferencedExpressions
3 | attr_accessor :referenced_expressions
4 |
5 | def referenced_expression
6 | referenced_expressions && referenced_expressions.first
7 | end
8 |
9 | def initialize_copy(orig)
10 | exp_id = [self.class, self.starts_at]
11 |
12 | # prevent infinite recursion for recursive subexp calls
13 | copied = self.class.instance_eval { @copied_ref_exps ||= {} }
14 | self.referenced_expressions =
15 | if copied[exp_id]
16 | orig.referenced_expressions
17 | else
18 | copied[exp_id] = true
19 | orig.referenced_expressions && orig.referenced_expressions.map(&:dup)
20 | end
21 | copied.clear
22 |
23 | super
24 | end
25 | end
26 |
27 | Base.include ReferencedExpressions
28 | end
29 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/strfregexp.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Base
3 |
4 | # %l Level (depth) of the expression. Returns 'root' for the root
5 | # expression, returns zero or higher for all others.
6 | #
7 | # %> Indentation at expression's level.
8 | #
9 | # %x Index of the expression at its depth. Available when using
10 | # the sprintf_tree method only.
11 | #
12 | # %s Start offset within the whole expression.
13 | # %e End offset within the whole expression.
14 | # %S Length of expression.
15 | #
16 | # %o Coded offset and length, same as '@%s+%S'
17 | #
18 | # %y Type of expression.
19 | # %k Token of expression.
20 | # %i ID, same as '%y:%k'
21 | # %c Class name
22 | #
23 | # %q Quantifier info, as {m[,M]}
24 | # %Q Quantifier text
25 | #
26 | # %z Quantifier min
27 | # %Z Quantifier max
28 | #
29 | # %t Base text of the expression (excludes quantifier, if any)
30 | # %~t Full text if the expression is terminal, otherwise %i
31 | # %T Full text of the expression (includes quantifier, if any)
32 | #
33 | # %b Basic info, same as '%o %i'
34 | # %m Most info, same as '%b %q'
35 | # %a All info, same as '%m %t'
36 | #
37 | def strfregexp(format = '%a', indent_offset = 0, index = nil)
38 | have_index = index ? true : false
39 |
40 | part = {}
41 |
42 | print_level = nesting_level > 0 ? nesting_level - 1 : nil
43 |
44 | # Order is important! Fields that use other fields in their
45 | # definition must appear before the fields they use.
46 | part_keys = %w[a m b o i l x s e S y k c q Q z Z t ~t T >]
47 | part.keys.each {|k| part[k] = "#{k}?>"}
48 |
49 | part['>'] = print_level ? (' ' * (print_level + indent_offset)) : ''
50 |
51 | part['l'] = print_level ? "#{'%d' % print_level}" : 'root'
52 | part['x'] = "#{'%d' % index}" if have_index
53 |
54 | part['s'] = starts_at
55 | part['S'] = full_length
56 | part['e'] = starts_at + full_length
57 | part['o'] = coded_offset
58 |
59 | part['k'] = token
60 | part['y'] = type
61 | part['i'] = '%y:%k'
62 | part['c'] = self.class.name
63 |
64 | if quantified?
65 | if quantifier.max == -1
66 | part['q'] = "{#{quantifier.min}, or-more}"
67 | else
68 | part['q'] = "{#{quantifier.min}, #{quantifier.max}}"
69 | end
70 |
71 | part['Q'] = quantifier.text
72 | part['z'] = quantifier.min
73 | part['Z'] = quantifier.max
74 | else
75 | part['q'] = '{1}'
76 | part['Q'] = ''
77 | part['z'] = '1'
78 | part['Z'] = '1'
79 | end
80 |
81 | part['t'] = to_s(:base)
82 | part['~t'] = terminal? ? to_s : "#{type}:#{token}"
83 | part['T'] = to_s(:full)
84 |
85 | part['b'] = '%o %i'
86 | part['m'] = '%b %q'
87 | part['a'] = '%m %t'
88 |
89 | out = format.dup
90 |
91 | part_keys.each do |k|
92 | out.gsub!(/%#{k}/, part[k].to_s)
93 | end
94 |
95 | out
96 | end
97 |
98 | alias :strfre :strfregexp
99 | end
100 |
101 | class Subexpression < Regexp::Expression::Base
102 | def strfregexp_tree(format = '%a', include_self = true, separator = "\n")
103 | output = include_self ? [self.strfregexp(format)] : []
104 |
105 | output += flat_map do |exp, index|
106 | exp.strfregexp(format, (include_self ? 1 : 0), index)
107 | end
108 |
109 | output.join(separator)
110 | end
111 |
112 | alias :strfre_tree :strfregexp_tree
113 | end
114 | end
115 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/methods/traverse.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Subexpression < Regexp::Expression::Base
3 |
4 | # Traverses the expression, passing each recursive child to the
5 | # given block.
6 | # If the block takes two arguments, the indices of the children within
7 | # their parents are also passed to it.
8 | def each_expression(include_self = false, &block)
9 | return enum_for(__method__, include_self) unless block
10 |
11 | if block.arity == 1
12 | block.call(self) if include_self
13 | each_expression_without_index(&block)
14 | else
15 | block.call(self, 0) if include_self
16 | each_expression_with_index(&block)
17 | end
18 | end
19 |
20 | # Traverses the subexpression (depth-first, pre-order) and calls the given
21 | # block for each expression with three arguments; the traversal event,
22 | # the expression, and the index of the expression within its parent.
23 | #
24 | # The event argument is passed as follows:
25 | #
26 | # - For subexpressions, :enter upon entering the subexpression, and
27 | # :exit upon exiting it.
28 | #
29 | # - For terminal expressions, :visit is called once.
30 | #
31 | # Returns self.
32 | def traverse(include_self = false, &block)
33 | return enum_for(__method__, include_self) unless block_given?
34 |
35 | block.call(:enter, self, 0) if include_self
36 |
37 | each_with_index do |exp, index|
38 | if exp.terminal?
39 | block.call(:visit, exp, index)
40 | else
41 | block.call(:enter, exp, index)
42 | exp.traverse(&block)
43 | block.call(:exit, exp, index)
44 | end
45 | end
46 |
47 | block.call(:exit, self, 0) if include_self
48 |
49 | self
50 | end
51 | alias :walk :traverse
52 |
53 | # Returns a new array with the results of calling the given block once
54 | # for every expression. If a block is not given, returns an array with
55 | # each expression and its level index as an array.
56 | def flat_map(include_self = false, &block)
57 | case block && block.arity
58 | when nil then each_expression(include_self).to_a
59 | when 2 then each_expression(include_self).map(&block)
60 | else each_expression(include_self).map { |exp| block.call(exp) }
61 | end
62 | end
63 |
64 | protected
65 |
66 | def each_expression_with_index(&block)
67 | each_with_index do |exp, index|
68 | block.call(exp, index)
69 | exp.each_expression_with_index(&block) unless exp.terminal?
70 | end
71 | end
72 |
73 | def each_expression_without_index(&block)
74 | each do |exp|
75 | block.call(exp)
76 | exp.each_expression_without_index(&block) unless exp.terminal?
77 | end
78 | end
79 | end
80 | end
81 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/quantifier.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | # TODO: in v3.0.0, maybe put Shared back into Base, and inherit from Base and
3 | # call super in #initialize, but raise in #quantifier= and #quantify,
4 | # or introduce an Expression::Quantifiable intermediate class.
5 | # Or actually allow chaining as a more concise but tricky solution than PR#69.
6 | class Quantifier
7 | include Regexp::Expression::Shared
8 |
9 | MODES = %i[greedy possessive reluctant]
10 |
11 | def initialize(*args)
12 | deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
13 |
14 | init_from_token_and_options(*args)
15 | # TODO: remove in v3.0.0, stop removing parts of #token (?)
16 | self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
17 | end
18 |
19 | def to_h
20 | {
21 | token: token,
22 | text: text,
23 | mode: mode,
24 | min: min,
25 | max: max,
26 | }
27 | end
28 |
29 | MODES.each do |mode|
30 | class_eval <<-RUBY, __FILE__, __LINE__ + 1
31 | def #{mode}?
32 | mode.equal?(:#{mode})
33 | end
34 | RUBY
35 | end
36 | alias :lazy? :reluctant?
37 |
38 | def min
39 | derived_data[:min]
40 | end
41 |
42 | def max
43 | derived_data[:max]
44 | end
45 |
46 | def mode
47 | derived_data[:mode]
48 | end
49 |
50 | private
51 |
52 | def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
53 | warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
54 | "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
55 | "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
56 | "with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode "\
57 | "will be derived automatically.\n"\
58 | "Or do `exp.quantifier = #{self.class}.construct(token: token, text: str)`.\n"\
59 | "This is consistent with how Expression::Base instances are created. "
60 | @token = token
61 | @text = text
62 | end
63 |
64 | def derived_data
65 | @derived_data ||= begin
66 | min, max =
67 | case text[0]
68 | when '?'; [0, 1]
69 | when '*'; [0, -1]
70 | when '+'; [1, -1]
71 | else
72 | int_min = text[/\{(\d*)/, 1]
73 | int_max = text[/,?(\d*)\}/, 1]
74 | [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
75 | end
76 |
77 | mod = text[/.([?+])/, 1]
78 | mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
79 |
80 | { min: min, max: max, mode: mode }
81 | end
82 | end
83 | end
84 | end
85 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/sequence.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | # A sequence of expressions. Differs from a Subexpressions by how it handles
3 | # quantifiers, as it applies them to its last element instead of itself as
4 | # a whole subexpression.
5 | #
6 | # Used as the base class for the Alternation alternatives, Conditional
7 | # branches, and CharacterSet::Intersection intersected sequences.
8 | class Sequence < Regexp::Expression::Subexpression
9 | class << self
10 | def add_to(exp, params = {}, active_opts = {})
11 | sequence = construct(
12 | level: exp.level,
13 | set_level: exp.set_level,
14 | conditional_level: params[:conditional_level] || exp.conditional_level,
15 | ts: params[:ts],
16 | )
17 | sequence.options = active_opts
18 | exp.expressions << sequence
19 | sequence
20 | end
21 | end
22 |
23 | def ts
24 | (head = expressions.first) ? head.ts : @ts
25 | end
26 |
27 | def quantify(token, *args)
28 | extract_quantifier_target(token.text).quantify(token, *args)
29 | end
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/sequence_operation.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | # abstract class
3 | class SequenceOperation < Regexp::Expression::Subexpression
4 | alias :sequences :expressions
5 | alias :operands :expressions
6 | alias :operator :text
7 |
8 | def ts
9 | (head = expressions.first) ? head.ts : @ts
10 | end
11 |
12 | def <<(exp)
13 | expressions.last << exp
14 | end
15 |
16 | def add_sequence(active_opts = {}, params = { ts: 0 })
17 | self.class::OPERAND.add_to(self, params, active_opts)
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/shared.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | module Shared
3 | module ClassMethods; end # filled in ./methods/*.rb
4 |
5 | def self.included(mod)
6 | mod.class_eval do
7 | extend Shared::ClassMethods
8 |
9 | attr_accessor :type, :token, :text, :ts, :te,
10 | :level, :set_level, :conditional_level,
11 | :options, :parent,
12 | :custom_to_s_handling, :pre_quantifier_decorations
13 |
14 | attr_reader :nesting_level, :quantifier
15 | end
16 | end
17 |
18 | def init_from_token_and_options(token, options = {})
19 | self.type = token.type
20 | self.token = token.token
21 | self.text = token.text
22 | self.ts = token.ts
23 | self.te = token.te
24 | self.level = token.level
25 | self.set_level = token.set_level
26 | self.conditional_level = token.conditional_level
27 | self.nesting_level = 0
28 | self.options = options || {}
29 | end
30 | private :init_from_token_and_options
31 |
32 | def initialize_copy(orig)
33 | self.text = orig.text.dup if orig.text
34 | self.options = orig.options.dup if orig.options
35 | self.quantifier = orig.quantifier.clone if orig.quantifier
36 | self.parent = nil # updated by Subexpression#initialize_copy
37 | if orig.pre_quantifier_decorations
38 | self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39 | end
40 | super
41 | end
42 |
43 | def starts_at
44 | ts
45 | end
46 |
47 | def ends_at(include_quantifier = true)
48 | ts + (include_quantifier ? full_length : base_length)
49 | end
50 |
51 | def base_length
52 | to_s(:base).length
53 | end
54 |
55 | def full_length
56 | to_s(:original).length
57 | end
58 |
59 | # #to_s reproduces the original source, as an unparser would.
60 | #
61 | # It takes an optional format argument.
62 | #
63 | # Example:
64 | #
65 | # lit = Regexp::Parser.parse(/a +/x)[0]
66 | #
67 | # lit.to_s # => 'a+' # default; with quantifier
68 | # lit.to_s(:full) # => 'a+' # default; with quantifier
69 | # lit.to_s(:base) # => 'a' # without quantifier
70 | # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71 | #
72 | def to_s(format = :full)
73 | base = parts.each_with_object(''.dup) do |part, buff|
74 | if part.instance_of?(String)
75 | buff << part
76 | elsif !part.custom_to_s_handling
77 | buff << part.to_s(:original)
78 | end
79 | end
80 | "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
81 | end
82 | alias :to_str :to_s
83 |
84 | def pre_quantifier_decoration(expression_format = :original)
85 | pre_quantifier_decorations.to_a.join if expression_format == :original
86 | end
87 |
88 | def quantifier_affix(expression_format = :full)
89 | quantifier.to_s if quantified? && expression_format != :base
90 | end
91 |
92 | def offset
93 | [starts_at, full_length]
94 | end
95 |
96 | def coded_offset
97 | '@%d+%d' % offset
98 | end
99 |
100 | def nesting_level=(lvl)
101 | @nesting_level = lvl
102 | quantifier && quantifier.nesting_level = lvl
103 | terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
104 | end
105 |
106 | def quantifier=(qtf)
107 | @quantifier = qtf
108 | @repetitions = nil # clear memoized value
109 | end
110 | end
111 | end
112 |
--------------------------------------------------------------------------------
/lib/regexp_parser/expression/subexpression.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Expression
2 | class Subexpression < Regexp::Expression::Base
3 | include Enumerable
4 |
5 | attr_accessor :expressions
6 |
7 | def initialize(token, options = {})
8 | self.expressions = []
9 | super
10 | end
11 |
12 | # Override base method to clone the expressions as well.
13 | def initialize_copy(orig)
14 | self.expressions = orig.expressions.map do |exp|
15 | exp.clone.tap { |copy| copy.parent = self }
16 | end
17 | super
18 | end
19 |
20 | def <<(exp)
21 | exp.parent = self
22 | expressions << exp
23 | end
24 |
25 | %w[[] at each empty? fetch index join last length values_at].each do |method|
26 | class_eval <<-RUBY, __FILE__, __LINE__ + 1
27 | def #{method}(*args, &block)
28 | expressions.#{method}(*args, &block)
29 | end
30 | RUBY
31 | end
32 |
33 | def dig(*indices)
34 | exp = self
35 | indices.each { |idx| exp = exp.nil? || exp.terminal? ? nil : exp[idx] }
36 | exp
37 | end
38 |
39 | def te
40 | ts + base_length
41 | end
42 |
43 | def to_h
44 | attributes.merge(
45 | text: to_s(:base),
46 | expressions: expressions.map(&:to_h)
47 | )
48 | end
49 |
50 | def extract_quantifier_target(quantifier_description)
51 | pre_quantifier_decorations = []
52 | target = expressions.reverse.find do |exp|
53 | if exp.decorative?
54 | exp.custom_to_s_handling = true
55 | pre_quantifier_decorations << exp.text
56 | next
57 | end
58 | exp
59 | end
60 | target or raise Regexp::Parser::ParserError,
61 | "No valid target found for '#{quantifier_description}' quantifier"
62 |
63 | target.pre_quantifier_decorations = pre_quantifier_decorations
64 | target
65 | end
66 | end
67 | end
68 |
--------------------------------------------------------------------------------
/lib/regexp_parser/scanner/char_type.rl:
--------------------------------------------------------------------------------
1 | %%{
2 | machine re_char_type;
3 |
4 | single_codepoint_char_type = [dDhHsSwW];
5 | multi_codepoint_char_type = [RX];
6 |
7 | char_type_char = single_codepoint_char_type | multi_codepoint_char_type;
8 |
9 | # Char types scanner
10 | # --------------------------------------------------------------------------
11 | char_type := |*
12 | char_type_char {
13 | case text = copy(data, ts-1, te)
14 | when '\d'; emit(:type, :digit, text)
15 | when '\D'; emit(:type, :nondigit, text)
16 | when '\h'; emit(:type, :hex, text)
17 | when '\H'; emit(:type, :nonhex, text)
18 | when '\s'; emit(:type, :space, text)
19 | when '\S'; emit(:type, :nonspace, text)
20 | when '\w'; emit(:type, :word, text)
21 | when '\W'; emit(:type, :nonword, text)
22 | when '\R'; emit(:type, :linebreak, text)
23 | when '\X'; emit(:type, :xgrapheme, text)
24 | end
25 | fret;
26 | };
27 | *|;
28 | }%%
29 |
--------------------------------------------------------------------------------
/lib/regexp_parser/scanner/errors/premature_end_error.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Scanner
2 | # Unexpected end of pattern
3 | class PrematureEndError < ScannerError
4 | def initialize(where = '')
5 | super "Premature end of pattern at #{where}"
6 | end
7 | end
8 | end
9 |
--------------------------------------------------------------------------------
/lib/regexp_parser/scanner/errors/scanner_error.rb:
--------------------------------------------------------------------------------
1 | require_relative '../../../regexp_parser/error'
2 |
3 | class Regexp::Scanner
4 | # General scanner error (catch all)
5 | class ScannerError < Regexp::Parser::Error; end
6 | end
7 |
--------------------------------------------------------------------------------
/lib/regexp_parser/scanner/errors/validation_error.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Scanner
2 | # Base for all scanner validation errors
3 | class ValidationError < ScannerError
4 | # Centralizes and unifies the handling of validation related errors.
5 | def self.for(type, problem, reason = nil)
6 | types.fetch(type).new(problem, reason)
7 | end
8 |
9 | def self.types
10 | @types ||= {
11 | backref: InvalidBackrefError,
12 | group: InvalidGroupError,
13 | group_option: InvalidGroupOption,
14 | posix_class: UnknownPosixClassError,
15 | property: UnknownUnicodePropertyError,
16 | sequence: InvalidSequenceError,
17 | }
18 | end
19 | end
20 |
21 | # Invalid sequence format. Used for escape sequences, mainly.
22 | class InvalidSequenceError < ValidationError
23 | def initialize(what = 'sequence', where = '')
24 | super "Invalid #{what} at #{where}"
25 | end
26 | end
27 |
28 | # Invalid group. Used for named groups.
29 | class InvalidGroupError < ValidationError
30 | def initialize(what, reason)
31 | super "Invalid #{what}, #{reason}."
32 | end
33 | end
34 |
35 | # Invalid groupOption. Used for inline options.
36 | # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
37 | class InvalidGroupOption < ValidationError
38 | def initialize(option, text)
39 | super "Invalid group option #{option} in #{text}"
40 | end
41 | end
42 |
43 | # Invalid back reference. Used for name a number refs/calls.
44 | class InvalidBackrefError < ValidationError
45 | def initialize(what, reason)
46 | super "Invalid back reference #{what}, #{reason}"
47 | end
48 | end
49 |
50 | # The property name was not recognized by the scanner.
51 | class UnknownUnicodePropertyError < ValidationError
52 | def initialize(name, _)
53 | super "Unknown unicode character property name #{name}"
54 | end
55 | end
56 |
57 | # The POSIX class name was not recognized by the scanner.
58 | class UnknownPosixClassError < ValidationError
59 | def initialize(text, _)
60 | super "Unknown POSIX class #{text}"
61 | end
62 | end
63 | end
64 |
--------------------------------------------------------------------------------
/lib/regexp_parser/scanner/property.rl:
--------------------------------------------------------------------------------
1 | %%{
2 | machine re_property;
3 |
4 | property_char = [pP];
5 |
6 | property_sequence = property_char . '{' . '^'? (alnum|space|[_\-\.=])+ '}';
7 |
8 | action premature_property_end {
9 | raise PrematureEndError.new('unicode property')
10 | }
11 |
12 | # Unicode properties scanner
13 | # --------------------------------------------------------------------------
14 | unicode_property := |*
15 |
16 | property_sequence < eof(premature_property_end) {
17 | text = copy(data, ts-1, te)
18 | type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19 |
20 | name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21 |
22 | token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23 | raise ValidationError.for(:property, name) unless token
24 |
25 | self.emit(type, token.to_sym, text)
26 |
27 | fret;
28 | };
29 | *|;
30 | }%%
31 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax.rb:
--------------------------------------------------------------------------------
1 | require_relative 'error'
2 |
3 | module Regexp::Syntax
4 | class SyntaxError < Regexp::Parser::Error; end
5 | end
6 |
7 | require_relative 'syntax/token'
8 | require_relative 'syntax/base'
9 | require_relative 'syntax/any'
10 | require_relative 'syntax/version_lookup'
11 | require_relative 'syntax/versions'
12 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/any.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | # A syntax that always returns true, passing all tokens as implemented. This
3 | # is useful during development, testing, and should be useful for some types
4 | # of transformations as well.
5 | class Any < Base
6 | implements :*, [:*]
7 |
8 | def self.implements?(_type, _token) true end
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/base.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | class NotImplementedError < Regexp::Syntax::SyntaxError
3 | def initialize(syntax, type, token)
4 | super "#{syntax} does not implement: [#{type}:#{token}]"
5 | end
6 | end
7 |
8 | # A lookup map of supported types and tokens in a given syntax
9 | class Base
10 | include Regexp::Syntax::Token
11 |
12 | class << self
13 | attr_accessor :features
14 |
15 | # automatically inherit features through the syntax class hierarchy
16 | def inherited(subclass)
17 | super
18 | subclass.features = features.to_h.map { |k, v| [k, v.dup] }.to_h
19 | end
20 |
21 | def implements(type, tokens)
22 | (features[type] ||= []).concat(tokens)
23 | added_features[type] = tokens
24 | end
25 |
26 | def excludes(type, tokens)
27 | tokens.each { |tok| features[type].delete(tok) }
28 | removed_features[type] = tokens
29 | end
30 |
31 | def implements?(type, token)
32 | implementations(type).include?(token)
33 | end
34 | alias :check? :implements?
35 |
36 | def implementations(type)
37 | features[type] || []
38 | end
39 |
40 | def implements!(type, token)
41 | raise NotImplementedError.new(self, type, token) unless
42 | implements?(type, token)
43 | end
44 | alias :check! :implements!
45 |
46 | def added_features
47 | @added_features ||= {}
48 | end
49 |
50 | def removed_features
51 | @removed_features ||= {}
52 | end
53 |
54 | def normalize(type, token)
55 | case type
56 | when :group
57 | normalize_group(type, token)
58 | when :backref
59 | normalize_backref(type, token)
60 | else
61 | [type, token]
62 | end
63 | end
64 |
65 | def normalize_group(type, token)
66 | case token
67 | when :named_ab, :named_sq
68 | %i[group named]
69 | else
70 | [type, token]
71 | end
72 | end
73 |
74 | def normalize_backref(type, token)
75 | case token
76 | when :name_ref_ab, :name_ref_sq
77 | %i[backref name_ref]
78 | when :name_call_ab, :name_call_sq
79 | %i[backref name_call]
80 | when :name_recursion_ref_ab, :name_recursion_ref_sq
81 | %i[backref name_recursion_ref]
82 | when :number_ref_ab, :number_ref_sq
83 | %i[backref number_ref]
84 | when :number_call_ab, :number_call_sq
85 | %i[backref number_call]
86 | when :number_rel_ref_ab, :number_rel_ref_sq
87 | %i[backref number_rel_ref]
88 | when :number_rel_call_ab, :number_rel_call_sq
89 | %i[backref number_rel_call]
90 | when :number_recursion_ref_ab, :number_recursion_ref_sq
91 | %i[backref number_recursion_ref]
92 | else
93 | [type, token]
94 | end
95 | end
96 | end
97 |
98 | # TODO: drop this backwards compatibility code in v3.0.0, do `private :new`
99 | def initialize
100 | warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \
101 | "and will no longer be supported in v3.0.0."
102 | end
103 |
104 | def method_missing(name, *args)
105 | if self.class.respond_to?(name)
106 | warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \
107 | "and will no longer be supported in v3.0.0. Please call "\
108 | "methods on the class directly, e.g.: #{self.class}.#{name}"
109 | self.class.send(name, *args)
110 | else
111 | super
112 | end
113 | end
114 |
115 | def respond_to_missing?(name, include_private = false)
116 | self.class.respond_to?(name) || super
117 | end
118 | # end of backwards compatibility code
119 | end
120 | end
121 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token.rb:
--------------------------------------------------------------------------------
1 | # Define the base module and the simplest of tokens.
2 | module Regexp::Syntax
3 | module Token
4 | Map = {}
5 |
6 | module Literal
7 | All = %i[literal]
8 | Type = :literal
9 | end
10 |
11 | module FreeSpace
12 | All = %i[comment whitespace]
13 | Type = :free_space
14 | end
15 |
16 | Map[FreeSpace::Type] = FreeSpace::All
17 | Map[Literal::Type] = Literal::All
18 | end
19 | end
20 |
21 |
22 | # Load all the token files, they will populate the Map constant.
23 | require_relative 'token/anchor'
24 | require_relative 'token/assertion'
25 | require_relative 'token/backreference'
26 | require_relative 'token/posix_class'
27 | require_relative 'token/character_set'
28 | require_relative 'token/character_type'
29 | require_relative 'token/conditional'
30 | require_relative 'token/escape'
31 | require_relative 'token/group'
32 | require_relative 'token/keep'
33 | require_relative 'token/meta'
34 | require_relative 'token/quantifier'
35 | require_relative 'token/unicode_property'
36 |
37 |
38 | # After loading all the tokens the map is full. Extract all tokens and types
39 | # into the All and Types constants.
40 | module Regexp::Syntax
41 | module Token
42 | All = Map.values.flatten.uniq.sort.freeze
43 | Types = Map.keys.freeze
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/anchor.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Anchor
4 | Basic = %i[bol eol]
5 | Extended = Basic + %i[word_boundary nonword_boundary]
6 | String = %i[bos eos eos_ob_eol]
7 | MatchStart = %i[match_start]
8 |
9 | All = Extended + String + MatchStart
10 | Type = :anchor
11 | end
12 |
13 | Map[Anchor::Type] = Anchor::All
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/assertion.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Assertion
4 | Lookahead = %i[lookahead nlookahead]
5 | Lookbehind = %i[lookbehind nlookbehind]
6 |
7 | All = Lookahead + Lookbehind
8 | Type = :assertion
9 | end
10 |
11 | Map[Assertion::Type] = Assertion::All
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/backreference.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Backreference
4 | Plain = %i[number]
5 | NumberRef = %i[number_ref number_rel_ref]
6 | Number = Plain + NumberRef
7 | Name = %i[name_ref]
8 |
9 | RecursionLevel = %i[name_recursion_ref number_recursion_ref]
10 |
11 | V1_8_6 = Plain
12 |
13 | V1_9_1 = Name + NumberRef + RecursionLevel
14 |
15 | All = V1_8_6 + V1_9_1
16 | Type = :backref
17 | end
18 |
19 | # Type is the same as Backreference so keeping it here, for now.
20 | module SubexpressionCall
21 | Name = %i[name_call]
22 | Number = %i[number_call number_rel_call]
23 |
24 | All = Name + Number
25 | end
26 |
27 | Map[Backreference::Type] = Backreference::All +
28 | SubexpressionCall::All
29 |
30 | # alias for symmetry between token symbol and Expression class name
31 | Backref = Backreference
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/character_set.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module CharacterSet
4 | Basic = %i[open close negate range]
5 | Extended = Basic + %i[intersection]
6 |
7 | All = Extended
8 | Type = :set
9 | end
10 |
11 | Map[CharacterSet::Type] = CharacterSet::All
12 |
13 | # alias for symmetry between token symbol and Token module name
14 | Set = CharacterSet
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/character_type.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module CharacterType
4 | Basic = []
5 | Extended = %i[digit nondigit space nonspace word nonword]
6 | Hex = %i[hex nonhex]
7 |
8 | Clustered = %i[linebreak xgrapheme]
9 |
10 | All = Basic + Extended + Hex + Clustered
11 | Type = :type
12 | end
13 |
14 | Map[CharacterType::Type] = CharacterType::All
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/conditional.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Conditional
4 | Delimiters = %i[open close]
5 |
6 | Condition = %i[condition_open condition condition_close]
7 | Separator = %i[separator]
8 |
9 | All = Conditional::Delimiters + Conditional::Condition + Conditional::Separator
10 |
11 | Type = :conditional
12 | end
13 |
14 | Map[Conditional::Type] = Conditional::All
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/escape.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Escape
4 | Basic = %i[backslash literal]
5 |
6 | Control = %i[control meta_sequence]
7 |
8 | ASCII = %i[bell backspace escape form_feed newline carriage
9 | tab vertical_tab]
10 |
11 | Unicode = %i[codepoint codepoint_list]
12 |
13 | Meta = %i[dot alternation
14 | zero_or_one zero_or_more one_or_more
15 | bol eol
16 | group_open group_close
17 | interval_open interval_close
18 | set_open set_close]
19 |
20 | Hex = %i[hex]
21 |
22 | Octal = %i[octal]
23 |
24 | All = Basic + Control + ASCII + Unicode + Meta + Hex + Octal
25 | Type = :escape
26 | end
27 |
28 | Map[Escape::Type] = Escape::All
29 |
30 | # alias for symmetry between Token::* and Expression::*
31 | EscapeSequence = Escape
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/group.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Group
4 | Basic = %i[capture close]
5 | Extended = Basic + %i[options options_switch]
6 |
7 | Named = %i[named]
8 | Atomic = %i[atomic]
9 | Passive = %i[passive]
10 | Comment = %i[comment]
11 |
12 | V1_8_6 = Group::Extended + Group::Named + Group::Atomic +
13 | Group::Passive + Group::Comment
14 |
15 | V2_4_1 = %i[absence]
16 |
17 | All = V1_8_6 + V2_4_1
18 | Type = :group
19 | end
20 |
21 | Map[Group::Type] = Group::All
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/keep.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Keep
4 | Mark = %i[mark]
5 |
6 | All = Mark
7 | Type = :keep
8 | end
9 |
10 | Map[Keep::Type] = Keep::All
11 | end
12 | end
13 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/meta.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Meta
4 | Basic = %i[dot]
5 | Alternation = %i[alternation]
6 | Extended = Basic + Alternation
7 |
8 | All = Extended
9 | Type = :meta
10 | end
11 |
12 | Map[Meta::Type] = Meta::All
13 |
14 | # alias for symmetry between Token::* and Expression::*
15 | module Alternation
16 | All = Meta::Alternation
17 | Type = Meta::Type
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/posix_class.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module PosixClass
4 | Standard = %i[alnum alpha blank cntrl digit graph
5 | lower print punct space upper xdigit]
6 |
7 | Extensions = %i[ascii word]
8 |
9 | All = Standard + Extensions
10 | Type = :posixclass
11 | NonType = :nonposixclass
12 | end
13 |
14 | Map[PosixClass::Type] = PosixClass::All
15 | Map[PosixClass::NonType] = PosixClass::All
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/quantifier.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Quantifier
4 | Greedy = %i[
5 | zero_or_one
6 | zero_or_more
7 | one_or_more
8 | ]
9 |
10 | Reluctant = %i[
11 | zero_or_one_reluctant
12 | zero_or_more_reluctant
13 | one_or_more_reluctant
14 | ]
15 |
16 | Possessive = %i[
17 | zero_or_one_possessive
18 | zero_or_more_possessive
19 | one_or_more_possessive
20 | ]
21 |
22 | Interval = %i[interval]
23 | IntervalReluctant = %i[interval_reluctant]
24 | IntervalPossessive = %i[interval_possessive]
25 |
26 | IntervalAll = Interval + IntervalReluctant + IntervalPossessive
27 |
28 | V1_8_6 = Greedy + Reluctant + Interval + IntervalReluctant
29 | All = Greedy + Reluctant + Possessive + IntervalAll
30 | Type = :quantifier
31 | end
32 |
33 | Map[Quantifier::Type] = Quantifier::All
34 | end
35 | end
36 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/token/virtual.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | module Token
3 | module Virtual
4 | Root = %i[root]
5 | Sequence = %i[sequence]
6 |
7 | All = %i[root sequence]
8 | Type = :expression
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/version_lookup.rb:
--------------------------------------------------------------------------------
1 | module Regexp::Syntax
2 | VERSION_FORMAT = '\Aruby/\d+\.\d+(\.\d+)?\z'
3 | VERSION_REGEXP = /#{VERSION_FORMAT}/
4 | VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
5 |
6 | class InvalidVersionNameError < Regexp::Syntax::SyntaxError
7 | def initialize(name)
8 | super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
9 | end
10 | end
11 |
12 | class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
13 | def initialize(name)
14 | super "Unknown syntax name '#{name}'."
15 | end
16 | end
17 |
18 | module_function
19 |
20 | # Returns the syntax specification class for the given syntax
21 | # version name. The special names 'any' and '*' return Syntax::Any.
22 | def for(name)
23 | (@alias_map ||= {})[name] ||= version_class(name)
24 | end
25 |
26 | def new(name)
27 | warn 'Regexp::Syntax.new is deprecated in favor of Regexp::Syntax.for. '\
28 | 'It does not return distinct instances and will be removed in v3.0.0.'
29 | self.for(name)
30 | end
31 |
32 | def supported?(name)
33 | name =~ VERSION_REGEXP && comparable(name) >= comparable('1.8.6')
34 | end
35 |
36 | def version_class(version)
37 | return Regexp::Syntax::Any if ['*', 'any'].include?(version.to_s)
38 |
39 | version =~ VERSION_REGEXP || raise(InvalidVersionNameError, version)
40 | version_const_name = "V#{version.to_s.scan(/\d+/).join('_')}"
41 | const_get(version_const_name) || raise(UnknownSyntaxNameError, version)
42 | end
43 |
44 | def const_missing(const_name)
45 | if const_name =~ VERSION_CONST_REGEXP
46 | return fallback_version_class(const_name)
47 | end
48 | super
49 | end
50 |
51 | def fallback_version_class(version)
52 | sorted = (specified_versions + [version]).sort_by { |ver| comparable(ver) }
53 | index = sorted.index(version)
54 | index > 0 && const_get(sorted[index - 1])
55 | end
56 |
57 | def specified_versions
58 | constants.select { |const_name| const_name =~ VERSION_CONST_REGEXP }
59 | end
60 |
61 | def comparable(name)
62 | # add .99 to treat versions without a patch value as latest patch version
63 | Gem::Version.new((name.to_s.scan(/\d+/) << 99).join('.'))
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions.rb:
--------------------------------------------------------------------------------
1 | # Ruby 1.x is no longer a supported runtime,
2 | # but its regex features are still recognized.
3 | #
4 | # Aliases for the latest patch version are provided as 'ruby/n.n',
5 | # e.g. 'ruby/1.9' refers to Ruby v1.9.3.
6 | Dir[File.expand_path('../versions/*.rb', __FILE__)].sort.each { |f| require_relative f }
7 |
8 | Regexp::Syntax::CURRENT = Regexp::Syntax.for("ruby/#{RUBY_VERSION}")
9 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/1.8.6.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V1_8_6 < Regexp::Syntax::Base
2 | implements :anchor, Anchor::All
3 | implements :assertion, Assertion::Lookahead
4 | implements :backref, Backreference::V1_8_6
5 | implements :escape, Escape::Basic + Escape::ASCII + Escape::Meta + Escape::Control
6 | implements :free_space, FreeSpace::All
7 | implements :group, Group::V1_8_6
8 | implements :literal, Literal::All
9 | implements :meta, Meta::Extended
10 | implements :posixclass, PosixClass::Standard
11 | implements :quantifier, Quantifier::V1_8_6
12 | implements :set, CharacterSet::All
13 | implements :type, CharacterType::Extended
14 | end
15 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/1.9.1.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V1_9_1 < Regexp::Syntax::V1_8_6
2 | implements :assertion, Assertion::Lookbehind
3 | implements :backref, Backreference::V1_9_1 + SubexpressionCall::All
4 | implements :escape, Escape::Unicode + Escape::Hex + Escape::Octal
5 | implements :posixclass, PosixClass::Extensions
6 | implements :nonposixclass, PosixClass::All
7 | implements :property, UnicodeProperty::V1_9_0
8 | implements :nonproperty, UnicodeProperty::V1_9_0
9 | implements :quantifier, Quantifier::Possessive + Quantifier::IntervalPossessive
10 | implements :type, CharacterType::Hex
11 | end
12 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/1.9.3.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V1_9_3 < Regexp::Syntax::V1_9_1
2 | implements :property, UnicodeProperty::V1_9_3
3 | implements :nonproperty, UnicodeProperty::V1_9_3
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.0.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_0_0 < Regexp::Syntax::V1_9_3
2 | implements :keep, Keep::All
3 | implements :conditional, Conditional::All
4 | implements :property, UnicodeProperty::V2_0_0
5 | implements :nonproperty, UnicodeProperty::V2_0_0
6 | implements :type, CharacterType::Clustered
7 |
8 | excludes :property, %i[newline]
9 | excludes :nonproperty, %i[newline]
10 | end
11 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.2.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_2_0 < Regexp::Syntax::V2_0_0
2 | implements :property, UnicodeProperty::V2_2_0
3 | implements :nonproperty, UnicodeProperty::V2_2_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.3.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_3_0 < Regexp::Syntax::V2_2_0
2 | implements :property, UnicodeProperty::V2_3_0
3 | implements :nonproperty, UnicodeProperty::V2_3_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.4.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_4_0 < Regexp::Syntax::V2_3_0
2 | implements :property, UnicodeProperty::V2_4_0
3 | implements :nonproperty, UnicodeProperty::V2_4_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.4.1.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_4_1 < Regexp::Syntax::V2_4_0
2 | implements :group, Group::V2_4_1
3 | end
4 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.5.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_5_0 < Regexp::Syntax::V2_4_1
2 | implements :property, UnicodeProperty::V2_5_0
3 | implements :nonproperty, UnicodeProperty::V2_5_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.6.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_6_0 < Regexp::Syntax::V2_5_0
2 | implements :property, UnicodeProperty::V2_6_0
3 | implements :nonproperty, UnicodeProperty::V2_6_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.6.2.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_6_2 < Regexp::Syntax::V2_6_0
2 | implements :property, UnicodeProperty::V2_6_2
3 | implements :nonproperty, UnicodeProperty::V2_6_2
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/2.6.3.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V2_6_3 < Regexp::Syntax::V2_6_2
2 | implements :property, UnicodeProperty::V2_6_3
3 | implements :nonproperty, UnicodeProperty::V2_6_3
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/3.1.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V3_1_0 < Regexp::Syntax::V2_6_3
2 | implements :property, UnicodeProperty::V3_1_0
3 | implements :nonproperty, UnicodeProperty::V3_1_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/syntax/versions/3.2.0.rb:
--------------------------------------------------------------------------------
1 | class Regexp::Syntax::V3_2_0 < Regexp::Syntax::V3_1_0
2 | implements :property, UnicodeProperty::V3_2_0
3 | implements :nonproperty, UnicodeProperty::V3_2_0
4 | end
5 |
--------------------------------------------------------------------------------
/lib/regexp_parser/token.rb:
--------------------------------------------------------------------------------
1 | class Regexp
2 | TOKEN_KEYS = %i[
3 | type
4 | token
5 | text
6 | ts
7 | te
8 | level
9 | set_level
10 | conditional_level
11 | ].freeze
12 |
13 | Token = Struct.new(*TOKEN_KEYS) do
14 | attr_accessor :previous, :next
15 |
16 | def offset
17 | [ts, te]
18 | end
19 |
20 | def length
21 | te - ts
22 | end
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/regexp_parser/version.rb:
--------------------------------------------------------------------------------
1 | class Regexp
2 | class Parser
3 | VERSION = '2.10.0'
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/regexp_parser.gemspec:
--------------------------------------------------------------------------------
1 | $:.unshift File.join(File.dirname(__FILE__), 'lib')
2 |
3 | require 'regexp_parser/version'
4 |
5 | Gem::Specification.new do |spec|
6 | spec.name = 'regexp_parser'
7 | spec.version = ::Regexp::Parser::VERSION
8 |
9 | spec.summary = "Scanner, lexer, parser for ruby's regular expressions"
10 | spec.description = 'A library for tokenizing, lexing, and parsing Ruby regular expressions.'
11 | spec.homepage = 'https://github.com/ammar/regexp_parser'
12 |
13 | spec.metadata['bug_tracker_uri'] = "#{spec.homepage}/issues"
14 | spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/master/CHANGELOG.md"
15 | spec.metadata['homepage_uri'] = spec.homepage
16 | spec.metadata['source_code_uri'] = spec.homepage
17 | spec.metadata['wiki_uri'] = "#{spec.homepage}/wiki"
18 |
19 | spec.metadata['rubygems_mfa_required'] = 'true'
20 |
21 | spec.authors = ['Ammar Ali', 'Janosch Müller']
22 | spec.email = ['ammarabuali@gmail.com', 'janosch84@gmail.com']
23 |
24 | spec.license = 'MIT'
25 |
26 | spec.require_paths = ['lib']
27 |
28 | spec.files = Dir.glob('lib/**/*.{csv,rb,rl}') +
29 | %w[Gemfile Rakefile LICENSE regexp_parser.gemspec]
30 |
31 | spec.platform = Gem::Platform::RUBY
32 |
33 | spec.required_ruby_version = '>= 2.0.0'
34 | end
35 |
--------------------------------------------------------------------------------
/spec/expression/base_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Expression::Base) do
4 | # test #level
5 | include_examples 'parse', /^a(b(c(d)))e$/,
6 | [0] => [to_s: '^', level: 0],
7 | [1] => [to_s: 'a', level: 0],
8 | [2] => [to_s: '(b(c(d)))', level: 0],
9 | [2, 0] => [to_s: 'b', level: 1],
10 | [2, 1] => [to_s: '(c(d))', level: 1],
11 | [2, 1, 0] => [to_s: 'c', level: 2],
12 | [2, 1, 1] => [to_s: '(d)', level: 2],
13 | [2, 1, 1, 0] => [to_s: 'd', level: 3],
14 | [3] => [to_s: 'e', level: 0],
15 | [4] => [to_s: '$', level: 0]
16 |
17 | # test #coded_offset
18 | include_examples 'parse', /^a*(b+(c?))$/,
19 | [] => [Root, coded_offset: '@0+12'],
20 | [0] => [to_s: '^', coded_offset: '@0+1'],
21 | [1] => [to_s: 'a*', coded_offset: '@1+2'],
22 | [2] => [to_s: '(b+(c?))', coded_offset: '@3+8'],
23 | [2, 0] => [to_s: 'b+', coded_offset: '@4+2'],
24 | [2, 1] => [to_s: '(c?)', coded_offset: '@6+4'],
25 | [2, 1, 0] => [to_s: 'c?', coded_offset: '@7+2'],
26 | [3] => [to_s: '$', coded_offset: '@11+1']
27 |
28 | # test #quantity
29 | include_examples 'parse', /aa/, [0] => [quantity: [nil, nil]]
30 | include_examples 'parse', /a?/, [0] => [quantity: [0, 1]]
31 | include_examples 'parse', /a*/, [0] => [quantity: [0, -1]]
32 | include_examples 'parse', /a+/, [0] => [quantity: [1, -1]]
33 |
34 | # test #repetitions
35 | include_examples 'parse', /aa/, [0] => [repetitions: 1..1]
36 | include_examples 'parse', /a?/, [0] => [repetitions: 0..1]
37 | include_examples 'parse', /a*/, [0] => [repetitions: 0..(Float::INFINITY)]
38 | include_examples 'parse', /a+/, [0] => [repetitions: 1..(Float::INFINITY)]
39 |
40 | # test #base_length, #full_length, #starts_at, #ends_at
41 | include_examples 'parse', /(aa)/,
42 | [] => [Root, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4],
43 | [0] => [Group::Capture, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4],
44 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3]
45 | include_examples 'parse', /(aa){42}/,
46 | [] => [Root, base_length: 8, full_length: 8, starts_at: 0, ends_at: 8],
47 | [0] => [Group::Capture, base_length: 4, full_length: 8, starts_at: 0, ends_at: 8],
48 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3]
49 | include_examples 'parse', /(aa) {42}/x,
50 | [] => [Root, base_length: 9, full_length: 9, starts_at: 0, ends_at: 9],
51 | [0] => [Group::Capture, base_length: 4, full_length: 9, starts_at: 0, ends_at: 9],
52 | [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3]
53 |
54 | # test #to_re
55 | include_examples 'parse', '^a*(b([cde]+))+f?$',
56 | [] => [Root, to_re: /^a*(b([cde]+))+f?$/]
57 |
58 | specify '#parent' do
59 | root = Regexp::Parser.parse(/(a(b)){42}/)
60 |
61 | expect(root.parent).to be_nil
62 | expect(root[0].parent).to eq root
63 | expect(root[0].quantifier.parent).to be_nil
64 | expect(root[0][0].parent).to eq root[0]
65 | expect(root[0][1].parent).to eq root[0]
66 | expect(root[0][1][0].parent).to eq root[0][1]
67 | end
68 |
69 | specify '#to_re warns when used on set members' do
70 | expect do
71 | result = Regexp::Parser.parse(/[\b]/)[0][0].to_re
72 | expect(result).to eq(/\b/)
73 | end.to output(/set member/).to_stderr
74 | end
75 |
76 | specify 'updating #quantifier updates #repetitions' do
77 | exp = Regexp::Parser.parse(/a{3}/)[0]
78 | expect(exp.repetitions).to eq 3..3
79 | exp.quantifier = Regexp::Parser.parse(/b{5}/)[0].quantifier
80 | expect(exp.repetitions).to eq 5..5
81 | end
82 | end
83 |
--------------------------------------------------------------------------------
/spec/expression/conditional_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Expression::Conditional) do
4 | specify('Conditional#condition, #branches') do
5 | conditional = RP.parse(/(?a)(?()T|F)/)[1]
6 | expect(conditional.condition).to eq conditional[0]
7 | expect(conditional.branches).to eq conditional[1..2]
8 | end
9 |
10 | specify('Condition#referenced_expression') do
11 | root = RP.parse(/(?a)(?()T|F)/)
12 | condition = root[1].condition
13 | expect(condition.referenced_expression).to eq root[0]
14 | expect(condition.referenced_expression.to_s).to eq '(?a)'
15 |
16 | root = RP.parse(/(a)(?(1)T|F)/)
17 | condition = root[1].condition
18 | expect(condition.referenced_expression).to eq root[0]
19 | expect(condition.referenced_expression.to_s).to eq '(a)'
20 | end
21 |
22 | specify('parse conditional excessive branches') do
23 | regexp = '(?a)(?()T|F|X)'
24 |
25 | expect { RP.parse(regexp) }.to raise_error(Conditional::TooManyBranches)
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/spec/expression/free_space_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Expression::FreeSpace) do
4 | specify('white space quantify raises error') do
5 | regexp = /
6 | a # Comment
7 | /x
8 |
9 | root = RP.parse(regexp)
10 | space = root[0]
11 |
12 | expect(space).to be_instance_of(FreeSpace::WhiteSpace)
13 | expect { space.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error)
14 | end
15 |
16 | specify('comment quantify raises error') do
17 | regexp = /
18 | a # Comment
19 | /x
20 |
21 | root = RP.parse(regexp)
22 | comment = root[3]
23 |
24 | expect(comment).to be_instance_of(FreeSpace::Comment)
25 | expect { comment.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error)
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/spec/expression/methods/construct_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Expression::Shared) do
4 | describe '::construct' do
5 | {
6 | Alternation => :meta,
7 | Alternative => :expression,
8 | Anchor::Base => :anchor,
9 | Anchor::EndOfLine => :anchor,
10 | Assertion::Base => :assertion,
11 | Assertion::Lookahead => :assertion,
12 | Backreference::Base => :backref,
13 | Backreference::Number => :backref,
14 | CharacterSet => :set,
15 | CharacterSet::IntersectedSequence => :expression,
16 | CharacterSet::Intersection => :set,
17 | CharacterSet::Range => :set,
18 | CharacterType::Any => :meta,
19 | CharacterType::Base => :type,
20 | CharacterType::Digit => :type,
21 | Conditional::Branch => :expression,
22 | Conditional::Condition => :conditional,
23 | Conditional::Expression => :conditional,
24 | EscapeSequence::Base => :escape,
25 | EscapeSequence::Literal => :escape,
26 | FreeSpace => :free_space,
27 | Group::Base => :group,
28 | Group::Capture => :group,
29 | Keep::Mark => :keep,
30 | Literal => :literal,
31 | PosixClass => :posixclass,
32 | Quantifier => :quantifier,
33 | Root => :expression,
34 | UnicodeProperty::Base => :property,
35 | UnicodeProperty::Number::Decimal => :property,
36 | }.each do |klass, expected_type|
37 | it "works for #{klass}" do
38 | result = klass.construct
39 | expect(result).to be_a klass
40 | expect(result.type).to eq expected_type
41 | end
42 | end
43 |
44 | it 'allows overriding defaults' do
45 | expect(Literal.construct(type: :foo).type).to eq :foo
46 | end
47 |
48 | it 'allows passing options' do
49 | expect(Literal.construct(options: { i: true }).options[:i]).to eq true
50 | end
51 |
52 | it 'raises ArgumentError for unknown parameters' do
53 | expect { Literal.construct(foo: :foo) }.to raise_error(ArgumentError)
54 | end
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/spec/expression/methods/human_name_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Regexp::Expression::Shared#human_name') do
4 | include_examples 'parse', //, [] => [human_name: 'root']
5 | include_examples 'parse', /a/, [0] => [human_name: 'literal']
6 | include_examples 'parse', /./, [0] => [human_name: 'match-all']
7 | include_examples 'parse', /[abc]/, [0] => [human_name: 'character set']
8 | include_examples 'parse', /[a-c]/, [0, 0] => [human_name: 'character range']
9 | include_examples 'parse', /\d/, [0] => [human_name: 'digit type']
10 | include_examples 'parse', /\n/, [0] => [human_name: 'newline escape']
11 | include_examples 'parse', /\u{61 62 63}/, [0] => [human_name: 'codepoint list escape']
12 | include_examples 'parse', /\p{ascii}/, [0] => [human_name: 'ascii property']
13 | include_examples 'parse', /[[:ascii:]]/, [0, 0] => [human_name: 'ascii posixclass']
14 | include_examples 'parse', /a{5}/, [0, :q] => [human_name: 'interval quantifier']
15 | include_examples 'parse', /^/, [0] => [human_name: 'beginning of line']
16 | include_examples 'parse', /(?=abc)/, [0] => [human_name: 'lookahead']
17 | include_examples 'parse', /(a)(b)/, [0] => [human_name: 'capture group 1']
18 | include_examples 'parse', /(a)(b)/, [1] => [human_name: 'capture group 2']
19 | include_examples 'parse', /(?abc)/, [0] => [human_name: 'named capture group']
20 | include_examples 'parse', / /x, [0] => [human_name: 'free space']
21 | include_examples 'parse', /#comment
22 | /x, [0] => [human_name: 'comment']
23 | include_examples 'parse', /(?#comment)/x, [0] => [human_name: 'comment group']
24 | include_examples 'parse', /(abc)\1/, [1] => [human_name: 'backreference']
25 | include_examples 'parse', /(?)\k/, [1] => [human_name: 'backreference by name']
26 | include_examples 'parse', /(abc)\g<-1>/, [1] => [human_name: 'relative subexpression call']
27 | include_examples 'parse', /a|bc/, [0] => [human_name: 'alternation']
28 | include_examples 'parse', /a|bc/, [0, 0] => [human_name: 'alternative']
29 | end
30 |
--------------------------------------------------------------------------------
/spec/expression/methods/match_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Base#match') do
4 | it 'returns the #match result of the respective Regexp' do
5 | expect(RP.parse(/a/).match('a')[0]).to eq 'a'
6 | end
7 |
8 | it 'can be given an offset, just like Regexp#match' do
9 | expect(RP.parse(/./).match('ab', 1)[0]).to eq 'b'
10 | end
11 |
12 | it 'works with the #=~ alias' do
13 | expect(RP.parse(/a/) =~ 'a').to be_a MatchData
14 | end
15 | end
16 |
17 | RSpec.describe('Expression::Base#match?') do
18 | it 'returns true if the Respective Regexp matches' do
19 | expect(RP.parse(/a/).match?('a')).to be true
20 | end
21 |
22 | it 'returns false if the Respective Regexp does not match' do
23 | expect(RP.parse(/a/).match?('b')).to be false
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/spec/expression/methods/negative_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Base#negative?') do
4 | include_examples 'parse', //, [] => [:root, negative?: false]
5 | include_examples 'parse', /a/, [0] => [:literal, negative?: false]
6 |
7 | include_examples 'parse', /\b/, [0] => [:word_boundary, negative?: false]
8 | include_examples 'parse', /\B/, [0] => [:nonword_boundary, negative?: true]
9 |
10 | include_examples 'parse', /(?=)/, [0] => [:lookahead, negative?: false]
11 | include_examples 'parse', /(?!)/, [0] => [:nlookahead, negative?: true]
12 |
13 | include_examples 'parse', /(?<=)/, [0] => [:lookbehind, negative?: false]
14 | include_examples 'parse', /(? [:nlookbehind, negative?: true]
15 |
16 | include_examples 'parse', /[a]/, [0] => [:character, negative?: false]
17 | include_examples 'parse', /[^a]/, [0] => [:character, negative?: true]
18 |
19 | include_examples 'parse', /\d/, [0] => [:digit, negative?: false]
20 | include_examples 'parse', /\D/, [0] => [:nondigit, negative?: true]
21 |
22 | include_examples 'parse', /[[:word:]]/, [0, 0] => [:word, negative?: false]
23 | include_examples 'parse', /[[:^word:]]/, [0, 0] => [:word, negative?: true]
24 |
25 | include_examples 'parse', /\p{word}/, [0] => [:word, negative?: false]
26 | include_examples 'parse', /\p{^word}/, [0] => [:word, negative?: true]
27 |
28 | include_examples 'parse', //, [] => [:root, negated?: false]
29 | include_examples 'parse', /[^a]/, [0] => [:character, negated?: true]
30 | end
31 |
--------------------------------------------------------------------------------
/spec/expression/methods/parts_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Base#parts') do
4 | include_examples 'parse', //, [] => [:root, parts: []]
5 | include_examples 'parse', /a/, [0] => [:literal, parts: ['a']]
6 | include_examples 'parse', /\K/, [0] => [:mark, parts: ['\K']]
7 | include_examples 'parse', /\p{any}/, [0] => [:any, parts: ['\p{any}']]
8 | include_examples 'parse', /[a]/, [0] => [:character, parts: ['[', s(Literal, 'a'), ']']]
9 | include_examples 'parse', /[^a]/, [0] => [:character, parts: ['[^', s(Literal, 'a'), ']']]
10 | include_examples 'parse', /(a)/, [0] => [:capture, parts: ['(', s(Literal, 'a'), ')']]
11 | include_examples 'parse', /(?>a)/, [0] => [:atomic, parts: ['(?>', s(Literal, 'a'), ')']]
12 | include_examples 'parse', /(?=a)/, [0] => [:lookahead, parts: ['(?=', s(Literal, 'a'), ')']]
13 | include_examples 'parse', /(?#a)/, [0] => [:comment, parts: ['(?#a)']]
14 |
15 | include_examples 'parse', /(a(b(c)))/,
16 | [0] => [:capture, parts: [
17 | '(',
18 | s(Literal, 'a'),
19 | s(Group::Capture, '(',
20 | s(Literal, 'b'),
21 | s(Group::Capture, '(',
22 | s(Literal, 'c'),
23 | )
24 | ),
25 | ')'
26 | ]]
27 |
28 | include_examples 'parse', /a|b|c/,
29 | [] => [:root, parts: [
30 | s(Alternation, '|',
31 | s(Alternative, nil, s(Literal, 'a')),
32 | s(Alternative, nil, s(Literal, 'b')),
33 | s(Alternative, nil, s(Literal, 'c'))
34 | )
35 | ]],
36 | [0] => [:alternation, parts: [
37 | s(Alternative, nil, s(Literal, 'a')),
38 | '|',
39 | s(Alternative, nil, s(Literal, 'b')),
40 | '|',
41 | s(Alternative, nil, s(Literal, 'c'))
42 | ]]
43 |
44 | include_examples 'parse', /[a-z]/,
45 | [] => [:root, parts: [
46 | s(CharacterSet, '[',
47 | s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')),
48 | )
49 | ]],
50 | [0] => [:character, parts: [
51 | '[',
52 | s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')),
53 | ']'
54 | ]],
55 | [0, 0] => [:range, parts: [
56 | s(Literal, 'a'),
57 | '-',
58 | s(Literal, 'z')
59 | ]]
60 |
61 | include_examples 'parse', /[a&&b&&c]/,
62 | [] => [:root, parts: [
63 | s(CharacterSet, '[',
64 | s(CharacterSet::Intersection, '&&',
65 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')),
66 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')),
67 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c'))
68 | )
69 | )
70 | ]],
71 | [0, 0] => [:intersection, parts: [
72 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')),
73 | '&&',
74 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')),
75 | '&&',
76 | s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c'))
77 | ]]
78 |
79 | include_examples 'parse', /(a)(?(1)T|F)/,
80 | [1] => [Conditional::Expression, parts: [
81 | '(?',
82 | s(Conditional::Condition, '(1)'),
83 | s(Conditional::Branch, nil, s(Literal, 'T')),
84 | '|',
85 | s(Conditional::Branch, nil, s(Literal, 'F')),
86 | ')'
87 | ]]
88 | end
89 |
--------------------------------------------------------------------------------
/spec/expression/methods/printing_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Shared#inspect') do
4 | it 'includes only essential information' do
5 | root = Regexp::Parser.parse(//)
6 | expect(root.inspect).to eq '#'
7 |
8 | root = Regexp::Parser.parse(/(a)+/)
9 | expect(root.inspect)
10 | .to match(/#'
15 | expect(root[0][0].inspect)
16 | .to eq '#'
17 | end
18 | end
19 |
20 | RSpec.describe('Expression::Shared#pretty_print') do
21 | it 'works' do
22 | require 'pp'
23 | pp_to_s = ->(arg) { ''.dup.tap { |buffer| PP.new(buffer).pp(arg) } }
24 |
25 | root = Regexp::Parser.parse(/(a)+/)
26 |
27 | expect(pp_to_s.(root)).to start_with '# [Alternation, ts: 0, te: 19],
7 | [0, 0] => [Alternative, ts: 0, te: 4],
8 | [0, 1] => [Alternative, ts: 5, te: 9],
9 | [0, 2] => [Alternative, ts: 10, te: 14],
10 | [0, 3] => [Alternative, ts: 15, te: 19]
11 |
12 | # check #nesting_level
13 | include_examples 'parse', /a(b(\d|[ef-g[h]]))/,
14 | [0] => [Literal, to_s: 'a', nesting_level: 1],
15 | [1, 0] => [Literal, to_s: 'b', nesting_level: 2],
16 | [1, 1, 0] => [Alternation, to_s: '\d|[ef-g[h]]', nesting_level: 3],
17 | [1, 1, 0, 0] => [Alternative, to_s: '\d', nesting_level: 4],
18 | [1, 1, 0, 0, 0] => [CharacterType::Digit, to_s: '\d', nesting_level: 5],
19 | [1, 1, 0, 1] => [Alternative, to_s: '[ef-g[h]]', nesting_level: 4],
20 | [1, 1, 0, 1, 0] => [CharacterSet, to_s: '[ef-g[h]]', nesting_level: 5],
21 | [1, 1, 0, 1, 0, 0] => [Literal, to_s: 'e', nesting_level: 6],
22 | [1, 1, 0, 1, 0, 1] => [CharacterSet::Range, to_s: 'f-g', nesting_level: 6],
23 | [1, 1, 0, 1, 0, 1, 0] => [Literal, to_s: 'f', nesting_level: 7],
24 | [1, 1, 0, 1, 0, 2, 0] => [Literal, to_s: 'h', nesting_level: 7]
25 |
26 | specify('#dig') do
27 | root = RP.parse(/(((a)))/)
28 |
29 | expect(root.dig(0).to_s).to eq '(((a)))'
30 | expect(root.dig(0, 0, 0, 0).to_s).to eq 'a'
31 | expect(root.dig(0, 0, 0, 0, 0)).to be_nil
32 | expect(root.dig(3, 7)).to be_nil
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/spec/expression/te_ts_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Shared#te,ts') do
4 | # Many tokens/expressions have their own tests for #te and #ts.
5 | # This is an integration-like test to ensure they are correct in conjunction.
6 | it 'is correct irrespective of nesting or preceding tokens' do
7 | regexp = regexp_with_all_features
8 | source = regexp.source
9 | root = RP.parse(regexp)
10 |
11 | checked_exps = root.each_expression.with_object([]) do |(exp), acc|
12 | acc.each { |e| fail "dupe: #{[e, exp]}" if e.to_s == exp.to_s }
13 | acc << exp unless exp.is_a?(Sequence) || exp.is_a?(WhiteSpace)
14 | end
15 | expect(checked_exps).not_to be_empty
16 |
17 | checked_exps.each do |exp|
18 | start = source.index(exp.to_s(:original))
19 | expect(exp.ts).to eq(start),
20 | "expected #{exp.class} #{exp} to start at #{start}, got #{exp.ts}"
21 |
22 | end_idx = start + exp.base_length
23 | expect(exp.te).to eq(end_idx),
24 | "expected #{exp.class} #{exp} to end at #{end_idx}, got #{exp.te}"
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/spec/expression/to_h_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Base#to_h') do
4 | include_examples 'parse', /abc/, [] => [Root, to_h: {
5 | token: :root,
6 | type: :expression,
7 | text: 'abc',
8 | starts_at: 0,
9 | length: 3,
10 | quantifier: nil,
11 | options: {},
12 | level: 0,
13 | set_level: 0,
14 | conditional_level: 0,
15 | expressions: [
16 | {
17 | token: :literal,
18 | type: :literal,
19 | text: 'abc',
20 | starts_at: 0,
21 | length: 3,
22 | quantifier: nil,
23 | options: {},
24 | level: 0,
25 | set_level: 0,
26 | conditional_level: 0
27 | }
28 | ]
29 | }]
30 |
31 | include_examples 'parse', /a{2,4}/, [0, :q] => [Quantifier, to_h: {
32 | max: 4,
33 | min: 2,
34 | mode: :greedy,
35 | text: '{2,4}',
36 | token: :interval,
37 | }]
38 |
39 | specify('Conditional#to_h') do
40 | root = RP.parse('(?a)(?()b|c)')
41 | expect { root.to_h }.not_to(raise_error)
42 | end
43 | end
44 |
--------------------------------------------------------------------------------
/spec/expression/to_s_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Expression::Base#to_s') do
4 | def parse_frozen(pattern)
5 | Leto.deep_freeze(RP.parse(pattern))
6 | end
7 |
8 | def expect_round_trip(pattern)
9 | parsed = parse_frozen(pattern)
10 |
11 | expect(parsed.to_s).to eql(pattern)
12 | end
13 |
14 | specify('literal alternation') do
15 | expect_round_trip('abcd|ghij|klmn|pqur')
16 | end
17 |
18 | specify('quantified alternations') do
19 | expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)')
20 | end
21 |
22 | specify('quantified sets') do
23 | expect_round_trip('[abc]+|[^def]{3,6}')
24 | end
25 |
26 | specify('property sets') do
27 | expect_round_trip('[\a\b\p{Lu}\P{Z}\c\d]+')
28 | end
29 |
30 | specify('groups') do
31 | expect_round_trip("(a(?>b(?:c(?d(?'N'e)??f)+g)*+h)*i)++")
32 | end
33 |
34 | specify('assertions') do
35 | expect_round_trip('(a+(?=b+(?!c+(?<=d+(?a)(?()b|c)/,
5 | 3 => [:conditional, :open, '(?', 7, 9, 0, 0, 0],
6 | 4 => [:conditional, :condition, '()', 9, 14, 0, 0, 1],
7 | 6 => [:conditional, :separator, '|', 15, 16, 0, 0, 1],
8 | 8 => [:conditional, :close, ')', 17, 18, 0, 0, 0]
9 |
10 | include_examples 'lex', /((?a)(?(?()b|((?()[e-g]|[h-j])))))/,
11 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0],
12 | 1 => [:group, :named, '(?', 1, 6, 1, 0, 0],
13 | 5 => [:conditional, :open, '(?', 13, 15, 2, 0, 0],
14 | 6 => [:conditional, :condition, '()', 15, 20, 2, 0, 1],
15 | 8 => [:conditional, :separator, '|', 21, 22, 2, 0, 1],
16 | 10 => [:conditional, :open, '(?', 23, 25, 3, 0, 1],
17 | 11 => [:conditional, :condition, '()', 25, 30, 3, 0, 2],
18 | 12 => [:set, :open, '[', 30, 31, 3, 0, 2],
19 | 13 => [:literal, :literal, 'e', 31, 32, 3, 1, 2],
20 | 14 => [:set, :range, '-', 32, 33, 3, 1, 2],
21 | 15 => [:literal, :literal, 'g', 33, 34, 3, 1, 2],
22 | 16 => [:set, :close, ']', 34, 35, 3, 0, 2],
23 | 17 => [:conditional, :separator, '|', 35, 36, 3, 0, 2],
24 | 23 => [:conditional, :close, ')', 41, 42, 3, 0, 1],
25 | 25 => [:conditional, :close, ')', 43, 44, 2, 0, 0],
26 | 26 => [:group, :close, ')', 44, 45, 1, 0, 0],
27 | 27 => [:group, :close, ')', 45, 46, 0, 0, 0]
28 |
29 | include_examples 'lex', /(a(b(c)))(?(1)(?(2)(?(3)d|e))|(?(3)(?(2)f|g)|(?(1)f|g)))/,
30 | 9 => [:conditional, :open, '(?', 9, 11, 0, 0, 0],
31 | 10 => [:conditional, :condition, '(1)', 11, 14, 0, 0, 1],
32 | 11 => [:conditional, :open, '(?', 14, 16, 0, 0, 1],
33 | 12 => [:conditional, :condition, '(2)', 16, 19, 0, 0, 2],
34 | 13 => [:conditional, :open, '(?', 19, 21, 0, 0, 2],
35 | 14 => [:conditional, :condition, '(3)', 21, 24, 0, 0, 3],
36 | 16 => [:conditional, :separator, '|', 25, 26, 0, 0, 3],
37 | 18 => [:conditional, :close, ')', 27, 28, 0, 0, 2],
38 | 19 => [:conditional, :close, ')', 28, 29, 0, 0, 1],
39 | 20 => [:conditional, :separator, '|', 29, 30, 0, 0, 1],
40 | 21 => [:conditional, :open, '(?', 30, 32, 0, 0, 1],
41 | 22 => [:conditional, :condition, '(3)', 32, 35, 0, 0, 2],
42 | 23 => [:conditional, :open, '(?', 35, 37, 0, 0, 2],
43 | 24 => [:conditional, :condition, '(2)', 37, 40, 0, 0, 3],
44 | 26 => [:conditional, :separator, '|', 41, 42, 0, 0, 3],
45 | 28 => [:conditional, :close, ')', 43, 44, 0, 0, 2],
46 | 29 => [:conditional, :separator, '|', 44, 45, 0, 0, 2],
47 | 30 => [:conditional, :open, '(?', 45, 47, 0, 0, 2],
48 | 31 => [:conditional, :condition, '(1)', 47, 50, 0, 0, 3],
49 | 33 => [:conditional, :separator, '|', 51, 52, 0, 0, 3],
50 | 35 => [:conditional, :close, ')', 53, 54, 0, 0, 2],
51 | 36 => [:conditional, :close, ')', 54, 55, 0, 0, 1],
52 | 37 => [:conditional, :close, ')', 55, 56, 0, 0, 0]
53 | end
54 |
--------------------------------------------------------------------------------
/spec/lexer/delimiters_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Literal delimiter lexing') do
4 | include_examples 'lex', '}',
5 | 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0]
6 |
7 | include_examples 'lex', '}}',
8 | 0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0]
9 |
10 | include_examples 'lex', '{',
11 | 0 => [:literal, :literal, '{', 0, 1, 0, 0, 0]
12 |
13 | include_examples 'lex', '{{',
14 | 0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0]
15 |
16 | include_examples 'lex', '{}',
17 | 0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0]
18 |
19 | include_examples 'lex', '}{',
20 | 0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0]
21 |
22 | include_examples 'lex', '}{+',
23 | 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0],
24 | 1 => [:literal, :literal, '{', 1, 2, 0, 0, 0],
25 | 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0]
26 |
27 | include_examples 'lex', '{{var}}',
28 | 0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0]
29 |
30 | include_examples 'lex', 'a{b}c',
31 | 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0]
32 |
33 | include_examples 'lex', 'a{1,2',
34 | 0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0]
35 |
36 | include_examples 'lex', '({.+})',
37 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0],
38 | 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0],
39 | 2 => [:meta, :dot, '.', 2, 3, 1, 0, 0],
40 | 3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0],
41 | 4 => [:literal, :literal, '}', 4, 5, 1, 0, 0],
42 | 5 => [:group, :close, ')', 5, 6, 0, 0, 0]
43 |
44 | include_examples 'lex', ']',
45 | 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0]
46 |
47 | include_examples 'lex', ']]',
48 | 0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0]
49 |
50 | include_examples 'lex', ']\[',
51 | 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0],
52 | 1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0]
53 |
54 | include_examples 'lex', '()',
55 | 0 => [:group, :capture, '(', 0, 1, 0, 0, 0],
56 | 1 => [:group, :close, ')', 1, 2, 0, 0, 0]
57 |
58 | include_examples 'lex', '{abc:.+}}}[^}]]}',
59 | 0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0],
60 | 1 => [:meta, :dot, '.', 5, 6, 0, 0, 0],
61 | 2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
62 | 3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0],
63 | 4 => [:set, :open, '[', 10, 11, 0, 0, 0],
64 | 5 => [:set, :negate, '^', 11, 12, 0, 1, 0],
65 | 6 => [:literal, :literal, '}', 12, 13, 0, 1, 0],
66 | 7 => [:set, :close, ']', 13, 14, 0, 0, 0],
67 | 8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0]
68 | end
69 |
--------------------------------------------------------------------------------
/spec/lexer/escapes_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Escape lexing') do
4 | include_examples 'lex', '\u{62}',
5 | 0 => [:escape, :codepoint_list, '\u{62}', 0, 6, 0, 0, 0]
6 |
7 | include_examples 'lex', '\u{62 63 64}',
8 | 0 => [:escape, :codepoint_list, '\u{62 63 64}', 0, 12, 0, 0, 0]
9 |
10 | include_examples 'lex', '\u{62 63 64}+',
11 | 0 => [:escape, :codepoint_list, '\u{62 63}', 0, 9, 0, 0, 0],
12 | 1 => [:escape, :codepoint_list, '\u{64}', 9, 15, 0, 0, 0],
13 | 2 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0]
14 | end
15 |
--------------------------------------------------------------------------------
/spec/lexer/keep_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Keep lexing') do
4 | include_examples 'lex', /ab\Kcd/,
5 | 1 => [:keep, :mark, '\K', 2, 4, 0, 0, 0]
6 |
7 | include_examples 'lex', /(a\Kb)|(c\\\Kd)ef/,
8 | 2 => [:keep, :mark, '\K', 2, 4, 1, 0, 0],
9 | 9 => [:keep, :mark, '\K', 11, 13, 1, 0, 0]
10 | end
11 |
--------------------------------------------------------------------------------
/spec/lexer/literals_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Literal lexing') do
4 | # ascii, single byte characters
5 | include_examples 'lex', 'a',
6 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0]
7 |
8 | include_examples 'lex', 'ab+',
9 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
10 | 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
11 | 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0]
12 |
13 | # 2 byte wide characters
14 | include_examples 'lex', 'äöü+',
15 | 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0],
16 | 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0],
17 | 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0]
18 |
19 | # 3 byte wide characters, Japanese
20 | include_examples 'lex', 'ab?れます+cd',
21 | 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
22 | 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
23 | 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
24 | 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0],
25 | 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0],
26 | 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
27 | 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0]
28 |
29 | # 4 byte wide characters, Osmanya
30 | include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃',
31 | 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0],
32 | 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0],
33 | 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
34 | 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0],
35 | 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0],
36 | 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
37 | 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0]
38 |
39 | include_examples 'lex', 'mu𝄞?si*𝄫c+',
40 | 0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0],
41 | 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0],
42 | 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
43 | 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0],
44 | 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0],
45 | 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0],
46 | 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0],
47 | 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0],
48 | 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0]
49 |
50 | specify('lex single 2 byte char') do
51 | tokens = RL.lex("\u0627+")
52 | expect(tokens.count).to eq 2
53 | end
54 |
55 | specify('lex single 3 byte char') do
56 | tokens = RL.lex("\u308C+")
57 | expect(tokens.count).to eq 2
58 | end
59 |
60 | specify('lex single 4 byte char') do
61 | tokens = RL.lex("\u{1D11E}+")
62 | expect(tokens.count).to eq 2
63 | end
64 | end
65 |
--------------------------------------------------------------------------------
/spec/lexer/refcalls_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('RefCall lexing') do
4 | # Traditional numerical group back-reference
5 | include_examples 'lex', '(abc)\1',
6 | 3 => [:backref, :number, '\1', 5, 7, 0, 0, 0]
7 |
8 | # Group back-references, named, numbered, and relative
9 | include_examples 'lex', '(?abc)\k',
10 | 3 => [:backref, :name_ref, '\k', 9, 14, 0, 0, 0]
11 | include_examples 'lex', "(?abc)\\k'X'",
12 | 3 => [:backref, :name_ref, "\\k'X'", 9, 14, 0, 0, 0]
13 |
14 | include_examples 'lex', '(abc)\k<1>',
15 | 3 => [:backref, :number_ref, '\k<1>', 5, 10, 0, 0, 0]
16 | include_examples 'lex', "(abc)\\k'1'",
17 | 3 => [:backref, :number_ref, "\\k'1'", 5, 10, 0, 0, 0]
18 |
19 | include_examples 'lex', '(abc)\k<-1>',
20 | 3 => [:backref, :number_rel_ref, '\k<-1>', 5, 11, 0, 0, 0]
21 | include_examples 'lex', "(abc)\\k'-1'",
22 | 3 => [:backref, :number_rel_ref, "\\k'-1'", 5, 11, 0, 0, 0]
23 |
24 | # Sub-expression invocation, named, numbered, and relative
25 | include_examples 'lex', '(?abc)\g',
26 | 3 => [:backref, :name_call, '\g', 9, 14, 0, 0, 0]
27 | include_examples 'lex', "(?abc)\\g'X'",
28 | 3 => [:backref, :name_call, "\\g'X'", 9, 14, 0, 0, 0]
29 |
30 | include_examples 'lex', '(abc)\g<1>',
31 | 3 => [:backref, :number_call, '\g<1>', 5, 10, 0, 0, 0]
32 | include_examples 'lex', "(abc)\\g'1'",
33 | 3 => [:backref, :number_call, "\\g'1'", 5, 10, 0, 0, 0]
34 |
35 | include_examples 'lex', '\g<0>',
36 | 0 => [:backref, :number_call, '\g<0>', 0, 5, 0, 0, 0]
37 | include_examples 'lex', "\\g'0'",
38 | 0 => [:backref, :number_call, "\\g'0'", 0, 5, 0, 0, 0]
39 |
40 | include_examples 'lex', '(abc)\g<-1>',
41 | 3 => [:backref, :number_rel_call, '\g<-1>', 5, 11, 0, 0, 0]
42 | include_examples 'lex', "(abc)\\g'-1'",
43 | 3 => [:backref, :number_rel_call, "\\g'-1'", 5, 11, 0, 0, 0]
44 |
45 | include_examples 'lex', '(abc)\g<+1>',
46 | 3 => [:backref, :number_rel_call, '\g<+1>', 5, 11, 0, 0, 0]
47 | include_examples 'lex', "(abc)\\g'+1'",
48 | 3 => [:backref, :number_rel_call, "\\g'+1'", 5, 11, 0, 0, 0]
49 |
50 | # Group back-references, with nesting level
51 | include_examples 'lex', '(?abc)\k',
52 | 3 => [:backref, :name_recursion_ref, '\k', 9, 16, 0, 0, 0]
53 | include_examples 'lex', "(?abc)\\k'X-0'",
54 | 3 => [:backref, :name_recursion_ref, "\\k'X-0'", 9, 16, 0, 0, 0]
55 |
56 | include_examples 'lex', '(abc)\k<1-0>',
57 | 3 => [:backref, :number_recursion_ref, '\k<1-0>', 5, 12, 0, 0, 0]
58 | include_examples 'lex', "(abc)\\k'1-0'",
59 | 3 => [:backref, :number_recursion_ref, "\\k'1-0'", 5, 12, 0, 0, 0]
60 | end
61 |
--------------------------------------------------------------------------------
/spec/parser/all_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Parser) do
4 | specify('parse returns a root expression') do
5 | expect(RP.parse('abc')).to be_instance_of(Root)
6 | end
7 |
8 | specify('parse can be called with block') do
9 | expect(RP.parse('abc') { |root| root.class }).to eq Root
10 | end
11 |
12 | specify('parse root contains expressions') do
13 | root = RP.parse(/^a.c+[^one]{2,3}\b\d\\\C-C$/)
14 | expect(root.expressions).to all(be_a Regexp::Expression::Base)
15 | end
16 |
17 | specify('parse root options mi') do
18 | root = RP.parse(/[abc]/mi)
19 |
20 | expect(root.m?).to be true
21 | expect(root.i?).to be true
22 | expect(root.x?).to be false
23 | end
24 |
25 | specify('parse no quantifier target raises error') do
26 | expect { RP.parse('?abc') }.to raise_error(Regexp::Parser::Error)
27 | end
28 |
29 | specify('parse sequence no quantifier target raises error') do
30 | expect { RP.parse('abc|?def') }.to raise_error(Regexp::Parser::Error)
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/spec/parser/alternation_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Alternation parsing') do
4 | include_examples 'parse', /a|b/,
5 | [0] => [Alternation, text: '|', count: 2],
6 | [0, 0] => [Alternative, text: '', count: 1],
7 | [0, 0, 0] => [:literal, text: 'a' ],
8 | [0, 1] => [Alternative, text: '', count: 1],
9 | [0, 1, 0] => [:literal, text: 'b' ]
10 |
11 | include_examples 'parse', /a|(b)c/,
12 | [0] => [Alternation, text: '|', count: 2],
13 | [0, 0] => [Alternative, text: '', count: 1],
14 | [0, 0, 0] => [:literal, text: 'a' ],
15 | [0, 1] => [Alternative, text: '', count: 2],
16 | [0, 1, 0] => [:capture, to_s: '(b)' ],
17 | [0, 1, 1] => [:literal, text: 'c' ]
18 |
19 | include_examples 'parse', /(ab??|cd*|ef+)*|(gh|ij|kl)?/,
20 | [0] => [Alternation, text: '|', count: 2, quantified?: false],
21 | [0, 0] => [Alternative, text: '', count: 1, quantified?: false],
22 | [0, 0, 0] => [:capture, count: 1, quantified?: true ],
23 | [0, 0, 0, 0] => [Alternation, text: '|', count: 3 ],
24 | [0, 0, 0, 0, 0] => [Alternative, text: '', count: 2 ],
25 | [0, 0, 0, 0, 0, 0] => [:literal, to_s: 'a' ],
26 | [0, 0, 0, 0, 0, 1] => [:literal, to_s: 'b??' ],
27 | [0, 1] => [Alternative, text: '', count: 1, quantified?: false],
28 | [0, 1, 0] => [:capture, count: 1, quantified?: true ]
29 |
30 | # test correct ts values for empty sequences
31 | include_examples 'parse', /|||/,
32 | [0] => [Alternation, text: '|', count: 4, starts_at: 0],
33 | [0, 0] => [Alternative, to_s: '', count: 0, starts_at: 0],
34 | [0, 1] => [Alternative, to_s: '', count: 0, starts_at: 1],
35 | [0, 2] => [Alternative, to_s: '', count: 0, starts_at: 2],
36 | [0, 3] => [Alternative, to_s: '', count: 0, starts_at: 3]
37 |
38 | # test correct ts values for non-empty sequences
39 | include_examples 'parse', /ab|cd|ef|gh/,
40 | [0] => [Alternation, text: '|', count: 4, starts_at: 0],
41 | [0, 0] => [Alternative, to_s: 'ab', count: 1, starts_at: 0],
42 | [0, 1] => [Alternative, to_s: 'cd', count: 1, starts_at: 3],
43 | [0, 2] => [Alternative, to_s: 'ef', count: 1, starts_at: 6],
44 | [0, 3] => [Alternative, to_s: 'gh', count: 1, starts_at: 9]
45 | end
46 |
--------------------------------------------------------------------------------
/spec/parser/anchors_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Anchor parsing') do
4 | include_examples 'parse', /^a/, 0 => [:anchor, :bol, Anchor::BOL]
5 | include_examples 'parse', /a$/, 1 => [:anchor, :eol, Anchor::EOL]
6 |
7 | include_examples 'parse', /\Aa/, 0 => [:anchor, :bos, Anchor::BOS]
8 | include_examples 'parse', /a\z/, 1 => [:anchor, :eos, Anchor::EOS]
9 | include_examples 'parse', /a\Z/, 1 => [:anchor, :eos_ob_eol, Anchor::EOSobEOL]
10 |
11 | include_examples 'parse', /a\b/, 1 => [:anchor, :word_boundary, Anchor::WordBoundary]
12 | include_examples 'parse', /a\B/, 1 => [:anchor, :nonword_boundary, Anchor::NonWordBoundary]
13 |
14 | include_examples 'parse', /a\G/, 1 => [:anchor, :match_start, Anchor::MatchStart]
15 |
16 | include_examples 'parse', /\\A/, 0 => [:escape, :backslash, EscapeSequence::Literal]
17 | end
18 |
--------------------------------------------------------------------------------
/spec/parser/conditionals_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Conditional parsing') do
4 | include_examples 'parse', /(?a)(?()T|F)/,
5 | [1] => [:conditional, :open, Conditional::Expression, to_s: '(?()T|F)', reference: 'A', ts: 7],
6 | [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '()', reference: 'A', ts: 9],
7 | [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 14],
8 | [1, 1, 0] => [:literal, text: 'T', ts: 14],
9 | [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 16],
10 | [1, 2, 0] => [:literal, text: 'F', ts: 16]
11 |
12 | include_examples 'parse', /(a)(?(1)T|F)/,
13 | [1] => [:conditional, :open, Conditional::Expression, to_s: '(?(1)T|F)', reference: 1, ts: 3],
14 | [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '(1)', reference: 1, ts: 5],
15 | [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 8],
16 | [1, 1, 0] => [:literal, text: 'T', ts: 8],
17 | [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 10],
18 | [1, 2, 0] => [:literal, text: 'F', ts: 10]
19 |
20 | include_examples 'parse', /(foo)(?(1)\d+|(\w)){42}/,
21 | [1] => [Conditional::Expression, quantified?: true, to_s: '(?(1)\d+|(\w)){42}'],
22 | [1, 0] => [Conditional::Condition, quantified?: false],
23 | [1, 1] => [Conditional::Branch, quantified?: false],
24 | [1, 1, 0] => [:digit, quantified?: true, to_s: '\d+'],
25 | [1, 2] => [Conditional::Branch, quantified?: false]
26 |
27 | # test nested and mixed with alternations
28 | include_examples 'parse', <<-EOS.gsub(/\s/, ''),
29 | (
30 | (a)
31 | |
32 | (b)
33 | |
34 | (
35 | (
36 | ?(2)
37 | (c(d|e)+)?
38 | |
39 | (
40 | ?(3)
41 | f
42 | |
43 | (
44 | ?(4)
45 | (g|(h)(i))
46 | )
47 | )
48 | )
49 | )
50 | )
51 | EOS
52 | [0] => [Group::Capture, count: 1],
53 | [0, 0] => [Alternation, count: 3],
54 | [0, 0, 2] => [Alternative, count: 1],
55 | [0, 0, 2, 0] => [Group::Capture, count: 1],
56 | [0, 0, 2, 0, 0] => [Conditional::Expression, count: 3, conditional_level: 0],
57 | [0, 0, 2, 0, 0, 0] => [Conditional::Condition, to_s: '(2)', conditional_level: 1],
58 | [0, 0, 2, 0, 0, 1] => [Conditional::Branch, to_s: '(c(d|e)+)?', conditional_level: 1],
59 | [0, 0, 2, 0, 0, 2] => [Conditional::Branch, to_s: '(?(3)f|(?(4)(g|(h)(i))))', conditional_level: 1],
60 | [0, 0, 2, 0, 0, 2, 0] => [Conditional::Expression, count: 3, conditional_level: 1],
61 | [0, 0, 2, 0, 0, 2, 0, 0] => [Conditional::Condition, to_s: '(3)', conditional_level: 2],
62 | [0, 0, 2, 0, 0, 2, 0, 1] => [Conditional::Branch, count: 1, to_s: 'f', conditional_level: 2],
63 | [0, 0, 2, 0, 0, 2, 0, 1, 0] => [Literal, text: 'f', conditional_level: 2]
64 |
65 | # test empty branch
66 | include_examples 'parse', /(?a)(?()T|)/,
67 | [1] => [Conditional::Expression, count: 3, to_s: '(?()T|)'],
68 | [1, 2] => [Conditional::Branch, to_s: '', ts: 16]
69 |
70 | # test insignificant leading zeros in the condition's group number ref
71 | include_examples 'parse', /(a)(?(001)T)/,
72 | [1, 0] => [Conditional::Condition, to_s: '(001)', reference: 1]
73 | end
74 |
--------------------------------------------------------------------------------
/spec/parser/errors_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Parsing errors') do
4 | let(:parser) { Regexp::Parser.new }
5 | before { parser.parse(/foo/) } # initializes ivars
6 |
7 | it('raises UnknownTokenTypeError for unknown token types') do
8 | expect { parser.send(:parse_token, Regexp::Token.new(:foo, :bar)) }
9 | .to raise_error(Regexp::Parser::UnknownTokenTypeError)
10 | end
11 |
12 | RSpec.shared_examples 'UnknownTokenError' do |type|
13 | it "raises for unknown tokens of type #{type}" do
14 | expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) }
15 | .to raise_error(Regexp::Parser::UnknownTokenError)
16 | end
17 | end
18 |
19 | include_examples 'UnknownTokenError', :anchor
20 | include_examples 'UnknownTokenError', :backref
21 | include_examples 'UnknownTokenError', :conditional
22 | include_examples 'UnknownTokenError', :free_space
23 | include_examples 'UnknownTokenError', :group
24 | include_examples 'UnknownTokenError', :meta
25 | include_examples 'UnknownTokenError', :nonproperty
26 | include_examples 'UnknownTokenError', :property
27 | include_examples 'UnknownTokenError', :quantifier
28 | include_examples 'UnknownTokenError', :set
29 | include_examples 'UnknownTokenError', :type
30 | end
31 |
--------------------------------------------------------------------------------
/spec/parser/escapes_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('EscapeSequence parsing') do
4 | es = EscapeSequence
5 |
6 | include_examples 'parse', /a\ac/, 1 => [:escape, :bell, es::Bell]
7 | include_examples 'parse', /a\ec/, 1 => [:escape, :escape, es::AsciiEscape]
8 | include_examples 'parse', /a\fc/, 1 => [:escape, :form_feed, es::FormFeed]
9 | include_examples 'parse', /a\nc/, 1 => [:escape, :newline, es::Newline]
10 | include_examples 'parse', /a\rc/, 1 => [:escape, :carriage, es::Return]
11 | include_examples 'parse', /a\tc/, 1 => [:escape, :tab, es::Tab]
12 | include_examples 'parse', /a\vc/, 1 => [:escape, :vertical_tab, es::VerticalTab]
13 |
14 | # meta character escapes
15 | include_examples 'parse', /a\.c/, 1 => [:escape, :dot, es::Literal]
16 | include_examples 'parse', /a\?c/, 1 => [:escape, :zero_or_one, es::Literal]
17 | include_examples 'parse', /a\*c/, 1 => [:escape, :zero_or_more, es::Literal]
18 | include_examples 'parse', /a\+c/, 1 => [:escape, :one_or_more, es::Literal]
19 | include_examples 'parse', /a\|c/, 1 => [:escape, :alternation, es::Literal]
20 | include_examples 'parse', /a\(c/, 1 => [:escape, :group_open, es::Literal]
21 | include_examples 'parse', /a\)c/, 1 => [:escape, :group_close, es::Literal]
22 | include_examples 'parse', /a\{c/, 1 => [:escape, :interval_open, es::Literal]
23 | include_examples 'parse', /a\}c/, 1 => [:escape, :interval_close, es::Literal]
24 |
25 | # unicode escapes
26 | include_examples 'parse', /a\u0640/, 1 => [:escape, :codepoint, es::Codepoint]
27 | include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, es::CodepointList]
28 | include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, es::CodepointList]
29 |
30 | # hex escapes
31 | include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, es::Hex]
32 |
33 | # octal escapes
34 | include_examples 'parse', /a\177/n, 1 => [:escape, :octal, es::Octal]
35 |
36 | # test #char and #codepoint
37 | include_examples 'parse', /\n/, 0 => [char: "\n", codepoint: 10 ]
38 | include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ]
39 | include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ]
40 | include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ]
41 | include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ]
42 | include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ]
43 | include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]]
44 |
45 | specify('codepoint_list #char and #codepoint raise errors') do
46 | exp = RP.parse(/\u{44 45}/)[0]
47 | expect { exp.char }.to raise_error(/#chars/)
48 | expect { exp.codepoint }.to raise_error(/#codepoints/)
49 | end
50 |
51 | # Meta/control escapes
52 | #
53 | # After the following fix in Ruby 3.1, a Regexp#source containing meta/control
54 | # escapes can only be set with the Regexp::new constructor.
55 | # In Regexp literals, these escapes are now pre-processed to hex escapes.
56 | #
57 | # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9
58 | n = ->(regexp_body){ Regexp.new(regexp_body.force_encoding('ascii-8bit')) }
59 |
60 | include_examples 'parse', n.('\\\\\c2b'), 1 => [es::Control, text: '\c2', char: "\x12", codepoint: 18 ]
61 | include_examples 'parse', n.('\d\C-C\w'), 1 => [es::Control, text: '\C-C', char: "\x03", codepoint: 3 ]
62 | include_examples 'parse', n.('\Z\M-Z'), 1 => [es::Meta, text: '\M-Z', char: "\u00DA", codepoint: 218]
63 | include_examples 'parse', n.('\A\M-\C-X'), 1 => [es::MetaControl, text: '\M-\C-X', char: "\u0098", codepoint: 152]
64 | include_examples 'parse', n.('\A\M-\cX'), 1 => [es::MetaControl, text: '\M-\cX', char: "\u0098", codepoint: 152]
65 | include_examples 'parse', n.('\A\C-\M-X'), 1 => [es::MetaControl, text: '\C-\M-X', char: "\u0098", codepoint: 152]
66 | include_examples 'parse', n.('\A\c\M-X'), 1 => [es::MetaControl, text: '\c\M-X', char: "\u0098", codepoint: 152]
67 | end
68 |
--------------------------------------------------------------------------------
/spec/parser/free_space_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('FreeSpace parsing') do
4 | include_examples 'parse', /a b c/,
5 | [0] => [Literal, text: 'a b c']
6 |
7 | include_examples 'parse', /a b c/x,
8 | [0] => [Literal, text: 'a'],
9 | [1] => [WhiteSpace, text: ' '],
10 | [2] => [Literal, text: 'b'],
11 | [3] => [WhiteSpace, text: ' '],
12 | [4] => [Literal, text: 'c']
13 |
14 | include_examples 'parse', /a * b + c/x,
15 | [0] => [Literal, to_s: 'a*', quantified?: true],
16 | [1] => [WhiteSpace, text: ' '],
17 | [2] => [WhiteSpace, text: ' '],
18 | [3] => [Literal, to_s: 'b+', quantified?: true],
19 | [4] => [WhiteSpace, text: ' '],
20 | [5] => [WhiteSpace, text: ' '],
21 | [6] => [Literal, to_s: 'c']
22 |
23 | include_examples 'parse', /
24 | a ? # One letter
25 | b {2,5} # Another one
26 | [c-g] + # A set
27 | (h|i|j) # A group
28 | /x,
29 | [0] => [WhiteSpace],
30 | [1] => [Literal, to_s: 'a?', quantified?: true],
31 | [2] => [WhiteSpace, text: ' '],
32 | [3] => [WhiteSpace, text: ' '],
33 | [4] => [Comment, to_s: "# One letter\n"],
34 | [5] => [WhiteSpace],
35 | [6] => [Literal, to_s: 'b{2,5}', quantified?: true],
36 | [7] => [WhiteSpace, text: ' '],
37 | [8] => [WhiteSpace, text: ' '],
38 | [9] => [Comment, to_s: "# Another one\n"],
39 | [10] => [WhiteSpace],
40 | [11] => [CharacterSet, to_s: '[c-g]+', quantified?: true],
41 | [12] => [WhiteSpace],
42 | [13] => [WhiteSpace],
43 | [14] => [Comment, to_s: "# A set\n"],
44 | [15] => [WhiteSpace],
45 | [16] => [Group::Capture],
46 | [17] => [WhiteSpace],
47 | [18] => [Comment, to_s: "# A group\n",]
48 |
49 | include_examples 'parse', /
50 | a
51 | # comment 1
52 | ?
53 | (
54 | b # comment 2
55 | # comment 3
56 | +
57 | )
58 | # comment 4
59 | *
60 | /x,
61 | [0] => [WhiteSpace],
62 | [1] => [Literal, to_s: 'a?', quantified?: true],
63 | [2] => [WhiteSpace],
64 | [3] => [Comment],
65 | [4] => [WhiteSpace],
66 | [5] => [WhiteSpace],
67 | [6] => [Group::Capture, quantified?: true],
68 | [6, 0] => [WhiteSpace],
69 | [6, 1] => [Literal, to_s: 'b+', quantified?: true],
70 | [6, 2] => [WhiteSpace],
71 | [6, 3] => [Comment, to_s: "# comment 2\n"],
72 | [6, 4] => [WhiteSpace],
73 | [6, 5] => [Comment, to_s: "# comment 3\n"],
74 | [6, 6] => [WhiteSpace],
75 | [6, 7] => [WhiteSpace]
76 | end
77 |
--------------------------------------------------------------------------------
/spec/parser/keep_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Keep parsing') do
4 | include_examples 'parse', /ab\Kcd/, 1 => [:keep, :mark, Keep::Mark, text: '\K']
5 | include_examples 'parse', /(a\K)/, [0, 1] => [:keep, :mark, Keep::Mark, text: '\K']
6 | end
7 |
--------------------------------------------------------------------------------
/spec/parser/options_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('passing options to parse') do
4 | it 'raises if if parsing from a Regexp and options are passed' do
5 | expect { RP.parse(/a+/, options: ::Regexp::EXTENDED) }.to raise_error(
6 | ArgumentError,
7 | 'options cannot be supplied unless parsing a String'
8 | )
9 | end
10 |
11 | it 'sets options if parsing from a String' do
12 | root = RP.parse('a+', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED)
13 |
14 | expect(root.options).to eq(m: true, x: true)
15 | end
16 |
17 | it 'allows options to not be supplied when parsing from a Regexp' do
18 | root = RP.parse(/a+/ix)
19 |
20 | expect(root.options).to eq(i: true, x: true)
21 | end
22 |
23 | it 'has an empty option-hash when parsing from a String and passing no options' do
24 | root = RP.parse('a+')
25 |
26 | expect(root.options).to be_empty
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/spec/parser/posix_classes_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('PosixClass parsing') do
4 | include_examples 'parse', /[[:word:]]/,
5 | [0] => [CharacterSet, count: 1],
6 | [0, 0] => [:posixclass, :word, PosixClass, name: 'word', text: '[:word:]']
7 | include_examples 'parse', /[[:^word:]]/,
8 | [0] => [CharacterSet, count: 1],
9 | [0, 0] => [:nonposixclass, :word, PosixClass, name: 'word', text: '[:^word:]']
10 |
11 | # cases treated as regular subsets by Ruby, not as (invalid) posix classes
12 | include_examples 'parse', '[[:ab]c:]',
13 | [0, 0] => [CharacterSet, count: 3],
14 | [0, 0, 0] => [Literal, text: ':']
15 |
16 | include_examples 'parse', '[[:a[b]c:]]',
17 | [0, 0] => [CharacterSet, count: 5],
18 | [0, 0, 0] => [Literal, text: ':']
19 | end
20 |
--------------------------------------------------------------------------------
/spec/parser/properties_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Property parsing') do
4 | # test various notations supported by Ruby
5 | include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted]
6 | include_examples 'parse', '\p{SD}', 0 => [:property, :soft_dotted]
7 | include_examples 'parse', '\p{Soft Dotted}', 0 => [:property, :soft_dotted]
8 | include_examples 'parse', '\p{Soft-Dotted}', 0 => [:property, :soft_dotted]
9 | include_examples 'parse', '\p{sOfT_dOtTeD}', 0 => [:property, :soft_dotted]
10 |
11 | # test ^-negation
12 | include_examples 'parse', '\p{^sd}', 0 => [:nonproperty, :soft_dotted]
13 | include_examples 'parse', '\p{^SD}', 0 => [:nonproperty, :soft_dotted]
14 | include_examples 'parse', '\p{^Soft Dotted}', 0 => [:nonproperty, :soft_dotted]
15 | include_examples 'parse', '\p{^Soft-Dotted}', 0 => [:nonproperty, :soft_dotted]
16 | include_examples 'parse', '\p{^sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted]
17 |
18 | # test P-negation
19 | include_examples 'parse', '\P{sd}', 0 => [:nonproperty, :soft_dotted]
20 | include_examples 'parse', '\P{SD}', 0 => [:nonproperty, :soft_dotted]
21 | include_examples 'parse', '\P{Soft Dotted}', 0 => [:nonproperty, :soft_dotted]
22 | include_examples 'parse', '\P{Soft-Dotted}', 0 => [:nonproperty, :soft_dotted]
23 | include_examples 'parse', '\P{sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted]
24 |
25 | # double negation is positive again
26 | include_examples 'parse', '\P{^sd}', 0 => [:property, :soft_dotted]
27 | include_examples 'parse', '\P{^SD}', 0 => [:property, :soft_dotted]
28 | include_examples 'parse', '\P{^Soft Dotted}', 0 => [:property, :soft_dotted]
29 | include_examples 'parse', '\P{^Soft-Dotted}', 0 => [:property, :soft_dotted]
30 | include_examples 'parse', '\P{^sOfT_dOtTeD}', 0 => [:property, :soft_dotted]
31 |
32 | # test #shortcut
33 | include_examples 'parse', '\p{soft_dotted}', 0 => [:property, :soft_dotted, shortcut: 'sd']
34 | include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted, shortcut: 'sd']
35 | include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil]
36 |
37 | # test classification
38 | include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age]
39 | include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block]
40 | include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived]
41 | include_examples 'parse', '\p{Emoji}', 0 => [UnicodeProperty::Emoji]
42 | include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated]
43 | include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script]
44 |
45 | specify('parse abandoned newline property') do
46 | root = RP.parse('\p{newline}', 'ruby/1.9')
47 | expect(root.expressions.last).to be_a(UnicodeProperty::Base)
48 |
49 | expect { RP.parse('\p{newline}', 'ruby/2.0') }.to raise_error(Regexp::Syntax::NotImplementedError)
50 | end
51 |
52 | # cannot test older Rubies because of https://bugs.ruby-lang.org/issues/18686
53 | if ruby_version_at_least('3.2.0')
54 | specify('parse all properties of current ruby') do
55 | unsupported = RegexpPropertyValues.all_for_current_ruby.reject do |prop|
56 | RP.parse("\\p{#{prop}}") rescue false
57 | end
58 | expect(unsupported).to be_empty
59 | end
60 | end
61 |
62 | # Ruby 2.3 supports a short prop name (sterm) without supporting the long name
63 | # of the same prop (sentence_terminal). Let's ignore this unique case.
64 | if ruby_version_at_least('2.4.0')
65 | specify('parse only properties of current ruby') do
66 | syntax = Regexp::Syntax.for("ruby/#{RUBY_VERSION}")
67 | excessive = syntax.features.fetch(:property, []).reject do |prop|
68 | begin
69 | Regexp.new("\\p{#{prop}}")
70 | rescue RegexpError, SyntaxError # error class depends on Ruby version
71 | false
72 | end
73 | end
74 | expect(excessive).to be_empty
75 | end
76 | end
77 | end
78 |
--------------------------------------------------------------------------------
/spec/parser/quantifiers_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Quantifier parsing') do
4 | include_examples 'parse', /a?b/, [0, :q] => [:zero_or_one, text: '?', mode: :greedy, min: 0, max: 1, ts: 1]
5 | include_examples 'parse', /a??b/, [0, :q] => [:zero_or_one, text: '??', mode: :reluctant, min: 0, max: 1, ts: 1]
6 | include_examples 'parse', /a?+b/, [0, :q] => [:zero_or_one, text: '?+', mode: :possessive, min: 0, max: 1, ts: 1]
7 | include_examples 'parse', /a*b/, [0, :q] => [:zero_or_more, text: '*', mode: :greedy, min: 0, max: -1, ts: 1]
8 | include_examples 'parse', /a*?b/, [0, :q] => [:zero_or_more, text: '*?', mode: :reluctant, min: 0, max: -1, ts: 1]
9 | include_examples 'parse', /a*+b/, [0, :q] => [:zero_or_more, text: '*+', mode: :possessive, min: 0, max: -1, ts: 1]
10 | include_examples 'parse', /a+b/, [0, :q] => [:one_or_more, text: '+', mode: :greedy, min: 1, max: -1, ts: 1]
11 | include_examples 'parse', /a+?b/, [0, :q] => [:one_or_more, text: '+?', mode: :reluctant, min: 1, max: -1, ts: 1]
12 | include_examples 'parse', /a++b/, [0, :q] => [:one_or_more, text: '++', mode: :possessive, min: 1, max: -1, ts: 1]
13 | include_examples 'parse', /a{2,4}b/, [0, :q] => [:interval, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1]
14 | include_examples 'parse', /a{2,}b/, [0, :q] => [:interval, text: '{2,}', mode: :greedy, min: 2, max: -1, ts: 1]
15 | include_examples 'parse', /a{,3}b/, [0, :q] => [:interval, text: '{,3}', mode: :greedy, min: 0, max: 3, ts: 1]
16 | include_examples 'parse', /a{4}b/, [0, :q] => [:interval, text: '{4}', mode: :greedy, min: 4, max: 4, ts: 1]
17 | include_examples 'parse', /a{004}b/, [0, :q] => [:interval, text: '{004}', mode: :greedy, min: 4, max: 4, ts: 1]
18 |
19 | # special case: exps with chained quantifiers are wrapped in implicit passive groups
20 | include_examples 'parse', /a+{2}{3}/,
21 | [0] => [:group, :passive, Group::Passive, implicit?: true, level: 0],
22 | [0, :q] => [:quantifier, :interval, Quantifier, text: '{3}', level: 0],
23 | [0, 0] => [:group, :passive, Group::Passive, implicit?: true, level: 1],
24 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2}', level: 1],
25 | [0, 0, 0] => [:literal, :literal, Literal, text: 'a', level: 2],
26 | [0, 0, 0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', level: 2]
27 |
28 | # Ruby does not support modes for intervals, following `?` and `+` are read as chained quantifiers
29 | include_examples 'parse', /a{2,4}?b/,
30 | [0, :q] => [:quantifier, :zero_or_one, Quantifier, text: '?', mode: :greedy, min: 0, max: 1, ts: 6],
31 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1]
32 | include_examples 'parse', /a{2,4}+b/,
33 | [0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', mode: :greedy, min: 1, max: -1, ts: 6],
34 | [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1]
35 |
36 | specify('mode-checking methods') do
37 | exp = RP.parse(/a??/).first
38 |
39 | expect(exp).to be_reluctant
40 | expect(exp).to be_lazy
41 | expect(exp).not_to be_greedy
42 | expect(exp).not_to be_possessive
43 | expect(exp.quantifier).to be_reluctant
44 | expect(exp.quantifier).to be_lazy
45 | expect(exp.quantifier).not_to be_greedy
46 | expect(exp.quantifier).not_to be_possessive
47 | end
48 | end
49 |
--------------------------------------------------------------------------------
/spec/parser/set/intersections_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | # edge cases with `...-&&...` and `...&&-...` are checked in ./ranges_spec.rb
4 |
5 | RSpec.describe('CharacterSet::Intersection parsing') do
6 | include_examples 'parse', /[a&&z]/,
7 | [0] => [CharacterSet, count: 1],
8 | [0, 0] => [CharacterSet::Intersection, count: 2],
9 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1],
10 | [0, 0, 0, 0] => [:literal, text: 'a'],
11 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
12 | [0, 0, 1, 0] => [:literal, text: 'z']
13 |
14 | include_examples 'parse', /[a-z&&[^a]]/,
15 | [0] => [CharacterSet, count: 1],
16 | [0, 0] => [CharacterSet::Intersection, count: 2],
17 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1],
18 | [0, 0, 0, 0] => [CharacterSet::Range, count: 2],
19 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
20 | [0, 0, 1, 0] => [CharacterSet, count: 1]
21 |
22 | include_examples 'parse', /[a&&a-z]/,
23 | [0] => [CharacterSet, count: 1],
24 | [0, 0] => [CharacterSet::Intersection, count: 2],
25 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1],
26 | [0, 0, 0, 0] => [:literal, text: 'a'],
27 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
28 | [0, 0, 1, 0] => [CharacterSet::Range, count: 2]
29 |
30 | include_examples 'parse', /[a&&\w]/,
31 | [0] => [CharacterSet, count: 1],
32 | [0, 0] => [CharacterSet::Intersection, count: 2],
33 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
34 | [0, 0, 1, 0] => [:word, text: '\w']
35 |
36 | include_examples 'parse', /[\h&&\w&&efg]/,
37 | [0] => [CharacterSet, count: 1],
38 | [0, 0] => [CharacterSet::Intersection, count: 3],
39 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1],
40 | [0, 0, 0, 0] => [:hex, text: '\h'],
41 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
42 | [0, 0, 1, 0] => [:word, text: '\w'],
43 | [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 3],
44 | [0, 0, 2, 0] => [:literal, text: 'e'],
45 | [0, 0, 2, 1] => [:literal, text: 'f'],
46 | [0, 0, 2, 2] => [:literal, text: 'g']
47 |
48 | # test correct ts values for empty sequences
49 | include_examples 'parse', /[&&]/,
50 | [0, 0] => [CharacterSet::Intersection, text: '&&', count: 2, ts: 1],
51 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 0, ts: 1],
52 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 0, ts: 3]
53 |
54 | # test correct ts values for non-empty sequences
55 | include_examples 'parse', /[ab&&cd&&ef]/,
56 | [0, 0] => [CharacterSet::Intersection, count: 3, text: '&&', ts: 1],
57 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ab', ts: 1],
58 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'cd', ts: 5],
59 | [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ef', ts: 9]
60 |
61 | # Some edge-case patterns are evaluated with #match to make sure that
62 | # their matching behavior still reflects the way they are parsed.
63 | # #capturing_stderr is used to skip any warnings generated by this.
64 | specify('intersections behavior remains unchanged') do
65 | capturing_stderr do
66 | expect(/[a&&z]/).not_to match 'a'
67 | expect(/[a&&z]/).not_to match '&'
68 | expect(/[a&&z]/).not_to match 'z'
69 | expect(/[a-z&&[^a]]/).not_to match 'a'
70 | expect(/[a-z&&[^a]]/).not_to match '&'
71 | expect(/[a-z&&[^a]]/).to match 'b'
72 | expect(/[a&&a-z]/).to match 'a'
73 | expect(/[a&&a-z]/).not_to match '&'
74 | expect(/[a&&a-z]/).not_to match 'b'
75 | expect(/[a&&\w]/).to match 'a'
76 | expect(/[a&&\w]/).not_to match '&'
77 | expect(/[a&&\w]/).not_to match 'b'
78 | expect(/[\h&&\w&&efg]/).to match 'e'
79 | expect(/[\h&&\w&&efg]/).to match 'f'
80 | expect(/[\h&&\w&&efg]/).not_to match 'a'
81 | expect(/[\h&&\w&&efg]/).not_to match 'g'
82 | end
83 | end
84 | end
85 |
--------------------------------------------------------------------------------
/spec/parser/set/ranges_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('CharacterSet::Range parsing') do
4 | include_examples 'parse', '[a-z]',
5 | [0] => [CharacterSet, count: 1],
6 | [0, 0] => [CharacterSet::Range, count: 2],
7 | [0, 0, 0] => [:literal, text: 'a'],
8 | [0, 0, 1] => [:literal, text: 'z']
9 |
10 | include_examples 'parse', '[\x00-\x22]',
11 | [0] => [CharacterSet, count: 1],
12 | [0, 0] => [CharacterSet::Range, count: 2],
13 | [0, 0, 0] => [:hex, text: '\x00'],
14 | [0, 0, 1] => [:hex, text: '\x22']
15 |
16 | include_examples 'parse', '[\u{40 42}-\u1234]',
17 | [0] => [CharacterSet, count: 1],
18 | [0, 0] => [CharacterSet::Range, count: 2],
19 | [0, 0, 0] => [:codepoint_list, text: '\u{40 42}'],
20 | [0, 0, 1] => [:codepoint, text: '\u1234']
21 |
22 | include_examples 'parse', '[--z]',
23 | [0] => [CharacterSet, count: 1],
24 | [0, 0] => [CharacterSet::Range, count: 2],
25 | [0, 0, 0] => [:literal, text: '-'],
26 | [0, 0, 1] => [:literal, text: 'z']
27 |
28 | include_examples 'parse', '[!--]',
29 | [0] => [CharacterSet, count: 1],
30 | [0, 0] => [CharacterSet::Range, count: 2],
31 | [0, 0, 0] => [:literal, text: '!'],
32 | [0, 0, 1] => [:literal, text: '-']
33 |
34 | include_examples 'parse', '[!-^]',
35 | [0] => [CharacterSet, count: 1],
36 | [0, 0] => [CharacterSet::Range, count: 2],
37 | [0, 0, 0] => [:literal, text: '!'],
38 | [0, 0, 1] => [:literal, text: '^']
39 |
40 | # edge cases that are NOT treated as range
41 |
42 | include_examples 'parse', '[^-z]',
43 | [0] => [CharacterSet, count: 2],
44 | [0, 0] => [:literal, text: '-'],
45 | [0, 1] => [:literal, text: 'z']
46 |
47 | include_examples 'parse', '[[\-ab]&&-bc]',
48 | [0] => [CharacterSet, count: 1],
49 | [0, 0] => [CharacterSet::Intersection, count: 2],
50 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1],
51 | [0, 0, 0, 0] => [CharacterSet, count: 3],
52 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 3],
53 | [0, 0, 1, 0] => [:literal, text: '-']
54 |
55 | include_examples 'parse', '[bc-&&[\-ab]]',
56 | [0] => [CharacterSet, count: 1],
57 | [0, 0] => [CharacterSet::Intersection, count: 2],
58 | [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 3],
59 | [0, 0, 0, 2] => [:literal, text: '-'],
60 | [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1],
61 | [0, 0, 1, 0] => [CharacterSet, count: 3]
62 |
63 | # Some edge-case patterns are evaluated with #match to make sure that
64 | # their matching behavior still reflects the way they are parsed.
65 | # #capturing_stderr is used to skip any warnings generated by this.
66 | specify('ranges behavior remains unchanged') do
67 | capturing_stderr do
68 | expect(Regexp.new('[\x00-\x22]')).to match "\x11"
69 | expect(Regexp.new('[\u{40 42}-\u1234]')).to match "\u0600"
70 | expect(Regexp.new('[--z]')).to match 'a'
71 | expect(Regexp.new('[!--]')).to match '$'
72 | expect(Regexp.new('[!-^]')).to match '$'
73 |
74 | # edge cases that are NOT treated as ranges
75 | expect(Regexp.new('[^-z]')).to match 'a'
76 | expect(Regexp.new('[^-z]')).not_to match 'z'
77 | expect(Regexp.new('[[\-ab]&&-bc]')).to match '-'
78 | expect(Regexp.new('[[\-ab]&&-bc]')).to match 'b'
79 | expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'a'
80 | expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'c'
81 | expect(Regexp.new('[bc-&&[\-ab]]')).to match '-'
82 | expect(Regexp.new('[bc-&&[\-ab]]')).to match 'b'
83 | expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'a'
84 | expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'c'
85 | end
86 | end
87 | end
88 |
--------------------------------------------------------------------------------
/spec/parser/sets_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('CharacterSet parsing') do
4 | include_examples 'parse', /[ab]+/,
5 | [0] => [:set, :character, CharacterSet, text: '[', count: 2, quantified?: true],
6 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1],
7 | [0, 1] => [:literal, :literal, Literal, text: 'b', set_level: 1]
8 |
9 | include_examples 'parse', /[a\dc]/,
10 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
11 | [0, 1] => [:type, :digit, CharacterType::Digit]
12 |
13 | include_examples 'parse', /[a\bc]/,
14 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
15 | [0, 1] => [:escape, :backspace, EscapeSequence::Backspace, text: '\b']
16 |
17 | include_examples 'parse', '[a\xFz]',
18 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
19 | [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\xF']
20 |
21 | include_examples 'parse', '[a\x20c]',
22 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
23 | [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\x20']
24 |
25 | include_examples 'parse', '[a\77c]',
26 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
27 | [0, 1] => [:escape, :octal, EscapeSequence::Octal, text: '\77']
28 |
29 | include_examples 'parse', '[a\u0640c]',
30 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
31 | [0, 1] => [:escape, :codepoint, EscapeSequence::Codepoint, text: '\u0640']
32 |
33 | include_examples 'parse', '[a\u{41 1F60D}c]',
34 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
35 | [0, 1] => [:escape, :codepoint_list, EscapeSequence::CodepointList, text: '\u{41 1F60D}']
36 |
37 | include_examples 'parse', '[[:digit:][:^lower:]]+',
38 | [0] => [:set, :character, CharacterSet, text: '[', count: 2],
39 | [0, 0] => [:posixclass, :digit, PosixClass, text: '[:digit:]'],
40 | [0, 1] => [:nonposixclass, :lower, PosixClass, text: '[:^lower:]']
41 |
42 | include_examples 'parse', '[a[b[c]d]e]',
43 | [0] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 0],
44 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1],
45 | [0, 1] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 1],
46 | [0, 2] => [:literal, :literal, Literal, text: 'e', set_level: 1],
47 | [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2],
48 | [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3]
49 |
50 | include_examples 'parse', '[a[^b[c]]]',
51 | [0] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 0],
52 | [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1],
53 | [0, 1] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 1],
54 | [0, 1, 0] => [:literal, :literal, Literal, text: 'b', set_level: 2],
55 | [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2],
56 | [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3]
57 |
58 | include_examples 'parse', '[aaa]',
59 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
60 | [0, 0] => [:literal, :literal, Literal, text: 'a'],
61 | [0, 1] => [:literal, :literal, Literal, text: 'a'],
62 | [0, 2] => [:literal, :literal, Literal, text: 'a']
63 |
64 | include_examples 'parse', '[ ]',
65 | [0] => [:set, :character, CharacterSet, text: '[', count: 3],
66 | [0, 0] => [:literal, :literal, Literal, text: ' '],
67 | [0, 1] => [:literal, :literal, Literal, text: ' '],
68 | [0, 2] => [:literal, :literal, Literal, text: ' ']
69 |
70 | include_examples 'parse', '(?x)[ ]', # shouldn't merge whitespace even in x-mode
71 | [1] => [:set, :character, CharacterSet, text: '[', count: 3],
72 | [1, 0] => [:literal, :literal, Literal, text: ' '],
73 | [1, 1] => [:literal, :literal, Literal, text: ' '],
74 | [1, 2] => [:literal, :literal, Literal, text: ' ']
75 |
76 | include_examples 'parse', '[[.span-ll.]]', # collating sequences are disabled in Onigmo
77 | [0, 0] => [:set, :character, CharacterSet, text: '[', count: 7],
78 | [0, 0, 0] => [:literal, :literal, Literal, text: '.']
79 |
80 | include_examples 'parse', '[[=e=]]', # character equivalents are disabled in Onigmo
81 | [0, 0] => [:set, :character, CharacterSet, text: '[', count: 3],
82 | [0, 0, 0] => [:literal, :literal, Literal, text: '=']
83 | end
84 |
--------------------------------------------------------------------------------
/spec/parser/types_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('CharacterType parsing') do
4 | include_examples 'parse', /a\dc/, 1 => [:type, :digit, CharacterType::Digit]
5 | include_examples 'parse', /a\Dc/, 1 => [:type, :nondigit, CharacterType::NonDigit]
6 |
7 | include_examples 'parse', /a\sc/, 1 => [:type, :space, CharacterType::Space]
8 | include_examples 'parse', /a\Sc/, 1 => [:type, :nonspace, CharacterType::NonSpace]
9 |
10 | include_examples 'parse', /a\hc/, 1 => [:type, :hex, CharacterType::Hex]
11 | include_examples 'parse', /a\Hc/, 1 => [:type, :nonhex, CharacterType::NonHex]
12 |
13 | include_examples 'parse', /a\wc/, 1 => [:type, :word, CharacterType::Word]
14 | include_examples 'parse', /a\Wc/, 1 => [:type, :nonword, CharacterType::NonWord]
15 |
16 | include_examples 'parse', 'a\Rc', 1 => [:type, :linebreak, CharacterType::Linebreak]
17 | include_examples 'parse', 'a\Xc', 1 => [:type, :xgrapheme, CharacterType::ExtendedGrapheme]
18 | end
19 |
--------------------------------------------------------------------------------
/spec/scanner/all_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Scanner) do
4 | specify('scanner returns an array') do
5 | expect(RS.scan('abc')).to be_instance_of(Array)
6 | end
7 |
8 | specify('scanner returns tokens as arrays') do
9 | tokens = RS.scan('^abc+[^one]{2,3}\b\d\C-C$')
10 | expect(tokens).to all(be_a Array)
11 | expect(tokens.map(&:length)).to all(eq 5)
12 | end
13 |
14 | specify('scanner token count') do
15 | re = /^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i
16 | expect(RS.scan(re).length).to eq 28
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/spec/scanner/anchors_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Anchor scanning') do
4 | include_examples 'scan', '^abc', 0 => [:anchor, :bol, '^', 0, 1]
5 | include_examples 'scan', 'abc$', 1 => [:anchor, :eol, '$', 3, 4]
6 |
7 | include_examples 'scan', '\Aabc', 0 => [:anchor, :bos, '\A', 0, 2]
8 | include_examples 'scan', 'abc\z', 1 => [:anchor, :eos, '\z', 3, 5]
9 | include_examples 'scan', 'abc\Z', 1 => [:anchor, :eos_ob_eol, '\Z', 3, 5]
10 |
11 | include_examples 'scan', 'a\bc', 1 => [:anchor, :word_boundary, '\b', 1, 3]
12 | include_examples 'scan', 'a\Bc', 1 => [:anchor, :nonword_boundary, '\B', 1, 3]
13 |
14 | include_examples 'scan', 'a\Gc', 1 => [:anchor, :match_start, '\G', 1, 3]
15 |
16 | include_examples 'scan', "\\\\Ac", 0 => [:escape, :backslash, '\\\\', 0, 2]
17 | include_examples 'scan', "a\\\\z", 1 => [:escape, :backslash, '\\\\', 1, 3]
18 | include_examples 'scan', "a\\\\Z", 1 => [:escape, :backslash, '\\\\', 1, 3]
19 | include_examples 'scan', "a\\\\bc", 1 => [:escape, :backslash, '\\\\', 1, 3]
20 | include_examples 'scan', "a\\\\Bc", 1 => [:escape, :backslash, '\\\\', 1, 3]
21 | end
22 |
--------------------------------------------------------------------------------
/spec/scanner/delimiters_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Literal delimiter scanning') do
4 | include_examples 'scan', '}',
5 | 0 => [:literal, :literal, '}', 0, 1]
6 |
7 | include_examples 'scan', '}}',
8 | 0 => [:literal, :literal, '}}', 0, 2]
9 |
10 | include_examples 'scan', '{',
11 | 0 => [:literal, :literal, '{', 0, 1]
12 |
13 | include_examples 'scan', '{{',
14 | 0 => [:literal, :literal, '{{', 0, 2]
15 |
16 | include_examples 'scan', '{}',
17 | 0 => [:literal, :literal, '{}', 0, 2]
18 |
19 | include_examples 'scan', '}{',
20 | 0 => [:literal, :literal, '}{', 0, 2]
21 |
22 | include_examples 'scan', '}{+',
23 | 0 => [:literal, :literal, '}{', 0, 2]
24 |
25 | include_examples 'scan', '{{var}}',
26 | 0 => [:literal, :literal, '{{var}}', 0, 7]
27 |
28 | include_examples 'scan', 'a{1,2',
29 | 0 => [:literal, :literal, 'a{1,2', 0, 5]
30 |
31 | include_examples 'scan', '({.+})',
32 | 0 => [:group, :capture, '(', 0, 1],
33 | 1 => [:literal, :literal, '{', 1, 2],
34 | 2 => [:meta, :dot, '.', 2, 3],
35 | 3 => [:quantifier, :one_or_more, '+', 3, 4],
36 | 4 => [:literal, :literal, '}', 4, 5],
37 | 5 => [:group, :close, ')', 5, 6]
38 |
39 | include_examples 'scan', ']',
40 | 0 => [:literal, :literal, ']', 0, 1]
41 |
42 | include_examples 'scan', ']]',
43 | 0 => [:literal, :literal, ']]', 0, 2]
44 |
45 | include_examples 'scan', ']\[',
46 | 0 => [:literal, :literal, ']', 0, 1],
47 | 1 => [:escape, :set_open, '\[', 1, 3]
48 |
49 | include_examples 'scan', '()',
50 | 0 => [:group, :capture, '(', 0, 1],
51 | 1 => [:group, :close, ')', 1, 2]
52 | end
53 |
--------------------------------------------------------------------------------
/spec/scanner/keep_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Keep scanning') do
4 | include_examples 'scan', /ab\Kcd/,
5 | 1 => [:keep, :mark, '\K', 2, 4]
6 |
7 | include_examples 'scan', /(a\Kb)|(c\\\Kd)ef/,
8 | 2 => [:keep, :mark, '\K', 2, 4],
9 | 9 => [:keep, :mark, '\K', 11, 13]
10 | end
11 |
--------------------------------------------------------------------------------
/spec/scanner/literals_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('UTF8 scanning') do
4 | # ascii, single byte characters
5 | include_examples 'scan', 'a',
6 | 0 => [:literal, :literal, 'a', 0, 1]
7 |
8 | include_examples 'scan', 'ab+',
9 | 0 => [:literal, :literal, 'ab', 0, 2],
10 | 1 => [:quantifier, :one_or_more, '+', 2, 3]
11 |
12 | # 2 byte wide characters
13 | include_examples 'scan', 'äöü',
14 | 0 => [:literal, :literal, 'äöü', 0, 3]
15 |
16 | # 3 byte wide characters, Japanese
17 | include_examples 'scan', 'ab?れます+cd',
18 | 0 => [:literal, :literal, 'ab', 0, 2],
19 | 1 => [:quantifier, :zero_or_one, '?', 2, 3],
20 | 2 => [:literal, :literal, 'れます', 3, 6],
21 | 3 => [:quantifier, :one_or_more, '+', 6, 7],
22 | 4 => [:literal, :literal, 'cd', 7, 9]
23 |
24 | # 4 byte wide characters, Osmanya
25 | include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃',
26 | 0 => [:literal, :literal, '𐒀𐒁', 0, 2],
27 | 1 => [:quantifier, :zero_or_one, '?', 2, 3],
28 | 2 => [:literal, :literal, '𐒂ab', 3, 6],
29 | 3 => [:quantifier, :one_or_more, '+', 6, 7],
30 | 4 => [:literal, :literal, '𐒃', 7, 8]
31 |
32 | include_examples 'scan', 'mu𝄞?si*𝄫c+',
33 | 0 => [:literal, :literal, 'mu𝄞', 0, 3],
34 | 1 => [:quantifier, :zero_or_one, '?', 3, 4],
35 | 2 => [:literal, :literal, 'si', 4, 6],
36 | 3 => [:quantifier, :zero_or_more, '*', 6, 7],
37 | 4 => [:literal, :literal, '𝄫c', 7, 9],
38 | 5 => [:quantifier, :one_or_more, '+', 9, 10]
39 | end
40 |
--------------------------------------------------------------------------------
/spec/scanner/meta_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Meta scanning') do
4 | include_examples 'scan', /abc??|def*+|ghi+/,
5 | 0 => [:literal, :literal, 'abc', 0, 3],
6 | 1 => [:quantifier, :zero_or_one_reluctant, '??', 3, 5],
7 | 2 => [:meta, :alternation, '|', 5, 6],
8 | 3 => [:literal, :literal, 'def', 6, 9],
9 | 4 => [:quantifier, :zero_or_more_possessive, '*+', 9, 11],
10 | 5 => [:meta, :alternation, '|', 11, 12]
11 |
12 | include_examples 'scan', /(a\|b)|(c|d)\|(e[|]f)/,
13 | 2 => [:escape, :alternation, '\|', 2, 4],
14 | 5 => [:meta, :alternation, '|', 6, 7],
15 | 8 => [:meta, :alternation, '|', 9, 10],
16 | 11 => [:escape, :alternation, '\|', 12, 14],
17 | 15 => [:literal, :literal, '|', 17, 18]
18 | end
19 |
--------------------------------------------------------------------------------
/spec/scanner/options_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('passing options to scan') do
4 | def expect_type_tokens(tokens, type_tokens)
5 | expect(tokens.map { |type, token, *| [type, token] }).to eq(type_tokens)
6 | end
7 |
8 | it 'raises if if scanning from a Regexp and options are passed' do
9 | expect { RS.scan(/a+/, options: ::Regexp::EXTENDED) }.to raise_error(
10 | ArgumentError,
11 | 'options cannot be supplied unless scanning a String'
12 | )
13 | end
14 |
15 | it 'sets free_spacing based on options if scanning from a String' do
16 | expect_type_tokens(
17 | RS.scan('a+#c', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED),
18 | [
19 | %i[literal literal],
20 | %i[quantifier one_or_more],
21 | %i[free_space comment]
22 | ]
23 | )
24 | end
25 |
26 | it 'does not set free_spacing if scanning from a String and passing no options' do
27 | expect_type_tokens(
28 | RS.scan('a+#c'),
29 | [
30 | %i[literal literal],
31 | %i[quantifier one_or_more],
32 | %i[literal literal]
33 | ]
34 | )
35 | end
36 | end
37 |
--------------------------------------------------------------------------------
/spec/scanner/properties_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Property scanning') do
4 | RSpec.shared_examples 'scan property' do |text, token|
5 | it("scans \\p{#{text}} as property #{token}") do
6 | result = RS.scan("\\p{#{text}}")[0]
7 | expect(result[0..1]).to eq [:property, token]
8 | end
9 |
10 | it("scans \\P{#{text}} as nonproperty #{token}") do
11 | result = RS.scan("\\P{#{text}}")[0]
12 | expect(result[0..1]).to eq [:nonproperty, token]
13 | end
14 |
15 | it("scans \\p{^#{text}} as nonproperty #{token}") do
16 | result = RS.scan("\\p{^#{text}}")[0]
17 | expect(result[0..1]).to eq [:nonproperty, token]
18 | end
19 |
20 | it("scans double-negated \\P{^#{text}} as property #{token}") do
21 | result = RS.scan("\\P{^#{text}}")[0]
22 | expect(result[0..1]).to eq [:property, token]
23 | end
24 | end
25 |
26 | include_examples 'scan property', 'Alnum', :alnum
27 |
28 | include_examples 'scan property', 'XPosixPunct', :xposixpunct
29 |
30 | include_examples 'scan property', 'Newline', :newline
31 |
32 | include_examples 'scan property', 'Any', :any
33 |
34 | include_examples 'scan property', 'Assigned', :assigned
35 |
36 | include_examples 'scan property', 'Age=1.1', :'age=1.1'
37 | include_examples 'scan property', 'Age=10.0', :'age=10.0'
38 |
39 | include_examples 'scan property', 'ahex', :ascii_hex_digit
40 | include_examples 'scan property', 'ASCII_Hex_Digit', :ascii_hex_digit # test underscore
41 |
42 | include_examples 'scan property', 'sd', :soft_dotted
43 | include_examples 'scan property', 'Soft-Dotted', :soft_dotted # test dash
44 |
45 | include_examples 'scan property', 'Egyp', :egyptian_hieroglyphs
46 | include_examples 'scan property', 'Egyptian Hieroglyphs', :egyptian_hieroglyphs # test whitespace
47 |
48 | include_examples 'scan property', 'Linb', :linear_b
49 | include_examples 'scan property', 'Linear-B', :linear_b # test dash
50 |
51 | include_examples 'scan property', 'InArabic', :in_arabic # test block
52 | include_examples 'scan property', 'in Arabic', :in_arabic # test block w. whitespace
53 | include_examples 'scan property', 'In_Arabic', :in_arabic # test block w. underscore
54 |
55 | include_examples 'scan property', 'Yiii', :yi
56 | include_examples 'scan property', 'Yi', :yi
57 |
58 | include_examples 'scan property', 'Zinh', :inherited
59 | include_examples 'scan property', 'Inherited', :inherited
60 | include_examples 'scan property', 'Qaai', :inherited
61 |
62 | include_examples 'scan property', 'Zzzz', :unknown
63 | include_examples 'scan property', 'Unknown', :unknown
64 | end
65 |
--------------------------------------------------------------------------------
/spec/scanner/quantifiers_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Quantifier scanning') do
4 | include_examples 'scan', 'a?', 1 => [:quantifier, :zero_or_one, '?', 1, 2]
5 | include_examples 'scan', 'a??', 1 => [:quantifier, :zero_or_one_reluctant, '??', 1, 3]
6 | include_examples 'scan', 'a?+', 1 => [:quantifier, :zero_or_one_possessive, '?+', 1, 3]
7 |
8 | include_examples 'scan', 'a*', 1 => [:quantifier, :zero_or_more, '*', 1, 2]
9 | include_examples 'scan', 'a*?', 1 => [:quantifier, :zero_or_more_reluctant, '*?', 1, 3]
10 | include_examples 'scan', 'a*+', 1 => [:quantifier, :zero_or_more_possessive, '*+', 1, 3]
11 |
12 | include_examples 'scan', 'a+', 1 => [:quantifier, :one_or_more, '+', 1, 2]
13 | include_examples 'scan', 'a+?', 1 => [:quantifier, :one_or_more_reluctant, '+?', 1, 3]
14 | include_examples 'scan', 'a++', 1 => [:quantifier, :one_or_more_possessive, '++', 1, 3]
15 |
16 | include_examples 'scan', 'a{2}', 1 => [:quantifier, :interval, '{2}', 1, 4]
17 | include_examples 'scan', 'a{2,}', 1 => [:quantifier, :interval, '{2,}', 1, 5]
18 | include_examples 'scan', 'a{,2}', 1 => [:quantifier, :interval, '{,2}', 1, 5]
19 | include_examples 'scan', 'a{2,4}', 1 => [:quantifier, :interval, '{2,4}', 1, 6]
20 |
21 | # special case: chained quantifiers
22 | include_examples 'scan', 'a+{2}{3}', 1 => [:quantifier, :one_or_more, '+', 1, 2]
23 | include_examples 'scan', 'a+{2}{3}', 2 => [:quantifier, :interval, '{2}', 2, 5]
24 | include_examples 'scan', 'a+{2}{3}', 3 => [:quantifier, :interval, '{3}', 5, 8]
25 | end
26 |
--------------------------------------------------------------------------------
/spec/scanner/refcalls_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('RefCall scanning') do
4 | # Traditional numerical group back-reference
5 | include_examples 'scan', '(abc)\1' , 3 => [:backref, :number, '\1', 5, 7]
6 |
7 | # Group back-references, named, numbered, and relative
8 | #
9 | # NOTE: only \g supports forward-looking references using '+', e.g. \g<+1>
10 | # refers to the next group, but \k<+1> refers to a group named '+1'.
11 | # Inversely, only \k supports addition or subtraction of a recursion level.
12 | # E.g. \k refers to a group named 'x' at the current recursion level,
13 | # but \g refers to a a group named 'x+0'.
14 | #
15 | include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_ref_ab, '\k', 9, 14]
16 | include_examples 'scan', "(?abc)\\k'X'", 3 => [:backref, :name_ref_sq, "\\k'X'", 9, 14]
17 |
18 | include_examples 'scan', '(?<+1>abc)\k<+1>', 3 => [:backref, :name_ref_ab, '\k<+1>', 10, 16]
19 | include_examples 'scan', "(?<+1>abc)\\k'+1'", 3 => [:backref, :name_ref_sq, "\\k'+1'", 10, 16]
20 |
21 | include_examples 'scan', '(abc)\k<1>', 3 => [:backref, :number_ref_ab, '\k<1>', 5, 10]
22 | include_examples 'scan', "(abc)\\k'1'", 3 => [:backref, :number_ref_sq, "\\k'1'", 5, 10]
23 | include_examples 'scan', "(abc)\\k'001'", 3 => [:backref, :number_ref_sq, "\\k'001'", 5, 12]
24 |
25 | include_examples 'scan', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref_ab, '\k<-1>', 5, 11]
26 | include_examples 'scan', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref_sq, "\\k'-1'", 5, 11]
27 | include_examples 'scan', '(abc)\k<-001>', 3 => [:backref, :number_rel_ref_ab, '\k<-001>', 5, 13]
28 |
29 | # Sub-expression invocation, named, numbered, and relative
30 | include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 14]
31 | include_examples 'scan', "(?abc)\\g'X'", 3 => [:backref, :name_call_sq, "\\g'X'", 9, 14]
32 |
33 | include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 16]
34 | include_examples 'scan', "(?abc)\\g'X-1'", 3 => [:backref, :name_call_sq, "\\g'X-1'", 9, 16]
35 |
36 | include_examples 'scan', '(abc)\g<1>', 3 => [:backref, :number_call_ab, '\g<1>', 5, 10]
37 | include_examples 'scan', "(abc)\\g'1'", 3 => [:backref, :number_call_sq, "\\g'1'", 5, 10]
38 | include_examples 'scan', '(abc)\g<001>', 3 => [:backref, :number_call_ab, '\g<001>', 5, 12]
39 |
40 | include_examples 'scan', 'a(b|\g<0>)', 4 => [:backref, :number_call_ab, '\g<0>', 4, 9]
41 | include_examples 'scan', "a(b|\\g'0')", 4 => [:backref, :number_call_sq, "\\g'0'", 4, 9]
42 |
43 | include_examples 'scan', '(abc)\g<-1>', 3 => [:backref, :number_rel_call_ab, '\g<-1>', 5, 11]
44 | include_examples 'scan', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call_sq, "\\g'-1'", 5, 11]
45 | include_examples 'scan', '(abc)\g<-001>', 3 => [:backref, :number_rel_call_ab, '\g<-001>', 5, 13]
46 |
47 | include_examples 'scan', '\g<+1>(abc)', 0 => [:backref, :number_rel_call_ab, '\g<+1>', 0, 6]
48 | include_examples 'scan', "\\g'+1'(abc)", 0 => [:backref, :number_rel_call_sq, "\\g'+1'", 0, 6]
49 |
50 | # Group back-references, with recursion level
51 | include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_recursion_ref_ab, '\k', 9, 16]
52 | include_examples 'scan', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'X-0'", 9, 16]
53 |
54 | include_examples 'scan', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref_ab, '\k<1-0>', 5, 12]
55 | include_examples 'scan', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref_sq, "\\k'1-0'", 5, 12]
56 |
57 | include_examples 'scan', '(abc)\k<+1-0>', 3 => [:backref, :name_recursion_ref_ab, '\k<+1-0>', 5, 13]
58 | include_examples 'scan', "(abc)\\k'+1-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'+1-0'", 5, 13]
59 | end
60 |
--------------------------------------------------------------------------------
/spec/scanner/types_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe('Type scanning') do
4 | include_examples 'scan', 'a\dc', 1 => [:type, :digit, '\d', 1, 3]
5 | include_examples 'scan', 'a\Dc', 1 => [:type, :nondigit, '\D', 1, 3]
6 | include_examples 'scan', 'a\hc', 1 => [:type, :hex, '\h', 1, 3]
7 | include_examples 'scan', 'a\Hc', 1 => [:type, :nonhex, '\H', 1, 3]
8 | include_examples 'scan', 'a\sc', 1 => [:type, :space, '\s', 1, 3]
9 | include_examples 'scan', 'a\Sc', 1 => [:type, :nonspace, '\S', 1, 3]
10 | include_examples 'scan', 'a\wc', 1 => [:type, :word, '\w', 1, 3]
11 | include_examples 'scan', 'a\Wc', 1 => [:type, :nonword, '\W', 1, 3]
12 | include_examples 'scan', 'a\Rc', 1 => [:type, :linebreak, '\R', 1, 3]
13 | include_examples 'scan', 'a\Xc', 1 => [:type, :xgrapheme, '\X', 1, 3]
14 | end
15 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | $VERBOSE = true
2 |
3 | require 'leto'
4 | require 'regexp_property_values'
5 | require_relative 'support/capturing_stderr'
6 | require_relative 'support/shared_examples'
7 |
8 | req_warn = capturing_stderr { @required_now = require('regexp_parser') }
9 | req_warn.empty? || fail("requiring parser generated warnings:\n#{req_warn}")
10 | @required_now || fail("regexp_parser was required earlier than expected")
11 |
12 | RS = Regexp::Scanner
13 | RL = Regexp::Lexer
14 | RP = Regexp::Parser
15 | RE = Regexp::Expression
16 | T = Regexp::Syntax::Token
17 |
18 | include Regexp::Expression
19 |
20 | def ruby_version_at_least(version)
21 | Gem::Version.new(RUBY_VERSION.dup) >= Gem::Version.new(version)
22 | end
23 |
24 | RSpec.configure do |config|
25 | config.around(:example) do |example|
26 | # treat unexpected warnings as failures
27 | expect { example.run }.not_to output.to_stderr
28 | end
29 | end
30 |
31 | def s(klass, text = '', *children)
32 | exp = klass.construct(text: text.to_s)
33 | children.each { |child| exp.expressions << child }
34 | exp
35 | end
36 |
37 | def regexp_with_all_features
38 | return /dummy/ unless ruby_version_at_least('2.4.1')
39 |
40 | Regexp.new(<<-'REGEXP', Regexp::EXTENDED)
41 | \A
42 | a++
43 | (?:
44 | \b {2}
45 | (?>
46 | c ??
47 | 😀😀😀
48 | # 😄😄😄
49 | (?# 😃😃😃 )
50 | (
51 | \d *+
52 | (
53 | ALT1
54 | |
55 | ALT2
56 | )
57 | ) {004}
58 | |
59 | [ä-ü&&ö[:ascii:]\p{thai}] {6}
60 | |
61 | \z
62 | )
63 | (?=lm{8}) ?+
64 | \K
65 | (?~
66 | \1
67 | \g<-1> {10}
68 | \uFFFF
69 | \012
70 | )
71 | (?(1)
72 | BRANCH1
73 | |
74 | BRANCH2
75 | )
76 | )
77 | REGEXP
78 | end
79 |
--------------------------------------------------------------------------------
/spec/support/capturing_stderr.rb:
--------------------------------------------------------------------------------
1 | require 'stringio'
2 |
3 | def capturing_stderr(&block)
4 | old_stderr, $stderr = $stderr, StringIO.new
5 | block.call
6 | $stderr.string
7 | ensure
8 | $stderr = old_stderr
9 | end
10 |
--------------------------------------------------------------------------------
/spec/support/shared_examples.rb:
--------------------------------------------------------------------------------
1 | RSpec.shared_examples 'syntax' do |opts|
2 | opts[:implements].each do |type, tokens|
3 | tokens.each do |token|
4 | it("implements #{token} #{type}") do
5 | expect(described_class.implements?(type, token)).to be true
6 | end
7 | end
8 | end
9 |
10 | opts[:excludes] && opts[:excludes].each do |type, tokens|
11 | tokens.each do |token|
12 | it("does not implement #{token} #{type}") do
13 | expect(described_class.implements?(type, token)).to be false
14 | end
15 | end
16 | end
17 | end
18 |
19 | RSpec.shared_examples 'scan' do |pattern, checks|
20 | context "given the pattern #{pattern}" do
21 | before(:all) { @tokens = Regexp::Scanner.scan(pattern) }
22 |
23 | checks.each do |index, (type, token, text, ts, te)|
24 | it "scans token #{index} as #{token} #{type} at #{ts}..#{te}" do
25 | result = @tokens.at(index)
26 | result || fail("no token at index #{index}, max is #{@tokens.size - 1}")
27 |
28 | expect(result[0]).to eq type
29 | expect(result[1]).to eq token
30 | expect(result[2]).to eq text
31 | expect(result[3]).to eq ts
32 | expect(result[4]).to eq te
33 | end
34 | end
35 | end
36 | end
37 |
38 | RSpec.shared_examples 'lex' do |pattern, checks|
39 | context "given the pattern #{pattern}" do
40 | before(:all) { @tokens = Regexp::Lexer.lex(pattern) }
41 |
42 | checks.each do |index, (type, token, text, ts, te, lvl, set_lvl, cond_lvl)|
43 | it "lexes token #{index} as #{token} #{type} at #{lvl}, #{set_lvl}, #{cond_lvl}" do
44 | struct = @tokens.at(index)
45 |
46 | expect(struct.type).to eq type
47 | expect(struct.token).to eq token
48 | expect(struct.text).to eq text
49 | expect(struct.ts).to eq ts
50 | expect(struct.te).to eq te
51 | expect(struct.level).to eq lvl
52 | expect(struct.set_level).to eq set_lvl
53 | expect(struct.conditional_level).to eq cond_lvl
54 | end
55 | end
56 | end
57 | end
58 |
59 | RSpec.shared_examples 'parse' do |pattern, checks|
60 | context "given the pattern #{pattern}" do
61 | before(:all) { @root = Regexp::Parser.parse(pattern, '*') }
62 |
63 | checks.each do |path, expectations|
64 | path = Array(path)
65 | inspect_quantifier = path.last == :q && path.pop
66 |
67 | attributes = expectations.pop if expectations.last.is_a?(Hash)
68 | klass = expectations.pop if expectations.last.is_a?(Class)
69 | token = expectations.pop
70 | type = expectations.pop
71 |
72 | description = klass || token || type || 'Expression'
73 |
74 | it "parses expression at #{path} as #{description}" do
75 | exp = @root.dig(*path)
76 | exp = exp.quantifier if inspect_quantifier
77 |
78 | klass && expect(exp).to(be_instance_of(klass))
79 | type && expect(exp.type).to(eq(type))
80 | token && expect(exp.token).to(eq(token))
81 |
82 | attributes && attributes.each do |method, value|
83 | actual = exp.send(method)
84 | expect(actual).to eq(value),
85 | "expected #{description} at #{path} to "\
86 | "have #{method} #{value.inspect}, got #{actual.inspect}"
87 | end
88 | end
89 | end
90 | end
91 | end
92 |
--------------------------------------------------------------------------------
/spec/syntax/syntax_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax) do
4 | describe('::for') do
5 | it { expect(Regexp::Syntax.for('ruby/1.8.6')).to eq Regexp::Syntax::V1_8_6 }
6 | it { expect(Regexp::Syntax.for('ruby/1.8')).to eq Regexp::Syntax::V1_8_6 }
7 | it { expect(Regexp::Syntax.for('ruby/1.9.1')).to eq Regexp::Syntax::V1_9_1 }
8 | it { expect(Regexp::Syntax.for('ruby/1.9')).to eq Regexp::Syntax::V1_9_3 }
9 | it { expect(Regexp::Syntax.for('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 }
10 | it { expect(Regexp::Syntax.for('ruby/2.0')).to eq Regexp::Syntax::V2_0_0 }
11 | it { expect(Regexp::Syntax.for('ruby/2.1')).to eq Regexp::Syntax::V2_0_0 }
12 | it { expect(Regexp::Syntax.for('ruby/2.2.0')).to eq Regexp::Syntax::V2_2_0 }
13 | it { expect(Regexp::Syntax.for('ruby/2.2.10')).to eq Regexp::Syntax::V2_2_0 }
14 | it { expect(Regexp::Syntax.for('ruby/2.2')).to eq Regexp::Syntax::V2_2_0 }
15 | it { expect(Regexp::Syntax.for('ruby/2.3.0')).to eq Regexp::Syntax::V2_3_0 }
16 | it { expect(Regexp::Syntax.for('ruby/2.3')).to eq Regexp::Syntax::V2_3_0 }
17 | it { expect(Regexp::Syntax.for('ruby/2.4.0')).to eq Regexp::Syntax::V2_4_0 }
18 | it { expect(Regexp::Syntax.for('ruby/2.4.1')).to eq Regexp::Syntax::V2_4_1 }
19 | it { expect(Regexp::Syntax.for('ruby/2.5.0')).to eq Regexp::Syntax::V2_5_0 }
20 | it { expect(Regexp::Syntax.for('ruby/2.5')).to eq Regexp::Syntax::V2_5_0 }
21 | it { expect(Regexp::Syntax.for('ruby/2.6.0')).to eq Regexp::Syntax::V2_6_0 }
22 | it { expect(Regexp::Syntax.for('ruby/2.6.2')).to eq Regexp::Syntax::V2_6_2 }
23 | it { expect(Regexp::Syntax.for('ruby/2.6.3')).to eq Regexp::Syntax::V2_6_3 }
24 | it { expect(Regexp::Syntax.for('ruby/2.6')).to eq Regexp::Syntax::V2_6_3 }
25 | it { expect(Regexp::Syntax.for('ruby/3.0.0')).to eq Regexp::Syntax::V2_6_3 }
26 | it { expect(Regexp::Syntax.for('ruby/3.0')).to eq Regexp::Syntax::V2_6_3 }
27 | it { expect(Regexp::Syntax.for('ruby/3.1.0')).to eq Regexp::Syntax::V3_1_0 }
28 | it { expect(Regexp::Syntax.for('ruby/3.1')).to eq Regexp::Syntax::V3_1_0 }
29 | it { expect(Regexp::Syntax.for('ruby/3.2.0')).to eq Regexp::Syntax::V3_2_0 }
30 | it { expect(Regexp::Syntax.for('ruby/3.2')).to eq Regexp::Syntax::V3_2_0 }
31 |
32 | it { expect(Regexp::Syntax.for('any')).to eq Regexp::Syntax::Any }
33 | it { expect(Regexp::Syntax.for('*')).to eq Regexp::Syntax::Any }
34 |
35 | it 'raises for unknown names' do
36 | expect { Regexp::Syntax.for('ruby/1.0') }.to raise_error(Regexp::Syntax::UnknownSyntaxNameError)
37 | end
38 |
39 | it 'raises for invalid names' do
40 | expect { Regexp::Syntax.version_class('2.0.0') }.to raise_error(Regexp::Syntax::InvalidVersionNameError)
41 | expect { Regexp::Syntax.version_class('ruby/20') }.to raise_error(Regexp::Syntax::InvalidVersionNameError)
42 | end
43 | end
44 |
45 | specify('::new is a deprecated alias of ::for') do
46 | expect { expect(Regexp::Syntax.new('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 }
47 | .to output(/deprecated/).to_stderr
48 | end
49 |
50 | specify('not implemented') do
51 | expect { RP.parse('\p{alpha}', 'ruby/1.8') }.to raise_error(Regexp::Syntax::NotImplementedError)
52 | end
53 |
54 | specify('supported?') do
55 | expect(Regexp::Syntax.supported?('ruby/1.1.1')).to be false
56 | expect(Regexp::Syntax.supported?('ruby/2.4.3')).to be true
57 | expect(Regexp::Syntax.supported?('ruby/2.5')).to be true
58 | end
59 |
60 | specify('raises for unknown constant lookups') do
61 | expect { Regexp::Syntax::V1 }.to raise_error(/V1/)
62 | end
63 |
64 | specify('instantiation is deprecated but still works') do
65 | expect { @instance = Regexp::Syntax::V3_1_0.new }
66 | .to output(/deprecated/).to_stderr
67 | expect { expect(@instance.implements?(:literal, :literal)).to be true }
68 | .to output(/deprecated/).to_stderr
69 | end
70 | end
71 |
--------------------------------------------------------------------------------
/spec/syntax/syntax_token_map_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::Token::Map) do
4 | let(:map) { Regexp::Syntax::Token::Map }
5 | let(:current_syntax) { Regexp::Syntax::CURRENT }
6 |
7 | specify('is complete') do
8 | current_syntax.features.each do |type, tokens|
9 | tokens.each { |token| expect(map[type]).to include(token) }
10 | end
11 | end
12 |
13 | specify('contains no duplicate tokens') do
14 | current_syntax.features.each do |_type, tokens|
15 | expect(tokens).to eq tokens.uniq
16 | end
17 | end
18 |
19 | specify('contains no duplicate type/token combinations') do
20 | combinations = map.flat_map do |type, tokens|
21 | tokens.map { |token| "#{type} #{token}" }
22 | end
23 |
24 | non_uniq = combinations.group_by { |str| str }.select { |_, v| v.count > 1 }
25 |
26 | expect(non_uniq.keys).to be_empty
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/spec/syntax/versions/1.8.6_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V1_8_6) do
4 | include_examples 'syntax',
5 | implements: {
6 | assertion: T::Assertion::Lookahead,
7 | backref: T::Backreference::Plain,
8 | escape: T::Escape::Basic + T::Escape::ASCII + T::Escape::Meta + T::Escape::Control,
9 | group: T::Group::V1_8_6,
10 | quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Interval + T::Quantifier::IntervalReluctant
11 | },
12 | excludes: {
13 | assertion: T::Assertion::Lookbehind,
14 | backref: T::Backreference::All - T::Backreference::Plain + T::SubexpressionCall::All,
15 | quantifier: T::Quantifier::Possessive
16 | }
17 | end
18 |
--------------------------------------------------------------------------------
/spec/syntax/versions/1.9.1_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V1_9_1) do
4 | include_examples 'syntax',
5 | implements: {
6 | escape: T::Escape::Hex + T::Escape::Octal + T::Escape::Unicode,
7 | type: T::CharacterType::Hex,
8 | quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Possessive
9 | }
10 | end
11 |
--------------------------------------------------------------------------------
/spec/syntax/versions/1.9.3_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V1_9_3) do
4 | include_examples 'syntax',
5 | implements: {
6 | property: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3,
7 | nonproperty: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3
8 | }
9 | end
10 |
--------------------------------------------------------------------------------
/spec/syntax/versions/2.0.0_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V2_0_0) do
4 | include_examples 'syntax',
5 | implements: {
6 | property: T::UnicodeProperty::Age_V2_0_0,
7 | nonproperty: T::UnicodeProperty::Age_V2_0_0
8 | },
9 | excludes: {
10 | property: %i[newline],
11 | nonproperty: %i[newline]
12 | }
13 | end
14 |
--------------------------------------------------------------------------------
/spec/syntax/versions/2.2.0_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V2_2_0) do
4 | include_examples 'syntax',
5 | implements: {
6 | property: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0,
7 | nonproperty: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0
8 | }
9 | end
10 |
--------------------------------------------------------------------------------
/spec/syntax/versions/3.2.0_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Syntax::V3_2_0) do
4 | include_examples 'syntax',
5 | implements: {
6 | property: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0,
7 | nonproperty: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0
8 | }
9 | end
10 |
--------------------------------------------------------------------------------
/spec/token/token_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe(Regexp::Token) do
4 | specify('#offset') do
5 | regexp = /ab?cd/
6 | tokens = RL.lex(regexp)
7 |
8 | expect(tokens[1].text).to eq 'b'
9 | expect(tokens[1].offset).to eq [1, 2]
10 |
11 | expect(tokens[2].text).to eq '?'
12 | expect(tokens[2].offset).to eq [2, 3]
13 |
14 | expect(tokens[3].text).to eq 'cd'
15 | expect(tokens[3].offset).to eq [3, 5]
16 | end
17 |
18 | specify('#length') do
19 | regexp = /abc?def/
20 | tokens = RL.lex(regexp)
21 |
22 | expect(tokens[0].text).to eq 'ab'
23 | expect(tokens[0].length).to eq 2
24 |
25 | expect(tokens[1].text).to eq 'c'
26 | expect(tokens[1].length).to eq 1
27 |
28 | expect(tokens[2].text).to eq '?'
29 | expect(tokens[2].length).to eq 1
30 |
31 | expect(tokens[3].text).to eq 'def'
32 | expect(tokens[3].length).to eq 3
33 | end
34 |
35 | specify('#to_h') do
36 | regexp = /abc?def/
37 | tokens = RL.lex(regexp)
38 |
39 | expect(tokens[0].text).to eq 'ab'
40 | expect(tokens[0].to_h).to eq type: :literal, token: :literal, text: 'ab', ts: 0, te: 2, level: 0, set_level: 0, conditional_level: 0
41 |
42 | expect(tokens[2].text).to eq '?'
43 | expect(tokens[2].to_h).to eq type: :quantifier, token: :zero_or_one, text: '?', ts: 3, te: 4, level: 0, set_level: 0, conditional_level: 0
44 | end
45 |
46 | specify('#next') do
47 | regexp = /a+b?c*d{2,3}/
48 | tokens = RL.lex(regexp)
49 |
50 | a = tokens.first
51 | expect(a.text).to eq 'a'
52 |
53 | plus = a.next
54 | expect(plus.text).to eq '+'
55 |
56 | b = plus.next
57 | expect(b.text).to eq 'b'
58 |
59 | interval = tokens.last
60 | expect(interval.text).to eq '{2,3}'
61 |
62 | expect(interval.next).to be_nil
63 | end
64 |
65 | specify('#previous') do
66 | regexp = /a+b?c*d{2,3}/
67 | tokens = RL.lex(regexp)
68 |
69 | interval = tokens.last
70 | expect(interval.text).to eq '{2,3}'
71 |
72 | d = interval.previous
73 | expect(d.text).to eq 'd'
74 |
75 | star = d.previous
76 | expect(star.text).to eq '*'
77 |
78 | c = star.previous
79 | expect(c.text).to eq 'c'
80 |
81 | a = tokens.first
82 | expect(a.text).to eq 'a'
83 | expect(a.previous).to be_nil
84 | end
85 | end
86 |
--------------------------------------------------------------------------------
/tasks/benchmark.rake:
--------------------------------------------------------------------------------
1 | BENCHMARKS_DIR = "#{__dir__}/benchmarks"
2 |
3 | desc 'Run all IPS benchmarks'
4 | task :benchmark do
5 | Dir["#{BENCHMARKS_DIR}/*.rb"].sort.each { |file| load(file) }
6 | end
7 |
8 | namespace :benchmark do
9 | desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
10 | task :write_to_file do
11 | require 'stringio'
12 |
13 | string_io = StringIO.new
14 | with_stdouts(STDOUT, string_io) { Rake.application[:benchmark].invoke }
15 |
16 | File.write "#{BENCHMARKS_DIR}/log",
17 | "Results of rake:benchmark on #{RUBY_DESCRIPTION}\n\n" +
18 | string_io.string.gsub(/Warming up.*?Comparison:/m, '')
19 | end
20 | end
21 |
22 | def with_stdouts(*ios)
23 | old_stdout = $stdout
24 | ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } }
25 | ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) }
26 | $stdout = ios
27 | yield
28 | ensure
29 | $stdout = old_stdout
30 | end
31 |
--------------------------------------------------------------------------------
/tasks/benchmarks/log:
--------------------------------------------------------------------------------
1 | Results of rake:benchmark on ruby 3.1.0p0 (2021-12-25 revision fb4df44d16) [arm64-darwin21]
2 |
3 | Parsing a minimal Regexp
4 |
5 | Scanner::scan: 32069.4 i/s
6 | Lexer::lex: 30700.6 i/s - same-ish: difference falls within error
7 | Parser::parse: 26248.5 i/s - 1.22x (± 0.00) slower
8 |
9 | Parsing a complex Regexp (URI.regexp)
10 |
11 | Scanner::scan: 843.4 i/s
12 | Lexer::lex: 546.3 i/s - 1.54x (± 0.00) slower
13 | Parser::parse: 332.5 i/s - 2.54x (± 0.00) slower
14 |
15 |
--------------------------------------------------------------------------------
/tasks/benchmarks/minimal_regexp.rb:
--------------------------------------------------------------------------------
1 | require 'benchmark/ips'
2 | require_relative '../../lib/regexp_parser'
3 |
4 | puts 'Parsing a minimal Regexp'
5 |
6 | regexp = /./
7 |
8 | Benchmark.ips do |x|
9 | x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) }
10 | x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) }
11 | x.report('Parser::parse') { Regexp::Parser.parse(regexp) }
12 | x.compare!
13 | end
14 |
--------------------------------------------------------------------------------
/tasks/benchmarks/uri_regexp.rb:
--------------------------------------------------------------------------------
1 | require 'benchmark/ips'
2 | require_relative '../../lib/regexp_parser'
3 |
4 | puts 'Parsing a complex Regexp (URI.regexp)'
5 |
6 | require 'uri'
7 | regexp = URI::DEFAULT_PARSER.make_regexp
8 |
9 | Benchmark.ips do |x|
10 | x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) }
11 | x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) }
12 | x.report('Parser::parse') { Regexp::Parser.parse(regexp) }
13 | x.compare!
14 | end
15 |
--------------------------------------------------------------------------------
/tasks/props.rake:
--------------------------------------------------------------------------------
1 | namespace :props do
2 | desc 'Write new property value hashes for the properties scanner'
3 | task :update do
4 | require 'regexp_property_values'
5 | RegexpPropertyValues.update
6 | dir = File.join(__dir__, '../lib/regexp_parser/scanner/properties')
7 |
8 | write_hash_to_file = ->(hash, path) do
9 | File.open(path, 'w') do |f|
10 | f.puts '# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT',
11 | *hash.sort.map { |pair| pair.join(',') }
12 | end
13 | puts "Wrote #{hash.count} aliases to `#{path}`"
14 | end
15 |
16 | long_names_to_tokens = RegexpPropertyValues.all.map do |val|
17 | [val.identifier, val.full_name.downcase]
18 | end
19 | write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
20 |
21 | short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
22 | [k.identifier, v.full_name.downcase]
23 | end
24 | write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/tasks/ragel.rake:
--------------------------------------------------------------------------------
1 | RAGEL_SOURCE_DIR = File.join(__dir__, '../lib/regexp_parser/scanner')
2 | RAGEL_OUTPUT_DIR = File.join(__dir__, '../lib/regexp_parser')
3 | RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
4 |
5 | namespace :ragel do
6 | desc 'Process the ragel source files and output ruby code'
7 | task rb: :install do |task|
8 | RAGEL_SOURCE_FILES.each do |source_file|
9 | source_path = "#{RAGEL_SOURCE_DIR}/#{source_file}.rl"
10 | output_path = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
11 | # -L = omit line hint comments
12 | flags = ENV['DEBUG_RAGEL'].to_i == 1 ? ['-p'] : ['-L']
13 | # using faster flat table driven FSM, about 25% larger code, but about 30% faster
14 | flags << '-F1'
15 | sh "ragel -R #{source_path} -o #{output_path} #{flags.join(' ')}"
16 |
17 | contents = File
18 | .read(output_path)
19 | .gsub(/[ \t]+$/, '') # remove trailing whitespace emitted by ragel
20 | .gsub(/(?<=\d,)[ \t]+|^[ \t]+(?=-?\d)/, '') # compact FSM tables (saves ~6KB)
21 | .gsub(/\n(?:[ \t]*\n){2,}/, "\n\n") # compact blank lines
22 |
23 | File.open(output_path, 'w') do |file|
24 | file.puts <<~RUBY
25 | # -*- warn-indent:false; -*-
26 | #
27 | # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
28 | #
29 | # This file was generated from #{source_path.split('/').last}
30 | # by running `bundle exec rake #{task.name}`
31 | RUBY
32 |
33 | file.write(contents)
34 | end
35 | end
36 | end
37 |
38 | desc 'Delete the ragel generated source file(s)'
39 | task :clean do
40 | RAGEL_SOURCE_FILES.each do |file|
41 | sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
42 | end
43 | end
44 |
45 | desc 'Make sure that ragel is installed'
46 | task :install do
47 | next if ENV['CI']
48 |
49 | if system('command -v ragel')
50 | # already installed
51 | elsif system('command -v brew')
52 | puts 'ragel not found, installing with homebrew ...'
53 | `brew install ragel`
54 | elsif system('command -v apt-get')
55 | puts 'ragel not found, installing with apt-get ...'
56 | `sudo apt-get install -y ragel`
57 | else
58 | raise 'Could not install ragel. Please install it manually.'
59 | end
60 | end
61 | end
62 |
--------------------------------------------------------------------------------