├── .rspec ├── lib ├── pcre2 │ ├── version.rb │ ├── error.rb │ ├── matchdata.rb │ ├── string_utils.rb │ ├── regexp.rb │ ├── lib.rb │ └── lib │ │ └── constants.rb └── pcre2.rb ├── spec ├── pcre2_spec.rb ├── regexp_spec.rb ├── lib │ └── pcre2 │ │ ├── error_spec.rb │ │ ├── lib_spec.rb │ │ ├── matchdata_spec.rb │ │ ├── string_utils_spec.rb │ │ └── regexp_spec.rb └── spec_helper.rb ├── Gemfile ├── bin ├── setup └── console ├── Rakefile ├── .gitignore ├── .github └── workflows │ └── tests.yml ├── LICENSE.txt ├── pcre2.gemspec ├── benchmark.rake └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /lib/pcre2/version.rb: -------------------------------------------------------------------------------- 1 | module PCRE2 2 | VERSION = "0.2.0" 3 | end 4 | -------------------------------------------------------------------------------- /spec/pcre2_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2 do 2 | it "has a version number" do 3 | expect(PCRE2::VERSION).not_to be nil 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | # Specify your gem's dependencies in pcre2.gemspec 4 | gemspec 5 | 6 | gem "rake", "~> 12.0" 7 | gem "rspec", "~> 3.0" 8 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | 8 | load "benchmark.rake" 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | 10 | # rspec failure tracking 11 | .rspec_status 12 | /Gemfile.lock 13 | -------------------------------------------------------------------------------- /lib/pcre2.rb: -------------------------------------------------------------------------------- 1 | require "pcre2/version" 2 | require "pcre2/lib" 3 | require "pcre2/lib/constants" 4 | require "pcre2/string_utils" 5 | 6 | # Classes 7 | require "pcre2/error" 8 | require "pcre2/regexp" 9 | require "pcre2/matchdata" 10 | 11 | module PCRE2 12 | # Your code goes here... 13 | end 14 | -------------------------------------------------------------------------------- /spec/regexp_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Regexp do 2 | it "responds to `to_pcre2`", :skip do 3 | regexp = /hello/ 4 | 5 | pcre2_regexp = regexp.to_pcre2 6 | 7 | expect(pcre2_regexp).to be_a(PCRE2::Regexp) 8 | expect(pcre2_regexp.source).to eq(regexp.source) 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /spec/lib/pcre2/error_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2::Error do 2 | describe ".from_error_code" do 3 | it "has the correct error message" do 4 | error = PCRE2::Error.from_error_code(PCRE2::PCRE2_ERROR_BADDATA) 5 | 6 | expect(error.message).to match(/bad data value/) 7 | end 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/pcre2/error.rb: -------------------------------------------------------------------------------- 1 | class PCRE2::Error < StandardError 2 | def self.from_error_code(error_code, extra_message = nil) 3 | message = "Error #{error_code}: " 4 | message += PCRE2::Lib.get_error_message(error_code) 5 | message += " - #{extra_message}" if extra_message 6 | 7 | self.new(message) 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "pcre2" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "pcre2" 3 | 4 | RSpec.configure do |config| 5 | # Enable flags like --only-failures and --next-failure 6 | config.example_status_persistence_file_path = ".rspec_status" 7 | 8 | # Disable RSpec exposing methods globally on `Module` and `main` 9 | config.disable_monkey_patching! 10 | 11 | config.expect_with :rspec do |c| 12 | c.syntax = :expect 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # Setup Ruby, install gems, cache gems, and run test suite 2 | # - https://github.com/ruby/setup-ruby 3 | 4 | name: Tests 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | ruby: [ '2.5', '2.6' ] 13 | name: Ruby ${{ matrix.ruby }} tests 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: ruby/setup-ruby@v1 17 | with: 18 | bundler-cache: true 19 | ruby-version: ${{ matrix.ruby }} 20 | - run: bundle install 21 | - run: bundle exec rake -t 22 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 David Verhasselt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/pcre2/matchdata.rb: -------------------------------------------------------------------------------- 1 | class PCRE2::MatchData 2 | attr :regexp, :pairs, :string 3 | 4 | def initialize(regexp, string, pairs) 5 | @regexp = regexp 6 | @string = string 7 | @pairs = pairs 8 | end 9 | 10 | def [](key) 11 | if !key.is_a?(Numeric) 12 | key = regexp.named_captures[key.to_s].first 13 | end 14 | 15 | if pair = pairs[key] 16 | string_from_pair(*pair) 17 | end 18 | end 19 | 20 | def offset(n) 21 | pairs[n] 22 | end 23 | 24 | def capture_pairs 25 | pairs[1..-1] 26 | end 27 | 28 | def to_a 29 | @to_a ||= pairs.map { |pair| string_from_pair(*pair) } 30 | end 31 | 32 | def captures 33 | to_a[1..-1] 34 | end 35 | 36 | def length 37 | start_of_match - end_of_match 38 | end 39 | 40 | def pre_match 41 | string[0 ... start_of_match] 42 | end 43 | 44 | def post_match 45 | string[end_of_match .. -1] 46 | end 47 | 48 | def start_of_match 49 | offset(0)[0] 50 | end 51 | 52 | def end_of_match 53 | offset(0)[1] 54 | end 55 | 56 | private 57 | 58 | def string_from_pair(start, ending) 59 | string.slice(start, ending-start) 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /spec/lib/pcre2/lib_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2::Lib do 2 | describe ".get_error_message" do 3 | it "returns an error message" do 4 | result = PCRE2::Lib.get_error_message(PCRE2::PCRE2_ERROR_NOMATCH) 5 | 6 | expect(result).to eq("no match") 7 | end 8 | 9 | it "accepts a MemoryPointer" do 10 | error_code = FFI::MemoryPointer.new(:int, 1) 11 | error_code.write_int(PCRE2::PCRE2_ERROR_NOMATCH) 12 | 13 | result = PCRE2::Lib.get_error_message(error_code) 14 | 15 | expect(result).to eq("no match") 16 | end 17 | end 18 | 19 | describe ".match" do 20 | it "returns 0 when no matches" do 21 | pattern_ptr = PCRE2::Lib.compile_pattern("hello") 22 | result_count, match_data_ptr = PCRE2::Lib.match(pattern_ptr, "goodbye") 23 | 24 | expect(result_count).to eq(0) 25 | end 26 | end 27 | 28 | describe ".compile_pattern" do 29 | errors = { 30 | '(?<>.' => /Error 162: subpattern name expected/, 31 | '(.*' => /Error 114: missing closing parenthesis/, 32 | } 33 | 34 | errors.each do |pattern, error| 35 | it "raises the correct error for '#{pattern}'" do 36 | expect { PCRE2::Lib.compile_pattern(pattern) }.to raise_error(PCRE2::Error, error) 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /pcre2.gemspec: -------------------------------------------------------------------------------- 1 | require_relative 'lib/pcre2/version' 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "pcre2" 5 | spec.version = PCRE2::VERSION 6 | spec.authors = ["David Verhasselt"] 7 | spec.email = ["david@crowdway.com"] 8 | 9 | spec.summary = %q{Use the PCRE2 library inside your Ruby projects} 10 | spec.description = %q{Wraps the PCRE2 library using FFI so it and the advanced functionality it provides can be used in Ruby projects} 11 | spec.homepage = "https://github.com/dv/pcre2" 12 | spec.license = "MIT" 13 | spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0") 14 | 15 | spec.metadata["homepage_uri"] = spec.homepage 16 | spec.metadata["source_code_uri"] = spec.homepage 17 | spec.metadata["changelog_uri"] = spec.homepage 18 | 19 | # Specify which files should be added to the gem when it is released. 20 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 21 | spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do 22 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 23 | end 24 | spec.bindir = "exe" 25 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 26 | spec.require_paths = ["lib"] 27 | 28 | spec.add_dependency "ffi" 29 | end 30 | -------------------------------------------------------------------------------- /lib/pcre2/string_utils.rb: -------------------------------------------------------------------------------- 1 | module PCRE2::StringUtils 2 | def scan(string, &block) 3 | return enum_for(:scan, string).to_a if !block_given? 4 | 5 | matches(string) do |matchdata| 6 | if matchdata.captures.any? 7 | yield matchdata.captures 8 | else 9 | yield matchdata[0] 10 | end 11 | end 12 | end 13 | 14 | def split(string, &block) 15 | return enum_for(:split, string).to_a if !block_given? 16 | 17 | previous_position = 0 18 | matches(string) do |matchdata| 19 | beginning, ending = matchdata.offset(0) 20 | 21 | # If zero-length match and the previous_position is equal to the match position, just skip 22 | # it. The next zero-length match will have a different previous_position and generate a split 23 | # which results in the appearance of a "per character split" but without empty parts in the 24 | # beginning. Note that we're also skipping adding capture groups. 25 | if matchdata.length == 0 && previous_position == beginning 26 | next 27 | end 28 | 29 | yield string[previous_position ... beginning] 30 | 31 | matchdata.captures.each do |capture| 32 | yield capture 33 | end 34 | 35 | previous_position = ending 36 | end 37 | 38 | # Also return the ending of the string from the last match 39 | if previous_position < string.length 40 | yield string[previous_position .. -1] 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /spec/lib/pcre2/matchdata_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2::MatchData do 2 | let(:pattern) { "(?hello) (world)" } 3 | let(:string) { "one two three hello world today!" } 4 | let(:re) { PCRE2::Regexp.new(pattern) } 5 | 6 | subject(:matchdata) { re.match(string) } 7 | 8 | describe "#[]" do 9 | it "returns the full match at 0" do 10 | expect(matchdata[0]).to eq("hello world") 11 | end 12 | 13 | it "returns the subpattern match at 1" do 14 | expect(matchdata[1]).to eq("hello") 15 | end 16 | 17 | it "returns the subpattern match at 2" do 18 | expect(matchdata[2]).to eq("world") 19 | end 20 | 21 | it "returns nil for unexisting subpattern" do 22 | expect(matchdata[3]).to be_nil 23 | end 24 | 25 | it "returns the named subpattern match at 'a'" do 26 | expect(matchdata["a"]).to eq("hello") 27 | end 28 | end 29 | 30 | describe "#to_a" do 31 | it "returns an array of all matches" do 32 | expect(matchdata.to_a).to eq(["hello world", "hello", "world"]) 33 | end 34 | end 35 | 36 | describe "#pre_match and #post_match" do 37 | it "returns the correct results" do 38 | expect(matchdata.pre_match).to eq("one two three ") 39 | expect(matchdata.post_match).to eq(" today!") 40 | end 41 | end 42 | 43 | describe "#length" do 44 | it "returns 0 for a zero-length match" do 45 | re = PCRE2::Regexp.new("") 46 | matchdata = re.match("string") 47 | 48 | expect(matchdata.length).to eq(0) 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /lib/pcre2/regexp.rb: -------------------------------------------------------------------------------- 1 | module PCRE2 2 | class Regexp 3 | attr :source, :pattern_ptr 4 | 5 | include StringUtils 6 | 7 | # Accepts a String, Regexp or another PCRE2::Regexp 8 | def initialize(pattern, *options) 9 | case pattern 10 | when ::Regexp, PCRE2::Regexp 11 | @source = pattern.source 12 | else 13 | @source = pattern 14 | end 15 | 16 | @pattern_ptr = Lib.compile_pattern(source, options) 17 | end 18 | 19 | # Compiles the Regexp into a JIT optimised version. Returns whether it was successful 20 | def jit! 21 | options = PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_PARTIAL_HARD 22 | 23 | Lib.pcre2_jit_compile_8(pattern_ptr, options) == 0 24 | end 25 | 26 | def match(str, pos = nil) 27 | result_count, match_data_ptr = Lib.match(@pattern_ptr, str, position: pos) 28 | 29 | if result_count == 0 30 | nil 31 | else 32 | pairs = PCRE2::Lib.get_ovector_pairs(match_data_ptr, result_count) 33 | 34 | MatchData.new(self, str, pairs) 35 | end 36 | end 37 | 38 | def matches(str, pos = nil, &block) 39 | return enum_for(:matches, str, pos) if !block_given? 40 | 41 | pos ||= 0 42 | while pos < str.length 43 | matchdata = self.match(str, pos) 44 | 45 | if matchdata 46 | yield matchdata 47 | 48 | beginning, ending = matchdata.offset(0) 49 | 50 | if pos == ending # Manually increment position if no change to avoid infinite loops 51 | pos += 1 52 | else 53 | pos = ending 54 | end 55 | else 56 | return 57 | end 58 | end 59 | end 60 | 61 | def named_captures 62 | @named_captures ||= Lib.named_captures(pattern_ptr) 63 | end 64 | 65 | def names 66 | named_captures.keys 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /benchmark.rake: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | require "pcre2" 3 | 4 | desc "Run a benchmark to compare PCRE2 vs Ruby's built-in Regexp" 5 | task :benchmark do 6 | def benchmark!(pattern, string) 7 | task = ->(re) { 8 | pos = 0 9 | 10 | while matchdata = re.match(string, pos) 11 | pos = matchdata.offset(0)[1] + 1 12 | end 13 | } 14 | 15 | GC.disable 16 | Benchmark.bmbm do |benchmark| 17 | ruby_re = Regexp.new(pattern) 18 | pcre2_re = PCRE2::Regexp.new(pattern) 19 | pcre2_re_jit = PCRE2::Regexp.new(pattern).tap(&:jit!) 20 | 21 | benchmark.report("Ruby Regexp") do 22 | 100000.times { task.call(ruby_re) } 23 | end 24 | 25 | GC.start 26 | 27 | benchmark.report("PCRE2 Regexp") do 28 | 100000.times { task.call(pcre2_re) } 29 | end 30 | 31 | GC.start 32 | 33 | benchmark.report("PCRE2 Regexp - JIT enhanced") do 34 | 100000.times { task.call(pcre2_re_jit) } 35 | end 36 | end 37 | GC.enable 38 | 39 | puts 40 | puts 41 | puts 42 | end 43 | 44 | puts "Benchmark 1: Small pattern, big string" 45 | puts 46 | 47 | pattern = "hello" 48 | string = "abab" * 1000 49 | string += "hello" 50 | string += "abab" * 1000 51 | 52 | benchmark!(pattern, string) 53 | 54 | 55 | puts "Benchmark 2: Big pattern, big string" 56 | puts 57 | 58 | pattern = "hello" * 50 59 | string = "abab" * 1000 60 | string += "hello" 61 | string += "abab" * 1000 62 | string += pattern 63 | string += "abab" * 1000 64 | 65 | benchmark!(pattern, string) 66 | 67 | 68 | puts "Benchmark 3: Small pattern, small string" 69 | puts 70 | 71 | pattern = "hello" 72 | string = "abababab" + "hello" + "abababab" 73 | 74 | benchmark!(pattern, string) 75 | 76 | 77 | puts "Benchmark 3: Multiple matches" 78 | puts 79 | 80 | pattern = "hello" 81 | string = "" 82 | 83 | 20.times do 84 | string += "abab" * 5 85 | string += "hello" 86 | string += "abab" * 5 87 | end 88 | 89 | benchmark!(pattern, string) 90 | end 91 | -------------------------------------------------------------------------------- /spec/lib/pcre2/string_utils_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2::StringUtils do 2 | describe "#scan" do 3 | it "returns all matched strings" do 4 | subject = "and a 1 and a 2 and a 345" 5 | regexp = PCRE2::Regexp.new('\d+') 6 | 7 | result = regexp.scan(subject) 8 | 9 | expect(result).to eq(["1", "2", "345"]) 10 | end 11 | 12 | it "accepts a block which iterates over all matches" do 13 | subject = "and a 1 and a 2 and a 345" 14 | regexp = PCRE2::Regexp.new('\d+') 15 | 16 | result = "" 17 | regexp.scan(subject) { |match| result += match } 18 | 19 | expect(result).to eq("12345") 20 | end 21 | 22 | it "returns captures if arity is higher than 1" do 23 | subject = "and a 1 and a 2 and a 345" 24 | regexp = PCRE2::Regexp.new('(and a) (\d+)') 25 | 26 | result = regexp.scan(subject) 27 | 28 | expect(result).to eq([["and a", "1"], ["and a", "2"], ["and a", "345"]]) 29 | end 30 | end 31 | 32 | describe "#split" do 33 | let(:string) { "and a 1 and a 2 and a 345 congrats!" } 34 | 35 | it "splits where the regexp matches" do 36 | regexp = PCRE2::Regexp.new('\d+') 37 | 38 | result = regexp.split(string) 39 | 40 | expect(result).to eq(["and a ", " and a ", " and a ", " congrats!"]) 41 | end 42 | 43 | it "returns captures in the result too" do 44 | regexp = PCRE2::Regexp.new('\d(\d*)') 45 | 46 | result = regexp.split(string) 47 | 48 | expect(result).to eq(["and a ", "", " and a ", "", " and a ", "45", " congrats!"]) 49 | end 50 | 51 | it "splits each character for zero-length matches" do 52 | regexp = PCRE2::Regexp.new('') 53 | 54 | result = regexp.split(string) 55 | 56 | expect(result).to eq(string.chars) 57 | end 58 | 59 | # These are tests that were reverse-engineered from what String#split returns, since the documentation 60 | # or the original source code is not very clear about what the exact specification of `split` should be 61 | # when there are zero-length matches. 62 | context "edge-cases" do 63 | let(:string) { "abcde" } 64 | 65 | it "has an empty part at the beginning" do 66 | regexp = PCRE2::Regexp.new("a") 67 | 68 | result = regexp.split(string) 69 | 70 | expect(result).to eq(["", "bcde"]) 71 | end 72 | 73 | it "has a zero-length match so split by characters without empty part at the beginning" do 74 | regexp = PCRE2::Regexp.new("") 75 | 76 | result = regexp.split(string) 77 | 78 | expect(result).to eq(["a", "b", "c", "d", "e"]) 79 | end 80 | 81 | it "has a zero-length match in the middle so split into two parts" do 82 | regexp = PCRE2::Regexp.new("(?=c)") 83 | 84 | result = regexp.split(string) 85 | 86 | expect(result).to eq(["ab", "cde"]) 87 | end 88 | 89 | it "has a zero-length match at the start and in the middle but again only split into two parts without empty part at the beginning" do 90 | regexp = PCRE2::Regexp.new("^|(?=c)") 91 | 92 | result = regexp.split(string) 93 | 94 | expect(result).to eq(["ab", "cde"]) 95 | end 96 | 97 | it "has multiple zero-length matches including empty capture groups so split by chars and also include lots of empty results" do 98 | regexp = PCRE2::Regexp.new("()|^|(?=c)") 99 | 100 | result = regexp.split(string) 101 | 102 | expect(result).to eq(["a", "", "b", "", "c", "", "d", "", "e"]) 103 | end 104 | end 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /spec/lib/pcre2/regexp_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe PCRE2::Regexp do 2 | describe "#new" do 3 | # Apparently this does not throw an error 4 | it "returns an error when given a broken pattern", :skip do 5 | pattern = "hell(o" 6 | 7 | expect do 8 | PCRE2::Regexp.new(pattern) 9 | end.to raise_error(/unmatched parenthesis/) 10 | end 11 | 12 | it "accepts a ::Regexp" do 13 | re = PCRE2::Regexp.new(/a/) 14 | 15 | expect(re.source).to eq("a") 16 | end 17 | 18 | it "accepts another PCRE2::Regexp" do 19 | re = PCRE2::Regexp.new(PCRE2::Regexp.new("a")) 20 | 21 | expect(re.source).to eq("a") 22 | end 23 | end 24 | 25 | describe "#match" do 26 | let(:regexp) { PCRE2::Regexp.new("hello") } 27 | 28 | it "returns a matchdata when a match" do 29 | subject = "well hello there!" 30 | 31 | result = regexp.match(subject) 32 | 33 | expect(result.offset(0)).to eq([5, 10]) 34 | end 35 | 36 | it "matches very long patterns" do 37 | part = "aa" 38 | pattern = part * 100 39 | subject = part * 99 40 | 41 | regexp = PCRE2::Regexp.new(pattern) 42 | 43 | expect(regexp.match(subject)).to be_nil 44 | expect(regexp.match(subject + part)).not_to be_nil 45 | end 46 | 47 | it "returns nil when no match" do 48 | subject = "goodbye" 49 | 50 | result = regexp.match(subject) 51 | 52 | expect(result).to be_nil 53 | end 54 | 55 | it "starts from a given position" do 56 | subject = "well hello hello hello there!" 57 | # ^ start here 58 | 59 | result = regexp.match(subject, 11) 60 | 61 | expect(result.offset(0)).to eq([11, 16]) 62 | end 63 | end 64 | 65 | describe "#matches" do 66 | let(:string) { "well hello hello hello there!"} 67 | 68 | it "yields all matchdatas" do 69 | regexp = PCRE2::Regexp.new("hello") 70 | 71 | matchdatas = regexp.matches(string).to_a 72 | 73 | expect(matchdatas.length).to eq(3) 74 | expect(matchdatas[0].offset(0)).to eq([5, 10]) 75 | expect(matchdatas[1].offset(0)).to eq([11, 16]) 76 | expect(matchdatas[2].offset(0)).to eq([17, 22]) 77 | end 78 | 79 | it "does not get stuck in an infinte loop with zero-length matches" do 80 | regexp = PCRE2::Regexp.new("") 81 | enum = regexp.matches(string) 82 | 83 | first_pair = enum.next.offset(0) 84 | second_pair = enum.next.offset(0) 85 | 86 | expect(first_pair).not_to eq(second_pair) 87 | end 88 | end 89 | 90 | context "with named captures" do 91 | describe "#named_captures" do 92 | it "returns a list of named subpatterns and positions" do 93 | pattern = '(?\w+)(?\W+)(?\w+)(?aaa)' 94 | re = PCRE2::Regexp.new(pattern, PCRE2::PCRE2_DUPNAMES) 95 | 96 | expect(re.named_captures).to eq( 97 | { 98 | "a" => [1], 99 | "b" => [2], 100 | "c" => [3, 4] 101 | } 102 | ) 103 | end 104 | end 105 | 106 | describe "#names" do 107 | it "returns names of the named captures" do 108 | pattern = '(?\w+)(?\W+)(?\w+)' 109 | re = PCRE2::Regexp.new(pattern) 110 | 111 | expect(re.names).to eq(["a", "b", "c"]) 112 | end 113 | end 114 | end 115 | 116 | context "with options" do 117 | it "matches case insensitive" do 118 | re = PCRE2::Regexp.new("HELLO") 119 | expect(re.match("hello!")).to be_nil 120 | 121 | re = PCRE2::Regexp.new("HELLO", PCRE2::PCRE2_CASELESS) 122 | expect(re.match("hello!")).not_to be_nil 123 | end 124 | 125 | it "allows duplicate named subpatterns" do\ 126 | pattern = "(?.)(?)" 127 | 128 | expect { PCRE2::Regexp.new(pattern) }.to raise_error(/two named subpatterns have the same name/) 129 | expect { PCRE2::Regexp.new(pattern, PCRE2::PCRE2_DUPNAMES) }.not_to raise_error 130 | end 131 | 132 | it "accepts multiple options" do 133 | re = PCRE2::Regexp.new("HELLO|(?world)(?country)", PCRE2::PCRE2_DUPNAMES, PCRE2::PCRE2_CASELESS) 134 | 135 | expect(re.match("hello!")).not_to be_nil 136 | end 137 | end 138 | 139 | describe "#jit!" do 140 | it "compiles successfully" do 141 | expect(PCRE2::Regexp.new("hello").jit!).to be_truthy 142 | end 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PCRE2 2 | 3 | This library provides a Ruby interface for the PCRE2 library, which supports more advanced regular expression functionality than the built-in Ruby `Regexp`. 4 | 5 | ## Why? 6 | 7 | Ruby's `Regexp` is actually quite fast! For simple Regexps without backtracking (for instance regexp without matches like `.*`), you should probably keep using the Ruby `Regexp`. No extra dependencies and it'll be faster than using an external library, including PCRE2. 8 | 9 | The main reason I built this was so I could use the [backtracking control verbs](https://www.rexegg.com/backtracking-control-verbs.html#mainverbs) such as `(*SKIP)(*FAIL)` that are not supported by Ruby's `Regexp`. Using these, and other features, `PCRE2` supports some pretty wild and advanced regular expressions which you cannot do with Ruby's `Regexp`. 10 | 11 | `PCRE2` also supports JIT (just-in-time) compilation of the regular expression. From [the manual](https://www.pcre.org/current/doc/html/pcre2jit.html): 12 | > Just-in-time compiling is a heavyweight optimization that can greatly speed up pattern matching. However, it comes at the cost of extra processing before the match is performed, so it is of most benefit when the same pattern is going to be matched many times. This does not necessarily mean many calls of a matching function; if the pattern is not anchored, matching attempts may take place many times at various positions in the subject, even for a single call. Therefore, if the subject string is very long, it may still pay to use JIT even for one-off matches. 13 | 14 | You can enable JIT by calling `regexp.jit!` on the `PCRE2::Regexp` object. Using JIT the `PCRE2` matching can be more than 2X faster than Ruby's built-in. 15 | 16 | ## Installation 17 | 18 | Install the PCRE2 library: 19 | 20 | ```bash 21 | brew install pcre2 22 | ``` 23 | 24 | Add this line to your application's Gemfile: 25 | 26 | ```ruby 27 | gem 'pcre2' 28 | ``` 29 | 30 | And then execute: 31 | 32 | $ bundle install 33 | 34 | Or install it yourself as: 35 | 36 | $ gem install pcre2 37 | 38 | ## Usage 39 | 40 | `PCRE2::Regexp` aims to act as much like Ruby's `Regexp` as possible. It has implemented a subset of the `Regexp` and `MatchData` APIs so it can be used as a drop-in replacement. 41 | 42 | ```ruby 43 | regexp = PCRE2::Regexp.new("hello") 44 | subject = "well hello there!" 45 | matchdata = regexp.match(subject) 46 | 47 | matchdata.offset(0) # [5, 10] - start and end of the match 48 | matchdata[0] # => "hello" 49 | 50 | matchdata = regexp.match(subject, 11) # find next match 51 | ``` 52 | 53 | Also some of the utility methods on `String` are reimplemented on `PCRE2::Regexp`: 54 | 55 | ```ruby 56 | regexp = PCRE2::Regexp.new('\d+') 57 | subject = "and a 1 and a 2 and a 345" 58 | 59 | regexp.scan(subject) # => ["1", "2", "345"] 60 | regexp.split(subject) # => ["and a ", " and a ", " and a "] 61 | ``` 62 | 63 | There is one new method not available on `Regexp`: `PCRE2::Regexp#matches` which will loop over all matches of the string, and yield the corresponding `Matchdata`: 64 | 65 | ```ruby 66 | string = "well hello hello hello there!" 67 | re = PCRE2::Regexp.new("hello") 68 | 69 | re.matches(string) do |matchdata| 70 | puts "Matchdata found between #{matchdata.offsets(0)[0]} and #{matchdata.offsets(0)[1]}" 71 | end 72 | ``` 73 | 74 | ## Benchmark 75 | 76 | You can run the benchmark that compares `PCRE2::Regexp` with Ruby's built-in `Regexp` as follows: 77 | 78 | ```bash 79 | bundle exec rake benchmark 80 | ``` 81 | 82 | ## Resources 83 | 84 | - [PCRE2 Library](https://www.pcre.org/current/doc/html/) 85 | - [PCRE2 demo](https://www.pcre.org/current/doc/html/pcre2demo.html) 86 | 87 | ## Development 88 | 89 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 90 | 91 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 92 | 93 | ## Contributing 94 | 95 | Bug reports and pull requests are welcome on GitHub at https://github.com/dv/pcre2. 96 | 97 | ## License 98 | 99 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 100 | -------------------------------------------------------------------------------- /lib/pcre2/lib.rb: -------------------------------------------------------------------------------- 1 | require "ffi" 2 | 3 | module PCRE2::Lib 4 | RETURN_CODE_NO_ERROR = 100 5 | 6 | extend FFI::Library 7 | 8 | ffi_lib 'pcre2-8' # Able to do 16 or 32 too 9 | 10 | PCRE2_SIZE = typedef :size_t, :PCRE2_SIZE 11 | PCRE2_SPTR = typedef :pointer, :PCRE2_SPTR 12 | PCRE2_UCHAR8 = typedef :uint8_t, :PCRE2_UCHAR8 13 | PCRE2_UCHAR16 = typedef :uint16_t, :PCRE2_UCHAR16 14 | PCRE2_UCHAR32 = typedef :uint32_t, :PCRE2_UCHAR32 15 | 16 | # For 8-bit PCRE 17 | PCRE2_UCHAR = typedef :PCRE2_UCHAR8, :PCRE2_UCHAR 18 | 19 | # int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen); 20 | attach_function :pcre2_get_error_message_8, [ :int, :pointer, :PCRE2_SIZE ], :int 21 | 22 | # pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext); 23 | attach_function :pcre2_compile_8, [ :PCRE2_SPTR, :PCRE2_SIZE, :uint32_t, :pointer, :pointer, :pointer ], :pointer 24 | attach_function :pcre2_code_free_8, [ :pointer ], :void 25 | 26 | # int pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where); 27 | attach_function :pcre2_pattern_info_8, [ :pointer, :uint32_t, :pointer ], :int 28 | 29 | # pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext); 30 | attach_function :pcre2_match_data_create_from_pattern_8, [ :pointer, :pointer ], :pointer 31 | attach_function :pcre2_match_data_free_8, [ :pointer ], :void 32 | 33 | # int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext); 34 | attach_function :pcre2_match_8, [ :pointer, :PCRE2_SPTR, :PCRE2_SIZE, :PCRE2_SIZE, :uint32_t, :pointer, :pointer ], :int 35 | 36 | attach_function :pcre2_get_ovector_count_8, [ :pointer ], :uint32_t 37 | attach_function :pcre2_get_ovector_pointer_8, [ :pointer ], :pointer 38 | 39 | # int pcre2_jit_compile(pcre2_code *code, uint32_t options) 40 | attach_function :pcre2_jit_compile_8, [ :pointer, :uint32_t ], :int 41 | 42 | 43 | def self.get_error_message(error_code) 44 | if error_code.kind_of?(FFI::MemoryPointer) 45 | error_code = error_code.read_int 46 | end 47 | 48 | buffer = FFI::MemoryPointer.new(PCRE2_UCHAR, 120) 49 | result = pcre2_get_error_message_8(error_code, buffer, buffer.size) 50 | 51 | case result 52 | when PCRE2::PCRE2_ERROR_BADDATA 53 | "Error number #{error_code} unknown" 54 | when PCRE2::PCRE2_ERROR_NOMEMORY 55 | raise PCRE2::Error, "Buffer of #{buffer.size} is not large enough to contain message" 56 | else 57 | buffer.read_string 58 | end 59 | end 60 | 61 | # Some utility functions to help make the above more palatable 62 | def self.compile_pattern(pattern, options = []) 63 | pattern_string_ptr = FFI::MemoryPointer.from_string(pattern) 64 | error_code_ptr = FFI::MemoryPointer.new(:int, 1) 65 | error_offset_ptr = FFI::MemoryPointer.new(PCRE2_SIZE, 1) 66 | options = options.flatten.inject(0) { |memo, option| memo | option } 67 | 68 | pattern_ptr = PCRE2::Lib.pcre2_compile_8(pattern_string_ptr, pattern.size, options, error_code_ptr, error_offset_ptr, nil) 69 | 70 | if pattern_ptr.null? 71 | error_code = error_code_ptr.read_int 72 | error_offset = error_offset_ptr.read(PCRE2_SIZE) 73 | 74 | raise PCRE2::Error.from_error_code(error_code, "while compiling pattern #{pattern} @ #{error_offset}") 75 | end 76 | 77 | FFI::AutoPointer.new(pattern_ptr, PCRE2::Lib.method(:pcre2_code_free_8)) 78 | end 79 | 80 | def self.create_match_data_for_pattern(pattern_ptr) 81 | match_data_ptr = PCRE2::Lib.pcre2_match_data_create_from_pattern_8(pattern_ptr, nil) 82 | FFI::AutoPointer.new(match_data_ptr, PCRE2::Lib.method(:pcre2_match_data_free_8)) 83 | end 84 | 85 | def self.match(pattern_ptr, body, position: 0, match_data_ptr: nil) 86 | position ||= 0 87 | match_data_ptr ||= create_match_data_for_pattern(pattern_ptr) 88 | 89 | body_ptr = FFI::MemoryPointer.from_string(body) 90 | 91 | return_code = 92 | PCRE2::Lib.pcre2_match_8( 93 | pattern_ptr, 94 | body_ptr, 95 | body_ptr.size, 96 | position, 97 | 0, 98 | match_data_ptr, 99 | nil 100 | ) 101 | 102 | case return_code 103 | when 0 104 | raise PCRE2::Error, "Not enough memory in MatchData to store all captures" 105 | when PCRE2::PCRE2_ERROR_NOMATCH 106 | result_count = 0 107 | else 108 | if return_code < 0 109 | raise PCRE2::Error.from_error_code(return_code) 110 | else 111 | result_count = return_code 112 | end 113 | end 114 | 115 | [result_count, match_data_ptr] 116 | end 117 | 118 | def self.get_ovector_pairs(match_data_ptr, pair_count) 119 | if pair_count.nil? 120 | pair_count = PCRE2::Lib.pcre2_get_ovector_count_8(match_data_ptr) 121 | end 122 | 123 | ovector_ptr = PCRE2::Lib.pcre2_get_ovector_pointer_8(match_data_ptr) 124 | type_size = FFI.type_size(:size_t) 125 | 126 | pair_count.times.map do |i| 127 | [ 128 | ovector_ptr.get(:size_t, i*2 * type_size), 129 | ovector_ptr.get(:size_t, (i*2+1) * type_size) 130 | ] 131 | end 132 | end 133 | 134 | def self.named_captures(pattern_ptr) 135 | named_captures_count = FFI::MemoryPointer.new(:uint32_t, 1) 136 | name_entry_size = FFI::MemoryPointer.new(:uint32_t, 1) 137 | name_table_ptr = FFI::MemoryPointer.new(:pointer, 1) 138 | 139 | if PCRE2::Lib.pcre2_pattern_info_8(pattern_ptr, PCRE2::PCRE2_INFO_NAMECOUNT, named_captures_count) != 0 140 | raise "Something went wrong" 141 | end 142 | 143 | if PCRE2::Lib.pcre2_pattern_info_8(pattern_ptr, PCRE2::PCRE2_INFO_NAMEENTRYSIZE, name_entry_size) != 0 144 | raise "Something went wrong" 145 | end 146 | 147 | if PCRE2::Lib.pcre2_pattern_info_8(pattern_ptr, PCRE2::PCRE2_INFO_NAMETABLE, name_table_ptr) != 0 148 | raise "Something went wrong" 149 | end 150 | 151 | named_captures_count = named_captures_count.read_uint32 152 | name_entry_size = name_entry_size.read_uint32 153 | name_table_ptr = name_table_ptr.read_pointer 154 | 155 | names_and_positions = 156 | named_captures_count.times.map do |i| 157 | ovector_position = (name_table_ptr.get_int8(0) << 8) + name_table_ptr.get_int8(1) 158 | match_name = (name_table_ptr+2).read_string_to_null 159 | 160 | name_table_ptr += name_entry_size 161 | 162 | [match_name, ovector_position] 163 | end 164 | 165 | # Convert an array of [name, position] into a Hash of name => [position (, position, ...)], with possible duplicate names 166 | names_and_positions.each_with_object(Hash.new {[]} ) { |(name, position), hash| hash[name] <<= position } 167 | end 168 | end 169 | -------------------------------------------------------------------------------- /lib/pcre2/lib/constants.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Use replace: 3 | # 4 | # "#define ([^\W]+) \W* (.*)/"" -> "\1 = \2" 5 | # "(0x[^u]+)u" -> "\1" 6 | # 7 | module PCRE2 8 | # The following option bits can be passed to pcre2_compile(), pcre2_match(), 9 | # or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it 10 | # is passed. Put these bits at the most significant end of the options word so 11 | # others can be added next to them 12 | 13 | PCRE2_ANCHORED = 0x80000000 14 | PCRE2_NO_UTF_CHECK = 0x40000000 15 | PCRE2_ENDANCHORED = 0x20000000 16 | 17 | # The following option bits can be passed only to pcre2_compile(). However, 18 | # they may affect compilation, JIT compilation, and/or interpretive execution. 19 | # The following tags indicate which: 20 | # C alters what is compiled by pcre2_compile() 21 | # J alters what is compiled by pcre2_jit_compile() 22 | # M is inspected during pcre2_match() execution 23 | # D is inspected during pcre2_dfa_match() execution 24 | 25 | PCRE2_ALLOW_EMPTY_CLASS = 0x00000001 # C 26 | PCRE2_ALT_BSUX = 0x00000002 # C 27 | PCRE2_AUTO_CALLOUT = 0x00000004 # C 28 | PCRE2_CASELESS = 0x00000008 # C 29 | PCRE2_DOLLAR_ENDONLY = 0x00000010 # J M D 30 | PCRE2_DOTALL = 0x00000020 # C 31 | PCRE2_DUPNAMES = 0x00000040 # C 32 | PCRE2_EXTENDED = 0x00000080 # C 33 | PCRE2_FIRSTLINE = 0x00000100 # J M D 34 | PCRE2_MATCH_UNSET_BACKREF = 0x00000200 # C J M 35 | PCRE2_MULTILINE = 0x00000400 # C 36 | PCRE2_NEVER_UCP = 0x00000800 # C 37 | PCRE2_NEVER_UTF = 0x00001000 # C 38 | PCRE2_NO_AUTO_CAPTURE = 0x00002000 # C 39 | PCRE2_NO_AUTO_POSSESS = 0x00004000 # C 40 | PCRE2_NO_DOTSTAR_ANCHOR = 0x00008000 # C 41 | PCRE2_NO_START_OPTIMIZE = 0x00010000 # J M D 42 | PCRE2_UCP = 0x00020000 # C J M D 43 | PCRE2_UNGREEDY = 0x00040000 # C 44 | PCRE2_UTF = 0x00080000 # C J M D 45 | PCRE2_NEVER_BACKSLASH_C = 0x00100000 # C 46 | PCRE2_ALT_CIRCUMFLEX = 0x00200000 # J M D 47 | PCRE2_ALT_VERBNAMES = 0x00400000 # C 48 | PCRE2_USE_OFFSET_LIMIT = 0x00800000 # J M D 49 | PCRE2_EXTENDED_MORE = 0x01000000 # C 50 | PCRE2_LITERAL = 0x02000000 # C 51 | 52 | # An additional compile options word is available in the compile context. 53 | 54 | PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES = 0x00000001 # C 55 | PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL = 0x00000002 # C 56 | PCRE2_EXTRA_MATCH_WORD = 0x00000004 # C 57 | PCRE2_EXTRA_MATCH_LINE = 0x00000008 # C 58 | PCRE2_EXTRA_ESCAPED_CR_IS_LF = 0x00000010 # C 59 | PCRE2_EXTRA_ALT_BSUX = 0x00000020 # C 60 | 61 | # These are for pcre2_jit_compile(). 62 | 63 | PCRE2_JIT_COMPLETE = 0x00000001 # For full matching 64 | PCRE2_JIT_PARTIAL_SOFT = 0x00000002 65 | PCRE2_JIT_PARTIAL_HARD = 0x00000004 66 | PCRE2_JIT_INVALID_UTF = 0x00000100 67 | 68 | # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and 69 | # pcre2_substitute(). Some are allowed only for one of the functions, and in 70 | # these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and 71 | # PCRE2_NO_UTF_CHECK can also be passed to these functions (though 72 | # pcre2_jit_match() ignores the latter since it bypasses all sanity checks). 73 | 74 | PCRE2_NOTBOL = 0x00000001 75 | PCRE2_NOTEOL = 0x00000002 76 | PCRE2_NOTEMPTY = 0x00000004 # ) These two must be kept 77 | PCRE2_NOTEMPTY_ATSTART = 0x00000008 # ) adjacent to each other. 78 | PCRE2_PARTIAL_SOFT = 0x00000010 79 | PCRE2_PARTIAL_HARD = 0x00000020 80 | PCRE2_DFA_RESTART = 0x00000040 # pcre2_dfa_match() only 81 | PCRE2_DFA_SHORTEST = 0x00000080 # pcre2_dfa_match() only 82 | PCRE2_SUBSTITUTE_GLOBAL = 0x00000100 # pcre2_substitute() only 83 | PCRE2_SUBSTITUTE_EXTENDED = 0x00000200 # pcre2_substitute() only 84 | PCRE2_SUBSTITUTE_UNSET_EMPTY = 0x00000400 # pcre2_substitute() only 85 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET = 0x00000800 # pcre2_substitute() only 86 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH = 0x00001000 # pcre2_substitute() only 87 | PCRE2_NO_JIT = 0x00002000 # Not for pcre2_dfa_match() 88 | PCRE2_COPY_MATCHED_SUBJECT = 0x00004000 89 | 90 | # Options for pcre2_pattern_convert(). 91 | 92 | PCRE2_CONVERT_UTF = 0x00000001 93 | PCRE2_CONVERT_NO_UTF_CHECK = 0x00000002 94 | PCRE2_CONVERT_POSIX_BASIC = 0x00000004 95 | PCRE2_CONVERT_POSIX_EXTENDED = 0x00000008 96 | PCRE2_CONVERT_GLOB = 0x00000010 97 | PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR = 0x00000030 98 | PCRE2_CONVERT_GLOB_NO_STARSTAR = 0x00000050 99 | 100 | # Newline and \R settings, for use in compile contexts. The newline values 101 | # must be kept in step with values set in config.h and both sets must all be 102 | # greater than zero. 103 | 104 | PCRE2_NEWLINE_CR = 1 105 | PCRE2_NEWLINE_LF = 2 106 | PCRE2_NEWLINE_CRLF = 3 107 | PCRE2_NEWLINE_ANY = 4 108 | PCRE2_NEWLINE_ANYCRLF = 5 109 | PCRE2_NEWLINE_NUL = 6 110 | PCRE2_BSR_UNICODE = 1 111 | PCRE2_BSR_ANYCRLF = 2 112 | 113 | # Error codes for pcre2_compile(). Some of these are also used by 114 | # pcre2_pattern_convert(). 115 | 116 | PCRE2_ERROR_END_BACKSLASH = 101 117 | PCRE2_ERROR_END_BACKSLASH_C = 102 118 | PCRE2_ERROR_UNKNOWN_ESCAPE = 103 119 | PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER = 104 120 | PCRE2_ERROR_QUANTIFIER_TOO_BIG = 105 121 | PCRE2_ERROR_MISSING_SQUARE_BRACKET = 106 122 | PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS = 107 123 | PCRE2_ERROR_CLASS_RANGE_ORDER = 108 124 | PCRE2_ERROR_QUANTIFIER_INVALID = 109 125 | PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT = 110 126 | PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY = 111 127 | PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS = 112 128 | PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING = 113 129 | PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS = 114 130 | PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE = 115 131 | PCRE2_ERROR_NULL_PATTERN = 116 132 | PCRE2_ERROR_BAD_OPTIONS = 117 133 | PCRE2_ERROR_MISSING_COMMENT_CLOSING = 118 134 | PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP = 119 135 | PCRE2_ERROR_PATTERN_TOO_LARGE = 120 136 | PCRE2_ERROR_HEAP_FAILED = 121 137 | PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS = 122 138 | PCRE2_ERROR_INTERNAL_CODE_OVERFLOW = 123 139 | PCRE2_ERROR_MISSING_CONDITION_CLOSING = 124 140 | PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH = 125 141 | PCRE2_ERROR_ZERO_RELATIVE_REFERENCE = 126 142 | PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES = 127 143 | PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED = 128 144 | PCRE2_ERROR_BAD_RELATIVE_REFERENCE = 129 145 | PCRE2_ERROR_UNKNOWN_POSIX_CLASS = 130 146 | PCRE2_ERROR_INTERNAL_STUDY_ERROR = 131 147 | PCRE2_ERROR_UNICODE_NOT_SUPPORTED = 132 148 | PCRE2_ERROR_PARENTHESES_STACK_CHECK = 133 149 | PCRE2_ERROR_CODE_POINT_TOO_BIG = 134 150 | PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED = 135 151 | PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C = 136 152 | PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE = 137 153 | PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG = 138 154 | PCRE2_ERROR_MISSING_CALLOUT_CLOSING = 139 155 | PCRE2_ERROR_ESCAPE_INVALID_IN_VERB = 140 156 | PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P = 141 157 | PCRE2_ERROR_MISSING_NAME_TERMINATOR = 142 158 | PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME = 143 159 | PCRE2_ERROR_INVALID_SUBPATTERN_NAME = 144 160 | PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE = 145 161 | PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY = 146 162 | PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY = 147 163 | PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG = 148 164 | PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS = 149 165 | PCRE2_ERROR_CLASS_INVALID_RANGE = 150 166 | PCRE2_ERROR_OCTAL_BYTE_TOO_BIG = 151 167 | PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE = 152 168 | PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN = 153 169 | PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES = 154 170 | PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE = 155 171 | PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE = 156 172 | PCRE2_ERROR_BACKSLASH_G_SYNTAX = 157 173 | PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING = 158 174 | 175 | # Error 159 is obsolete and should now never occur 176 | 177 | PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED = 159 178 | PCRE2_ERROR_VERB_UNKNOWN = 160 179 | PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG = 161 180 | PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED = 162 181 | PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW = 163 182 | PCRE2_ERROR_INVALID_OCTAL = 164 183 | PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH = 165 184 | PCRE2_ERROR_MARK_MISSING_ARGUMENT = 166 185 | PCRE2_ERROR_INVALID_HEXADECIMAL = 167 186 | PCRE2_ERROR_BACKSLASH_C_SYNTAX = 168 187 | PCRE2_ERROR_BACKSLASH_K_SYNTAX = 169 188 | PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS = 170 189 | PCRE2_ERROR_BACKSLASH_N_IN_CLASS = 171 190 | PCRE2_ERROR_CALLOUT_STRING_TOO_LONG = 172 191 | PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT = 173 192 | PCRE2_ERROR_UTF_IS_DISABLED = 174 193 | PCRE2_ERROR_UCP_IS_DISABLED = 175 194 | PCRE2_ERROR_VERB_NAME_TOO_LONG = 176 195 | PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG = 177 196 | PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS = 178 197 | PCRE2_ERROR_VERSION_CONDITION_SYNTAX = 179 198 | PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS = 180 199 | PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER = 181 200 | PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER = 182 201 | PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED = 183 202 | PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP = 184 203 | PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED = 185 204 | PCRE2_ERROR_PATTERN_TOO_COMPLICATED = 186 205 | PCRE2_ERROR_LOOKBEHIND_TOO_LONG = 187 206 | PCRE2_ERROR_PATTERN_STRING_TOO_LONG = 188 207 | PCRE2_ERROR_INTERNAL_BAD_CODE = 189 208 | PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP = 190 209 | PCRE2_ERROR_NO_SURROGATES_IN_UTF16 = 191 210 | PCRE2_ERROR_BAD_LITERAL_OPTIONS = 192 211 | PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE = 193 212 | PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS = 194 213 | PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN = 195 214 | PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE = 196 215 | 216 | # "Expected" matching error codes: no match and partial match. 217 | 218 | PCRE2_ERROR_NOMATCH = (-1) 219 | PCRE2_ERROR_PARTIAL = (-2) 220 | 221 | # Error codes for UTF-8 validity checks 222 | 223 | PCRE2_ERROR_UTF8_ERR1 = (-3) 224 | PCRE2_ERROR_UTF8_ERR2 = (-4) 225 | PCRE2_ERROR_UTF8_ERR3 = (-5) 226 | PCRE2_ERROR_UTF8_ERR4 = (-6) 227 | PCRE2_ERROR_UTF8_ERR5 = (-7) 228 | PCRE2_ERROR_UTF8_ERR6 = (-8) 229 | PCRE2_ERROR_UTF8_ERR7 = (-9) 230 | PCRE2_ERROR_UTF8_ERR8 = (-10) 231 | PCRE2_ERROR_UTF8_ERR9 = (-11) 232 | PCRE2_ERROR_UTF8_ERR10 = (-12) 233 | PCRE2_ERROR_UTF8_ERR11 = (-13) 234 | PCRE2_ERROR_UTF8_ERR12 = (-14) 235 | PCRE2_ERROR_UTF8_ERR13 = (-15) 236 | PCRE2_ERROR_UTF8_ERR14 = (-16) 237 | PCRE2_ERROR_UTF8_ERR15 = (-17) 238 | PCRE2_ERROR_UTF8_ERR16 = (-18) 239 | PCRE2_ERROR_UTF8_ERR17 = (-19) 240 | PCRE2_ERROR_UTF8_ERR18 = (-20) 241 | PCRE2_ERROR_UTF8_ERR19 = (-21) 242 | PCRE2_ERROR_UTF8_ERR20 = (-22) 243 | PCRE2_ERROR_UTF8_ERR21 = (-23) 244 | 245 | # Error codes for UTF-16 validity checks 246 | 247 | PCRE2_ERROR_UTF16_ERR1 = (-24) 248 | PCRE2_ERROR_UTF16_ERR2 = (-25) 249 | PCRE2_ERROR_UTF16_ERR3 = (-26) 250 | 251 | # Error codes for UTF-32 validity checks 252 | 253 | PCRE2_ERROR_UTF32_ERR1 = (-27) 254 | PCRE2_ERROR_UTF32_ERR2 = (-28) 255 | 256 | # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction 257 | # functions, context functions, and serializing functions. They are in numerical 258 | # order. Originally they were in alphabetical order too, but now that PCRE2 is 259 | # released, the numbers must not be changed. 260 | 261 | PCRE2_ERROR_BADDATA = (-29) 262 | PCRE2_ERROR_MIXEDTABLES = (-30) # Name was changed 263 | PCRE2_ERROR_BADMAGIC = (-31) 264 | PCRE2_ERROR_BADMODE = (-32) 265 | PCRE2_ERROR_BADOFFSET = (-33) 266 | PCRE2_ERROR_BADOPTION = (-34) 267 | PCRE2_ERROR_BADREPLACEMENT = (-35) 268 | PCRE2_ERROR_BADUTFOFFSET = (-36) 269 | PCRE2_ERROR_CALLOUT = (-37) # Never used by PCRE2 itself 270 | PCRE2_ERROR_DFA_BADRESTART = (-38) 271 | PCRE2_ERROR_DFA_RECURSE = (-39) 272 | PCRE2_ERROR_DFA_UCOND = (-40) 273 | PCRE2_ERROR_DFA_UFUNC = (-41) 274 | PCRE2_ERROR_DFA_UITEM = (-42) 275 | PCRE2_ERROR_DFA_WSSIZE = (-43) 276 | PCRE2_ERROR_INTERNAL = (-44) 277 | PCRE2_ERROR_JIT_BADOPTION = (-45) 278 | PCRE2_ERROR_JIT_STACKLIMIT = (-46) 279 | PCRE2_ERROR_MATCHLIMIT = (-47) 280 | PCRE2_ERROR_NOMEMORY = (-48) 281 | PCRE2_ERROR_NOSUBSTRING = (-49) 282 | PCRE2_ERROR_NOUNIQUESUBSTRING = (-50) 283 | PCRE2_ERROR_NULL = (-51) 284 | PCRE2_ERROR_RECURSELOOP = (-52) 285 | PCRE2_ERROR_DEPTHLIMIT = (-53) 286 | PCRE2_ERROR_RECURSIONLIMIT = (-53) # Obsolete synonym 287 | PCRE2_ERROR_UNAVAILABLE = (-54) 288 | PCRE2_ERROR_UNSET = (-55) 289 | PCRE2_ERROR_BADOFFSETLIMIT = (-56) 290 | PCRE2_ERROR_BADREPESCAPE = (-57) 291 | PCRE2_ERROR_REPMISSINGBRACE = (-58) 292 | PCRE2_ERROR_BADSUBSTITUTION = (-59) 293 | PCRE2_ERROR_BADSUBSPATTERN = (-60) 294 | PCRE2_ERROR_TOOMANYREPLACE = (-61) 295 | PCRE2_ERROR_BADSERIALIZEDDATA = (-62) 296 | PCRE2_ERROR_HEAPLIMIT = (-63) 297 | PCRE2_ERROR_CONVERT_SYNTAX = (-64) 298 | PCRE2_ERROR_INTERNAL_DUPMATCH = (-65) 299 | 300 | # Request types for pcre2_pattern_info() 301 | 302 | PCRE2_INFO_ALLOPTIONS = 0 303 | PCRE2_INFO_ARGOPTIONS = 1 304 | PCRE2_INFO_BACKREFMAX = 2 305 | PCRE2_INFO_BSR = 3 306 | PCRE2_INFO_CAPTURECOUNT = 4 307 | PCRE2_INFO_FIRSTCODEUNIT = 5 308 | PCRE2_INFO_FIRSTCODETYPE = 6 309 | PCRE2_INFO_FIRSTBITMAP = 7 310 | PCRE2_INFO_HASCRORLF = 8 311 | PCRE2_INFO_JCHANGED = 9 312 | PCRE2_INFO_JITSIZE = 10 313 | PCRE2_INFO_LASTCODEUNIT = 11 314 | PCRE2_INFO_LASTCODETYPE = 12 315 | PCRE2_INFO_MATCHEMPTY = 13 316 | PCRE2_INFO_MATCHLIMIT = 14 317 | PCRE2_INFO_MAXLOOKBEHIND = 15 318 | PCRE2_INFO_MINLENGTH = 16 319 | PCRE2_INFO_NAMECOUNT = 17 320 | PCRE2_INFO_NAMEENTRYSIZE = 18 321 | PCRE2_INFO_NAMETABLE = 19 322 | PCRE2_INFO_NEWLINE = 20 323 | PCRE2_INFO_DEPTHLIMIT = 21 324 | PCRE2_INFO_RECURSIONLIMIT = 21 # Obsolete synonym 325 | PCRE2_INFO_SIZE = 22 326 | PCRE2_INFO_HASBACKSLASHC = 23 327 | PCRE2_INFO_FRAMESIZE = 24 328 | PCRE2_INFO_HEAPLIMIT = 25 329 | PCRE2_INFO_EXTRAOPTIONS = 26 330 | 331 | # Request types for pcre2_config(). 332 | 333 | PCRE2_CONFIG_BSR = 0 334 | PCRE2_CONFIG_JIT = 1 335 | PCRE2_CONFIG_JITTARGET = 2 336 | PCRE2_CONFIG_LINKSIZE = 3 337 | PCRE2_CONFIG_MATCHLIMIT = 4 338 | PCRE2_CONFIG_NEWLINE = 5 339 | PCRE2_CONFIG_PARENSLIMIT = 6 340 | PCRE2_CONFIG_DEPTHLIMIT = 7 341 | PCRE2_CONFIG_RECURSIONLIMIT = 7 # Obsolete synonym 342 | PCRE2_CONFIG_STACKRECURSE = 8 # Obsolete 343 | PCRE2_CONFIG_UNICODE = 9 344 | PCRE2_CONFIG_UNICODE_VERSION = 10 345 | PCRE2_CONFIG_VERSION = 11 346 | PCRE2_CONFIG_HEAPLIMIT = 12 347 | PCRE2_CONFIG_NEVER_BACKSLASH_C = 13 348 | PCRE2_CONFIG_COMPILED_WIDTHS = 14 349 | end 350 | --------------------------------------------------------------------------------