├── .rspec ├── lib ├── rehtml │ ├── version.rb │ ├── scanner.rb │ ├── elements.rb │ ├── builder.rb │ ├── tokenizer.rb │ └── entities.rb └── rehtml.rb ├── Rakefile ├── .travis.yml ├── Gemfile ├── .gitignore ├── spec ├── spec_helper.rb ├── rehtml_scanner_spec.rb ├── rehtml_parser_spec.rb └── rehtml_tokenizer_spec.rb ├── gen_entities.rb ├── rehtml.gemspec ├── LICENSE.txt └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /lib/rehtml/version.rb: -------------------------------------------------------------------------------- 1 | module REHTML 2 | VERSION = "0.0.1" 3 | end 4 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 1.9.3 4 | - 1.8.7 5 | - 2.1.1 6 | - 2.0.0 7 | - jruby-19mode 8 | # - rbx-2.1.1 Gem bundler is not installed 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in rehtml.gemspec 4 | gemspec 5 | 6 | group :test do 7 | gem 'coveralls', :require => false, :platforms => [ :ruby_20 ] 8 | end 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | bin 19 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'rubygems' 3 | require 'rehtml' 4 | begin 5 | require 'coveralls' 6 | Coveralls.wear! 7 | rescue LoadError 8 | end 9 | 10 | require 'rspec/expectations' 11 | 12 | 13 | -------------------------------------------------------------------------------- /lib/rehtml.rb: -------------------------------------------------------------------------------- 1 | require 'rehtml/version' 2 | require 'rehtml/tokenizer' 3 | require 'rehtml/builder' 4 | 5 | module REHTML 6 | # convert html(string) to REXML::Document 7 | def self.to_rexml(html) 8 | builder = REXMLBuilder.new 9 | builder.parse(Tokenizer.new(html)) 10 | builder.doc 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/rehtml/scanner.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require 'strscan' 3 | module REHTML 4 | class Scanner < StringScanner 5 | def scan_before_or_eos(regex, move_after=false) 6 | self.scan_before(regex, true, move_after) 7 | end 8 | def scan_before(regex, or_eos=false, move_after=false) 9 | text = self.scan_until(regex) 10 | if text 11 | size = self.matched.size 12 | self.pos -= size unless move_after 13 | return text[0...(-size)] 14 | end 15 | if or_eos 16 | text = self.rest 17 | self.terminate 18 | end 19 | text 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /spec/rehtml_scanner_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'rehtml/scanner' 3 | describe REHTML::Scanner do 4 | describe "scan aabcd" do 5 | let(:scanner){ REHTML::Scanner.new("aabcd") } 6 | it "scan_before" do 7 | expect(scanner.scan_before(/b/)).to eq("aa") 8 | expect(scanner.check(/b/)).to eq("b") 9 | expect(scanner.scan(/b/)).to eq("b") 10 | expect(scanner.scan(/b/)).to eq(nil) 11 | end 12 | it "scan_before_or_eos" do 13 | expect(scanner.scan_before_or_eos(/z/)).to eq("aabcd") 14 | expect(scanner.eos?).to eq(true) 15 | end 16 | it "scan_before_or_eos move_after" do 17 | expect(scanner.scan_before_or_eos(/b/,true)).to eq("aa") 18 | expect(scanner.rest).to eq("cd") 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/rehtml/elements.rb: -------------------------------------------------------------------------------- 1 | module REHTML 2 | class Node 3 | end 4 | class Text < Node 5 | attr_reader :value 6 | def initialize(value) 7 | @value = value 8 | end 9 | end 10 | class CData < Text 11 | end 12 | class Tag < Node 13 | attr_reader :name, :attributes 14 | def initialize(name,attributes,empty) 15 | @name = name 16 | @attributes = attributes 17 | @empty = empty 18 | end 19 | def empty? 20 | @empty 21 | end 22 | end 23 | class EndTag < Tag 24 | end 25 | class Instruction < Node 26 | attr_reader :target, :content 27 | def initialize(target,content) 28 | @target = target 29 | @content = content 30 | end 31 | def is_xml_decl? 32 | target.upcase == 'XML' 33 | end 34 | end 35 | class Comment < Node 36 | attr_reader :string 37 | def initialize(string) 38 | @string = string 39 | end 40 | end 41 | class DocType < Node 42 | def initialize 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /gen_entities.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require 'json' 3 | require 'open-uri' 4 | require 'kconv' 5 | 6 | url = "http://www.w3.org/TR/html5/entities.json" 7 | fname = File.join(File.dirname(__FILE__),"lib/rehtml/entities.rb") 8 | 9 | puts "Generete #{fname} from #{url}" 10 | 11 | # read 12 | json = JSON.parse(open(url).read).delete_if{|k,v| 13 | k !~ /;$/ 14 | } 15 | 16 | # write source 17 | entities = json.map{|k,v| 18 | "\"#{k.gsub(/^&/,'').gsub(/;$/,'')}\" => #{v["codepoints"].inspect}.pack( \"U*\" )" 19 | } 20 | max_size = json.keys.map{|a|a.length}.max 21 | open(fname,"w"){|f| 22 | f.write <<-CODE 23 | module REHTML 24 | module ENTITIES 25 | # generate from #{url} on #{Time.now} 26 | MAP = { 27 | #{entities.join(",\n ")} 28 | } 29 | REGEXP = /\\&(?:([a-zA-Z][a-zA-Z0-9]{1,#{max_size-1}})|#([0-9]{1,7})|#x([0-9a-f]{1,6}));/ 30 | end 31 | end 32 | CODE 33 | } 34 | 35 | # check 36 | require fname 37 | json.keys.map{|m| 38 | puts "#{m} is not match #{REHTML::ENTITIES::REGEXP}" if m !~ REHTML::ENTITIES::REGEXP 39 | } 40 | puts "done." 41 | -------------------------------------------------------------------------------- /rehtml.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'rehtml/version' 5 | description = open(File.dirname(__FILE__)+"/README.md").read.gsub(/^.*\n(Pure Ruby)/m,'\1').gsub(/\n##.*/m,"") 6 | 7 | Gem::Specification.new do |spec| 8 | spec.name = "rehtml" 9 | spec.version = REHTML::VERSION 10 | spec.authors = ["nazoking"] 11 | spec.email = ["nazoking@gmail.com"] 12 | spec.summary = description.split(/\n/)[0].strip 13 | spec.description = description 14 | spec.homepage = "https://github.com/nazoking/rehtml" 15 | spec.license = "MIT" 16 | 17 | spec.files = `git ls-files -z`.split("\x0") 18 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 19 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 20 | spec.require_paths = ["lib"] 21 | 22 | spec.add_development_dependency "bundler", "~> 1.5" 23 | spec.add_development_dependency "rake" 24 | spec.add_development_dependency "rspec" 25 | end 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 nazoking 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /spec/rehtml_parser_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'rehtml' 3 | 4 | class ReHTML 5 | def initialize(str) 6 | @str = str 7 | end 8 | def to_s 9 | "parse [#{@str}]" 10 | end 11 | def to_rexml 12 | REHTML.to_rexml(@str).to_s 13 | end 14 | def doc 15 | REHTML.to_rexml(@str) 16 | end 17 | end 18 | def parse(str) 19 | ReHTML.new(str) 20 | end 21 | 22 | describe parse(%[ html a]) do 23 | its(:to_rexml){ should eq(%[ html a]) } 24 | end 25 | describe parse(%[html]) do 26 | its(:to_rexml){ should eq(%[html]) } 27 | end 28 | describe parse(%[htmla]) do 29 | its(:to_rexml){ should eq(%[htmla]) } 30 | its("doc.xml_decl.writethis"){ should be_false } 31 | end 32 | describe parse(%[ a]) do 33 | its(:to_rexml){ should eq(%[ a]) } 34 | its("doc.xml_decl.writethis"){ should be_true } 35 | end 36 | describe parse(%[]) do 37 | its(:to_rexml){ should eq(%[]) } 38 | end 39 | =begin 40 | describe %[index.jsp] do 41 | it{ 42 | doc = REHTML.to_rexml(open(File.join(File.dirname(__FILE__),'files','login.jsp')).read) 43 | formatter = REXML::Formatters::Pretty.new 44 | formatter.write(doc.root, $stdout) 45 | } 46 | end 47 | =end 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # REHTML 2 | 3 | [![Version](http://allthebadges.io/nazoking/rehtml/badge_fury.png)](http://allthebadges.io/nazoking/rehtml/badge_fury) 4 | [![Build Status](https://travis-ci.org/nazoking/rehtml.png?branch=master)](https://travis-ci.org/nazoking/rehtml) 5 | [![Coverage Status](https://coveralls.io/repos/nazoking/rehtml/badge.png?branch=master)](https://coveralls.io/r/nazoking/rehtml?branch=master) 6 | [![Code Climate](https://codeclimate.com/github/nazoking/rehtml.png)](https://codeclimate.com/github/nazoking/rehtml) 7 | [![Dependency Status](https://gemnasium.com/nazoking/rehtml.png)](https://gemnasium.com/nazoking/rehtml) 8 | 9 | Pure Ruby html parser. 10 | 11 | This library parse html and build rexml document. 12 | 13 | Nokogiri is very convenient, but the installation is complex because it do I need to build a native library, it is not suitable for chef. 14 | 15 | ## Installation 16 | 17 | Add this line to your application's Gemfile: 18 | 19 | gem 'rehtml' 20 | 21 | And then execute: 22 | 23 | $ bundle 24 | 25 | Or install it yourself as: 26 | 27 | $ gem install rehtml 28 | 29 | ## Usage 30 | 31 | ``` 32 | doc = REHTML.to_rexml(open('https://github.com/nazoking/rehtml').read) 33 | ``` 34 | 35 | ## Contributing 36 | 37 | 1. Fork it ( http://github.com/nazoking/rehtml/fork ) 38 | 2. Create your feature branch (`git checkout -b my-new-feature`) 39 | 3. Commit your changes (`git commit -am 'Add some feature'`) 40 | 4. Push to the branch (`git push origin my-new-feature`) 41 | 5. Create new Pull Request 42 | -------------------------------------------------------------------------------- /lib/rehtml/builder.rb: -------------------------------------------------------------------------------- 1 | require 'rexml/document' 2 | 3 | module REHTML 4 | class REXMLBuilder 5 | EMPTY_TAGS=Set.new %w[area base br col embed hr img input keygen link meta param source track wbr isindex basefont] 6 | CDATA_TAGS=Set.new %w[script style textarea xmp title] 7 | attr_reader :doc 8 | 9 | # build document use tokenizer 10 | def parse(tokenizer) 11 | @doc = REXML::Document.new 12 | @pos = @doc 13 | while node=tokenizer.next 14 | append(node) 15 | end 16 | end 17 | 18 | # append node to document 19 | def append(node) 20 | if node.is_a?(EndTag) 21 | return if empty_tag?(node.name) 22 | po = @pos 23 | while po.parent and po.name != node.name 24 | po = po.parent 25 | end 26 | if po.name == node.name 27 | @pos = po.parent 28 | end 29 | else 30 | rexml = to_rexml(node) 31 | 32 | # if node is second root element, add root element wrap html tag 33 | if rexml.is_a?(REXML::Element) and @pos == @doc and @doc.root 34 | if @doc.root.name != 'html' 35 | html = REXML::Element.new 36 | html.name = "html" 37 | i = @doc.root.index_in_parent-1 38 | while pos = @doc.delete_at(i) 39 | @doc.delete_element(pos) if pos.is_a?(REXML::Element) 40 | html << pos 41 | end 42 | @doc << html 43 | @pos = html 44 | end 45 | @pos = @doc.root 46 | end 47 | @pos << rexml 48 | if rexml.is_a?(REXML::Element) and !empty_tag?(node.name) and !node.empty? 49 | @pos = rexml 50 | end 51 | end 52 | end 53 | 54 | private 55 | 56 | def to_rexml(node) 57 | case node 58 | when Text 59 | REXML::Text.new(node.value, true) 60 | when CData 61 | REXML::CData.new(node.value) 62 | when Instruction 63 | if node.is_xml_decl? and ( @doc.xml_decl.nil? or !@doc.xml_decl.writethis ) 64 | begin 65 | return REXML::Document.new("").xml_decl 66 | rescue REXML::ParseException 67 | end 68 | end 69 | REXML::Instruction.new(node.target,node.content) 70 | when DocType 71 | REXML::Comment.new(node.raw) 72 | when Comment 73 | REXML::Comment.new(node.string) 74 | when Tag 75 | if cdata_tag?(@pos.name) 76 | REXML::Text.new(node.raw, true) 77 | else 78 | xml = REXML::Element.new 79 | xml.name = node.name 80 | xml.add_attributes(node.attributes) 81 | xml 82 | end 83 | else 84 | raise "unknown node type #{node}" 85 | end 86 | end 87 | 88 | def empty_tag?(tagname) 89 | EMPTY_TAGS.include?(tagname) 90 | end 91 | 92 | def cdata_tag?(tagname) 93 | CDATA_TAGS.include?(tagname) 94 | end 95 | end 96 | end 97 | -------------------------------------------------------------------------------- /lib/rehtml/tokenizer.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require 'rehtml/scanner' 3 | require 'rehtml/elements' 4 | require 'rehtml/entities' 5 | 6 | module REHTML 7 | module TokenInfo 8 | attr_reader :raw, :start_pos, :end_pos 9 | def set_token_info(bpos,scanner) 10 | @start_pos=bpos 11 | @end_pos= scanner.pos 12 | @raw = scanner.string[@start_pos...(@end_pos)] 13 | end 14 | end 15 | class Tokenizer 16 | # Create a new Tokenizer for the given text. 17 | def initialize(html) 18 | @scanner = Scanner.new(html) 19 | @bpos = 0 20 | end 21 | 22 | # Return the next token in the sequence, or +nil+ if there are no more tokens in 23 | # the stream. 24 | def next 25 | return nil if @scanner.eos? 26 | add_parse_info(@scanner.check(/<\S/) ? scan_element : scan_text) 27 | end 28 | 29 | private 30 | def add_parse_info(node) 31 | node.extend(TokenInfo) 32 | node.set_token_info(@bpos,@scanner) 33 | @bpos = @scanner.pos 34 | node 35 | end 36 | 37 | def scan_text 38 | Text.new(decode("#{@scanner.getch}#{@scanner.scan(/[^<]*/)}")) 39 | end 40 | 41 | # decode html entity 42 | def decode(html) 43 | html.gsub(ENTITIES::REGEXP){ 44 | if $1 45 | if ENTITIES::MAP[$1] 46 | ENTITIES::MAP[$1] 47 | else 48 | $& 49 | end 50 | elsif $2 51 | [$2.to_i(10)].pack('U') 52 | elsif $3 53 | [$3.to_i(16)].pack('U') 54 | else 55 | $& 56 | end 57 | } 58 | end 59 | 60 | def scan_element 61 | if @scanner.scan(//,true) 63 | Comment.new(comment) 64 | elsif @scanner.scan(//,true)) 66 | elsif @scanner.scan(//,true) 70 | Comment.new(comment) 71 | elsif @scanner.scan(/<\?/) # PI or xml decl 72 | scan_pi 73 | else 74 | scan_tag 75 | end 76 | end 77 | 78 | def scan_tag 79 | @scanner.scan(/<(\/)?([^\x20\x09\x0A\x0C\x0D>]*)/) 80 | is_end = @scanner[1] ? true : false 81 | name = @scanner[2] 82 | attrs = {} 83 | loop do 84 | @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) 85 | attr = @scanner.scan_before_or_eos(/[=>\x20\x09\x0A\x0C\x0D]|\/>/) 86 | matched = @scanner.matched 87 | if matched == '>' || matched.nil? || matched == '/>' 88 | attrs[attr.downcase]="" unless attr.empty? 89 | break 90 | end 91 | @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) 92 | if @scanner.scan(/=/) 93 | @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) 94 | if @scanner.scan(/['"]/) 95 | m = Regexp.compile(Regexp.quote(@scanner.matched)) 96 | value = @scanner.scan_before_or_eos(m, true) 97 | else 98 | value = @scanner.scan_before_or_eos(/[>\x20\x09\x0A\x0C\x0D]|\/>/) 99 | end 100 | else 101 | value = "" 102 | end 103 | attrs[attr.downcase]=decode(value) unless attr.empty? 104 | end 105 | empty = !@scanner.scan(/\//).nil? 106 | @scanner.skip(/>/) 107 | if is_end 108 | EndTag.new(name.downcase,attrs,empty) 109 | else 110 | Tag.new(name.downcase,attrs,empty) 111 | end 112 | end 113 | def scan_pi 114 | # http://www.w3.org/TR/REC-xml/#NT-Name 115 | name = @scanner.scan(/([-:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD0-9\u00B7\u0300-\u036F\u203F-\u2040]+)/) || "" 116 | body = @scanner.scan_before_or_eos(/\?>/,true) 117 | Instruction.new(name,body) 118 | end 119 | def scan_doctype 120 | # TODO complex doctype 121 | # https://github.com/ruby/ruby/blob/master/lib/rexml/parsers/baseparser.rb#L258 122 | # source = REXML::Source.new(doctype) 123 | # parser = REXML::Parsers::BaseParser.new(soucre) 124 | # while parser.document_status == in_doctype 125 | # parser.pull_event 126 | doctype = @scanner.scan_before_or_eos(/>/,true) 127 | DocType.new 128 | end 129 | end 130 | end 131 | -------------------------------------------------------------------------------- /spec/rehtml_tokenizer_spec.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require 'spec_helper' 3 | require 'rehtml/tokenizer' 4 | 5 | class TokenizeHelper 6 | def initialize(msg,str=nil); 7 | @msg = str.nil? ? "" : " #{msg}" 8 | @str = str.nil? ? msg : str 9 | end 10 | def to_s; "tokenize#{@msg} {#{@str}}"; end 11 | def first_token; REHTML::Tokenizer.new(@str).next; end 12 | def token_size 13 | t = REHTML::Tokenizer.new(@str) 14 | i = 0 15 | i += 1 until t.next.nil? 16 | i 17 | end 18 | def token(num) 19 | t = REHTML::Tokenizer.new(@str) 20 | num.times{|ii| 21 | token = t.next 22 | raise "token size is #{ii}" if token.nil? 23 | } 24 | t.next 25 | end 26 | def method_missing(name, *args) 27 | if name.to_s =~ /^token(\d+)$/ 28 | token($1.to_i-1) 29 | else 30 | first_token.send(name, *args) 31 | end 32 | end 33 | end 34 | def tokenize(msg,str=nil); TokenizeHelper.new(msg,str); end 35 | 36 | describe tokenize(%[]) do 37 | its("first_token.raw"){ should eq(%[]) } 38 | its("first_token"){ should be_a(REHTML::Tag) } 39 | its(:name){ should eq("a") } 40 | its(:attributes){ should eq({ 41 | "type"=>"checkbox", 42 | "name"=>"be evil", 43 | "value"=>"yes", 44 | "disabled"=>""}) } 45 | its(:token_size){ should eq(1) } 46 | end 47 | describe tokenize(%[]) do 48 | its(:first_token){ should be_a(REHTML::Instruction) } 49 | its(:first_token){ should be_is_xml_decl } 50 | its(:token_size){ should eq(1) } 51 | end 52 | describe tokenize(%[]) do 53 | its(:token_size){ should eq(1) } 54 | its(:first_token){ should be_a(REHTML::Instruction) } 55 | its(:target){ should eq("php") } 56 | its(:content){ should eq(" hoge") } 57 | it{ should_not be_is_xml_decl } 58 | end 59 | describe tokenize(%[]) do 60 | its(:token_size){ should eq(1) } 61 | its(:first_token){ should be_a(REHTML::Instruction) } 62 | its(:target){ should eq("") } 63 | its(:content){ should eq(" huga") } 64 | it{ should_not be_is_xml_decl } 65 | end 66 | describe tokenize(%{}) do 67 | its(:token_size){ should eq(1) } 68 | its(:first_token){ should be_a(REHTML::Comment) } 69 | its("first_token.string"){ should eq(" comment ") } 70 | end 71 | describe tokenize(%{abc &a; & & − ' }) do 72 | its(:token_size){ should eq(1) } 73 | its(:first_token){ should be_a(REHTML::Text) } 74 | its(:value){ should eq(%[abc &a; & & − ' ]) } 75 | end 76 | describe tokenize(%{}) do 77 | its(:token_size){ should eq(1) } 78 | its(:first_token){ should be_a(REHTML::CData) } 79 | its(:value){ should eq(" cdata ") } 80 | end 81 | describe tokenize("unclosed comment",%[