├── .gitignore ├── .rvmrc ├── CHANGELOG.rdoc ├── Gemfile ├── LICENSE ├── Manifest.txt ├── README.rdoc ├── Rakefile ├── TODO ├── bin └── lorax ├── lib ├── lorax.rb └── lorax │ ├── delta.rb │ ├── delta │ ├── delete_delta.rb │ ├── insert_delta.rb │ └── modify_delta.rb │ ├── delta_set.rb │ ├── delta_set_generator.rb │ ├── fast_matcher.rb │ ├── match.rb │ ├── match_set.rb │ └── signature.rb ├── lorax.gemspec └── spec ├── fast_matcher_spec.rb ├── files ├── Michael-Dalessio-200909.html ├── Michael-Dalessio-201001.html ├── slashdot-1.html ├── slashdot-2.html ├── slashdot-3.html └── slashdot-4.html ├── integration └── lorax_spec.rb ├── match_spec.rb ├── spec.opts ├── spec_helper.rb └── unit ├── delta ├── delete_delta_spec.rb ├── insert_delta_spec.rb └── modify_delta_spec.rb ├── delta_set_generator_spec.rb ├── delta_set_spec.rb ├── lorax_spec.rb ├── match_set_spec.rb └── signature_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.tmproj 3 | tmtags 4 | *~ 5 | \#* 6 | .\#* 7 | *.swp 8 | coverage 9 | rdoc 10 | pkg 11 | .bundle 12 | Gemfile.lock 13 | -------------------------------------------------------------------------------- /.rvmrc: -------------------------------------------------------------------------------- 1 | rvm use --create 1.9.3@lorax 2 | 3 | -------------------------------------------------------------------------------- /CHANGELOG.rdoc: -------------------------------------------------------------------------------- 1 | = Changelog 2 | 3 | == 0.3.0 (unreleased) 4 | 5 | * Human-readable diffs. 6 | 7 | == 0.2.0 (2010-10-14) 8 | 9 | * Better handling of whitespace: blank text nodes are ignored, as is 10 | leading and trailing whitespace in text nodes. GH#2. 11 | 12 | == 0.1.0 (2010-03-09) 13 | 14 | * Happy Birthday! 15 | * Diffs and generates patches, and for trivial cases applies patches correctly. 16 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | # DO NOT EDIT THIS FILE. Instead, edit Rakefile, and run `rake bundler:gemfile`. 4 | 5 | source "https://rubygems.org/" 6 | 7 | gem "nokogiri", ">=1.4" 8 | 9 | gem "rspec", "~>2.11", :group => [:development, :test] 10 | gem "rr", ">=1.0", :group => [:development, :test] 11 | gem "hoe-git", ">0", :group => [:development, :test] 12 | gem "hoe-gemspec", ">0", :group => [:development, :test] 13 | gem "hoe-bundler", ">0", :group => [:development, :test] 14 | gem "rdoc", ">=4.0", "<7", :group => [:development, :test] 15 | gem "hoe", "~>3.23", :group => [:development, :test] 16 | 17 | # vim: syntax=ruby 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009-2019 Mike Dalessio 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Manifest.txt: -------------------------------------------------------------------------------- 1 | CHANGELOG.rdoc 2 | LICENSE 3 | Manifest.txt 4 | README.rdoc 5 | Rakefile 6 | TODO 7 | bin/lorax 8 | lib/lorax.rb 9 | lib/lorax/delta.rb 10 | lib/lorax/delta/delete_delta.rb 11 | lib/lorax/delta/insert_delta.rb 12 | lib/lorax/delta/modify_delta.rb 13 | lib/lorax/delta_set.rb 14 | lib/lorax/delta_set_generator.rb 15 | lib/lorax/fast_matcher.rb 16 | lib/lorax/match.rb 17 | lib/lorax/match_set.rb 18 | lib/lorax/signature.rb 19 | spec/fast_matcher_spec.rb 20 | spec/files/Michael-Dalessio-200909.html 21 | spec/files/Michael-Dalessio-201001.html 22 | spec/files/slashdot-1.html 23 | spec/files/slashdot-2.html 24 | spec/files/slashdot-3.html 25 | spec/files/slashdot-4.html 26 | spec/integration/lorax_spec.rb 27 | spec/match_spec.rb 28 | spec/spec.opts 29 | spec/spec_helper.rb 30 | spec/unit/delta/delete_delta_spec.rb 31 | spec/unit/delta/insert_delta_spec.rb 32 | spec/unit/delta/modify_delta_spec.rb 33 | spec/unit/delta_set_generator_spec.rb 34 | spec/unit/delta_set_spec.rb 35 | spec/unit/lorax_spec.rb 36 | spec/unit/match_set_spec.rb 37 | spec/unit/signature_spec.rb 38 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = Lorax 2 | 3 | * http://github.com/flavorjones/lorax 4 | 5 | == Description 6 | 7 | The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri. 8 | 9 | It can tell you whether two XML/HTML documents are identical, or if 10 | they're not, tell you what's different. In trivial cases, it can even 11 | apply the patch. 12 | 13 | It's based loosely on Gregory Cobena's master's thesis paper, which 14 | generates deltas in less than O(n * log n) time, accepting some 15 | tradeoffs in the size of the delta set. You can find his paper at 16 | http://gregory.cobena.free.fr/www/Publications/thesis.html. 17 | 18 | "I am the Lorax, I speak for the trees." 19 | 20 | == Features / Problems 21 | 22 | * Detect differences between documents, or tell whether two documents are the same. 23 | * Generate patches for the differences between documents. 24 | * Apply patches for trivial cases. 25 | * More work needs to be done to make sure patches apply cleanly. 26 | 27 | == Synopsis 28 | 29 | Imagine you have two Nokogiri::XML::Documents. You can tell if they're identical: 30 | 31 | Lorax::Signature.new(doc1.root).signature == Lorax::Signature.new(doc2.root).signature 32 | 33 | You can generate a delta set (currently opaque (sorry kids)): 34 | 35 | delta_set = Lorax.diff(doc1, doc2) 36 | 37 | and apply the delta set as a patch to the original document: 38 | 39 | new_doc = delta_set.apply(doc1) 40 | 41 | == Requirements 42 | 43 | * Nokogiri 1.4.0 44 | 45 | == Install 46 | 47 | * sudo gem install lorax 48 | 49 | == License 50 | 51 | (The MIT License) 52 | 53 | Copyright (c) 2009-2012 Mike Dalessio 54 | 55 | Permission is hereby granted, free of charge, to any person obtaining 56 | a copy of this software and associated documentation files (the 57 | 'Software'), to deal in the Software without restriction, including 58 | without limitation the rights to use, copy, modify, merge, publish, 59 | distribute, sublicense, and/or sell copies of the Software, and to 60 | permit persons to whom the Software is furnished to do so, subject to 61 | the following conditions: 62 | 63 | The above copyright notice and this permission notice shall be 64 | included in all copies or substantial portions of the Software. 65 | 66 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 67 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 68 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 69 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 70 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 71 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 72 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 73 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "rubygems" 2 | require "hoe" 3 | 4 | Hoe.plugin :git 5 | Hoe.plugin :gemspec 6 | Hoe.plugin :bundler 7 | 8 | Hoe.spec "lorax" do 9 | developer "Mike Dalessio", "mike.dalessio@gmail.com" 10 | 11 | self.extra_rdoc_files = FileList["*.rdoc"] 12 | self.history_file = "CHANGELOG.rdoc" 13 | self.readme_file = "README.rdoc" 14 | self.licenses = ["MIT"] 15 | 16 | extra_deps << ["nokogiri", ">= 1.4"] 17 | extra_dev_deps << ["rspec", "~> 2.11"] 18 | extra_dev_deps << ["rr", ">= 1.0"] 19 | extra_dev_deps << ["hoe-git", "> 0"] 20 | extra_dev_deps << ["hoe-gemspec", "> 0"] 21 | extra_dev_deps << ["hoe-bundler", "> 0"] 22 | end 23 | 24 | task :redocs => :fix_css 25 | task :docs => :fix_css 26 | task :fix_css do 27 | better_css = <<-EOT 28 | .method-description pre { 29 | margin : 1em 0 ; 30 | } 31 | 32 | .method-description ul { 33 | padding : .5em 0 .5em 2em ; 34 | } 35 | 36 | .method-description p { 37 | margin-top : .5em ; 38 | } 39 | 40 | #main ul, div#documentation ul { 41 | list-style-type : disc ! IMPORTANT ; 42 | list-style-position : inside ! IMPORTANT ; 43 | } 44 | 45 | h2 + ul { 46 | margin-top : 1em; 47 | } 48 | EOT 49 | puts "* fixing css" 50 | File.open("doc/rdoc.css", "a") { |f| f.write better_css } 51 | end 52 | 53 | # vim: syntax=ruby 54 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | # -*-org-*- 2 | Lorax TODO 3 | 4 | * docs 5 | *** rdocs 6 | *** class description notes 7 | - Signature: calculate and persist signatures and weights for nodes in a single document 8 | - Match: represents a match between two nodes 9 | - MatchSet: composed of Signatures and Matches. 10 | - Matcher: an algorithm that operates on a MatchSet statelessly to generate matches. 11 | - Generator: generates a DeltaSet from a MatchSet 12 | - Delta: an atomic change step 13 | - DeltaSet: an ordered set of Deltas 14 | - Apply: f(doc1, DeltaSet) -> doc2 15 | *** algorithmic notes 16 | ***** ignoring ID 17 | - too many web sites fuck that up 18 | - libxml2 allows duplicate ids 19 | - algorithm would ignore changed content 20 | ***** indexes (ascendant lookahead) needs to be implemented? 21 | ***** if we do "phase 3" in weight-order, and recursively match parents, can't we avoid the "propagate to parent" step of phase 4? 22 | * core 23 | *** write integration test for MODIFY delta 24 | *** write integration test for DELETE delta 25 | *** write integration test for MODIFY delta with move 26 | *** change API to specify HTML or XML. or should we make user pass in Nokogirified docs? 27 | *** pick a hashing algorithm 28 | - ruby hash / md5 / sha1 29 | - benchmark? collision rate? 30 | * additional 31 | *** build an rspec matcher for xml 32 | *** build a test/unit assertion for xml 33 | *** try to make the code independent of the tree we're diffing 34 | think about diffing any tree, e.g. AST, YAML 35 | *** benchmark suite so we can try different algorithms 36 | -------------------------------------------------------------------------------- /bin/lorax: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env ruby 2 | 3 | require "lorax" 4 | 5 | delta_set = Lorax.diff(File.read(ARGV[0]), File.read(ARGV[1])) 6 | summary = delta_set.deltas.map do |d| 7 | d.descriptor 8 | end 9 | 10 | File.open("foo.yml", "w+") { |f| f.puts summary.to_yaml } 11 | File.open("foo.html", "w+") do |f| 12 | doc = Nokogiri::HTML File.read(ARGV[0]) 13 | delta_set.apply!(doc) 14 | f.puts doc.to_html 15 | end 16 | -------------------------------------------------------------------------------- /lib/lorax.rb: -------------------------------------------------------------------------------- 1 | require 'nokogiri' 2 | 3 | module Lorax 4 | VERSION = "0.3.0.rc2" 5 | REQUIRED_NOKOGIRI_VERSION = "1.4.0" 6 | raise LoadError, "lorax requires Nokogiri version #{REQUIRED_NOKOGIRI_VERSION} or higher" unless Gem::Version.new(Nokogiri::VERSION) >= Gem::Version.new(REQUIRED_NOKOGIRI_VERSION) 7 | end 8 | 9 | require "lorax/signature" 10 | require "lorax/match" 11 | require "lorax/match_set" 12 | require "lorax/fast_matcher" 13 | 14 | require "lorax/delta" 15 | require "lorax/delta_set_generator" 16 | require "lorax/delta_set" 17 | 18 | module Lorax 19 | def Lorax.diff(string_or_io_or_nokogiridoc_1, string_or_io_or_nokogiridoc_2) 20 | doc1 = documentize string_or_io_or_nokogiridoc_1 21 | doc2 = documentize string_or_io_or_nokogiridoc_2 22 | 23 | Lorax::FastMatcher.new(doc1, doc2).match.to_delta_set 24 | end 25 | 26 | private 27 | 28 | def Lorax.documentize(string_or_io_or_nokogiridoc) 29 | if string_or_io_or_nokogiridoc.is_a?(Nokogiri::XML::Document) 30 | string_or_io_or_nokogiridoc 31 | else 32 | Nokogiri string_or_io_or_nokogiridoc 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/lorax/delta.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class Delta 3 | class NodeNotFoundError < RuntimeError ; end 4 | 5 | def apply!(document) 6 | raise NotImplementedError, self.class.to_s 7 | end 8 | 9 | def inspect 10 | "#<#{self.class.name}:#{sprintf("0x%x", object_id)} #{descriptor.inspect}>" 11 | end 12 | 13 | private 14 | 15 | def insert_node(node, parent, position) 16 | children = parent.children 17 | if children.empty? || position >= children.length 18 | parent << node.dup 19 | else 20 | children[position].add_previous_sibling(node.dup) 21 | end 22 | end 23 | 24 | def context_before node 25 | if node.previous_sibling 26 | node.previous_sibling.to_xml.gsub(/^/,' ').rstrip 27 | else 28 | " <#{node.parent.name}>" 29 | end 30 | end 31 | 32 | def context_after node 33 | if node.next_sibling 34 | node.next_sibling.to_xml.gsub(/^/,' ').rstrip 35 | else 36 | " " 37 | end 38 | end 39 | end 40 | end 41 | 42 | require "lorax/delta/insert_delta" 43 | require "lorax/delta/modify_delta" 44 | require "lorax/delta/delete_delta" 45 | -------------------------------------------------------------------------------- /lib/lorax/delta/delete_delta.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class DeleteDelta < Delta 3 | attr_accessor :node 4 | 5 | def initialize(node) 6 | @node = node 7 | end 8 | 9 | def apply!(document) 10 | target = document.at_xpath(node.path) 11 | raise NodeNotFoundError, xpath unless target 12 | target.unlink 13 | end 14 | 15 | def descriptor 16 | [:delete, {:xpath => node.path, :content => node.to_s}] 17 | end 18 | 19 | def to_s 20 | response = [] 21 | response << "--- #{node.path}" 22 | response << "+++" 23 | response << context_before(node) 24 | response << node.to_html.gsub(/^/,'- ').strip 25 | response << context_after(node) 26 | response.join("\n") 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/lorax/delta/insert_delta.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class InsertDelta < Delta 3 | attr_accessor :node, :xpath, :position 4 | 5 | def initialize(node, xpath, position) 6 | @node = node 7 | @xpath = xpath 8 | @position = position 9 | end 10 | 11 | def apply!(document) 12 | # TODO: patch nokogiri to make inserting node copies efficient 13 | parent = document.at_xpath(xpath) 14 | raise NodeNotFoundError, xpath unless parent 15 | insert_node(node.dup, parent, position) 16 | end 17 | 18 | def descriptor 19 | [:insert, {:xpath => xpath, :position => position, :content => node.to_s}] 20 | end 21 | 22 | def to_s 23 | response = [] 24 | response << "---" 25 | response << "+++ #{node.path}" 26 | response << context_before(node) 27 | response << node.to_html.gsub(/^/,'+ ').strip 28 | response << context_after(node) 29 | response.join("\n") 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/lorax/delta/modify_delta.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class ModifyDelta < Delta 3 | attr_accessor :node1, :node2 4 | 5 | def initialize(node1, node2) 6 | @node1 = node1 7 | @node2 = node2 8 | end 9 | 10 | def apply!(doc) 11 | node = doc.at_xpath(node1.path) 12 | raise NodeNotFoundError, node1.path unless node 13 | 14 | if node.text? || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE 15 | node.content = node2.content 16 | else 17 | attributes = attributes_hash(node) 18 | attributes2 = attributes_hash(node2) 19 | if attributes != attributes2 20 | attributes .each { |name, value| node.remove_attribute(name) } 21 | attributes2.each { |name, value| node[name] = value } 22 | end 23 | end 24 | 25 | if node1.path != node2.path 26 | position = node2.parent.children.index(node2) 27 | target_parent = doc.at_xpath(node2.parent.path) 28 | raise NodeNotFoundError, node2.parent.path unless target_parent 29 | node.unlink 30 | insert_node(node, target_parent, position) 31 | end 32 | end 33 | 34 | def descriptor 35 | if node1.text? || node1.type == Nokogiri::XML::Node::CDATA_SECTION_NODE 36 | [:modify, {:old => {:xpath => node1.path, :content => node1.to_s}, 37 | :new => {:xpath => node2.path, :content => node2.to_s}}] 38 | else 39 | [:modify, {:old => {:xpath => node1.path, :name => node1.name, :attributes => node1.attributes.map{|k,v| [k, v.value]}}, 40 | :new => {:xpath => node2.path, :name => node2.name, :attributes => node2.attributes.map{|k,v| [k, v.value]}}}] 41 | end 42 | end 43 | 44 | def to_s 45 | response = [] 46 | response << "--- #{node1.path}" 47 | response << "+++ #{node2.path}" 48 | response << context_before(node2) 49 | 50 | response << node1.to_html.gsub(/^/,'- ').strip 51 | response << node2.to_html.gsub(/^/,'+ ').strip 52 | 53 | response << context_after(node2) 54 | response.join("\n") 55 | end 56 | 57 | private 58 | 59 | def attributes_hash(node) 60 | # lame. 61 | node.attributes.inject({}) { |hash, attr| hash[attr.first] = attr.last.value ; hash } 62 | end 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /lib/lorax/delta_set.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class DeltaSet 3 | attr_accessor :deltas 4 | 5 | def initialize 6 | @deltas = [] 7 | end 8 | 9 | def add(delta) 10 | @deltas << delta 11 | end 12 | 13 | def apply(document) 14 | apply! document.dup 15 | end 16 | 17 | def apply!(document) 18 | deltas.each do |delta| 19 | delta.apply! document 20 | end 21 | document 22 | end 23 | 24 | def to_s 25 | deltas.collect do |delta| 26 | delta.to_s 27 | end.join("\n\n") 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/lorax/delta_set_generator.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | module DeltaSetGenerator 3 | def self.generate_delta_set match_set 4 | delta_set = DeltaSet.new 5 | generate_inserts_and_moves_recursively delta_set, match_set, match_set.signature2.root 6 | generate_deletes_recursively delta_set, match_set, match_set.signature1.root 7 | delta_set 8 | end 9 | 10 | private 11 | 12 | def self.generate_inserts_and_moves_recursively delta_set, match_set, node 13 | match = match_set.match node 14 | if match 15 | if ! match.perfect? 16 | if match_set.signature1.monogram(match.pair.first) != match_set.signature2.monogram(match.pair.last) 17 | delta_set.add ModifyDelta.new(match.pair.first, match.pair.last) 18 | end 19 | node.children.each { |child| generate_inserts_and_moves_recursively delta_set, match_set, child } 20 | end 21 | else 22 | delta_set.add InsertDelta.new(node, node.parent.path, node.parent.children.index(node)) # TODO: demeter violation 23 | end 24 | end 25 | 26 | def self.generate_deletes_recursively delta_set, match_set, node 27 | match = match_set.match(node) 28 | if match 29 | return if match.perfect? 30 | node.children.each { |child| generate_deletes_recursively delta_set, match_set, child } 31 | else 32 | delta_set.add DeleteDelta.new(node) 33 | end 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/lorax/fast_matcher.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class FastMatcher 3 | attr_accessor :match_set 4 | 5 | def initialize(doc1, doc2, dependency_injection={}) 6 | @document1 = doc1 7 | @document2 = doc2 8 | @match_set = dependency_injection[:matcher_match_set] || MatchSet.new(doc1, doc2, dependency_injection) 9 | end 10 | 11 | def match 12 | match_node @document1.root 13 | end 14 | 15 | private 16 | 17 | def match_node(node1) 18 | return if match_set.match(node1) 19 | signature1 = match_set.signature1.signature(node1) # assumes node1 is in signature1 20 | candidates = match_set.signature2.nodes(signature1) || [] 21 | candidates.reject! { |node| match_set.match(node) } 22 | 23 | if candidates.empty? 24 | node1.children.each do |child| 25 | match_node(child) 26 | end 27 | match = match_set.match(node1) 28 | propagate_to_children(match.pair.first, match.pair.last) if match 29 | else 30 | match_candidate(node1, candidates) 31 | end 32 | propagate_to_parent(node1) unless match_set.match(node1) 33 | match_set 34 | end 35 | 36 | def match_candidate(node1, candidates) 37 | ancestral_matches = candidates.collect do |node2| 38 | ancestral_match(node1, node2, depth(node2, match_set.signature2)) 39 | end 40 | longest_trail = ancestral_matches.max { |a, b| a.length <=> b.length } 41 | longest_trail.each do |ancestral_match| 42 | match_set.add ancestral_match 43 | end 44 | end 45 | 46 | def ancestral_match(node1, node2, max_depth) 47 | matches = [Match.new(node1, node2, :perfect => true)] 48 | curr1, curr2 = node1.parent, node2.parent 49 | 1.upto(max_depth) do 50 | break unless curr1.name == curr2.name && ! curr1.is_a?(Nokogiri::XML::Document) 51 | matches << Match.new(curr1, curr2) 52 | curr1, curr2 = curr1.parent, curr2.parent 53 | end 54 | matches 55 | end 56 | 57 | def propagate_to_parent(node1) 58 | node1.children.sort_by { |child| match_set.signature1.weight(child) }.reverse.each do |child| 59 | next unless match = match_set.match(child) 60 | match_parent = match.pair.last.parent 61 | if match_parent.name == node1.name 62 | match_set.add Match.new(node1, match_parent) 63 | return 64 | end 65 | end 66 | end 67 | 68 | def propagate_to_children(node1, node2) 69 | # TODO: OMG! MY EYES ARE BLEEDING! REFACTOR ME AND OPTIMIZE ME! 70 | children_set1 = collect_children_by_name(node1.children) 71 | children_set2 = collect_children_by_name(node2.children) 72 | 73 | children_set1.each do |name1, children1| 74 | children_set2.each do |name2, children2| 75 | next unless name1 == name2 76 | if children1.length == 1 && children2.length == 1 77 | match_set.add Match.new(children1.first, children2.first) 78 | propagate_to_children children1.first, children2.first 79 | else 80 | children1.each do |child1| 81 | children2.each do |child2| 82 | if node1.children.index(child1) == node2.children.index(child2) 83 | match_set.add Match.new(child1, child2) 84 | propagate_to_children child1, child2 85 | end 86 | end 87 | end 88 | end 89 | end 90 | end 91 | end 92 | 93 | def depth(node, sig) 94 | depth = 1 + Math.log(sig.size) * sig.weight(node) / sig.weight 95 | # puts "lorax: debug: #{__FILE__}:#{__LINE__}: depth #{depth} = 1 + #{Math.log(sig.size)} * #{sig.weight(node)} / #{sig.weight}" 96 | depth.to_i 97 | end 98 | 99 | def collect_children_by_name(node_set) 100 | collection = {} 101 | node_set.each do |child| 102 | next if match_set.match(child) 103 | (collection[child.name] ||= []) << child 104 | end 105 | collection 106 | end 107 | end 108 | end 109 | -------------------------------------------------------------------------------- /lib/lorax/match.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class Match 3 | attr_accessor :pair 4 | 5 | def initialize(node1, node2, options={}) 6 | @pair = [node1, node2] 7 | @perfect = options[:perfect] ? true : false 8 | end 9 | 10 | def perfect? 11 | @perfect 12 | end 13 | 14 | def other(node) 15 | case node 16 | when pair.first then pair.last 17 | when pair.last then pair.first 18 | else nil 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/lorax/match_set.rb: -------------------------------------------------------------------------------- 1 | module Lorax 2 | class MatchSet 3 | attr_accessor :signature1, :signature2 4 | 5 | def initialize(doc1, doc2, dependency_injection={}) 6 | @document1 = doc1 7 | @document2 = doc2 8 | @signature1 = dependency_injection[:match_set_signature1] || Lorax::Signature.new(@document1.root) 9 | @signature2 = dependency_injection[:match_set_signature2] || Lorax::Signature.new(@document2.root) 10 | @matches = {} 11 | end 12 | 13 | def match(node) 14 | @matches[node] 15 | end 16 | 17 | def matches 18 | puts "MIKE: #{__FILE__}:#{__LINE__} REMOVE ME THIS IS FOR DEBUGGING ONLY" 19 | @matches.values.uniq.collect {|m| [m.pair.first.path, m.pair.last.path, m.perfect?]}.sort 20 | end 21 | 22 | def add(match) 23 | match.pair.each { |node| @matches[node] = match } 24 | end 25 | 26 | def to_delta_set 27 | DeltaSetGenerator.generate_delta_set(self) 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/lorax/signature.rb: -------------------------------------------------------------------------------- 1 | require 'digest/sha1' 2 | 3 | module Lorax 4 | class Signature 5 | SEP = "\0" 6 | 7 | def initialize(node=nil) 8 | @signatures = {} # node => signature 9 | @monograms = {} # node => monogram (signature not including children) 10 | @nodes = {} # signature => [node, ...] 11 | @weights = {} # node => weight 12 | @size = 0 13 | @node = node 14 | signature(node) if node 15 | end 16 | 17 | def root 18 | @node 19 | end 20 | 21 | def nodes(sig=nil) 22 | sig ? @nodes[sig] : [@node] 23 | end 24 | 25 | def size 26 | @size 27 | end 28 | 29 | def signature(node=@node) 30 | return @signatures[node] if @signatures.key?(node) 31 | raise ArgumentError, "signature expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node) 32 | 33 | if node.text? 34 | content = node.content.strip 35 | if content.empty? 36 | return nil 37 | else 38 | monogram = signature = hashify(content) 39 | end 40 | elsif node.cdata? || node.comment? 41 | monogram = signature = hashify(node.content) 42 | elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE 43 | monogram = signature = hashify(node.to_html) 44 | elsif node.element? 45 | children_sig = hashify(node.children .collect { |child| signature(child) }.compact) 46 | attr_sig = hashify(node.attributes.sort.collect { |k,v| [k, v.value] }.flatten) 47 | monogram = hashify(node.name, attr_sig) 48 | signature = hashify(node.name, attr_sig, children_sig) 49 | else 50 | raise ArgumentError, "signature expects an element, text, cdata or comment node, but received #{node.class}" 51 | end 52 | 53 | @size += 1 54 | weight(node) 55 | 56 | (@nodes[signature] ||= []) << node 57 | @monograms[node] = monogram 58 | @signatures[node] = signature 59 | end 60 | 61 | def weight(node=@node) 62 | return @weights[node] if @weights.key?(node) 63 | raise ArgumentError, "weight expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node) 64 | 65 | if node.text? 66 | content = node.content.strip 67 | if content.empty? 68 | calculated_weight = 0 69 | else 70 | calculated_weight = 1 + Math.log(content.length) 71 | end 72 | elsif node.cdata? || node.comment? 73 | calculated_weight = 1 + Math.log(node.content.length) 74 | elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE 75 | calculated_weight = 1 76 | elsif node.element? 77 | calculated_weight = node.children.inject(1) { |sum, child| sum += weight(child) } 78 | else 79 | raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}" 80 | end 81 | 82 | @weights[node] = calculated_weight 83 | end 84 | 85 | def monogram(node=@node) 86 | return @monograms[node] if @monograms.key?(node) 87 | signature(node) 88 | @monograms[node] 89 | end 90 | 91 | def set_signature(node, value) # :nodoc: for testing 92 | (@nodes[value] ||= []) << node 93 | @signatures[node] = value 94 | end 95 | 96 | def set_weight(node, value) # :nodoc: for testing 97 | @weights[node] = value 98 | end 99 | 100 | private 101 | 102 | def hashify(*args) 103 | if args.length == 1 104 | if args.first.is_a?(Array) 105 | Digest::SHA1.hexdigest args.first.join(SEP) 106 | else 107 | Digest::SHA1.hexdigest args.first 108 | end 109 | else 110 | Digest::SHA1.hexdigest args.join(SEP) 111 | end 112 | end 113 | end 114 | end 115 | -------------------------------------------------------------------------------- /lorax.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # stub: lorax 0.3.0.rc2.20210907092457 ruby lib 3 | 4 | Gem::Specification.new do |s| 5 | s.name = "lorax".freeze 6 | s.version = "0.3.0.rc2.20210907092457" 7 | 8 | s.required_rubygems_version = Gem::Requirement.new("> 1.3.1".freeze) if s.respond_to? :required_rubygems_version= 9 | s.metadata = { "homepage_uri" => "http://github.com/flavorjones/lorax" } if s.respond_to? :metadata= 10 | s.require_paths = ["lib".freeze] 11 | s.authors = ["Mike Dalessio".freeze] 12 | s.date = "2021-09-07" 13 | s.description = "The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri.\n\nIt can tell you whether two XML/HTML documents are identical, or if\nthey're not, tell you what's different. In trivial cases, it can even\napply the patch.\n\nIt's based loosely on Gregory Cobena's master's thesis paper, which\ngenerates deltas in less than O(n * log n) time, accepting some\ntradeoffs in the size of the delta set. You can find his paper at\nhttp://gregory.cobena.free.fr/www/Publications/thesis.html.\n\n\"I am the Lorax, I speak for the trees.\"".freeze 14 | s.email = ["mike.dalessio@gmail.com".freeze] 15 | s.executables = ["lorax".freeze] 16 | s.extra_rdoc_files = ["CHANGELOG.rdoc".freeze, "Manifest.txt".freeze, "README.rdoc".freeze, "CHANGELOG.rdoc".freeze, "README.rdoc".freeze] 17 | s.files = ["CHANGELOG.rdoc".freeze, "LICENSE".freeze, "Manifest.txt".freeze, "README.rdoc".freeze, "Rakefile".freeze, "TODO".freeze, "bin/lorax".freeze, "lib/lorax.rb".freeze, "lib/lorax/delta.rb".freeze, "lib/lorax/delta/delete_delta.rb".freeze, "lib/lorax/delta/insert_delta.rb".freeze, "lib/lorax/delta/modify_delta.rb".freeze, "lib/lorax/delta_set.rb".freeze, "lib/lorax/delta_set_generator.rb".freeze, "lib/lorax/fast_matcher.rb".freeze, "lib/lorax/match.rb".freeze, "lib/lorax/match_set.rb".freeze, "lib/lorax/signature.rb".freeze, "spec/fast_matcher_spec.rb".freeze, "spec/files/Michael-Dalessio-200909.html".freeze, "spec/files/Michael-Dalessio-201001.html".freeze, "spec/files/slashdot-1.html".freeze, "spec/files/slashdot-2.html".freeze, "spec/files/slashdot-3.html".freeze, "spec/files/slashdot-4.html".freeze, "spec/integration/lorax_spec.rb".freeze, "spec/match_spec.rb".freeze, "spec/spec.opts".freeze, "spec/spec_helper.rb".freeze, "spec/unit/delta/delete_delta_spec.rb".freeze, "spec/unit/delta/insert_delta_spec.rb".freeze, "spec/unit/delta/modify_delta_spec.rb".freeze, "spec/unit/delta_set_generator_spec.rb".freeze, "spec/unit/delta_set_spec.rb".freeze, "spec/unit/lorax_spec.rb".freeze, "spec/unit/match_set_spec.rb".freeze, "spec/unit/signature_spec.rb".freeze] 18 | s.homepage = "http://github.com/flavorjones/lorax".freeze 19 | s.licenses = ["MIT".freeze] 20 | s.rdoc_options = ["--main".freeze, "README.rdoc".freeze] 21 | s.rubygems_version = "3.2.15".freeze 22 | s.summary = "The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri".freeze 23 | 24 | if s.respond_to? :specification_version then 25 | s.specification_version = 4 26 | end 27 | 28 | if s.respond_to? :add_runtime_dependency then 29 | s.add_runtime_dependency(%q.freeze, [">= 1.4"]) 30 | s.add_development_dependency(%q.freeze, ["~> 2.11"]) 31 | s.add_development_dependency(%q.freeze, [">= 1.0"]) 32 | s.add_development_dependency(%q.freeze, ["> 0"]) 33 | s.add_development_dependency(%q.freeze, ["> 0"]) 34 | s.add_development_dependency(%q.freeze, ["> 0"]) 35 | s.add_development_dependency(%q.freeze, [">= 4.0", "< 7"]) 36 | s.add_development_dependency(%q.freeze, ["~> 3.23"]) 37 | else 38 | s.add_dependency(%q.freeze, [">= 1.4"]) 39 | s.add_dependency(%q.freeze, ["~> 2.11"]) 40 | s.add_dependency(%q.freeze, [">= 1.0"]) 41 | s.add_dependency(%q.freeze, ["> 0"]) 42 | s.add_dependency(%q.freeze, ["> 0"]) 43 | s.add_dependency(%q.freeze, ["> 0"]) 44 | s.add_dependency(%q.freeze, [">= 4.0", "< 7"]) 45 | s.add_dependency(%q.freeze, ["~> 3.23"]) 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /spec/fast_matcher_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Lorax::FastMatcher do 4 | describe ".new" do 5 | context "normal usage" do 6 | it "takes two arguments" do 7 | proc { Lorax::FastMatcher.new(xml{root}) }.should raise_error(ArgumentError) 8 | proc { Lorax::FastMatcher.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError) 9 | end 10 | 11 | it "builds a MatchSet for the documents" do 12 | doc1 = xml { root1 } 13 | doc2 = xml { root2 } 14 | mock.proxy(Lorax::MatchSet).new(doc1, doc2, anything) 15 | Lorax::FastMatcher.new(doc1, doc2) 16 | end 17 | end 18 | 19 | context "dependency injection" do 20 | it "takes an optional third argument for dependency injection" do 21 | proc { Lorax::FastMatcher.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError) 22 | end 23 | 24 | it "will use the value of ':matcher_match_set' for @match_set" do 25 | matcher = Lorax::FastMatcher.new(xml{root}, xml{root}, {:matcher_match_set => :foo}) 26 | matcher.match_set.should == :foo 27 | end 28 | end 29 | end 30 | 31 | describe "basic node matching" do 32 | context "simple matches" do 33 | before do 34 | @doc1 = xml { root1 { 35 | a1 36 | b1 37 | } } 38 | @doc2 = xml { root2 { 39 | a1 40 | b2 41 | } } 42 | @signature1 = Lorax::Signature.new(@doc1.root) 43 | @signature1.set_signature(@doc1.at_css("root1"), "root1") 44 | @signature1.set_signature(@doc1.at_css("a1"), "a1") 45 | @signature1.set_signature(@doc1.at_css("b1"), "b1") 46 | @signature2 = Lorax::Signature.new(@doc2.root) 47 | @signature2.set_signature(@doc2.at_css("root2"), "root2") 48 | @signature2.set_signature(@doc2.at_css("a1"), "a1") 49 | @signature2.set_signature(@doc2.at_css("b2"), "b2") 50 | end 51 | 52 | it "matches identical nodes" do 53 | match_set = Lorax::FastMatcher.new(@doc1, @doc2, 54 | :match_set_signature1 => @signature1, 55 | :match_set_signature2 => @signature2).match 56 | assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1") 57 | end 58 | 59 | it "does not match different nodes" do 60 | match_set = Lorax::FastMatcher.new(@doc1, @doc2, 61 | :match_set_signature1 => @signature1, 62 | :match_set_signature2 => @signature2).match 63 | assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b2") 64 | end 65 | end 66 | 67 | context "sibling matches" do 68 | it "matches all identical siblings" do 69 | doc1 = xml { root { 70 | a1_1 ; a1_3 ; a1_5 71 | } } 72 | doc2 = xml { root { 73 | a2_1 ; a2_2 ; a2_3 ; a2_4 ; a2_5 74 | } } 75 | signature1 = Lorax::Signature.new(doc1.root) 76 | signature1.set_signature(doc1.at_css("a1_1"), "a1") 77 | signature1.set_signature(doc1.at_css("a1_3"), "a3") 78 | signature1.set_signature(doc1.at_css("a1_5"), "a5") 79 | 80 | signature2 = Lorax::Signature.new(doc2.root) 81 | signature2.set_signature(doc2.at_css("a2_1"), "a1") 82 | signature2.set_signature(doc2.at_css("a2_3"), "a3") 83 | signature2.set_signature(doc2.at_css("a2_5"), "a5") 84 | 85 | match_set = Lorax::FastMatcher.new(doc1, doc2, 86 | :match_set_signature1 => signature1, :match_set_signature2 => signature2).match 87 | assert_perfect_match_exists match_set, doc1.at_css("a1_1"), doc2.at_css("a2_1") 88 | assert_perfect_match_exists match_set, doc1.at_css("a1_3"), doc2.at_css("a2_3") 89 | assert_perfect_match_exists match_set, doc1.at_css("a1_5"), doc2.at_css("a2_5") 90 | end 91 | end 92 | 93 | context "matching children of an unmatched node" do 94 | it "matches those children" do 95 | doc1 = xml { root { 96 | a1 { 97 | b1 ; b2 98 | } 99 | } } 100 | doc2 = xml { root { 101 | a2 { 102 | b1 ; b2 103 | } 104 | } } 105 | signature1 = Lorax::Signature.new(doc1.root) 106 | signature1.set_signature(doc1.at_css("a1"), "a1") 107 | signature1.set_signature(doc1.at_css("b1"), "b1") 108 | signature1.set_signature(doc1.at_css("b2"), "b2") 109 | 110 | signature2 = Lorax::Signature.new(doc2.root) 111 | signature1.set_signature(doc2.at_css("a2"), "a2") 112 | signature2.set_signature(doc2.at_css("b1"), "b1") 113 | signature2.set_signature(doc2.at_css("b2"), "b2") 114 | 115 | match_set = Lorax::FastMatcher.new(doc1, doc2, 116 | :match_set_signature1 => signature1, :match_set_signature2 => signature2).match 117 | assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1") 118 | assert_perfect_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2") 119 | end 120 | end 121 | 122 | context "nested matches" do 123 | before do 124 | @doc1 = xml { root1 { a1 { b1 } } } 125 | @doc2 = xml { root2 { a1 { b1 } } } 126 | @signature1 = Lorax::Signature.new(@doc1.root) 127 | @signature1.set_signature(@doc1.at_css("a1"), "a1") 128 | @signature1.set_signature(@doc1.at_css("b1"), "b1") 129 | @signature2 = Lorax::Signature.new(@doc2.root) 130 | @signature2.set_signature(@doc2.at_css("a1"), "a1") 131 | @signature2.set_signature(@doc2.at_css("b1"), "b2") 132 | end 133 | 134 | it "matches the root nodes of the largest identical subtree" do 135 | match_set = Lorax::FastMatcher.new(@doc1, @doc2, 136 | :match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match 137 | assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1") 138 | end 139 | 140 | it "does not match children of identical match nodes" do 141 | match_set = Lorax::FastMatcher.new(@doc1, @doc2, 142 | :match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match 143 | assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b1") 144 | end 145 | end 146 | end 147 | 148 | describe "forced parent matching" do 149 | before do 150 | stub.instance_of(Lorax::FastMatcher).propagate_to_parent # we're not testing propagation to parent 151 | end 152 | 153 | it "forces a match when parent names are the same but attributes are different" do 154 | doc1 = xml { root { a1(:foo => "bar") { b1 } } } 155 | doc2 = xml { root { a1(:bazz => "quux") { b1 } } } 156 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 157 | assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1") 158 | assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1") 159 | end 160 | 161 | it "forces a match when parent names and attributes are the same but siblings are different" do 162 | doc1 = xml { root { a1(:foo => "bar") { b1 ; b2 } } } 163 | doc2 = xml { root { a1(:foo => "bar") { b1 ; b3 } } } 164 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 165 | assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1") 166 | assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1") 167 | end 168 | 169 | describe "subsequent forced child matching" do 170 | it "force matches a uniquely-named sibling" do 171 | doc1 = xml { root { a1 { 172 | b2 "goodbye" 173 | b1 "hello" 174 | b3 175 | b4 176 | } } } 177 | doc2 = xml { root { a1 { 178 | b2 "good boy" 179 | b1 "hello" 180 | b3 "something" 181 | b4 { c1 } 182 | } } } 183 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 184 | assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1") 185 | assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1") 186 | assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2") 187 | assert_forced_match_exists match_set, doc1.at_css("b3"), doc2.at_css("b3") 188 | assert_forced_match_exists match_set, doc1.at_css("b4"), doc2.at_css("b4") 189 | end 190 | 191 | it "force matches recursively" do 192 | doc1 = xml { root { a1 ; a2 { b2 "hello" } } } 193 | doc2 = xml { root { a1 ; a2 { b2 "goodbye" } } } 194 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 195 | assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1") 196 | assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2") 197 | assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2") 198 | assert_forced_match_exists match_set, doc1.at_xpath("//b2/text()"), doc2.at_xpath("//b2/text()") 199 | end 200 | 201 | it "should match uniquely-named unmatched children" do 202 | doc1 = xml { root { 203 | a1 "hello" 204 | a2 "goodbye" 205 | a3 "natch" 206 | } } 207 | doc2 = xml { root { 208 | a1 "hello" 209 | a3 "not" 210 | } } 211 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 212 | assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1") 213 | assert_forced_match_exists match_set, doc1.at_css("a3"), doc2.at_css("a3") 214 | end 215 | 216 | it "should match same-named children in the same position, even if they are not uniquely named" do 217 | doc1 = xml { root { 218 | a1 { 219 | text "hello" 220 | b1 "foo" 221 | text "goodbye" 222 | } 223 | } } 224 | doc2 = xml { root { 225 | a1 { 226 | text "bonjour" 227 | b1 "foo" 228 | text "au revoir" 229 | } 230 | } } 231 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 232 | assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[1]"), doc2.at_xpath("/root/a1/text()[1]") 233 | assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[2]"), doc2.at_xpath("/root/a1/text()[2]") 234 | end 235 | 236 | it "large subtree matches force more parent matches than smaller subtree matches" do 237 | small_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 { 238 | f1 239 | f2 240 | } } } } } } } 241 | small_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 { 242 | f1 243 | f3 244 | } } } } } } } 245 | large_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 { 246 | f1 247 | f2 248 | } } } } } } } 249 | large_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 { 250 | f1 251 | f3 252 | } } } } } } } 253 | 254 | small_signature1 = Lorax::Signature.new(small_doc1.root) 255 | small_signature1.set_weight(small_doc1.at_css("f1"), 1) 256 | small_signature2 = Lorax::Signature.new(small_doc2.root) 257 | small_signature2.set_weight(small_doc2.at_css("f1"), 1) 258 | large_signature1 = Lorax::Signature.new(large_doc1.root) 259 | large_signature1.set_weight(large_doc1.at_css("f1"), 10) 260 | large_signature2 = Lorax::Signature.new(large_doc2.root) 261 | large_signature2.set_weight(large_doc2.at_css("f1"), 10) 262 | 263 | small_match_set = Lorax::FastMatcher.new(small_doc1, small_doc2, 264 | :match_set_signature1 => small_signature1, :match_set_signature2 => small_signature2).match 265 | large_match_set = Lorax::FastMatcher.new(large_doc1, large_doc2, 266 | :match_set_signature1 => large_signature1, :match_set_signature2 => large_signature2).match 267 | 268 | assert_forced_match_exists small_match_set, small_doc1.at_css("e1"), small_doc2.at_css("e1") 269 | assert_no_match_exists small_match_set, small_doc1.at_css("d1"), small_doc2.at_css("d1") 270 | 271 | assert_forced_match_exists large_match_set, large_doc1.at_css("e1"), large_doc2.at_css("e1") 272 | assert_forced_match_exists large_match_set, large_doc1.at_css("d1"), large_doc2.at_css("d1") 273 | assert_forced_match_exists large_match_set, large_doc1.at_css("c1"), large_doc2.at_css("c1") 274 | assert_no_match_exists large_match_set, large_doc1.at_css("b1"), large_doc2.at_css("b1") 275 | end 276 | end 277 | end 278 | 279 | describe "propagating matches to unmatched parents based on children's matches' parents" do 280 | context "when there is only one child" do 281 | it "should match parents all the way up the tree" do 282 | doc1 = xml { root { a1 { b1 { c1 { d1 { e1 { 283 | f1 "hello" 284 | f2 285 | } } } } } } } 286 | doc2 = xml { root { a1 { b1 { c1 { d1 { e1 { 287 | f1 "hello" 288 | f3 289 | } } } } } } } 290 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 291 | assert_perfect_match_exists match_set, doc1.at_css("f1"), doc2.at_css("f1") 292 | %w[e1 d1 c1 b1 a1 root].each do |node_name| 293 | assert_forced_match_exists match_set, doc1.at_css(node_name), doc2.at_css(node_name) 294 | end 295 | end 296 | end 297 | 298 | context "there are many possible children" do 299 | it "should match via children with largest weight" do 300 | doc1 = xml { root { 301 | a1 { b1 ; b2 } 302 | } } 303 | doc2 = xml { root { 304 | a1 { b1 ; b3 } 305 | a1 { b2 ; b4 } 306 | } } 307 | signature1 = Lorax::Signature.new(doc1.root) 308 | signature2 = Lorax::Signature.new(doc2.root) 309 | signature1.set_weight(doc1.at_css("b1"), 10) 310 | signature1.set_weight(doc1.at_css("b2"), 100) 311 | signature2.set_weight(doc2.at_css("b1"), 10) 312 | signature2.set_weight(doc2.at_css("b2"), 100) 313 | 314 | match_set = Lorax::MatchSet.new(doc1, doc2, :match_set_signature1 => signature1, :match_set_signature2 => signature2) 315 | match_set.add Lorax::Match.new(doc1.at_css("b1"), doc2.at_css("b1")) 316 | match_set.add Lorax::Match.new(doc1.at_css("b2"), doc2.at_css("b2")) 317 | 318 | match_set = Lorax::FastMatcher.new(doc1, doc2, :matcher_match_set => match_set).match 319 | assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_xpath("//a1[2]") 320 | end 321 | end 322 | end 323 | 324 | describe "choosing the best among multiple possible matches" do 325 | context "no match's parent is same-named" do 326 | it "we don't care which node we match, just pick one" do 327 | doc1 = xml { root { 328 | a1 { b1 } 329 | } } 330 | doc2 = xml { root { 331 | a2 { b1 } 332 | a3 { b1 } 333 | } } 334 | signature1 = Lorax::Signature.new(doc1.root) 335 | signature2 = Lorax::Signature.new(doc2.root) 336 | signature1.set_signature(doc1.at_xpath("//b1"), "b1") 337 | signature2.set_signature(doc2.at_xpath("//a2/b1"), "b1") 338 | signature2.set_signature(doc2.at_xpath("//a3/b1"), "b1") 339 | match_set = Lorax::FastMatcher.new(doc1, doc2, 340 | :match_set_signature1 => signature1, :match_set_signature2 => signature2).match 341 | match_set.match(doc1.at_css("b1")).other(doc1.at_css("b1")).name.should == "b1" 342 | end 343 | end 344 | 345 | context "one match's parent is same-named" do 346 | it "matches the node with the same-named parent" do 347 | doc1 = xml { root { 348 | a2 { b1 ; b2 } 349 | } } 350 | doc2 = xml { root { 351 | a1 { b1 } 352 | a2 { b1 } 353 | a3 { b1 } 354 | } } 355 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 356 | assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2") 357 | end 358 | end 359 | 360 | context "multiple identical nodes exist in both documents" do 361 | it "should create one-to-one match relationships" do 362 | doc1 = xml { root1 { 363 | a1 ; a1 ; a1 364 | } } 365 | doc2 = xml { root2 { 366 | a1 ; a1 367 | } } 368 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 369 | [doc1, doc2].each do |doc| 370 | others = doc.css("a1").collect do |node| 371 | m = match_set.match(node) 372 | m ? m.pair.last : nil 373 | end 374 | others.uniq.length.should == others.length 375 | end 376 | end 377 | end 378 | 379 | context "multiple matches' parents are same-named" do 380 | it "matches the node with the same-named grandparent" do 381 | doc1 = xml { root { 382 | wrap2 { 383 | a1 { b1 { 10.times { c1 "hello there" } } ; b2 } 384 | } } } 385 | doc2 = xml { root { 386 | wrap1 { 387 | a1 { b1 { 10.times { c1 "hello there" } } } 388 | } 389 | wrap2 { 390 | a1 { b1 { 10.times { c1 "hello there" } } } 391 | } 392 | wrap3 { 393 | a1 { b1 { 10.times { c1 "hello there" } } } 394 | } } } 395 | match_set = Lorax::FastMatcher.new(doc1, doc2).match 396 | assert_forced_match_exists match_set, doc1.at_css("wrap2"), doc2.at_css("wrap2") 397 | end 398 | end 399 | end 400 | end 401 | -------------------------------------------------------------------------------- /spec/files/Michael-Dalessio-200909.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Michael Dalessio 7 | 8 |
9 |
10 | mike.dalessio@gmail.com 11 |
12 | 24 Pine Avenue, Madison, NJ 07940 13 |
14 |
15 | 201.602.9038 16 |
17 | http://mike.daless.io 18 |
19 |
20 |

Michael Dalessio

21 | 22 |

I write code for a living, but I also do it for fun. I’ve had success as a manager and a technical lead. I encourage Agile software development practices, which emphasize early and continuous delivery of valuable software.

23 | 24 |

Technological Strengths

25 | 26 |
    27 |
  • Ruby. Comfortable with Functional Programming and meta-programming. Language extensions in both C and FFI (JRuby).
  • 28 | 29 |
  • C, C++, STL. Large-scale OO design and implementation. Emphasis on writing portable code: Linux gcc, Solaris CC, AIX; Windows VS6 and VS7; Intel, Portland Group, Compaq compilers; 32/64-bit.
  • 30 | 31 |
  • Web development: Rails web framework, JavaScript, jQuery and Prototype javascript libraries. Successfully deployed highly-available web applications.
  • 32 | 33 |
  • High performance parallel computing: PVM and MPI toolkits; data serialization; load balancing and fault tolerance; job scheduling.
  • 34 | 35 |
  • System-level programming: threads, IPC, message queues, sockets. Network programming using TCP and UDP.
  • 36 | 37 |
  • Linux system administration experience, notably large compute clusters.
  • 38 | 39 |
  • Broad experience with screen scraping, including HTML+JS and proprietary terminal applications: Mechanize, Nokogiri, Hpricot, libxml2, XPath and CSS selectors.
  • 40 | 41 |
  • Extensive scripting in Ruby, Perl, various shells. Comfortable with OLE automation via APIs and VB script.
  • 42 | 43 |
  • Deep knowledge of FORTRAN. Integration with C and C++ environments.
  • 44 | 45 |
  • Comfortable with debuggers and profilers: gdb, valgrind, gprof, efence, Heap Agent, Bleak House.
  • 46 | 47 |
  • Comfortable with many source code revision tools: Git, Mercurial, Subversion, CVS, RCS.
  • 48 |
49 | 50 |

Management Skills

51 | 52 |
    53 |
  • Building and managing strong development teams, and improving existing teams.
  • 54 | 55 |
  • Constructing development environments which leverage the time and skills of the development team, from sane build environments to custom debugging tools and meta-programming tools.
  • 56 | 57 |
  • Building application frameworks to deliver abstracted functionality to domain-expert developers.
  • 58 | 59 |
  • Dealing with technology integration issues such as training, documentation and inertial resistance while meeting business demands.
  • 60 | 61 |
  • Mentoring developers in technology and design, both in informal office setting and in formal training classes. Experienced technical trainer.
  • 62 | 63 |
  • Experience in equity and fixed income financial markets, market data and trading systems.
  • 64 | 65 |
  • Excellent public speaking and presentation skills. Excellent written and verbal communication skills.
  • 66 |
67 | 68 |

Professional Experience

69 | 70 |

Agile Software Developer @ Pivotal Labs (Nov 2008 — Present)

71 | 72 |

Pivotal Labs is an Agile / XP software development shop specializing in Ruby and Java. At Pivotal, I was a member of a 4-person team developing an enterprise web application for social services. Our practices included test-driving development and pair programming. The application is implemented in Ruby on Rails with jQuery, and featured an XML API for data export and import and Jasper reporting tools.

73 | 74 |

Founder and Managing Partner @ Pharos Enterprise Intelligence (Jan 2008 — Present)

75 | 76 |

Pharos is a startup software and consulting company which caters to the electricity trading and asset management markets. At Pharos, I designed and built an energy trading product to aggregate public and private data from ISO energy markets. I also designed and built asset management software for generator owners and operators. Both applications are designed to be hosted “in the cloud”, and provide data mining and realtime data aggregation from multiple sources, including external websites, internal databases and data feeds. The software is implemented in Ruby, Rails, C, C++, Flash and JavaScript.

77 | 78 |

Director of Application Development @ USPOWERGEN (May 2006 — Dec 2007)

79 | 80 |

As system architect and lead developer for a generator operator, I designed, implemented and maintained a large energy management system (EMS) providing realtime overviews of the markets and electrical grid. The project was bootstrapped in two months under an extremely aggressive timetable. Implemented in Ruby, Rails, C, C++, Javascript.

81 | 82 |

As the technology director, I managed system integration projects for Accounting and Treasury departments, and managed the I.T. due diligence process for mergers and acquisitions. As part of managing the financial software systems, I initiated the company’s SOX 404 compliance efforts. I also managed outsourced staff of I.T. contractors across four sites.

83 | 84 |

Head of Software Development @ ASPEED Software (Aug 2004 — Apr 2006)

85 | 86 |

At ASPEED, a successful startup ISV, I hired and managed a small team of developers in the design and development of a high-performance parallel computing SDK. The library provided dynamic load balancing, efficient data serialization and fault tolerance while supporting common business algorithms (e.g., Monte Carlo simulations) and associated issues (e.g., distributed random number generation).

87 | 88 |

Some advanced features included: machine-generating C++ template code for fast, typesafe, portable serialization of complex data structures; process management subsystem providing support for common SOA vendors as well as “native” process management for both Linux and Windows; and a patented algorithm for parallelizing some types of long-running pharmaceutical models.

89 | 90 |

I also co-authored and maintained a high-performance messaging library for distributed computing, which provided high throughput and low latency on both Linux and Windows.

91 | 92 |

Infrastructure Applications Lead @ Moneyline/Telerate (Nov 2003 — Aug 2004)

93 | 94 |

The Infrastructure Application group was part of a larger effort to build a state-of-the-art ticker plant for financial market data. As team lead, I managed seven developers building and porting content generation applications under extremely aggressive deadlines. I also managed a small team of outsourced developers building a GUI application for browsing ticker plant data.

95 | 96 |

I designed and implemented a high-performance data feed to integrate third-party vendors. The feed used ASN.1 and BER encoding to describe and marshal data. The feed handled all contributed data for the firm. Implemented in C++ using POSIX threads.

97 | 98 |

Chief Architect for Bloomberg Launchpad @ Bloomberg, L.P. (Jun 1995 - Aug 2003)

99 | 100 |

At Bloomberg, I was the Lead Developer and Architect for Bloomberg Launchpad, a desktop platform which allows the user to customize a display integrating data from multiple sources: Bloomberg and third-party real-time market data; historical, fundamental and research data; news and web content. I designed and helped build the Launchpad application development framework, used by hundreds of developers across the firm.

101 | 102 |

As Lead Developer and Architect in Server-Side Integration and Research group, I led a team whose areas of focus were: rapid prototyping of applications; reusable component technology; salvaging legacy code using intelligent frameworks and components; generic market data pub/sub services; messaging and IPC; and evaluating database systems.

103 | 104 |

I led that same team in the design and delivery of a beta Bloomberg instant messaging product in three months; implemented using C++, pub/sub infrastructure and proprietary directory services.

105 | 106 |

As Lead Developer in the Trading System group, I designed and built the Trading Systems Desktop (“TW”), integrating Trading System functionality and market data display; including market data monitors, trading position blotters, pricing methodologies, news, auto-execution, trade history, and trade ticketing.

107 | 108 |

I also designed, built, and maintained TCP/IP trade feeds, custody bank links and client data feeds. This included designing an application-layer protocol and meta-programming to generate feed handler code based on feed specs.

109 | 110 |

Open Source Experience

111 | 112 |

Co-Author of Nokogiri (August 2008 — Present)

113 | 114 |

Nokogiri is currently the most popular XML/HTML parsing library for Ruby developers. It supports CSS and XPath querying, has push and pull SAX parsers, validates DTD and XSD schemas, performs XSLT transformations, and has a very simple and usable API. It’s implemented in Ruby and C.

115 | 116 |

Author of Loofah (August 2009 — Present)

117 | 118 |

Loofah is an HTML sanitizer (based on Nokogiri) which can be used to prevent cross-site scripting (XSS) attacks. It provides a variety of methods to clean or remove unsafe HTML, and delivers ActiveRecord plugins for Rails applications.

119 | 120 |

Core Maintainer of Mechanize (June 2008 — Present)

121 | 122 |

Mechanize is a Ruby library used for automating website interaction and screen-scraping. It fully supports browser history and cookies, and allows easy authentication and form submission. Implemented in Ruby.

123 | 124 |

Smaller projects and contributions:

125 | 126 |
    127 |
  • Co-author of ActiveHash, a project to create ActiveRecord-like models with Hash- or file-based readonly datasources.
  • 128 | 129 |
  • Author of git.rake, a set of rake tasks to make it easy to keep your git superproject in synch with multiple submodules.
  • 130 | 131 |
  • Contributor to git, an open-source distributed version control system.
  • 132 | 133 |
  • Contributor to aintablog, a feed aggregator / tumblog built using Ruby on Rails.
  • 134 | 135 |
  • Contributor to flexible-js-formatting, Baron Schwartz’s fast and flexible javascript library for formatting and parsing datetimes and numbers.
  • 136 | 137 |
  • Contributor to ruby-ffi, Ruby’s Foreign Function Interface project, which allows Ruby extensions to run on the JVM (JRuby).
  • 138 | 139 |
  • Contributor to rinari, a Ruby on Rails minor-mode for Emacs.
  • 140 |
141 | 142 |

Education

143 | 144 |

The Johns Hopkins University

145 | 146 |

Graduated 1995 with a B.A. in Physics.

147 | 148 | -------------------------------------------------------------------------------- /spec/files/Michael-Dalessio-201001.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Michael Dalessio 7 | 8 |
9 |
10 | mike.dalessio@gmail.com 11 |
12 | 24 Pine Avenue, Madison, NJ 07940 13 |
14 |
15 | 201.602.9038 16 |
17 | http://mike.daless.io 18 |
19 |
20 |

Michael Dalessio

21 | 22 |

I write code for a living, but I also do it for fun. I’ve had success as a manager and a technical lead. I encourage Agile software development practices, which emphasize early and continuous delivery of valuable software.

23 | 24 |
    25 |
  • Ruby. Comfortable with Functional Programming and meta-programming. Language extensions in both C and FFI (JRuby).
  • 26 | 27 |
  • C, C++, STL. Large-scale OO design and implementation. Emphasis on writing portable code: Linux gcc, Solaris CC, AIX; Windows VS6 and VS7; Intel, Portland Group, Compaq compilers; 32/64-bit.
  • 28 | 29 |
  • Web development: Rails web framework, JavaScript, jQuery and Prototype javascript libraries. Successfully deployed highly-available web applications.
  • 30 | 31 |
  • High performance parallel computing: PVM and MPI toolkits; data serialization; load balancing and fault tolerance; job scheduling.
  • 32 | 33 |
  • System-level programming: threads, IPC, message queues, sockets. Network programming using TCP and UDP.
  • 34 | 35 |
  • Linux system administration experience, notably large compute clusters.
  • 36 | 37 |
  • Broad experience with screen scraping, including HTML+JS and proprietary terminal applications: Mechanize, Nokogiri, Hpricot, libxml2, XPath and CSS selectors.
  • 38 | 39 |
  • Extensive scripting in Ruby, Perl, various shells. Comfortable with OLE automation via APIs and VB script.
  • 40 | 41 |
  • Deep knowledge of FORTRAN. Integration with C and C++ environments.
  • 42 | 43 |
  • Comfortable with debuggers and profilers: gdb, valgrind, gprof, efence, Heap Agent, Bleak House.
  • 44 | 45 |
  • Comfortable with many source code revision tools: Git, Mercurial, Subversion, CVS, RCS.
  • 46 |
47 | 48 |

Management Skills

49 | 50 |
    51 |
  • Building and managing strong development teams, and improving existing teams.
  • 52 | 53 |
  • Constructing development environments which leverage the time and skills of the development team, from sane build environments to custom debugging tools and meta-programming tools.
  • 54 | 55 |
  • Building application frameworks to deliver abstracted functionality to domain-expert developers.
  • 56 | 57 |
  • Dealing with technology integration issues such as training, documentation and inertial resistance while meeting business demands.
  • 58 | 59 |
  • Mentoring developers in technology and design, both in informal office setting and in formal training classes. Experienced technical trainer.
  • 60 | 61 |
  • Experience in equity and fixed income financial markets, market data and trading systems.
  • 62 | 63 |
  • Excellent public speaking and presentation skills. Excellent written and verbal communication skills.
  • 64 |
65 | 66 |

Professional Experience

67 | 68 |

Manager and Agile Software Developer @ Pivotal Labs (Nov 2008 — Present)

69 | 70 |

Pivotal Labs is an Agile / XP software development shop specializing in Ruby and Java. Our practices included test-driving development and pair programming.

71 | 72 |

Engineering Manager (Oct 2009 — Present)

73 | 74 |

In addition to Agile evangelism with new and potential clients, I also manage staffing, recruiting, and the sales pipeline. I also try to keep the developers happy, and, oh yeah, occasionally code.

75 | 76 |

Agile Software Developer (Nov 2008 — Sep 2009)

77 | 78 |

I was a member of a 10-person team developing an enterprise web application for social services. The application is implemented in Ruby on Rails with jQuery, and featured an XML API for data export and import and Jasper reporting tools.

79 | 80 |

Founder and Managing Partner @ Pharos Enterprise Intelligence (Jan 2008 — Present)

81 | 82 |

Pharos is a startup software and consulting company which caters to the electricity trading and asset management markets. At Pharos, I designed and built an energy trading product to aggregate public and private data from ISO energy markets. I also designed and built asset management software for generator owners and operators. Both applications are designed to be hosted “in the cloud”, and provide data mining and realtime data aggregation from multiple sources, including external websites, internal databases and data feeds. The software is implemented in Ruby, Rails, C, C++, Flash and JavaScript.

83 | 84 |

Director of Application Development @ USPOWERGEN (May 2006 — Dec 2007)

85 | 86 |

As system architect and lead developer for a generator operator, I designed, implemented and maintained a large energy management system (EMS) providing realtime overviews of the markets and electrical grid. The project was bootstrapped in two months under an extremely aggressive timetable. Implemented in Ruby, Rails, C, C++, Javascript.

87 | 88 |

As the technology director, I managed system integration projects for Accounting and Treasury departments, and managed the I.T. due diligence process for mergers and acquisitions. As part of managing the financial software systems, I initiated the company’s SOX 404 compliance efforts. I also managed outsourced staff of I.T. contractors across four sites.

89 | 90 |

Head of Software Development @ ASPEED Software (Aug 2004 — Apr 2006)

91 | 92 |

At ASPEED, a successful startup ISV, I hired and managed a small team of developers in the design and development of a high-performance parallel computing SDK. The library provided dynamic load balancing, efficient data serialization and fault tolerance while supporting common business algorithms (e.g., Monte Carlo simulations) and associated issues (e.g., distributed random number generation).

93 | 94 |

Some advanced features included: machine-generating C++ template code for fast, typesafe, portable serialization of complex data structures; process management subsystem providing support for common SOA vendors as well as “native” process management for both Linux and Windows; and a patented algorithm for parallelizing some types of long-running pharmaceutical models.

95 | 96 |

I also co-authored and maintained a high-performance messaging library for distributed computing, which provided high throughput and low latency on both Linux and Windows.

97 | 98 |

Infrastructure Applications Lead @ Moneyline/Telerate (Nov 2003 — Aug 2004)

99 | 100 |

The Infrastructure Application group was part of a larger effort to build a state-of-the-art ticker plant for financial market data. As team lead, I managed seven developers building and porting content generation applications under extremely aggressive deadlines. I also managed a small team of outsourced developers building a GUI application for browsing ticker plant data.

101 | 102 |

I designed and implemented a high-performance data feed to integrate third-party vendors. The feed used ASN.1 and BER encoding to describe and marshal data. The feed handled all contributed data for the firm. Implemented in C++ using POSIX threads.

103 | 104 |

Chief Architect for Bloomberg Launchpad @ Bloomberg, L.P. (Jun 1995 - Aug 2003)

105 | 106 |

At Bloomberg, I was the Lead Developer and Architect for Bloomberg Launchpad, a desktop platform which allows the user to customize a display integrating data from multiple sources: Bloomberg and third-party real-time market data; historical, fundamental and research data; news and web content. I designed and helped build the Launchpad application development framework, used by hundreds of developers across the firm.

107 | 108 |

As Lead Developer and Architect in Server-Side Integration and Research group, I led a team whose areas of focus were: rapid prototyping of applications; reusable component technology; salvaging legacy code using intelligent frameworks and components; generic market data pub/sub services; messaging and IPC; and evaluating database systems.

109 | 110 |

I led that same team in the design and delivery of a beta Bloomberg instant messaging product in three months; implemented using C++, pub/sub infrastructure and proprietary directory services.

111 | 112 |

As Lead Developer in the Trading System group, I designed and built the Trading Systems Desktop (“TW”), integrating Trading System functionality and market data display; including market data monitors, trading position blotters, pricing methodologies, news, auto-execution, trade history, and trade ticketing.

113 | 114 |

I also designed, built, and maintained TCP/IP trade feeds, custody bank links and client data feeds. This included designing an application-layer protocol and meta-programming to generate feed handler code based on feed specs.

115 | 116 |

Open Source Experience

117 | 118 |

Co-Author of Nokogiri (August 2008 — Present)

119 | 120 |

Nokogiri is currently the most popular XML/HTML parsing library for Ruby developers. It supports CSS and XPath querying, has push and pull SAX parsers, validates DTD and XSD schemas, performs XSLT transformations, and has a very simple and usable API. It’s implemented in Ruby and C.

121 | 122 |

Author of Loofah (August 2009 — Present)

123 | 124 |

Loofah is an HTML sanitizer (based on Nokogiri) which can be used to prevent cross-site scripting (XSS) attacks. It provides a variety of methods to clean or remove unsafe HTML, and delivers ActiveRecord plugins for Rails applications.

125 | 126 |

Core Maintainer of Mechanize (June 2008 — Present)

127 | 128 |

Mechanize is a Ruby library used for automating website interaction and screen-scraping. It fully supports browser history and cookies, and allows easy authentication and form submission. Implemented in Ruby.

129 | 130 |

Smaller projects and contributions:

131 | 132 |
    133 |
  • Co-author of ActiveHash, a project to create ActiveRecord-like models with Hash- or file-based readonly datasources.
  • 134 | 135 |
  • Author of git.rake, a set of rake tasks to make it easy to keep your git superproject in synch with multiple submodules.
  • 136 | 137 |
  • Contributor to git, an open-source distributed version control system.
  • 138 | 139 |
  • Contributor to aintablog, a feed aggregator / tumblog built using Ruby on Rails.
  • 140 | 141 |
  • Contributor to flexible-js-formatting, Baron Schwartz’s fast and flexible javascript library for formatting and parsing datetimes and numbers.
  • 142 | 143 |
  • Contributor to ruby-ffi, Ruby’s Foreign Function Interface project, which allows Ruby extensions to run on the JVM (JRuby).
  • 144 | 145 |
  • Contributor to rinari, a Ruby on Rails minor-mode for Emacs.
  • 146 |
147 | 148 |

Education

149 | 150 |

The Johns Hopkins University

151 | 152 |

Graduated 1995 with a B.A. in Physics.

153 | 154 | -------------------------------------------------------------------------------- /spec/integration/lorax_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax do 4 | def round_trip_should_succeed(doc1, doc2) 5 | delta_set = Lorax.diff(doc1, doc2) 6 | new_doc = delta_set.apply(doc1) 7 | 8 | unless Lorax::Signature.new(new_doc.root).signature == Lorax::Signature.new(doc2.root).signature 9 | errmsg = [] 10 | errmsg << "Documents are not identical after a round-trip diff and patch:" 11 | errmsg << doc1.root.to_xml 12 | errmsg << "-----" 13 | errmsg << doc2.root.to_xml 14 | errmsg << "=> patch: #{delta_set.deltas.inspect}" 15 | errmsg << new_doc.root.to_xml 16 | fail errmsg.join("\n") 17 | end 18 | end 19 | 20 | context "inserted nodes" do 21 | it "handles appends to matching siblings" do 22 | doc1 = xml { root { 23 | a1 "hello" 24 | } } 25 | doc2 = xml { root { 26 | a1 "hello" 27 | a2 "goodbye" 28 | } } 29 | round_trip_should_succeed doc1, doc2 30 | end 31 | 32 | it "inserts into matching siblings" do 33 | doc1 = xml { root { 34 | a1 "hello" 35 | a3 "goodbye" 36 | } } 37 | doc2 = xml { root { 38 | a1 "hello" 39 | a2 40 | a3 "goodbye" 41 | } } 42 | round_trip_should_succeed doc1, doc2 43 | end 44 | 45 | it "inserts under an existing sibling node" do 46 | doc1 = xml { root { 47 | a1 "hello" 48 | a2 49 | } } 50 | doc2 = xml { root { 51 | a1 "hello" 52 | a2 { b1 "subnode" } 53 | } } 54 | round_trip_should_succeed doc1, doc2 55 | end 56 | end 57 | 58 | context "deleted nodes" do 59 | it "handles deleting nodes" do 60 | doc1 = xml { root { 61 | a1 "hello" 62 | a2 "goodbye" 63 | a3 "natch" 64 | } } 65 | doc2 = xml { root { 66 | a1 "hello" 67 | a3 "natch" 68 | } } 69 | round_trip_should_succeed doc1, doc2 70 | end 71 | end 72 | 73 | context "modified nodes" do 74 | it "handles modifying nodes" do 75 | doc1 = xml { root { 76 | a1 "hello" 77 | a2 "goodbye" 78 | a3 "natch" 79 | } } 80 | doc2 = xml { root { 81 | a1 "hello" 82 | a2 "good buy" 83 | a3 "natch" 84 | } } 85 | round_trip_should_succeed doc1, doc2 86 | end 87 | end 88 | 89 | context "mixed operations" do 90 | it "handles mixed deletions and modifications" do 91 | doc1 = xml { root { 92 | a1 "hello" 93 | a2 "goodbye" 94 | a3 "natch" 95 | a4 "jimmy" 96 | } } 97 | doc2 = xml { root { 98 | a1 "hello" 99 | a3 "not" 100 | a4 "jimmy" 101 | } } 102 | round_trip_should_succeed doc1, doc2 103 | end 104 | end 105 | 106 | context "with whitespace interleaved" do 107 | it "handles whitespace nodes" do 108 | doc1 = xml { root { 109 | a1 110 | text "\n\n" 111 | a4 112 | text "\n\n" 113 | a5 114 | } } 115 | doc2 = xml { root { 116 | a1 117 | text "\n\n" 118 | a2 119 | text "\n\n" 120 | a3 121 | text "\n\n" 122 | a4 123 | text "\n\n" 124 | a5 125 | } } 126 | round_trip_should_succeed doc1, doc2 127 | end 128 | end 129 | 130 | end 131 | -------------------------------------------------------------------------------- /spec/match_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Lorax::Match do 4 | before do 5 | @doc1 = xml { root } 6 | @doc2 = xml { root } 7 | end 8 | 9 | describe "#new" do 10 | it "takes two nodes as arguments" do 11 | proc { Lorax::Match.new(@doc1.root, @doc2.root) }.should_not raise_error 12 | end 13 | 14 | it "takes optional options" do 15 | proc { Lorax::Match.new(@doc1.root, @doc2.root, {:perfect => true}) }.should_not raise_error 16 | end 17 | end 18 | 19 | describe "#perfect" do 20 | it "returns true if {:perfect => true} option was passed to #new" do 21 | Lorax::Match.new(@doc1.root, @doc2.root, {:perfect => true}).should be_perfect 22 | end 23 | 24 | it "returns false if {:perfect => false} option was passed to #new" do 25 | Lorax::Match.new(@doc1.root, @doc2.root, {:perfect => false}).should_not be_perfect 26 | end 27 | 28 | it "returns false if no :perfect option was passed to #new" do 29 | Lorax::Match.new(@doc1.root, @doc2.root).should_not be_perfect 30 | end 31 | end 32 | 33 | describe "#pair" do 34 | it "returns the two nodes in an array" do 35 | Lorax::Match.new(@doc1.root, @doc2.root).pair.should == [@doc1.root, @doc2.root] 36 | end 37 | end 38 | 39 | describe "#other" do 40 | context "the node is in the pair" do 41 | it "returns the other node" do 42 | match = Lorax::Match.new :a, :b 43 | match.other(:a).should == :b 44 | match.other(:b).should == :a 45 | end 46 | end 47 | 48 | context "the node is not in the pair" do 49 | it "returns nil" do 50 | Lorax::Match.new(:a, :b).other(:c).should be_nil 51 | end 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /spec/spec.opts: -------------------------------------------------------------------------------- 1 | --color 2 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'lorax' 2 | 3 | require 'rspec' 4 | require 'rr' 5 | require 'pp' 6 | 7 | warn "#{__FILE__}:#{__LINE__}: libxml version info: #{Nokogiri::VERSION_INFO.inspect}" 8 | 9 | module XmlBuilderHelper 10 | def xml(&block) 11 | Nokogiri::XML::Builder.new(&block).doc 12 | end 13 | 14 | def assert_no_match_exists(match_set, node1, node2) 15 | match_set.match(node1).should be_nil 16 | match_set.match(node2).should be_nil 17 | end 18 | 19 | def assert_perfect_match_exists(match_set, node1, node2) 20 | match = match_set.match(node1) 21 | fail "#{node1.inspect} was not matched" if match.nil? 22 | match.other(node1).should == node2 23 | match.should be_perfect 24 | end 25 | 26 | def assert_forced_match_exists(match_set, node1, node2) 27 | match = match_set.match(node1) 28 | fail "#{node1.inspect} was not matched" if match.nil? 29 | match.other(node1).should == node2 30 | match.should_not be_perfect 31 | end 32 | end 33 | 34 | RSpec.configure do |config| 35 | config.mock_with :rr 36 | config.include XmlBuilderHelper 37 | end 38 | -------------------------------------------------------------------------------- /spec/unit/delta/delete_delta_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper') 2 | 3 | describe Lorax::DeleteDelta do 4 | describe ".new" do 5 | it "takes one argument" do 6 | proc { Lorax::DeleteDelta.new(:foo) }.should_not raise_error(ArgumentError) 7 | proc { Lorax::DeleteDelta.new(:foo, :bar)}.should raise_error(ArgumentError) 8 | end 9 | end 10 | 11 | describe "#node" do 12 | it "returns the initalizer argument" do 13 | Lorax::DeleteDelta.new(:foo).node.should == :foo 14 | end 15 | end 16 | 17 | describe "#apply!" do 18 | context "for an atomic node delta" do 19 | it "should delete the node" do 20 | doc1 = xml { root { a1 } } 21 | doc2 = xml { root } 22 | node = doc1.at_css("a1") 23 | delta = Lorax::DeleteDelta.new node 24 | 25 | delta.apply!(doc1) 26 | 27 | doc1.at_css("a1").should be_nil 28 | node.parent.should == nil 29 | end 30 | end 31 | 32 | context "for a subtree delta" do 33 | it "should delete the subtree" do 34 | doc1 = xml { root { a1 { b1 ; b2 "hello" } } } 35 | doc2 = xml { root } 36 | node = doc1.at_css("a1") 37 | delta = Lorax::DeleteDelta.new node 38 | 39 | delta.apply!(doc1) 40 | 41 | doc1.at_css("a1,b1,b2").should be_nil 42 | node.parent.should == nil 43 | end 44 | end 45 | end 46 | 47 | describe "#descriptor" do 48 | it "needs a spec" 49 | end 50 | 51 | describe "#to_s" do 52 | it "needs a spec" 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /spec/unit/delta/insert_delta_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper') 2 | 3 | describe Lorax::InsertDelta do 4 | describe ".new" do 5 | it "takes three arguments" do 6 | proc { Lorax::InsertDelta.new(:foo, :bar) }.should raise_error(ArgumentError) 7 | proc { Lorax::InsertDelta.new(:foo, :bar, :quux) }.should_not raise_error(ArgumentError) 8 | proc { Lorax::InsertDelta.new(:foo, :bar, :quux, :fuzz)}.should raise_error(ArgumentError) 9 | end 10 | end 11 | 12 | describe "#node" do 13 | it "returns the first argument to #new" do 14 | Lorax::InsertDelta.new(:foo, :bar, :quux).node.should == :foo 15 | end 16 | end 17 | 18 | describe "#xpath" do 19 | it "returns the second argument to #new" do 20 | Lorax::InsertDelta.new(:foo, :bar, :quux).xpath.should == :bar 21 | end 22 | end 23 | 24 | describe "#position" do 25 | it "returns the third argument to #new" do 26 | Lorax::InsertDelta.new(:foo, :bar, :quux).position.should == :quux 27 | end 28 | end 29 | 30 | describe "#apply!" do 31 | context "for an atomic node delta" do 32 | it "should insert a copy into the document" do 33 | doc1 = xml { root } 34 | doc2 = xml { root { a1 } } 35 | node = doc2.at_css("a1") 36 | delta = Lorax::InsertDelta.new node, node.parent.path, 0 37 | 38 | delta.apply!(doc1) 39 | 40 | doc1.at_css("a1").should_not be_nil 41 | node.parent.should == doc2.root 42 | end 43 | end 44 | 45 | context "for a subtree node delta" do 46 | it "should insert a copy into the document" do 47 | doc1 = xml { root } 48 | doc2 = xml { root { a1 { b1 ; b2 "hello" } } } 49 | node = doc2.at_css("a1") 50 | delta = Lorax::InsertDelta.new node, node.parent.path, 0 51 | 52 | delta.apply!(doc1) 53 | 54 | doc1.at_css("a1").should_not be_nil 55 | node.parent.should == doc2.root 56 | end 57 | end 58 | 59 | context "sibling node insertions" do 60 | it "should insert at the front" do 61 | doc1 = xml { root { a2 } } 62 | doc2 = xml { root { a1 ; a2 } } 63 | node = doc2.at_css("a1") 64 | delta = Lorax::InsertDelta.new node, node.parent.path, 0 65 | 66 | delta.apply! doc1 67 | 68 | doc1.root.children.map {|child| child.name}.should == %w[a1 a2] 69 | end 70 | 71 | it "should insert at the middle" do 72 | doc1 = xml { root { a1 ; a3 } } 73 | doc2 = xml { root { a1 ; a2 ; a3 } } 74 | node = doc2.at_css("a2") 75 | delta = Lorax::InsertDelta.new node, node.parent.path, 1 76 | 77 | delta.apply! doc1 78 | 79 | doc1.root.children.map {|child| child.name}.should == %w[a1 a2 a3] 80 | end 81 | 82 | it "should insert at the end" do 83 | doc1 = xml { root { a1 } } 84 | doc2 = xml { root { a1 ; a2 } } 85 | node = doc2.at_css("a2") 86 | delta = Lorax::InsertDelta.new node, node.parent.path, 1 87 | 88 | delta.apply! doc1 89 | 90 | doc1.root.children.map {|child| child.name}.should == %w[a1 a2] 91 | end 92 | end 93 | 94 | context "delta with unresolvable xpath" do 95 | it "should raise a Conflict exception" do 96 | doc1 = xml { root } 97 | doc2 = xml { root { a1 } } 98 | node = doc2.at_css("a1") 99 | delta = Lorax::InsertDelta.new node, "/foo/bar/quux", 0 100 | 101 | proc { delta.apply!(doc1) }.should raise_error(Lorax::Delta::NodeNotFoundError) 102 | end 103 | end 104 | end 105 | 106 | describe "#descriptor" do 107 | it "needs a spec" 108 | end 109 | 110 | describe "#to_s" do 111 | it "needs a spec" 112 | end 113 | end 114 | -------------------------------------------------------------------------------- /spec/unit/delta/modify_delta_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper') 2 | 3 | describe Lorax::ModifyDelta do 4 | describe ".new" do 5 | it "takes two arguments" do 6 | proc { Lorax::ModifyDelta.new(:foo) }.should raise_error(ArgumentError) 7 | proc { Lorax::ModifyDelta.new(:foo, :bar) }.should_not raise_error(ArgumentError) 8 | proc { Lorax::ModifyDelta.new(:foo, :bar, :quux)}.should raise_error(ArgumentError) 9 | end 10 | end 11 | 12 | describe "#node1" do 13 | it "returns the first initializer parameter" do 14 | Lorax::ModifyDelta.new(:foo, :bar).node1.should == :foo 15 | end 16 | end 17 | 18 | describe "#node2" do 19 | it "returns the first initializer parameter" do 20 | Lorax::ModifyDelta.new(:foo, :bar).node2.should == :bar 21 | end 22 | end 23 | 24 | describe "#apply!" do 25 | context "element node" do 26 | context "when attributes differ" do 27 | it "should set the attributes properly" do 28 | doc1 = xml { root { a1(:foo => :bar) } } 29 | doc2 = xml { root { a1(:bazz => :quux, :once => :twice) } } 30 | doc3 = doc1.dup 31 | node1 = doc1.at_css("a1") 32 | node2 = doc2.at_css("a1") 33 | node3 = doc3.at_css("a1") 34 | 35 | delta = Lorax::ModifyDelta.new(node1, node2) 36 | delta.apply!(doc3) 37 | 38 | node3["foo"].should be_nil 39 | node3["bazz"].should == "quux" 40 | node3["once"].should == "twice" 41 | end 42 | end 43 | end 44 | 45 | context "text node" do 46 | it "should set the content properly" do 47 | doc1 = xml { root "hello" } 48 | doc2 = xml { root "goodbye" } 49 | doc3 = doc1.dup 50 | 51 | delta = Lorax::ModifyDelta.new(doc1.root.children.first, doc2.root.children.first) 52 | delta.apply!(doc3) 53 | 54 | doc3.root.content.should == "goodbye" 55 | end 56 | end 57 | 58 | context "when positions differ" do 59 | it "should move the node" do 60 | doc1 = xml { root { 61 | a1 { b1 } 62 | a2 63 | } } 64 | doc2 = xml { root { 65 | a1 66 | a2 { b1 } 67 | } } 68 | delta = Lorax::ModifyDelta.new(doc1.at_css("b1"), doc2.at_css("b1")) 69 | doc3 = doc1.dup 70 | delta.apply!(doc3) 71 | doc3.at_xpath("/root/a2/b1").should_not be_nil 72 | end 73 | 74 | it "should move the node to the correct position" do 75 | doc1 = xml { root { 76 | a1 { b2 } 77 | a2 { b1 ; b3 } 78 | } } 79 | doc2 = xml { root { 80 | a1 81 | a2 { b1 ; b2 ; b3 } 82 | } } 83 | delta = Lorax::ModifyDelta.new(doc1.at_css("b2"), doc2.at_css("b2")) 84 | doc3 = doc1.dup 85 | delta.apply!(doc3) 86 | doc3.at_xpath("/root/a2/*[2]").name.should == "b2" 87 | end 88 | end 89 | end 90 | 91 | describe "#descriptor" do 92 | it "needs a spec" 93 | end 94 | 95 | describe "#to_s" do 96 | it "needs a spec" 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /spec/unit/delta_set_generator_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax::DeltaSetGenerator do 4 | describe "#generate_delta_set" do 5 | context "InsertDeltas" do 6 | it "should be generated for an atomic node without a match" do 7 | doc1 = xml { root1 } 8 | doc2 = xml { root2 } 9 | match_set = Lorax::MatchSet.new doc1, doc2 10 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 11 | delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 12 | end 13 | 14 | it "should be generated for a subtree without a match" do 15 | doc1 = xml { root1 } 16 | doc2 = xml { root2 { a1 ; a2 "hello" } } 17 | match_set = Lorax::MatchSet.new doc1, doc2 18 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 19 | delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 20 | end 21 | 22 | it "should not be generated for children of a perfect match" do 23 | doc1 = xml { root { a1 { b1 "hello" } } } 24 | doc2 = xml { root { a1 { b1 "hello" } ; a2 } } 25 | match_set = Lorax::MatchSet.new doc1, doc2 26 | match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root")) 27 | match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true) 28 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 29 | delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 # a2 30 | end 31 | 32 | it "should be generated for siblings without a match" do 33 | doc1 = xml { root { 34 | a1 "hello" 35 | a3 "goodbye" 36 | a5 "again" 37 | } } 38 | doc2 = xml { root { 39 | a1 "hello" 40 | a2 "middleman" 41 | a3 "goodbye" 42 | a4 "good boy" 43 | a5 "again" 44 | } } 45 | match_set = Lorax::MatchSet.new doc1, doc2 46 | match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true) 47 | match_set.add Lorax::Match.new(doc1.at_css("a3"), doc2.at_css("a3"), :perfect => true) 48 | match_set.add Lorax::Match.new(doc1.at_css("a5"), doc2.at_css("a5"), :perfect => true) 49 | match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root")) 50 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 51 | delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 2 52 | end 53 | end 54 | 55 | context "ModifyDeltas" do 56 | it "should be generated for nodes that are imperfectly matched" do 57 | doc1 = xml { root(:foo => :bar) } 58 | doc2 = xml { root(:foo => :quux) } 59 | match_set = Lorax::MatchSet.new doc1, doc2 60 | match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root") 61 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 62 | delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 1 63 | end 64 | 65 | context "imperfect self-same match with children" do 66 | it "should handle children as expected" do 67 | doc1 = xml { root { 68 | a1 69 | a2 70 | a4(:foo => :bar) 71 | } } 72 | doc2 = xml { root { 73 | a2 74 | a3 75 | a4(:foo => :quux) 76 | } } 77 | match_set = Lorax::MatchSet.new doc1, doc2 78 | match_set.add Lorax::Match.new doc1.root, doc2.root 79 | match_set.add Lorax::Match.new doc1.at_css("a2"), doc2.at_css("a2"), :perfect => true 80 | match_set.add Lorax::Match.new doc1.at_css("a4"), doc2.at_css("a4") 81 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 82 | delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 # a3 83 | delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 1 # a4 84 | delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 # a1 85 | end 86 | end 87 | 88 | it "should not be generated for nodes that are imperfectly matched but are self-same" do 89 | doc1 = xml { root(:foo => :bar) { a1 } } 90 | doc2 = xml { root(:foo => :bar) { a2 } } 91 | match_set = Lorax::MatchSet.new doc1, doc2 92 | match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root") 93 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 94 | delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 0 95 | end 96 | 97 | it "should not be generated for nodes that are perfectly matched" do 98 | doc1 = xml { root } 99 | doc2 = xml { root } 100 | match_set = Lorax::MatchSet.new doc1, doc2 101 | match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root"), :perfect => true 102 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 103 | delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 0 104 | end 105 | end 106 | 107 | context "DeleteDeltas" do 108 | it "should be generated for an atomic node without a match" do 109 | doc1 = xml { root1 } 110 | doc2 = xml { root2 } 111 | match_set = Lorax::MatchSet.new doc1, doc2 112 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 113 | delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 114 | end 115 | 116 | it "should be generated for a subtree without a match" do 117 | doc1 = xml { root1 { a1 ; a2 "hello" } } 118 | doc2 = xml { root2 } 119 | match_set = Lorax::MatchSet.new doc1, doc2 120 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 121 | delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 122 | end 123 | 124 | it "should not be generated for children of a deleted node" do 125 | doc1 = xml { root { a1 { b1 "hello" } ; a2 } } 126 | doc2 = xml { root { a2 } } 127 | match_set = Lorax::MatchSet.new doc1, doc2 128 | match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root")) 129 | match_set.add Lorax::Match.new(doc1.at_css("a2"), doc2.at_css("a2"), :perfect => true) 130 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 131 | delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 # a1 132 | end 133 | 134 | it "should be generated for siblings without a match" do 135 | doc1 = xml { root { 136 | a1 "hello" 137 | a2 "middleman" 138 | a3 "goodbye" 139 | a4 "good boy" 140 | a5 "again" 141 | } } 142 | doc2 = xml { root { 143 | a1 "hello" 144 | a3 "goodbye" 145 | a5 "again" 146 | } } 147 | match_set = Lorax::MatchSet.new doc1, doc2 148 | match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true) 149 | match_set.add Lorax::Match.new(doc1.at_css("a3"), doc2.at_css("a3"), :perfect => true) 150 | match_set.add Lorax::Match.new(doc1.at_css("a5"), doc2.at_css("a5"), :perfect => true) 151 | match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root")) 152 | delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set) 153 | delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 2 154 | end 155 | end 156 | end 157 | end 158 | -------------------------------------------------------------------------------- /spec/unit/delta_set_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax::DeltaSet do 4 | describe "#add / #deltas" do 5 | it "appends to and returns an ordered list of deltas" do 6 | delta_set = Lorax::DeltaSet.new 7 | delta_set.add :foo 8 | delta_set.add :bar 9 | delta_set.deltas.should == [:foo, :bar] 10 | end 11 | end 12 | 13 | describe "#apply" do 14 | it "calls apply! on a duplicate document" do 15 | delta_set = Lorax::DeltaSet.new 16 | document = Nokogiri::XML::Document.new 17 | mock(document).dup { :foo } 18 | mock(delta_set).apply!(:foo) 19 | delta_set.apply document 20 | end 21 | end 22 | 23 | describe "#apply!" do 24 | it "invokes apply! on each delta in order" do 25 | doc = xml { root } 26 | delta_set = Lorax::DeltaSet.new 27 | delta1 = Lorax::InsertDelta.new(:foo, :bar, :quux) 28 | delta2 = Lorax::InsertDelta.new(:foo, :bar, :quux) 29 | delta_set.add delta1 30 | delta_set.add delta2 31 | 32 | order_of_invocation = [] 33 | mock(delta1).apply!(doc) { order_of_invocation << :delta1 } 34 | mock(delta2).apply!(doc) { order_of_invocation << :delta2 } 35 | delta_set.apply!(doc) 36 | 37 | order_of_invocation.should == [:delta1, :delta2] 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/unit/lorax_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax do 4 | describe ".diff" do 5 | it "should accept an IO" 6 | it "should accept a string" 7 | it "should accept a Nokogiri::XML::Document" 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /spec/unit/match_set_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax::MatchSet do 4 | describe "#new" do 5 | context "normal usage" do 6 | it "takes two arguments" do 7 | proc { Lorax::MatchSet.new(xml{root}) }.should raise_error(ArgumentError) 8 | proc { Lorax::MatchSet.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError) 9 | end 10 | 11 | it "builds a Signature for each document root" do 12 | doc1 = xml { root1 } 13 | doc2 = xml { root2 } 14 | mock.proxy(Lorax::Signature).new(doc1.root) 15 | mock.proxy(Lorax::Signature).new(doc2.root) 16 | Lorax::MatchSet.new(doc1, doc2) 17 | end 18 | end 19 | 20 | context "with dependency injection" do 21 | it "takes an optional third argument for dependency injection" do 22 | proc { Lorax::MatchSet.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError) 23 | end 24 | 25 | it "will use the value of ':match_set_signature1' for @signature1" do 26 | match_set = Lorax::MatchSet.new(xml{root}, xml{root}, {:match_set_signature1 => :foo}) 27 | match_set.signature1.should == :foo 28 | end 29 | 30 | it "will use the value of ':match_set_signature2' for @signature2" do 31 | match_set = Lorax::MatchSet.new(xml{root}, xml{root}, {:match_set_signature2 => :foo}) 32 | match_set.signature2.should == :foo 33 | end 34 | end 35 | end 36 | 37 | describe "#signature1" do 38 | it "returns the Signature of the first document" do 39 | doc1 = xml { root1 } 40 | doc2 = xml { root2 } 41 | match_set = Lorax::MatchSet.new(doc1, doc2) 42 | match_set.signature1.should_not be_nil 43 | match_set.signature1.root.should == doc1.root 44 | end 45 | end 46 | 47 | describe "#signature2" do 48 | it "returns the Signature of the second document" do 49 | doc1 = xml { root1 } 50 | doc2 = xml { root2 } 51 | match_set = Lorax::MatchSet.new(doc1, doc2) 52 | match_set.signature2.should_not be_nil 53 | match_set.signature2.root.should == doc2.root 54 | end 55 | end 56 | 57 | describe "#match and #add" do 58 | before do 59 | @doc1 = xml { root1 { a1 } } 60 | @doc2 = xml { root2 { a1 } } 61 | @match_set = Lorax::MatchSet.new(@doc1, @doc2) 62 | end 63 | 64 | context "when there is a match for the node" do 65 | before do 66 | @match = Lorax::Match.new(@doc1.at_css("a1"), @doc2.at_css("a1")) 67 | @match_set.add @match 68 | end 69 | 70 | it "returns the match" do 71 | @match_set.match(@doc1.at_css("a1")).should == @match 72 | @match_set.match(@doc2.at_css("a1")).should == @match 73 | end 74 | end 75 | 76 | context "when there is no match" do 77 | it "returns nil" do 78 | @match_set.match(@doc1.at_css("a1")).should be_nil 79 | @match_set.match(@doc2.at_css("a1")).should be_nil 80 | end 81 | end 82 | end 83 | 84 | describe "#to_delta_set" do 85 | it "invokes DeltaSetGenerator.generate_delta_set on itself" do 86 | doc1 = xml { root1 } 87 | doc2 = xml { root2 } 88 | match_set = Lorax::MatchSet.new(doc1, doc2) 89 | mock(Lorax::DeltaSetGenerator).generate_delta_set(match_set) 90 | match_set.to_delta_set 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /spec/unit/signature_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') 2 | 3 | describe Lorax::Signature do 4 | WHITESPACES = ["\n"," ","\t","\r","\f"] 5 | 6 | def assert_node_signature_equal(node1, node2) 7 | Lorax::Signature.new(node1).signature.should == Lorax::Signature.new(node2).signature 8 | end 9 | 10 | def assert_node_signature_not_equal(node1, node2) 11 | Lorax::Signature.new(node1).signature.should_not == Lorax::Signature.new(node2).signature 12 | end 13 | 14 | describe ".new" do 15 | it "accepts nil" do 16 | proc { Lorax::Signature.new }.should_not raise_error 17 | end 18 | 19 | it "does not call signature if param is nil" do 20 | mock.instance_of(Lorax::Signature).signature(42).never 21 | Lorax::Signature.new(nil) 22 | end 23 | 24 | it "calls signature if a param is non-nil" do 25 | mock.instance_of(Lorax::Signature).signature(42).once 26 | Lorax::Signature.new(42) 27 | end 28 | end 29 | 30 | describe "#root" do 31 | it "returns the subtree root" do 32 | doc = xml { root { a1 "hello" } } 33 | node = doc.at_css("a1") 34 | sig = Lorax::Signature.new(node) 35 | sig.root.should == node 36 | end 37 | end 38 | 39 | describe "#nodes" do 40 | it "returns an array of nodes matching the signature" do 41 | doc = xml { root { 42 | a1 "hello" 43 | a1 "hello" 44 | a1 "hello" 45 | } } 46 | nodes = doc.css("a1") 47 | doc_sig = Lorax::Signature.new(doc.root) 48 | node_sig = Lorax::Signature.new(nodes.first) 49 | doc_sig.nodes(node_sig.signature).should =~ nodes.to_a 50 | end 51 | 52 | it "returns the node if I pass nil" do 53 | doc = xml { root { 54 | a1 "hello1" 55 | a1 "hello2" 56 | a1 "hello3" 57 | } } 58 | nodes = doc.css("a1") 59 | doc_sig = Lorax::Signature.new(doc.root) 60 | node_sig = Lorax::Signature.new(nodes.first) 61 | doc_sig.nodes(nil).should == [doc.root] 62 | end 63 | end 64 | 65 | describe "#size" do 66 | it "returns the total number of nodes in the subtree" do 67 | doc = xml { root { a1 "hello" } } 68 | node = doc.at_css("a1") 69 | doc_sig = Lorax::Signature.new(doc.root) 70 | doc_sig.size.should == 3 # root, a1, hello 71 | end 72 | end 73 | 74 | describe "#set_signature" do 75 | it "assigns values such that signature and nodes return the proper thing" do 76 | signature = Lorax::Signature.new 77 | signature.set_signature(:foo, "a") 78 | signature.set_signature(:bar, "a") 79 | signature.set_signature(:bazz, "b") 80 | signature.signature(:foo).should == "a" 81 | signature.signature(:bar).should == "a" 82 | signature.signature(:bazz).should == "b" 83 | signature.nodes("a").should =~ [:foo, :bar] 84 | signature.nodes("b").should == [:bazz] 85 | end 86 | end 87 | 88 | describe "#set_weight" do 89 | it "assigns values such that weight returns the proper thing" do 90 | signature = Lorax::Signature.new 91 | signature.set_weight(:foo, 2.2) 92 | signature.weight(:foo).should == 2.2 93 | end 94 | end 95 | 96 | describe "#signature" do 97 | context "passed no argument" do 98 | it "returns the subtree root's signature" do 99 | doc = xml { root { a1 "hello" } } 100 | sig = Lorax::Signature.new(doc.root) 101 | sig.signature.should == sig.signature(doc.root) 102 | end 103 | end 104 | 105 | context "passed a node" do 106 | it "returns the node's signature" do 107 | doc = xml { root { a1 "hello" } } 108 | node = doc.at_css("a1") 109 | doc_sig = Lorax::Signature.new(doc.root) 110 | node_sig = Lorax::Signature.new(node) 111 | doc_sig.signature(node).should == node_sig.signature 112 | end 113 | end 114 | 115 | context "passed a non-Node" do 116 | it "raises an error" do 117 | proc { Lorax::Signature.new.signature(42) }.should raise_error(ArgumentError, /signature expects a Node/) 118 | end 119 | end 120 | 121 | context "passed a cdata Node" do 122 | it "treats it like a leaf text node" do 123 | doc = xml { root { cdata "hello" } } 124 | node = doc.root.children.first 125 | doc_sig = Lorax::Signature.new(doc.root) 126 | node_sig = Lorax::Signature.new(node) 127 | doc_sig.signature(node).should == node_sig.signature 128 | end 129 | end 130 | 131 | context "passed a comment Node" do 132 | it "treats it like a leaf text node" do 133 | doc = Nokogiri::XML "" 134 | node = doc.root.children.first 135 | doc_sig = Lorax::Signature.new(doc.root) 136 | node_sig = Lorax::Signature.new(node) 137 | doc_sig.signature(node).should == node_sig.signature 138 | end 139 | end 140 | 141 | context "passed an entity reference Node" do 142 | it "treats it like a leaf text node" do 143 | doc = Nokogiri::XML %q( ) 144 | node = doc.at_css("span").children.first 145 | doc_sig = Lorax::Signature.new(doc.root) 146 | node_sig = Lorax::Signature.new(node) 147 | doc_sig.signature(node).should == node_sig.signature 148 | end 149 | end 150 | 151 | context "passed an invalid Node" do 152 | it "raises an error" do 153 | doc = xml { root { a1("foo" => "bar") } } 154 | attr = doc.at_css("a1").attributes.first.last 155 | proc { Lorax::Signature.new.signature(attr) }.should raise_error(ArgumentError, /signature expects an element/) 156 | end 157 | end 158 | 159 | it "hashes each node only once" do 160 | doc = xml { root { a1 { b1 { c1 "hello" } } } } 161 | node = doc.at_css "c1" 162 | mock.proxy.instance_of(Lorax::Signature).signature(anything).times(5) 163 | Lorax::Signature.new.signature(doc.root) 164 | end 165 | 166 | it "caches signaturees" do 167 | doc = xml { root { a1 { b1 { c1 "hello" } } } } 168 | node = doc.at_css "c1" 169 | mock.proxy.instance_of(Lorax::Signature).signature(anything).times(6) 170 | sig = Lorax::Signature.new 171 | sig.signature(doc.root) 172 | sig.signature(doc.root) 173 | end 174 | 175 | it "calculates weights along the way" do 176 | doc = xml { root { a1 } } 177 | node = doc.at_css "a1" 178 | sig = Lorax::Signature.new 179 | mock(sig).weight(node) 180 | sig.signature(node) 181 | end 182 | 183 | context "passed a text Node" do 184 | it "returns equal signatures for identical text nodes" do 185 | doc = xml { root { 186 | span "hello" 187 | span "hello" 188 | } } 189 | assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first }) 190 | end 191 | 192 | it "returns inequal signatures for different text nodes" do 193 | doc = xml { root { 194 | span "hello" 195 | span "goodbye" 196 | } } 197 | assert_node_signature_not_equal(*doc.css("span").collect { |n| n.children.first }) 198 | end 199 | 200 | it "ignores leading whitespace" do 201 | doc = xml { root { 202 | span "hello" 203 | span "#{WHITESPACES.join}hello" 204 | } } 205 | assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first }) 206 | end 207 | 208 | it "ignores trailing whitespace" do 209 | doc = xml { root { 210 | span "hello" 211 | span "hello#{WHITESPACES.join}" 212 | } } 213 | assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first }) 214 | end 215 | 216 | it "treats empty text nodes the same as no text node" do 217 | doc = xml { root { 218 | span WHITESPACES.join 219 | span 220 | } } 221 | assert_node_signature_equal(*doc.css("span")) 222 | end 223 | end 224 | 225 | context "elements with same name (with no attributes and no content)" do 226 | it "have equal signatures" do 227 | doc = xml { root { a1 ; a1 } } 228 | assert_node_signature_equal(*doc.css("a1")) 229 | end 230 | end 231 | 232 | context "elements with different names" do 233 | it "have inequal signatures" do 234 | doc = xml { root { a1 ; a2 } } 235 | assert_node_signature_not_equal doc.at_css("a1"), doc.at_css("a2") 236 | end 237 | end 238 | 239 | context "same elements in different docs" do 240 | it "have equal signatures" do 241 | doc1 = xml { root { a1 } } 242 | doc2 = xml { root { a1 } } 243 | assert_node_signature_equal doc1.at_css("a1"), doc2.at_css("a1") 244 | end 245 | end 246 | 247 | context "elements with same name and content (with no attributes)" do 248 | context "and content is the same" do 249 | it "have equal signatures" do 250 | doc = xml { root { 251 | a1 "hello" 252 | a1 "hello" 253 | } } 254 | assert_node_signature_equal(*doc.css("a1")) 255 | end 256 | end 257 | 258 | context "and content is not the same" do 259 | it "have inequal signatures" do 260 | doc = xml { root { 261 | a1 "hello" 262 | a1 "goodbye" 263 | } } 264 | assert_node_signature_not_equal(*doc.css("a1")) 265 | end 266 | end 267 | end 268 | 269 | context "elements with same name and children (with no attributes)" do 270 | context "and children are in the same order" do 271 | it "have equal signatures" do 272 | doc = xml { root { 273 | a1 { b1 ; b2 } 274 | a1 { b1 ; b2 } 275 | } } 276 | assert_node_signature_equal(*doc.css("a1")) 277 | end 278 | end 279 | 280 | context "and children are not in the same order" do 281 | it "have inequal signatures" do 282 | doc = xml { root { 283 | a1 { b1 ; b2 } 284 | a1 { b2 ; b1 } 285 | } } 286 | assert_node_signature_not_equal(*doc.css("a1")) 287 | end 288 | end 289 | end 290 | 291 | context "elements with same name and same attributes (with no content)" do 292 | it "have equal signatures" do 293 | doc = xml { root { 294 | a1("foo" => "bar", "bazz" => "quux") 295 | a1("foo" => "bar", "bazz" => "quux") 296 | } } 297 | assert_node_signature_equal(*doc.css("a1")) 298 | end 299 | end 300 | 301 | context "elements with same name and different attributes (with no content)" do 302 | it "have inequal signatures" do 303 | doc = xml { root { 304 | a1("foo" => "bar", "bazz" => "quux") 305 | a1("foo" => "123", "bazz" => "456") 306 | } } 307 | assert_node_signature_not_equal(*doc.css("a1")) 308 | end 309 | end 310 | 311 | context "attributes reverse-engineered to be similar" do 312 | it "have inequal signatures" do 313 | doc = xml { root { 314 | a1("foo" => "bar#{Lorax::Signature::SEP}quux") 315 | a1("foo#{Lorax::Signature::SEP}bar" => "quux") 316 | } } 317 | assert_node_signature_not_equal(*doc.css("a1")) 318 | end 319 | end 320 | 321 | context "HTML" do 322 | it "should be case-insensitive" do 323 | doc1 = Nokogiri::HTML <<-EOH 324 | 325 |
hello
326 | 327 | EOH 328 | doc2 = Nokogiri::HTML <<-EOH 329 | 330 |
hello
331 | 332 | EOH 333 | assert_node_signature_equal(doc1.at_css("body").children.first, 334 | doc2.at_css("body").children.first) 335 | end 336 | end 337 | end 338 | 339 | describe "#weight" do 340 | context "passed no argument" do 341 | it "returns the subtree root's weight" do 342 | doc = xml { root { a1 { b1 { c1 { d1 } } } } } 343 | node = doc.at_css("a1") 344 | doc_sig = Lorax::Signature.new(doc.root) 345 | doc_sig.weight.should == 5 346 | end 347 | end 348 | 349 | context "passed a node" do 350 | it "returns the node's weight" do 351 | doc = xml { root { a1 "hello" } } 352 | node = doc.at_css("a1") 353 | doc_sig = Lorax::Signature.new(doc.root) 354 | node_sig = Lorax::Signature.new(node) 355 | doc_sig.weight(node).should == node_sig.weight 356 | end 357 | end 358 | 359 | context "passed a non-Node" do 360 | it "raises an error" do 361 | proc { Lorax::Signature.new.weight(42) }.should raise_error(ArgumentError, /weight expects a Node/) 362 | end 363 | end 364 | 365 | context "passed a cdata Node" do 366 | it "treats it like a leaf text node" do 367 | doc = xml { root { cdata "hello" } } 368 | node = doc.root.children.first 369 | doc_sig = Lorax::Signature.new(doc.root) 370 | node_sig = Lorax::Signature.new(node) 371 | doc_sig.weight(node).should == node_sig.weight 372 | end 373 | end 374 | 375 | context "passed a comment Node" do 376 | it "treats it like a leaf text node" do 377 | doc = Nokogiri::XML "" 378 | node = doc.root.children.first 379 | doc_sig = Lorax::Signature.new(doc.root) 380 | node_sig = Lorax::Signature.new(node) 381 | doc_sig.weight(node).should == node_sig.weight 382 | end 383 | end 384 | 385 | context "passed an entity reference Node" do 386 | it "treats it like a leaf text node" do 387 | doc = Nokogiri::XML %q( ) 388 | node = doc.at_css("span").children.first 389 | doc_sig = Lorax::Signature.new(doc.root) 390 | node_sig = Lorax::Signature.new(node) 391 | doc_sig.weight(node).should == node_sig.weight 392 | end 393 | end 394 | 395 | context "passed an invalid Node" do 396 | it "raises an error" do 397 | doc = xml { root { a1("foo" => "bar") } } 398 | attr = doc.at_css("a1").attributes.first.last 399 | proc { Lorax::Signature.new.weight(attr) }.should raise_error(ArgumentError, /weight expects an element/) 400 | end 401 | end 402 | 403 | it "weighs each node only once" do 404 | doc = xml { root { a1 { b1 { c1 "hello" } } } } 405 | node = doc.at_css "c1" 406 | mock.proxy.instance_of(Lorax::Signature).weight(anything).times(5) 407 | Lorax::Signature.new.weight(doc.root) 408 | end 409 | 410 | it "caches weights" do 411 | doc = xml { root { a1 { b1 { c1 "hello" } } } } 412 | node = doc.at_css "c1" 413 | mock.proxy.instance_of(Lorax::Signature).weight(anything).times(6) 414 | sig = Lorax::Signature.new 415 | sig.weight(doc.root) 416 | sig.weight(doc.root) 417 | end 418 | 419 | it "weighs empty nodes with no children as 1" do 420 | doc = xml { root { a1 } } 421 | sig = Lorax::Signature.new(doc.root) 422 | sig.weight(doc.at_css("a1")).should == 1 423 | end 424 | 425 | it "weighs nodes with children as 1 + sum(weight(children))" do 426 | doc = xml { root { 427 | a1 { b1 ; b2 } 428 | a2 { b1 ; b2 ; b3 ; b4 } 429 | } } 430 | sig = Lorax::Signature.new(doc.root) 431 | sig.weight(doc.at_css("a1")).should == 3 432 | sig.weight(doc.at_css("a2")).should == 5 433 | sig.weight.should == 9 434 | end 435 | 436 | describe "text nodes" do 437 | it "scores as 1 + log(length)" do 438 | doc = xml { root { 439 | a1 "x" 440 | a2("x" * 500) 441 | a3("x" * 50_000) 442 | } } 443 | sig = Lorax::Signature.new(doc.root) 444 | sig.weight(doc.at_css("a1")).should be_within(0.0005).of(2) 445 | sig.weight(doc.at_css("a2")).should be_within(0.0005).of(2 + Math.log(500)) 446 | sig.weight(doc.at_css("a3")).should be_within(0.0005).of(2 + Math.log(50_000)) 447 | end 448 | end 449 | end 450 | 451 | describe "#monogram" do 452 | context "passed no argument" do 453 | it "returns the subtree root's signature" do 454 | doc = xml { root { a1(:foo => :bar) } } 455 | sig = Lorax::Signature.new(doc.root) 456 | sig.monogram.should == sig.monogram(doc.root) 457 | end 458 | end 459 | 460 | context "passed a node" do 461 | it "returns the node's signature" do 462 | doc = xml { root { a1(:foo => :bar) } } 463 | node = doc.at_css("a1") 464 | doc_sig = Lorax::Signature.new(doc.root) 465 | node_sig = Lorax::Signature.new(node) 466 | doc_sig.monogram(node).should == node_sig.monogram 467 | end 468 | end 469 | 470 | context "passed a non-Node" do 471 | it "raises an error" do 472 | proc { Lorax::Signature.new.monogram(42) }.should raise_error(ArgumentError, /signature expects a Node/) 473 | end 474 | end 475 | 476 | context "text nodes" do 477 | it "returns the signature as the monogram" do 478 | doc = xml { root { text "hello" } } 479 | node = doc.root.children.first 480 | sig = Lorax::Signature.new(doc.root) 481 | sig.monogram(node).should == sig.signature(node) 482 | end 483 | end 484 | 485 | context "element nodes" do 486 | it "is equal for nodes with equal names and attributes" do 487 | doc = xml { root { 488 | a1(:foo => :bar, :bazz => :quux) { text "hello" } 489 | a1(:foo => :bar, :bazz => :quux) { b1 } 490 | a1(:foo => :bar, :bazz => :quux) 491 | } } 492 | nodes = doc.css("a1") 493 | sig = Lorax::Signature.new(doc.root) 494 | sig.monogram(nodes[0]).should == sig.monogram(nodes[1]) 495 | sig.monogram(nodes[1]).should == sig.monogram(nodes[2]) 496 | end 497 | 498 | it "is inequal for nodes with different attributes" do 499 | doc = xml { root { 500 | a1(:foo => :bar) 501 | a1(:foo => :bar, :bazz => :quux) 502 | } } 503 | nodes = doc.css("a1") 504 | sig = Lorax::Signature.new(doc.root) 505 | sig.monogram(nodes[0]).should_not == sig.monogram(nodes[1]) 506 | end 507 | end 508 | end 509 | end 510 | --------------------------------------------------------------------------------