├── .rspec
├── lib
├── sax-machine
│ ├── version.rb
│ ├── config
│ │ ├── sax_ancestor.rb
│ │ ├── sax_attribute.rb
│ │ ├── sax_element_value.rb
│ │ ├── sax_collection.rb
│ │ └── sax_element.rb
│ ├── handlers
│ │ ├── sax_nokogiri_handler.rb
│ │ ├── sax_oga_handler.rb
│ │ ├── sax_ox_handler.rb
│ │ └── sax_abstract_handler.rb
│ ├── sax_configure.rb
│ ├── sax_config.rb
│ └── sax_document.rb
└── sax-machine.rb
├── .gitignore
├── Rakefile
├── Guardfile
├── Gemfile
├── .travis.yml
├── spec
├── spec_helper.rb
├── sax-machine
│ ├── sax_activerecord_spec.rb
│ ├── sax_include_spec.rb
│ ├── sax_configure_spec.rb
│ └── sax_document_spec.rb
└── fixtures
│ ├── atom-content.html
│ └── atom.xml
├── sax-machine.gemspec
├── HISTORY.md
└── README.md
/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | --format progress
3 |
--------------------------------------------------------------------------------
/lib/sax-machine/version.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | VERSION = "1.3.2"
3 | end
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .bundle
3 | *.gem
4 | Gemfile.lock
5 | .rvmrc
6 | .DS_STORE
7 | pkg/
8 | coverage/
9 | .ruby-version
10 | .ruby-gemset
11 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env rake
2 | require 'bundler/gem_tasks'
3 | require 'rspec/core/rake_task'
4 |
5 | RSpec::Core::RakeTask.new(:spec)
6 | task test: :spec
7 | task default: :test
8 |
--------------------------------------------------------------------------------
/Guardfile:
--------------------------------------------------------------------------------
1 | guard "rspec", version: 2 do
2 | watch(%r{^spec/.+_spec\.rb$})
3 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4 | watch("spec/spec_helper.rb") { "spec" }
5 | end
6 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "http://rubygems.org"
2 |
3 | gemspec
4 |
5 | group :development, :test do
6 | gem 'rake'
7 | gem 'guard-rspec'
8 | gem 'simplecov', require: false, platforms: [:mri]
9 | gem 'coveralls', require: false, platforms: [:mri]
10 |
11 | gem 'activerecord', '~> 4.1'
12 | gem 'nokogiri', '~> 1.6'
13 | gem 'ox', '>= 2.1.2', platforms: [:mri, :rbx]
14 | gem 'oga', '>= 0.3.4'
15 | end
16 |
--------------------------------------------------------------------------------
/lib/sax-machine/config/sax_ancestor.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | class SAXConfig
3 | class AncestorConfig
4 | attr_reader :name, :setter
5 |
6 | def initialize(name, options)
7 | @name = name.to_s
8 | @as = options[:as]
9 | @setter = "#{@as}="
10 | end
11 |
12 | def column
13 | @as || @name.to_sym
14 | end
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/sax-machine/config/sax_attribute.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | class SAXConfig
3 | class AttributeConfig < ElementValueConfig
4 | def value_from_attrs(attrs)
5 | attrs.fetch(@name, nil)
6 | end
7 |
8 | def attrs_match?(attrs)
9 | attrs.key?(@name) || attrs.value?(@name)
10 | end
11 | alias_method :has_value_and_attrs_match?, :attrs_match?
12 |
13 | def collection?
14 | false
15 | end
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 |
3 | rvm:
4 | - 1.9.3
5 | - 2.0
6 | - 2.1
7 | - 2.2
8 | - jruby-1.7
9 | - rbx-2
10 | - ruby-head
11 | - jruby-head
12 |
13 | sudo: false
14 |
15 | env:
16 | matrix:
17 | - HANDLER="nokogiri"
18 | - HANDLER="ox"
19 | - HANDLER="oga"
20 |
21 | matrix:
22 | exclude:
23 | - env: HANDLER="ox"
24 | rvm: jruby-1.7
25 | - env: HANDLER="ox"
26 | rvm: jruby-head
27 | allow_failures:
28 | - env: HANDLER="oga"
29 | rvm: jruby-1.7
30 | - rvm: rbx-2
31 | - rvm: ruby-head
32 | - rvm: jruby-head
33 |
--------------------------------------------------------------------------------
/lib/sax-machine/config/sax_element_value.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | class SAXConfig
3 | class ElementValueConfig
4 | attr_reader :name, :setter, :data_class
5 |
6 | def initialize(name, options)
7 | @name = name.to_s
8 | @as = options[:as]
9 | @setter = "#{@as}="
10 | @required = options[:required]
11 | @data_class = options[:class]
12 | end
13 |
14 | def column
15 | @as || @name.to_sym
16 | end
17 |
18 | def required?
19 | !!@required
20 | end
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | begin
2 | require 'simplecov'
3 | require 'coveralls'
4 |
5 | SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
6 | SimpleCov::Formatter::HTMLFormatter,
7 | Coveralls::SimpleCov::Formatter
8 | ]
9 |
10 | SimpleCov.start do
11 | add_filter '/spec/'
12 | end
13 | rescue LoadError
14 | end
15 |
16 | require File.expand_path(File.dirname(__FILE__) + '/../lib/sax-machine')
17 | SAXMachine.handler = ENV['HANDLER'].to_sym if ENV['HANDLER']
18 |
19 | RSpec.configure do |config|
20 | config.run_all_when_everything_filtered = true
21 | config.filter_run :focus
22 | end
23 |
--------------------------------------------------------------------------------
/spec/sax-machine/sax_activerecord_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2 | require 'active_record'
3 |
4 | describe "SAXMachine ActiveRecord integration" do
5 | before do
6 | class MySaxModel < ActiveRecord::Base
7 | SAXMachine.configure(MySaxModel) do |c|
8 | c.element :title
9 | end
10 | end
11 | end
12 |
13 | after do
14 | Object.send(:remove_const, :MySaxModel)
15 | end
16 |
17 | it "parses document" do
18 | document = MySaxModel.parse("My Title")
19 | expect(document.title).to eq("My Title")
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/sax-machine.rb:
--------------------------------------------------------------------------------
1 | require "sax-machine/version"
2 | require "sax-machine/sax_document"
3 | require "sax-machine/sax_configure"
4 | require "sax-machine/sax_config"
5 |
6 | module SAXMachine
7 | def self.handler
8 | @@handler ||= nil
9 | end
10 |
11 | def self.handler=(handler)
12 | if handler
13 | require "sax-machine/handlers/sax_#{handler}_handler"
14 | @@handler = handler
15 | end
16 | end
17 | end
18 |
19 | # Try handlers
20 | [:ox, :oga].each do |handler|
21 | begin
22 | SAXMachine.handler = handler
23 | break
24 | rescue LoadError
25 | end
26 | end
27 |
28 | # Still no handler, use Nokogiri
29 | if SAXMachine.handler.nil?
30 | SAXMachine.handler = :nokogiri
31 | end
32 |
--------------------------------------------------------------------------------
/lib/sax-machine/config/sax_collection.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | class SAXConfig
3 | class CollectionConfig
4 | attr_reader :name
5 |
6 | def initialize(name, options)
7 | @name = name.to_s
8 | @class = options[:class]
9 | @as = options[:as].to_s
10 | @with = options.fetch(:with, {})
11 | end
12 |
13 | def accessor
14 | as
15 | end
16 |
17 | def attrs_match?(attrs)
18 | @with.all? do |key, value|
19 | value === attrs[key.to_s]
20 | end
21 | end
22 |
23 | def data_class
24 | @class || @name
25 | end
26 |
27 | protected
28 | def as
29 | @as
30 | end
31 | end
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/lib/sax-machine/handlers/sax_nokogiri_handler.rb:
--------------------------------------------------------------------------------
1 | require 'sax-machine/handlers/sax_abstract_handler'
2 | require 'nokogiri'
3 |
4 | module SAXMachine
5 | class SAXNokogiriHandler < Nokogiri::XML::SAX::Document
6 | include SAXAbstractHandler
7 |
8 | def sax_parse(xml_input)
9 | parser = Nokogiri::XML::SAX::Parser.new(self)
10 | parser.parse(xml_input) do |ctx|
11 | ctx.replace_entities = true
12 | end
13 | end
14 |
15 | alias_method :initialize, :_initialize
16 | alias_method :characters, :_characters
17 | alias_method :cdata_block, :_characters
18 | alias_method :start_element, :_start_element
19 | alias_method :end_element, :_end_element
20 | alias_method :error, :_error
21 | alias_method :warning, :_warning
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/sax-machine.gemspec:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | require File.expand_path("../lib/sax-machine/version", __FILE__)
3 |
4 | Gem::Specification.new do |s|
5 | s.name = "sax-machine"
6 | s.version = SAXMachine::VERSION
7 |
8 | s.authors = ["Paul Dix", "Julien Kirch", "Ezekiel Templin", "Dmitry Krasnoukhov"]
9 | s.email = %q{paul@pauldix.net}
10 | s.homepage = %q{http://github.com/pauldix/sax-machine}
11 | s.summary = %q{Declarative SAX Parsing with Nokogiri, Ox or Oga}
12 | s.license = %q{MIT}
13 |
14 | s.files = `git ls-files`.split("\n")
15 | s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16 | s.require_paths = ["lib"]
17 | s.platform = Gem::Platform::RUBY
18 |
19 | s.add_development_dependency "rspec", "~> 3.0"
20 | end
21 |
--------------------------------------------------------------------------------
/lib/sax-machine/handlers/sax_oga_handler.rb:
--------------------------------------------------------------------------------
1 | require 'sax-machine/handlers/sax_abstract_handler'
2 | require 'oga'
3 |
4 | module SAXMachine
5 | class SAXOgaHandler
6 | include SAXAbstractHandler
7 |
8 | def initialize(*args)
9 | _initialize(*args)
10 | end
11 |
12 | def sax_parse(xml_input)
13 | Oga.sax_parse_xml(self, xml_input)
14 | rescue LL::ParserError => e
15 | on_error(e.message)
16 | end
17 |
18 | def on_element(namespace, name, attrs)
19 | _start_element(node_name(namespace, name), attrs)
20 | end
21 |
22 | def after_element(namespace, name)
23 | _end_element(node_name(namespace, name))
24 | end
25 |
26 | def on_error(*args)
27 | _error(args.join(" "))
28 | end
29 |
30 | alias_method :on_text, :_characters
31 | alias_method :on_cdata, :_characters
32 |
33 | private
34 |
35 | def node_name(namespace, name)
36 | namespace ? "#{namespace}:#{name}" : name
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/lib/sax-machine/sax_configure.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | def self.configure(clazz)
3 | extended_clazz = Class.new(clazz)
4 | extended_clazz.send(:include, SAXMachine)
5 |
6 | # override create_attr to create attributes on the original class
7 | def extended_clazz.create_attr real_name
8 | superclass.send(:attr_reader, real_name) unless superclass.method_defined?(real_name)
9 | superclass.send(:attr_writer, real_name) unless superclass.method_defined?("#{real_name}=")
10 | end
11 |
12 | yield(extended_clazz)
13 |
14 | clazz.extend LightWeightSaxMachine
15 | clazz.sax_config = extended_clazz.sax_config
16 |
17 | (class << clazz;self;end).send(:define_method, :parse) do |xml_input|
18 | extended_clazz.parse(xml_input)
19 | end
20 | end
21 |
22 | module LightWeightSaxMachine
23 | attr_writer :sax_config
24 |
25 | def sax_config
26 | @sax_config ||= SAXConfig.new
27 | end
28 |
29 | def inherited(subclass)
30 | subclass.sax_config.send(:initialize_copy, self.sax_config)
31 | end
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/lib/sax-machine/handlers/sax_ox_handler.rb:
--------------------------------------------------------------------------------
1 | require 'sax-machine/handlers/sax_abstract_handler'
2 | require 'ox'
3 |
4 | module SAXMachine
5 | class SAXOxHandler < Ox::Sax
6 | include SAXAbstractHandler
7 |
8 | def initialize(*args)
9 | _initialize(*args)
10 | _reset_element
11 | end
12 |
13 | def sax_parse(xml_input)
14 | # Ox requires input to be streamable
15 | xml_input = StringIO.new(xml_input) if xml_input.is_a?(String)
16 |
17 | Ox.sax_parse(self, xml_input,
18 | symbolize: false,
19 | convert_special: true,
20 | skip: :skip_return,
21 | )
22 | end
23 |
24 | def attr(name, str)
25 | @attrs[name] = str
26 | end
27 |
28 | def attrs_done
29 | _start_element(@element, @attrs)
30 | _reset_element
31 | end
32 |
33 | def start_element(name)
34 | @element = name
35 | end
36 |
37 | def text(value)
38 | _characters(value) if value && !value.empty?
39 | end
40 |
41 | alias_method :cdata, :text
42 |
43 | def error(message, line, column)
44 | _error("#{message} on line #{line} column #{column}")
45 | end
46 |
47 | alias_method :end_element, :_end_element
48 |
49 | private
50 |
51 | def _reset_element
52 | @attrs = {}
53 | @element = ""
54 | end
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/spec/sax-machine/sax_include_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2 |
3 | describe "SAXMachine inheritance" do
4 | before do
5 | class A
6 | include SAXMachine
7 | element :title
8 | end
9 |
10 | class B < A
11 | element :b
12 | end
13 |
14 | class C < B
15 | element :c
16 | end
17 |
18 | xml = "TestMatched!And Again"
19 | @a = A.new
20 | @a.parse xml
21 | @b = B.new
22 | @b.parse xml
23 | @c = C.new
24 | @c.parse xml
25 | end
26 |
27 | after do
28 | Object.send(:remove_const, :A)
29 | Object.send(:remove_const, :B)
30 | Object.send(:remove_const, :C)
31 | end
32 |
33 | it { expect(@a).to be_a(A) }
34 | it { expect(@a).not_to be_a(B) }
35 | it { expect(@a).to be_a(SAXMachine) }
36 | it { expect(@a.title).to eq("Test") }
37 | it { expect(@b).to be_a(A) }
38 | it { expect(@b).to be_a(B) }
39 | it { expect(@b).to be_a(SAXMachine) }
40 | it { expect(@b.title).to eq("Test") }
41 | it { expect(@b.b).to eq("Matched!") }
42 | it { expect(@c).to be_a(A) }
43 | it { expect(@c).to be_a(B) }
44 | it { expect(@c).to be_a(C) }
45 | it { expect(@c).to be_a(SAXMachine) }
46 | it { expect(@c.title).to eq("Test") }
47 | it { expect(@c.b).to eq("Matched!") }
48 | it { expect(@c.c).to eq("And Again") }
49 | end
50 |
--------------------------------------------------------------------------------
/spec/sax-machine/sax_configure_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2 |
3 | describe "SAXMachine configure" do
4 | before do
5 | class A
6 | SAXMachine.configure(A) do |c|
7 | c.element :title
8 | end
9 | end
10 |
11 | class B < A
12 | SAXMachine.configure(B) do |c|
13 | c.element :b
14 | end
15 | end
16 |
17 | class C < B
18 | SAXMachine.configure(C) do |c|
19 | c.element :c
20 | end
21 | end
22 |
23 | xml = "TestMatched!And Again"
24 | @a = A.parse xml
25 | @b = B.parse xml
26 | @c = C.parse xml
27 | end
28 |
29 | after do
30 | Object.send(:remove_const, :A)
31 | Object.send(:remove_const, :B)
32 | Object.send(:remove_const, :C)
33 | end
34 |
35 | it { expect(@a).to be_a(A) }
36 | it { expect(@a).not_to be_a(B) }
37 | it { expect(@a).to be_a(SAXMachine) }
38 | it { expect(@a.title).to eq("Test") }
39 | it { expect(@b).to be_a(A) }
40 | it { expect(@b).to be_a(B) }
41 | it { expect(@b).to be_a(SAXMachine) }
42 | it { expect(@b.title).to eq("Test") }
43 | it { expect(@b.b).to eq("Matched!") }
44 | it { expect(@c).to be_a(A) }
45 | it { expect(@c).to be_a(B) }
46 | it { expect(@c).to be_a(C) }
47 | it { expect(@c).to be_a(SAXMachine) }
48 | it { expect(@c.title).to eq("Test") }
49 | it { expect(@c.b).to eq("Matched!") }
50 | it { expect(@c.c).to eq("And Again") }
51 | end
52 |
--------------------------------------------------------------------------------
/lib/sax-machine/config/sax_element.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | class SAXConfig
3 | class ElementConfig
4 | attr_reader :name, :as, :setter, :data_class, :collection, :default
5 |
6 | def initialize(name, options)
7 | @name = name.to_s
8 | @with = options.fetch(:with, {})
9 |
10 | @value = if options.has_key?(:value)
11 | options[:value].to_s
12 | else
13 | nil
14 | end
15 |
16 | @as = options[:as]
17 | @collection = options[:collection]
18 | @default = options[:default]
19 |
20 | @setter = if @collection
21 | "add_#{options[:as]}"
22 | else
23 | "#{@as}="
24 | end
25 |
26 | @data_class = options[:class]
27 | @required = options[:required]
28 | end
29 |
30 | def value_configured?
31 | !@value.nil?
32 | end
33 |
34 | def to_s
35 | "name: #{@name} dataclass: #{@data_class} setter: #{@setter} required: #{@required} value: #{@value} as:#{@as} collection: #{@collection} with: #{@with}"
36 | end
37 |
38 | def column
39 | @as || @name.to_sym
40 | end
41 |
42 | def required?
43 | !!@required
44 | end
45 |
46 | def value_from_attrs(attrs)
47 | attrs.fetch(@value, nil)
48 | end
49 |
50 | def attrs_match?(attrs)
51 | @with.all? do |key, value|
52 | value === attrs[key.to_s]
53 | end
54 | end
55 |
56 | def has_value_and_attrs_match?(attrs)
57 | !@value.nil? && attrs_match?(attrs)
58 | end
59 |
60 | def collection?
61 | !!@collection
62 | end
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/lib/sax-machine/sax_config.rb:
--------------------------------------------------------------------------------
1 | require "sax-machine/config/sax_element_value"
2 | require "sax-machine/config/sax_attribute"
3 | require "sax-machine/config/sax_element"
4 | require "sax-machine/config/sax_collection"
5 | require "sax-machine/config/sax_ancestor"
6 |
7 | module SAXMachine
8 | class SAXConfig
9 | attr_accessor :top_level_elements, :top_level_attributes, :top_level_element_value, :collection_elements, :ancestors
10 |
11 | def initialize
12 | # Default value is an empty array
13 | @top_level_elements = Hash.new { |hash, key| hash[key] = [] }
14 | @top_level_attributes = []
15 | @top_level_element_value = []
16 | @collection_elements = Hash.new { |hash, key| hash[key] = [] }
17 | @ancestors = []
18 | end
19 |
20 | def columns
21 | @top_level_elements.map { |_, ecs| ecs }.flatten
22 | end
23 |
24 | def initialize_copy(sax_config)
25 | super
26 |
27 | @top_level_elements = sax_config.top_level_elements.clone
28 | @top_level_attributes = sax_config.top_level_attributes.clone
29 | @top_level_element_value = sax_config.top_level_element_value.clone
30 | @collection_elements = sax_config.collection_elements.clone
31 | @ancestors = sax_config.ancestors.clone
32 | end
33 |
34 | def add_top_level_element(name, options)
35 | @top_level_elements[name.to_s] << ElementConfig.new(name, options)
36 | end
37 |
38 | def add_top_level_attribute(name, options)
39 | @top_level_attributes << AttributeConfig.new(options.delete(:name), options)
40 | end
41 |
42 | def add_top_level_element_value(name, options)
43 | @top_level_element_value << ElementValueConfig.new(options.delete(:name), options)
44 | end
45 |
46 | def add_collection_element(name, options)
47 | @collection_elements[name.to_s] << CollectionConfig.new(name, options)
48 | end
49 |
50 | def add_ancestor(name, options)
51 | @ancestors << AncestorConfig.new(name, options)
52 | end
53 |
54 | def collection_config(name, attrs)
55 | @collection_elements[name.to_s].detect { |cc| cc.attrs_match?(attrs) }
56 | end
57 |
58 | def attribute_configs_for_element(attrs)
59 | @top_level_attributes.select { |aa| aa.attrs_match?(attrs) }
60 | end
61 |
62 | def element_values_for_element
63 | @top_level_element_value
64 | end
65 |
66 | def element_configs_for_attribute(name, attrs)
67 | return [] unless @top_level_elements.has_key?(name.to_s)
68 |
69 | @top_level_elements[name.to_s].select { |ec| ec.has_value_and_attrs_match?(attrs) }
70 | end
71 |
72 | def element_config_for_tag(name, attrs)
73 | return unless @top_level_elements.has_key?(name.to_s)
74 |
75 | @top_level_elements[name.to_s].detect { |ec| ec.attrs_match?(attrs) }
76 | end
77 | end
78 | end
79 |
--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
1 | # HEAD
2 |
3 | # 1.3.2
4 |
5 | * Compatibility with Oga 0.3
6 |
7 | # 1.3.1
8 |
9 | * Allow default value to be `false` [[#66](https://github.com/pauldix/sax-machine/pull/66)]
10 | * Support adding class to an attribute [[#68](https://github.com/pauldix/sax-machine/pull/68)]
11 | * Adjust Ox handler to skip empty text/cdata values
12 |
13 | # 1.3.0
14 |
15 | * Improve block modifiers to support all config options
16 | * Make block modifiers run in instance context
17 | * Make all handlers support IO as a input
18 |
19 | # 1.2.0
20 |
21 | * Add support for blocks as value modifiers [[#61](https://github.com/pauldix/sax-machine/pull/61)]
22 |
23 | # 1.1.1
24 |
25 | * Fix Nokogiri autoloading [[#60](https://github.com/pauldix/sax-machine/pull/60)]
26 |
27 | # 1.1.0
28 |
29 | * Option to use Oga as a SAX handler
30 |
31 | # 1.0.3
32 |
33 | * Remove missed `nokogiri` reference [[#54](https://github.com/pauldix/sax-machine/pull/54)]
34 | * Add support for `Symbol` data type conversion [[#57](https://github.com/pauldix/sax-machine/pull/57)]
35 | * Add specs for multiple elements with the same alias [[#53](https://github.com/pauldix/sax-machine/pull/53)]
36 | * Various code and documentation enhancements
37 |
38 | # 1.0.2
39 |
40 | * Make sure SAXConfig getters do not modify internal vars. Prevent race conditions
41 |
42 | # 1.0.1
43 |
44 | * Improve normalize_name performance
45 |
46 | # 1.0.0
47 |
48 | * Make `nokogiri` dependency optional
49 | * Add :default argument for elements [[#51](https://github.com/pauldix/sax-machine/pull/51)]
50 |
51 | # 0.3.0
52 |
53 | * Option to use Ox as a SAX handler instead of Nokogiri [[#49](https://github.com/pauldix/sax-machine/pull/49)]
54 | * Bump RSpec to 3.0, convert existing specs
55 |
56 | # 0.2.1
57 |
58 | * Turn on replace_entities on Nokogiri parser [[#40](https://github.com/pauldix/sax-machine/pull/40)]
59 | * Provide mass assignment through initialize method [[#38](https://github.com/pauldix/sax-machine/pull/38)]
60 | * Bump nokogiri (~> 1.6) and rspec, drop growl dependency
61 | * Update 'with' option to allow pattern matching in addition to string matching
62 |
63 | # 0.2.0.rc1
64 |
65 | * Try to reduce the number of instances of respond_to? in the code by
66 | pulling common uses of it out to methods. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
67 | * The parse stack is now composed of simple objects instead of it being
68 | an array of arrays. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
69 | * Now using an identifier for an empty buffer instead of empty string. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
70 | * Clean up several variables that were not being used. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
71 | * Encapsulate stack so it's not being exposed as part of the API. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
72 | * `cdata_block` is now an alias instead of delegating to characters. [[#32](https://github.com/pauldix/sax-machine/pull/32)]
73 |
74 | # 0.1.0
75 |
76 | * Rename parent to ancestor
77 | * Add SAXMachine.configure
78 |
--------------------------------------------------------------------------------
/spec/fixtures/atom-content.html:
--------------------------------------------------------------------------------
1 |
2 |
In my previous post about the speed of serializing data, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, Bryan Helmkamp had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.
3 |
4 |
I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:
5 |
{ :foo => 3, :bar => 2 } # hash with symbols for keys and integer values [3, 2.1, 4, 8] # array with integer and float values
6 |
Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.
7 |
8 |
I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:
9 |
user system total real array marshal 0.200000 0.010000 0.210000 ( 0.214018) (without Base64) array marshal 0.220000 0.010000 0.230000 ( 0.250260)
As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.
11 |
12 |
I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/lib/sax-machine/sax_document.rb:
--------------------------------------------------------------------------------
1 | module SAXMachine
2 | def self.included(base)
3 | base.send(:include, InstanceMethods)
4 | base.extend(ClassMethods)
5 | end
6 |
7 | def parse(xml_input, on_error = nil, on_warning = nil)
8 | handler_klass = SAXMachine.const_get("SAX#{SAXMachine.handler.capitalize}Handler")
9 |
10 | handler = handler_klass.new(self, on_error, on_warning)
11 | handler.sax_parse(xml_input)
12 |
13 | self
14 | end
15 |
16 | module InstanceMethods
17 | def initialize(attributes = {})
18 | attributes.each do |name, value|
19 | send("#{name}=", value)
20 | end
21 |
22 | self.class.sax_config.top_level_elements.each do |_, configs|
23 | configs.each do |config|
24 | next if config.default.nil?
25 | next unless send(config.as).nil?
26 |
27 | send(config.setter, config.default)
28 | end
29 | end
30 | end
31 | end
32 |
33 | module ClassMethods
34 | def inherited(subclass)
35 | subclass.sax_config.send(:initialize_copy, self.sax_config)
36 | end
37 |
38 | def parse(*args)
39 | new.parse(*args)
40 | end
41 |
42 | def element(name, options = {}, &block)
43 | real_name = (options[:as] ||= name).to_s
44 | sax_config.add_top_level_element(name, options)
45 | create_attr(real_name, &block)
46 | end
47 |
48 | def attribute(name, options = {}, &block)
49 | real_name = (options[:as] ||= name).to_s
50 | sax_config.add_top_level_attribute(self.class.to_s, options.merge(name: name))
51 | create_attr(real_name, &block)
52 | end
53 |
54 | def value(name, options = {}, &block)
55 | real_name = (options[:as] ||= name).to_s
56 | sax_config.add_top_level_element_value(self.class.to_s, options.merge(name: name))
57 | create_attr(real_name, &block)
58 | end
59 |
60 | def ancestor(name, options = {}, &block)
61 | real_name = (options[:as] ||= name).to_s
62 | sax_config.add_ancestor(name, options)
63 | create_attr(real_name, &block)
64 | end
65 |
66 | def elements(name, options = {}, &block)
67 | real_name = (options[:as] ||= name).to_s
68 |
69 | if options[:class]
70 | sax_config.add_collection_element(name, options)
71 | else
72 | if block_given?
73 | define_method("add_#{real_name}") do |value|
74 | send(real_name).send(:<<, instance_exec(value, &block))
75 | end
76 | else
77 | define_method("add_#{real_name}") do |value|
78 | send(real_name).send(:<<, value)
79 | end
80 | end
81 |
82 | sax_config.add_top_level_element(name, options.merge(collection: true))
83 | end
84 |
85 | if !method_defined?(real_name)
86 | class_eval <<-SRC
87 | def #{real_name}
88 | @#{real_name} ||= []
89 | end
90 | SRC
91 | end
92 |
93 | attr_writer(options[:as]) unless method_defined?("#{options[:as]}=")
94 | end
95 |
96 | def columns
97 | sax_config.columns
98 | end
99 |
100 | def column(sym)
101 | columns.select { |c| c.column == sym }[0]
102 | end
103 |
104 | def data_class(sym)
105 | column(sym).data_class
106 | end
107 |
108 | def required?(sym)
109 | column(sym).required?
110 | end
111 |
112 | def column_names
113 | columns.map { |e| e.column }
114 | end
115 |
116 | def sax_config
117 | @sax_config ||= SAXConfig.new
118 | end
119 |
120 | # we only want to insert the getter and setter if they haven't defined it from elsewhere.
121 | # this is how we allow custom parsing behavior. So you could define the setter
122 | # and have it parse the string into a date or whatever.
123 | def create_attr(real_name, &block)
124 | attr_reader(real_name) unless method_defined?(real_name)
125 |
126 | if !method_defined?("#{real_name}=")
127 | if block_given?
128 | define_method("#{real_name}=") do |value|
129 | instance_variable_set("@#{real_name}", instance_exec(value, &block))
130 | end
131 | else
132 | attr_writer(real_name)
133 | end
134 | end
135 | end
136 | end
137 | end
138 |
--------------------------------------------------------------------------------
/lib/sax-machine/handlers/sax_abstract_handler.rb:
--------------------------------------------------------------------------------
1 | require 'time'
2 |
3 | module SAXMachine
4 | module SAXAbstractHandler
5 | NO_BUFFER = :no_buffer
6 |
7 | class StackNode < Struct.new(:object, :config, :buffer)
8 | def initialize(object, config = nil, buffer = NO_BUFFER)
9 | self.object = object
10 | self.config = config
11 | self.buffer = buffer
12 | end
13 | end
14 |
15 | def sax_parse(xml_input)
16 | raise NotImplementedError
17 | end
18 |
19 | def _initialize(object, on_error = nil, on_warning = nil)
20 | @stack = [ StackNode.new(object) ]
21 | @parsed_configs = {}
22 | @on_error = on_error
23 | @on_warning = on_warning
24 | end
25 |
26 | def _characters(data)
27 | node = stack.last
28 |
29 | if node.buffer == NO_BUFFER
30 | node.buffer = data.dup
31 | else
32 | node.buffer << data
33 | end
34 | end
35 |
36 | def _start_element(name, attrs = [])
37 | name = normalize_name(name)
38 | node = stack.last
39 | object = node.object
40 |
41 | sax_config = sax_config_for(object)
42 |
43 | if sax_config
44 | attrs = Hash[attrs]
45 |
46 | if collection_config = sax_config.collection_config(name, attrs)
47 | object = collection_config.data_class.new
48 | sax_config = sax_config_for(object)
49 |
50 | stack.push(StackNode.new(object, collection_config))
51 |
52 | set_attributes_on(object, attrs)
53 | end
54 |
55 | sax_config.element_configs_for_attribute(name, attrs).each do |ec|
56 | unless parsed_config?(object, ec)
57 | value = data_class_value(ec.data_class, ec.value_from_attrs(attrs))
58 | object.send(ec.setter, value)
59 | mark_as_parsed(object, ec)
60 | end
61 | end
62 |
63 | if !collection_config && element_config = sax_config.element_config_for_tag(name, attrs)
64 | new_object =
65 | case element_config.data_class.to_s
66 | when "Integer" then 0
67 | when "Float" then 0.0
68 | when "Symbol" then nil
69 | when "Time" then Time.at(0)
70 | when "" then object
71 | else
72 | element_config.data_class.new
73 | end
74 |
75 | stack.push(StackNode.new(new_object, element_config))
76 |
77 | set_attributes_on(new_object, attrs)
78 | end
79 | end
80 | end
81 |
82 | def _end_element(name)
83 | name = normalize_name(name)
84 |
85 | start_tag = stack[-2]
86 | close_tag = stack[-1]
87 |
88 | return unless start_tag && close_tag
89 |
90 | object = start_tag.object
91 | element = close_tag.object
92 | config = close_tag.config
93 | value = close_tag.buffer
94 |
95 | return unless config.name == name
96 |
97 | unless parsed_config?(object, config)
98 | if (element_value_config = element_values_for(config))
99 | element_value_config.each { |evc| element.send(evc.setter, value) }
100 | end
101 |
102 | if config.respond_to?(:accessor)
103 | subconfig = sax_config_for(element)
104 |
105 | if econf = subconfig.element_config_for_tag(name, [])
106 | element.send(econf.setter, value) unless econf.value_configured?
107 | end
108 |
109 | object.send(config.accessor) << element
110 | else
111 | value = data_class_value(config.data_class, value) || element
112 | object.send(config.setter, value) if value != NO_BUFFER
113 | mark_as_parsed(object, config)
114 | end
115 |
116 | # try to set the ancestor
117 | if (sax_config = sax_config_for(element))
118 | sax_config.ancestors.each do |ancestor|
119 | element.send(ancestor.setter, object)
120 | end
121 | end
122 | end
123 |
124 | stack.pop
125 | end
126 |
127 | def _error(string)
128 | if @on_error
129 | @on_error.call(string)
130 | end
131 | end
132 |
133 | def _warning(string)
134 | if @on_warning
135 | @on_warning.call(string)
136 | end
137 | end
138 |
139 | private
140 |
141 | def mark_as_parsed(object, element_config)
142 | unless element_config.collection?
143 | @parsed_configs[[object.object_id, element_config.object_id]] = true
144 | end
145 | end
146 |
147 | def parsed_config?(object, element_config)
148 | @parsed_configs[[object.object_id, element_config.object_id]]
149 | end
150 |
151 | def sax_config_for(object)
152 | if object.class.respond_to?(:sax_config)
153 | object.class.sax_config
154 | end
155 | end
156 |
157 | def element_values_for(config)
158 | if config.data_class.respond_to?(:sax_config)
159 | config.data_class.sax_config.element_values_for_element
160 | end
161 | end
162 |
163 | def normalize_name(name)
164 | name.to_s.tr("-", "_")
165 | end
166 |
167 | def set_attributes_on(object, attributes)
168 | config = sax_config_for(object)
169 |
170 | if config
171 | config.attribute_configs_for_element(attributes).each do |ac|
172 | value = data_class_value(ac.data_class, ac.value_from_attrs(attributes))
173 | object.send(ac.setter, value)
174 | end
175 | end
176 | end
177 |
178 | def data_class_value(data_class, value)
179 | case data_class.to_s
180 | when "String" then value != NO_BUFFER ? value.to_s : value
181 | when "Integer" then value != NO_BUFFER ? value.to_i : value
182 | when "Float" then value != NO_BUFFER ? value.to_s.gsub(",",".").to_f : value
183 | when "Symbol" then
184 | if value != NO_BUFFER
185 | value.to_s.empty? ? nil : value.to_s.downcase.to_sym
186 | else
187 | value
188 | end
189 | # Assumes that time elements will be string-based and are not
190 | # something else, e.g. seconds since epoch
191 | when "Time" then value != NO_BUFFER ? Time.parse(value.to_s) : value
192 | when "" then value
193 | end
194 | end
195 |
196 | def stack
197 | @stack
198 | end
199 | end
200 | end
201 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SAX Machine
2 |
3 | ## Status
4 |
5 | [](http://badge.fury.io/rb/sax-machine)
6 | [](http://travis-ci.org/pauldix/sax-machine?branch=master)
7 | [](https://coveralls.io/r/pauldix/sax-machine?branch=master)
8 | [](https://codeclimate.com/github/pauldix/sax-machine)
9 | [](https://gemnasium.com/pauldix/sax-machine)
10 |
11 | ## Description
12 |
13 | A declarative SAX parsing library backed by Nokogiri, Ox or Oga.
14 |
15 | ## Installation
16 |
17 | Add this line to your application's Gemfile:
18 |
19 | ```ruby
20 | gem 'sax-machine'
21 | ```
22 |
23 | And then execute:
24 |
25 | ```bash
26 | $ bundle
27 | ```
28 |
29 | ## Usage
30 |
31 | SAX Machine can use either `nokogiri`, `ox` or `oga` as XML SAX handler.
32 |
33 | To use **Nokogiri** add this line to your Gemfile:
34 |
35 | ```ruby
36 | gem 'nokogiri', '~> 1.6'
37 | ```
38 |
39 | To use **Ox** add this line to your Gemfile:
40 |
41 | ```ruby
42 | gem 'ox', '>= 2.1.2'
43 | ```
44 |
45 | To use **Oga** add this line to your Gemfile:
46 |
47 | ```ruby
48 | gem 'oga', '>= 0.2.0'
49 | ```
50 |
51 | You can also specify which handler to use manually, like this:
52 |
53 | ```ruby
54 | SAXMachine.handler = :nokogiri
55 | ```
56 |
57 | ## Examples
58 |
59 | Include `SAXMachine` in any class and define properties to parse:
60 |
61 | ```ruby
62 | class AtomContent
63 | include SAXMachine
64 | attribute :type
65 | value :text
66 | end
67 |
68 | class AtomEntry
69 | include SAXMachine
70 | element :title
71 | # The :as argument makes this available through entry.author instead of .name
72 | element :name, as: :author
73 | element "feedburner:origLink", as: :url
74 | # The :default argument specifies default value for element when it's missing
75 | element :summary, class: String, default: "No summary available"
76 | element :content, class: AtomContent
77 | element :published
78 | ancestor :ancestor
79 | end
80 |
81 | class Atom
82 | include SAXMachine
83 | # Use block to modify the returned value
84 | # Blocks are working with pretty much everything,
85 | # except for `elements` with `class` attribute
86 | element :title do |title|
87 | title.strip
88 | end
89 | # The :with argument means that you only match a link tag
90 | # that has an attribute of type: "text/html"
91 | element :link, value: :href, as: :url, with: {
92 | type: "text/html"
93 | }
94 | # The :value argument means that instead of setting the value
95 | # to the text between the tag, it sets it to the attribute value of :href
96 | element :link, value: :href, as: :feed_url, with: {
97 | type: "application/atom+xml"
98 | }
99 | elements :entry, as: :entries, class: AtomEntry
100 | end
101 | ```
102 |
103 | Then parse any XML with your class:
104 |
105 | ```ruby
106 | feed = Atom.parse(xml_text)
107 |
108 | feed.title # Whatever the title of the blog is
109 | feed.url # The main URL of the blog
110 | feed.feed_url # The URL of the blog feed
111 |
112 | feed.entries.first.title # Title of the first entry
113 | feed.entries.first.author # The author of the first entry
114 | feed.entries.first.url # Permalink on the blog for this entry
115 | feed.entries.first.summary # Returns "No summary available" if summary is missing
116 | feed.entries.first.ancestor # The Atom ancestor
117 | feed.entries.first.content # Instance of AtomContent
118 | feed.entries.first.content.text # Entry content text
119 | ```
120 |
121 | You can also use the elements method without specifying a class:
122 |
123 | ```ruby
124 | class ServiceResponse
125 | include SAXMachine
126 | elements :message, as: :messages
127 | end
128 |
129 | response = ServiceResponse.parse("
130 |
131 | hi
132 | world
133 |
134 | ")
135 | response.messages.first # hi
136 | response.messages.last # world
137 | ```
138 |
139 | To limit conflicts in the class used for mappping, you can use the alternate
140 | `SAXMachine.configure` syntax:
141 |
142 | ```ruby
143 | class X < ActiveRecord::Base
144 | # This way no element, elements or ancestor method will be added to X
145 | SAXMachine.configure(X) do |c|
146 | c.element :title
147 | end
148 | end
149 | ```
150 |
151 | Multiple elements can be mapped to the same alias:
152 |
153 | ```ruby
154 | class RSSEntry
155 | include SAXMachine
156 | # ...
157 | element :pubDate, as: :published
158 | element :pubdate, as: :published
159 | element :"dc:date", as: :published
160 | element :"dc:Date", as: :published
161 | element :"dcterms:created", as: :published
162 | end
163 | ```
164 |
165 | If more than one of these elements exists in the source, the value from the *last one* is used. The order of
166 | the `element` declarations in the code is unimportant. The order they are encountered while parsing the
167 | document determines the value assigned to the alias.
168 |
169 | If an element is defined in the source but is blank (e.g., ``), it is ignored, and non-empty one is picked.
170 |
171 | ## Contributing
172 |
173 | 1. Fork it
174 | 2. Create your feature branch (`git checkout -b my-new-feature`)
175 | 3. Commit your changes (`git commit -am 'Add some feature'`)
176 | 4. Push to the branch (`git push origin my-new-feature`)
177 | 5. Create new Pull Request
178 |
179 | ## LICENSE
180 |
181 | The MIT License
182 |
183 | Copyright (c) 2009-2014:
184 |
185 | * [Paul Dix](http://www.pauldix.net)
186 | * [Julien Kirch](http://www.archiloque.net)
187 | * [Ezekiel Templin](http://zeke.templ.in)
188 | * [Dmitry Krasnoukhov](http://krasnoukhov.com)
189 |
190 | Permission is hereby granted, free of charge, to any person obtaining
191 | a copy of this software and associated documentation files (the
192 | 'Software'), to deal in the Software without restriction, including
193 | without limitation the rights to use, copy, modify, merge, publish,
194 | distribute, sublicense, and/or sell copies of the Software, and to
195 | permit persons to whom the Software is furnished to do so, subject to
196 | the following conditions:
197 |
198 | The above copyright notice and this permission notice shall be
199 | included in all copies or substantial portions of the Software.
200 |
201 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
202 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
203 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
204 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
205 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
206 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
207 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
208 |
--------------------------------------------------------------------------------
/spec/fixtures/atom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Paul Dix Explains Nothing
4 |
5 |
6 | tag:typepad.com,2003:weblog-108605
7 | 2008-09-04T16:07:19-04:00
8 | Entrepreneurship, programming, software development, politics, NYC, and random thoughts.
9 | TypePad
10 |
11 | Marshal data too short error with ActiveRecord
12 |
13 |
14 | tag:typepad.com,2003:post-55147740
15 | 2008-09-04T16:07:19-04:00
16 | 2008-11-17T14:40:06-05:00
17 | In my previous post about the speed of serializing data, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at...
18 |
19 | Paul Dix
20 |
21 |
22 |
23 |
24 |
25 | <div xmlns="http://www.w3.org/1999/xhtml"><p>In my previous <a href="http://www.pauldix.net/2008/08/serializing-dat.html">post about the speed of serializing data</a>, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, <a href="http://www.brynary.com/">Bryan Helmkamp</a> had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.</p>
26 |
27 | <p>I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:</p>
28 | <pre>{ :foo => 3, :bar => 2 } # hash with symbols for keys and integer values<br />[3, 2.1, 4, 8] # array with integer and float values</pre>
29 | <p>Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.</p>
30 |
31 | <p>I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:</p>
32 | <pre> user system total real<br />array marshal 0.200000 0.010000 0.210000 ( 0.214018) (without Base64)<br />array marshal 0.220000 0.010000 0.230000 ( 0.250260)<br /><br />hash marshal 1.830000 0.040000 1.870000 ( 1.892874) (without Base64)<br />hash marshal 2.040000 0.100000 2.140000 ( 2.170405)</pre>
33 | <p>As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.</p>
34 |
35 | <p>I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.</p></div>
36 | <div class="feedflare">
37 | <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=rWfWO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=rWfWO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=RaCqo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=RaCqo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=1CBLo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=1CBLo" border="0"></img></a>
38 | </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/383536354" height="1" width="1"/>
39 |
40 |
41 | http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1¶m2=2
42 |
43 | Serializing data speed comparison: Marshal vs. JSON vs. Eval vs. YAML
44 |
45 |
46 | tag:typepad.com,2003:post-54766774
47 | 2008-08-27T14:31:41-04:00
48 | 2008-10-14T01:26:31-04:00
49 | Last night at the NYC Ruby hackfest, I got into a discussion about serializing data. Brian mentioned the Marshal library to me, which for some reason had completely escaped my attention until last night. He said it was wicked fast...
50 |
51 | Paul Dix
52 |
53 |
54 |
55 |
56 |
57 | <div xmlns="http://www.w3.org/1999/xhtml"><p>Last night at the <a href="http://nycruby.org">NYC Ruby hackfest</a>, I got into a discussion about serializing data. Brian mentioned the Marshal library to me, which for some reason had completely escaped my attention until last night. He said it was wicked fast so we decided to run a quick benchmark comparison.</p>
58 | <p>The test data is designed to roughly approximate what my <a href="http://www.pauldix.net/2008/08/storing-many-cl.html">stored classifier data</a> will look like. The different methods we decided to benchmark were Marshal, json, eval, and yaml. With each one we took the in-memory object and serialized it and then read it back in. With eval we had to convert the object to ruby code to serialize it then run eval against that. Here are the results for 100 iterations on a 10k element array and a hash with 10k key/value pairs run on my Macbook Pro 2.4 GHz Core 2 Duo:</p>
59 | <pre> user system total real<br />array marshal 0.210000 0.010000 0.220000 ( 0.220701)<br />array json 2.180000 0.050000 2.230000 ( 2.288489)<br />array eval 2.090000 0.060000 2.150000 ( 2.240443)<br />array yaml 26.650000 0.350000 27.000000 ( 27.810609)<br /><br />hash marshal 2.000000 0.050000 2.050000 ( 2.114950)<br />hash json 3.700000 0.060000 3.760000 ( 3.881716)<br />hash eval 5.370000 0.140000 5.510000 ( 6.117947)<br />hash yaml 68.220000 0.870000 69.090000 ( 72.370784)</pre>
60 | <p>The order in which I tested them is pretty much the order in which they ranked for speed. Marshal was amazingly fast. JSON and eval came out roughly equal on the array with eval trailing quite a bit for the hash. Yaml was just slow as all hell. A note on the json: I used the 1.1.3 library which uses c to parse. I assume it would be quite a bit slower if I used the pure ruby implementation. Here's <a href="http://gist.github.com/7549">a gist of the benchmark code</a> if you're curious and want to run it yourself.</p>
61 |
62 |
63 |
64 | <p>If you're serializing user data, be super careful about using eval. It's probably best to avoid it completely. Finally, just for fun I took yaml out (it was too slow) and ran the benchmark again with 1k iterations:</p>
65 | <pre> user system total real<br />array marshal 2.080000 0.110000 2.190000 ( 2.242235)<br />array json 21.860000 0.500000 22.360000 ( 23.052403)<br />array eval 20.730000 0.570000 21.300000 ( 21.992454)<br /><br />hash marshal 19.510000 0.500000 20.010000 ( 20.794111)<br />hash json 39.770000 0.670000 40.440000 ( 41.689297)<br />hash eval 51.410000 1.290000 52.700000 ( 54.155711)</pre></div>
66 | <div class="feedflare">
67 | <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=zombO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=zombO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=T3kqo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=T3kqo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=aI6Oo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=aI6Oo" border="0"></img></a>
68 | </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/376401099" height="1" width="1"/>
69 |
70 |
71 | http://www.pauldix.net/2008/08/serializing-dat.html
72 |
73 | Gotcha with cache_fu and permalinks
74 |
75 |
76 | tag:typepad.com,2003:post-54411628
77 | 2008-08-19T14:26:24-04:00
78 | 2008-11-20T13:58:38-05:00
79 | This is an issue I had recently in a project with cache_fu. Models that I found and cached based on permalinks weren't expiring the cache correctly when getting updated. Here's an example scenario. Say you have a blog with posts....
80 |
81 | Paul Dix
82 |
83 |
84 |
85 |
86 |
87 | <div xmlns="http://www.w3.org/1999/xhtml"><p>This is an issue I had recently in a project with <a href="http://errtheblog.com/posts/57-kickin-ass-w-cachefu">cache_fu</a>. Models that I found and cached based on permalinks weren't expiring the cache correctly when getting updated. Here's an example scenario.</p>
88 |
89 | <p>Say you have a blog with posts. However, instead of using a url like http://paulscoolblog.com/posts/23 you want something that's more search engine friendly and readable for the user. So you use a permalink (maybe using the <a href="http://github.com/github/permalink_fu/tree/master">permalink_fu plugin</a>) that's auto-generated based on the title of the post. This post would have a url that looks something like http://paulscoolblog.com/posts/gotcha-with-cache_fu-and-permalinks.</p>
90 |
91 | <p>In your controller's show method you'd probably find the post like this:</p>
92 | <pre>@post = Post.find_by_permalink(params[:permalink])</pre>
93 | <p>However, you'd want to do the caching thing so you'd actually do this:</p>
94 | <pre>@post = Post.cached(:find_by_permalink, :with => params[:permalink])</pre>
95 | <p>The problem that I ran into, which is probably obvious to anyone familiar with cache_fu, was that when updating the post, it wouldn't expire the cache. That part of the post model looks like this:</p>
96 | <pre>class Post < ActiveRecord::Base<br /> before_save :expire_cache<br /> ...<br />end</pre>
97 | <p>Do you see it? The issue is that when expire_cache gets called on the object, it expires the key <strong>Post:23</strong> from the cache (assuming 23 was the id of the post). However, when the post was cached using the cached(:find_by_permalink ...) method, it put the post object into the cache with a key of <strong>Post:find_by_permalink:gotcha-with-cache_fu-and-permalinks</strong>.</p>
98 | <p>Luckily, it's a fairly simple fix. If you have a model that is commonly accessed through permalinks, just write your own cache expiry method that looks for both keys and expires them.</p></div>
99 | <div class="feedflare">
100 | <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=V1ojO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=V1ojO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=eu6Zo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=eu6Zo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=ddUho"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=ddUho" border="0"></img></a>
101 | </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/369250462" height="1" width="1"/>
102 |
103 |
104 | http://www.pauldix.net/2008/08/gotcha-with-cac.html
105 |
106 | Non-greedy mode in regex
107 |
108 |
109 | tag:typepad.com,2003:post-54227244
110 | 2008-08-15T09:32:11-04:00
111 | 2008-08-27T09:33:15-04:00
112 | I was writing a regular expression yesterday and this popped up. It's just a quick note about greedy vs. non-greedy mode in regular expression matching. Say I have a regular expression that looks something like this: /(\[.*\])/ In English that...
113 |
114 | Paul Dix
115 |
116 |
117 |
118 |
119 | <p>I was writing a regular expression yesterday and this popped up. It's just a quick note about greedy vs. non-greedy mode in regular expression matching. Say I have a regular expression that looks something like this:</p>
120 | <pre>/(\[.*\])/</pre>
121 | <p>In English that says something roughly like: find an opening bracket [ with 0 or more of any character followed by a closing bracket. The backslashes are to escape the brackets and the parenthesis specify grouping so we can later access that matched text.</p>
122 |
123 | <p>The greedy mode comes up with the 0 or more characters part of the match (the .* part of the expression). The default mode of greedy means that the parser will gobble up as many characters as it can and match the very last closing bracket. So if you have text like this:</p>
124 |
125 | <pre>a = [:foo, :bar]<br>b = [:hello, :world]</pre>
126 | <p>The resulting grouped match would be this:</p>
127 | <pre>[:foo, :bar]<br>b = [:hello, :world]</pre>
128 | <p>If you just wanted the [:foo, :bar] part, the solution is to parse in non-greedy mode. This means that it will match on the first closing bracket it sees. The modified regular expression looks like this:</p>
129 | <pre>/(\[.*?\])/</pre>
130 | <p>I love the regular expression engine in Ruby. It's one of the best things it ripped off from Perl. The one thing I don't like is the magic global variable that it places matched groups into. You can access that first match through the $1 variable. If you're unfamiliar with regular expressions, a good place to start is the <a href="http://www.amazon.com/Programming-Perl-3rd-Larry-Wall/dp/0596000278/ref=pd_bbs_sr_1?ie=UTF8&s=books&qid=1218806755&sr=8-1">Camel book</a>. It's about Perl, but the way they work is very similar. I actually haven't seen good coverage of regexes in a Ruby book.</p><div class="feedflare">
131 | <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=OkVmO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=OkVmO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=iRpWo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=iRpWo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=pjRCo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=pjRCo" border="0"></img></a>
132 | </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/365673983" height="1" width="1"/>
133 |
134 |
135 | http://www.pauldix.net/2008/08/non-greedy-mode.html
136 |
137 | Storing many classification models
138 |
139 |
140 | tag:typepad.com,2003:post-53888232
141 | 2008-08-07T12:01:38-04:00
142 | 2008-08-27T16:58:18-04:00
143 | One of the things I need to do in Filterly is keep many trained classifiers. These are the machine learning models that determine if a blog post is on topic (Filterly separates information by topic). At the very least I...
144 |
145 | Paul Dix
146 |
147 |
148 |
149 |
150 | <p>One of the things I need to do in <a href="http://filterly.com/">Filterly</a> is keep many trained <a href="http://en.wikipedia.org/wiki/Statistical_classification">classifiers</a>. These are the machine learning models that determine if a blog post is on topic (Filterly separates information by topic). At the very least I need one per topic in the system. If I want to do something like <a href="http://en.wikipedia.org/wiki/Boosting">boosting</a> then I need even more. The issue I'm wrestling with is how to store this data. I'll outline a specific approach and what the storage needs are.</p>
151 |
152 | <p>Let's say I go with boosting and 10 <a href="http://en.wikipedia.org/wiki/Perceptron">perceptrons</a>. I'll also limit my feature space to the 10,000 most statistically significant features. So the storage for each perceptron is a 10k element array. However, I'll also have to keep another data structure to store what the 10k features are and their position in the array. In code I use a hash for this where the feature name is the key and the value is its position. I just need to store one of these hashes per topic.</p>
153 |
154 | <p>That's not really a huge amount of data. I'm more concerned about the best way to store it. I don't think this kind of thing maps well to a relational database. I don't need to store the features individually. Generally when I'm running the thing I'll want the whole perceptron and feature set in memory for quick access. For now I'm just using a big text field and serializing each using JSON.</p>
155 |
156 | <p>I don't really like this approach. The whole serializing into the database seems really inelegant. Combined with the time that it takes to parse these things. Each time I want to see if a new post is on topic I'd need to load up the classifier and parse the 10 10k arrays and the 10k key hash. I could keep each classifier running as a service, but then I've got a pretty heavy process running for each topic.</p>
157 |
158 | <p>I guess I'll just use the stupid easy solution for the time being and worry about performance later. Anyone have thoughts on the best approach?</p><div class="feedflare">
159 | <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=DUT8O"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=DUT8O" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=ZGjFo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=ZGjFo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=pH3Vo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=pH3Vo" border="0"></img></a>
160 | </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/358530158" height="1" width="1"/>
161 |
162 |
163 | http://www.pauldix.net/2008/08/storing-many-cl.html
164 |
165 |
166 |
--------------------------------------------------------------------------------
/spec/sax-machine/sax_document_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2 |
3 | describe "SAXMachine" do
4 | describe "element" do
5 | describe "when parsing a single element" do
6 | before do
7 | @klass = Class.new do
8 | include SAXMachine
9 | element :title
10 | ancestor :body
11 | value :something, required: false
12 | attribute :anything, required: true
13 | end
14 | end
15 |
16 | it "provides mass assignment through initialize method" do
17 | document = @klass.new(title: "Title")
18 | expect(document.title).to eq("Title")
19 | end
20 |
21 | it "provides an accessor" do
22 | document = @klass.new
23 | document.title = "Title"
24 | expect(document.title).to eq("Title")
25 | end
26 |
27 | it "does not overwrites the getter is there is already one present" do
28 | @klass = Class.new do
29 | def title
30 | "#{@title} ***"
31 | end
32 |
33 | include SAXMachine
34 | element :title
35 | end
36 |
37 | document = @klass.new
38 | document.title = "Title"
39 | expect(document.title).to eq("Title ***")
40 | end
41 |
42 | it "does not overwrites the setter if there is already one present" do
43 | @klass = Class.new do
44 | def title=(val)
45 | @title = "#{val} **"
46 | end
47 |
48 | include SAXMachine
49 | element :title
50 | end
51 |
52 | document = @klass.new
53 | document.title = "Title"
54 | expect(document.title).to eq("Title **")
55 | end
56 |
57 | it "does not overwrites the accessor when the element is not present" do
58 | document = @klass.new
59 | document.title = "Title"
60 | document.parse("")
61 | expect(document.title).to eq("Title")
62 | end
63 |
64 | it "overwrites the value when the element is present" do
65 | document = @klass.new
66 | document.title = "Old title"
67 | document.parse("New title")
68 | expect(document.title).to eq("New title")
69 | end
70 |
71 | it "saves the element text into an accessor" do
72 | document = @klass.parse("My Title")
73 | expect(document.title).to eq("My Title")
74 | end
75 |
76 | it "keeps the document encoding for elements" do
77 | data = "My Title"
78 | data.encode!("utf-8")
79 |
80 | document = @klass.parse(data)
81 | expect(document.title.encoding).to eq(data.encoding)
82 | end
83 |
84 | it "saves cdata into an accessor" do
85 | document = @klass.parse("")
86 | expect(document.title).to eq("A Title")
87 | end
88 |
89 | it "saves the element text into an accessor when there are multiple elements" do
90 | document = @klass.parse("My Titlebar")
91 | expect(document.title).to eq("My Title")
92 | end
93 |
94 | it "saves the first element text when there are multiple of the same element" do
95 | document = @klass.parse("My Titlebar")
96 | expect(document.title).to eq("My Title")
97 | end
98 |
99 | describe "the introspection" do
100 | it "allows to get column names" do
101 | expect(@klass.column_names).to match_array([:title])
102 | end
103 |
104 | it "allows to get elements" do
105 | expect(@klass.sax_config.top_level_elements.values.flatten.map(&:to_s)).to \
106 | match_array(["name: title dataclass: setter: title= required: value: as:title collection: with: {}"])
107 | end
108 |
109 | it "allows to get ancestors" do
110 | expect(@klass.sax_config.ancestors.map(&:column)).to \
111 | match_array([:body])
112 | end
113 |
114 | it "allows to get values" do
115 | expect(@klass.sax_config.top_level_element_value.map(&:column)).to \
116 | match_array([:something])
117 | expect(@klass.sax_config.top_level_element_value.map(&:required?)).to \
118 | match_array([false])
119 | end
120 |
121 | it "allows to get attributes" do
122 | expect(@klass.sax_config.top_level_attributes.map(&:column)).to \
123 | match_array([:anything])
124 | expect(@klass.sax_config.top_level_attributes.map(&:required?)).to \
125 | match_array([true])
126 | expect(@klass.sax_config.top_level_attributes.map(&:collection?)).to \
127 | match_array([false])
128 | end
129 | end
130 |
131 | describe "the class attribute" do
132 | before(:each) do
133 | @klass = Class.new do
134 | include SAXMachine
135 | element :date, class: DateTime
136 | end
137 |
138 | @document = @klass.new
139 | @document.date = Time.now.iso8601
140 | end
141 |
142 | it "is available" do
143 | expect(@klass.data_class(:date)).to eq(DateTime)
144 | end
145 |
146 | describe "string" do
147 | before do
148 | class TestString
149 | include SAXMachine
150 | element :number, class: String
151 | end
152 |
153 | class TestStringAttribute
154 | include SAXMachine
155 | attribute :sub_number, class: String
156 | end
157 |
158 | class TestStringWithAttribute
159 | include SAXMachine
160 | element :number, class: TestStringAttribute
161 | end
162 | end
163 |
164 | it "is handled in an element" do
165 | document = TestString.parse("5.5")
166 | expect(document.number).to eq("5.5")
167 | end
168 |
169 | it "is handled in an attribute" do
170 | document = TestStringWithAttribute.parse("")
171 | expect(document.number.sub_number).to eq("5.5")
172 | end
173 | end
174 |
175 | describe "integer" do
176 | before do
177 | class TestInteger
178 | include SAXMachine
179 | element :number, class: Integer
180 | end
181 |
182 | class TestIntegerAttribute
183 | include SAXMachine
184 | attribute :sub_number, class: Integer
185 | end
186 |
187 | class TestIntegerWithAttribute
188 | include SAXMachine
189 | element :number, class: TestIntegerAttribute
190 | end
191 |
192 | class IntegerInsideAttribute
193 | include SAXMachine
194 | element :number, value: :int_attr, as: :int_attr, class: Integer
195 | end
196 | end
197 |
198 | it "is handled in an element" do
199 | document = TestInteger.parse("5")
200 | expect(document.number).to eq(5)
201 | end
202 |
203 | it "is handled in an attribute" do
204 | document = TestIntegerWithAttribute.parse("")
205 | expect(document.number.sub_number).to eq(5)
206 | end
207 |
208 | it "is handled in an attribute with value option" do
209 | document = IntegerInsideAttribute.parse("")
210 | expect(document.int_attr).to eq(2)
211 | end
212 | end
213 |
214 | describe "float" do
215 | before do
216 | class TestFloat
217 | include SAXMachine
218 | element :number, class: Float
219 | end
220 |
221 | class TestFloatAttribute
222 | include SAXMachine
223 | attribute :sub_number, class: Float
224 | end
225 |
226 | class TestFloatWithAttribute
227 | include SAXMachine
228 | element :number, class: TestFloatAttribute
229 | end
230 | end
231 |
232 | it "is handled in an element with '.' delimiter" do
233 | document = TestFloat.parse("5.5")
234 | expect(document.number).to eq(5.5)
235 | end
236 |
237 | it "is handled in an element with ',' delimiter" do
238 | document = TestFloat.parse("5,5")
239 | expect(document.number).to eq(5.5)
240 | end
241 |
242 | it "is handled in an attribute" do
243 | document = TestFloatWithAttribute.parse("5.5")
244 | expect(document.number.sub_number).to eq(5.5)
245 | end
246 | end
247 |
248 | describe "symbol" do
249 | before do
250 | class TestSymbol
251 | include SAXMachine
252 | element :symbol, class: Symbol
253 | end
254 |
255 | class TestSymbolAttribute
256 | include SAXMachine
257 | attribute :sub_symbol, class: Symbol
258 | end
259 |
260 | class TestSymbolWithAttribute
261 | include SAXMachine
262 | element :symbol, class: TestSymbolAttribute
263 | end
264 | end
265 |
266 | it "is handled in an element" do
267 | document = TestSymbol.parse("MY_SYMBOL_VALUE")
268 | expect(document.symbol).to eq(:my_symbol_value)
269 | end
270 |
271 | it "is handled in an attribute" do
272 | document = TestSymbolWithAttribute.parse("")
273 | expect(document.symbol.sub_symbol).to eq(:my_symbol_value)
274 | end
275 | end
276 |
277 | describe "time" do
278 | before do
279 | class TestTime
280 | include SAXMachine
281 | element :time, class: Time
282 | end
283 |
284 | class TestTimeAttribute
285 | include SAXMachine
286 | attribute :sub_time, class: Time
287 | end
288 |
289 | class TestTimeWithAttribute
290 | include SAXMachine
291 | element :time, class: TestTimeAttribute
292 | end
293 | end
294 |
295 | it "is handled in an element" do
296 | document = TestTime.parse("")
297 | expect(document.time).to eq(Time.utc(1994, 2, 4, 6, 20, 0, 0))
298 | end
299 |
300 | it "is handled in an attribute" do
301 | document = TestTimeWithAttribute.parse("")
302 | expect(document.time.sub_time).to eq(Time.utc(1994, 2, 4, 6, 20, 0, 0))
303 | end
304 | end
305 | end
306 |
307 | describe "the default attribute" do
308 | it "is available" do
309 | @klass = Class.new do
310 | include SAXMachine
311 | element :number, class: Integer, default: 0
312 | end
313 |
314 | document = @klass.parse("number")
315 | expect(document.number).to eq(0)
316 |
317 | document = @klass.parse("")
318 | expect(document.number).to eq(0)
319 | end
320 |
321 | it "can be a Boolean" do
322 | @klass = Class.new do
323 | include SAXMachine
324 | element(:bool, default: false) { |v| !!v }
325 | end
326 |
327 | document = @klass.parse("bool")
328 | expect(document.bool).to be false
329 |
330 | document = @klass.parse("")
331 | expect(document.bool).to be false
332 |
333 | document = @klass.parse("1")
334 | expect(document.bool).to be true
335 | end
336 | end
337 |
338 | describe "the required attribute" do
339 | it "is available" do
340 | @klass = Class.new do
341 | include SAXMachine
342 | element :date, required: true
343 | end
344 | expect(@klass.required?(:date)).to be_truthy
345 | end
346 | end
347 |
348 | describe "the block" do
349 | before do
350 | class ElementBlockParser
351 | include SAXMachine
352 |
353 | ancestor :parent do |parent|
354 | parent.class.to_s
355 | end
356 |
357 | value :text do |text|
358 | text.downcase
359 | end
360 | end
361 |
362 | class BlockParser
363 | include SAXMachine
364 |
365 | element :title do |title|
366 | "#{title}!!!"
367 | end
368 |
369 | element :scope do |scope|
370 | "#{title} #{scope}"
371 | end
372 |
373 | attribute :id do |id|
374 | id.to_i
375 | end
376 |
377 | element :nested, class: ElementBlockParser
378 | elements :message, as: :messages do |message|
379 | "#{message}!"
380 | end
381 | end
382 | end
383 |
384 | it "has instance as a block context" do
385 | document = BlockParser.parse("SAXsomething")
386 | expect(document.scope).to eq("SAX!!! something")
387 | end
388 |
389 | it "uses block for element" do
390 | document = BlockParser.parse("SAX")
391 | expect(document.title).to eq("SAX!!!")
392 | end
393 |
394 | it 'uses block for attribute' do
395 | document = BlockParser.parse("SAX")
396 | expect(document.id).to eq(345)
397 | end
398 |
399 | it "uses block for value" do
400 | document = BlockParser.parse("tEst")
401 | expect(document.nested.text).to eq("test")
402 | end
403 |
404 | it "uses block for ancestor" do
405 | document = BlockParser.parse("SAX")
406 | expect(document.nested.parent).to eq("BlockParser")
407 | end
408 |
409 | it "uses block for elements" do
410 | document = BlockParser.parse("hiworld")
411 | expect(document.messages).to eq(["hi!", "world!"])
412 | end
413 | end
414 | end
415 |
416 | describe "when parsing multiple elements" do
417 | before do
418 | @klass = Class.new do
419 | include SAXMachine
420 | element :title
421 | element :name
422 | end
423 | end
424 |
425 | it "saves the element text for a second tag" do
426 | document = @klass.parse("My TitlePaul")
427 | expect(document.name).to eq("Paul")
428 | expect(document.title).to eq("My Title")
429 | end
430 |
431 | it "does not overwrites the getter is there is already one present" do
432 | @klass = Class.new do
433 | def items
434 | []
435 | end
436 |
437 | include SAXMachine
438 | elements :items
439 | end
440 |
441 | document = @klass.new
442 | document.items = [1, 2, 3, 4]
443 | expect(document.items).to eq([])
444 | end
445 |
446 | it "does not overwrites the setter if there is already one present" do
447 | @klass = Class.new do
448 | def items=(val)
449 | @items = [1, *val]
450 | end
451 |
452 | include SAXMachine
453 | elements :items
454 | end
455 |
456 | document = @klass.new
457 | document.items = [2, 3]
458 | expect(document.items).to eq([1, 2, 3])
459 | end
460 | end
461 |
462 | describe "when using options for parsing elements" do
463 | describe "using the 'as' option" do
464 | before do
465 | @klass = Class.new do
466 | include SAXMachine
467 | element :description, as: :summary
468 | end
469 | end
470 |
471 | it "provides an accessor using the 'as' name" do
472 | document = @klass.new
473 | document.summary = "a small summary"
474 | expect(document.summary).to eq("a small summary")
475 | end
476 |
477 | it "saves the element text into the 'as' accessor" do
478 | document = @klass.parse("here is a description")
479 | expect(document.summary).to eq("here is a description")
480 | end
481 | end
482 |
483 | describe "using the :with option" do
484 | describe "and the :value option" do
485 | before do
486 | @klass = Class.new do
487 | include SAXMachine
488 | element :link, value: :href, with: { foo: "bar" }
489 | end
490 | end
491 |
492 | it "saves the value of a matching element" do
493 | document = @klass.parse("asdf")
494 | expect(document.link).to eq("test")
495 | end
496 |
497 | it "saves the value of the first matching element" do
498 | document = @klass.parse("")
499 | expect(document.link).to eq("first")
500 | end
501 |
502 | describe "and the :as option" do
503 | before do
504 | @klass = Class.new do
505 | include SAXMachine
506 | element :link, value: :href, as: :url, with: { foo: "bar" }
507 | element :link, value: :href, as: :second_url, with: { asdf: "jkl" }
508 | end
509 | end
510 |
511 | it "saves the value of the first matching element" do
512 | document = @klass.parse("")
513 | expect(document.url).to eq("first")
514 | expect(document.second_url).to eq("second")
515 | end
516 | end
517 | end
518 |
519 | describe "with only one element" do
520 | before do
521 | @klass = Class.new do
522 | include SAXMachine
523 | element :link, with: { foo: "bar" }
524 | end
525 | end
526 |
527 | it "saves the text of an element that has matching attributes" do
528 | document = @klass.parse("match")
529 | expect(document.link).to eq("match")
530 | end
531 |
532 | it "does not saves the text of an element that doesn't have matching attributes" do
533 | document = @klass.parse("no match")
534 | expect(document.link).to be_nil
535 | end
536 |
537 | it "saves the text of an element that has matching attributes when it is the second of that type" do
538 | document = @klass.parse("no matchmatch")
539 | expect(document.link).to eq("match")
540 | end
541 |
542 | it "saves the text of an element that has matching attributes plus a few more" do
543 | document = @klass.parse("no matchmatch")
544 | expect(document.link).to eq("match")
545 | end
546 | end
547 |
548 | describe "with multiple elements of same tag" do
549 | before do
550 | @klass = Class.new do
551 | include SAXMachine
552 | element :link, as: :first, with: { foo: "bar" }
553 | element :link, as: :second, with: { asdf: "jkl" }
554 | end
555 | end
556 |
557 | it "matches the first element" do
558 | document = @klass.parse("no matchfirst matchno match")
559 | expect(document.first).to eq("first match")
560 | end
561 |
562 | it "matches the second element" do
563 | document = @klass.parse("no matchfirst matchsecond matchhi")
564 | expect(document.second).to eq("second match")
565 | end
566 | end
567 |
568 | describe "with only one element as a regular expression" do
569 | before do
570 | @klass = Class.new do
571 | include SAXMachine
572 | element :link, with: { foo: /ar$/ }
573 | end
574 | end
575 |
576 | it "saves the text of an element that has matching attributes" do
577 | document = @klass.parse("match")
578 | expect(document.link).to eq("match")
579 | end
580 |
581 | it "does not saves the text of an element that doesn't have matching attributes" do
582 | document = @klass.parse("no match")
583 | expect(document.link).to be_nil
584 | end
585 |
586 | it "saves the text of an element that has matching attributes when it is the second of that type" do
587 | document = @klass.parse("no matchmatch")
588 | expect(document.link).to eq("match")
589 | end
590 |
591 | it "saves the text of an element that has matching attributes plus a few more" do
592 | document = @klass.parse("no matchmatch")
593 | expect(document.link).to eq("match")
594 | end
595 | end
596 | end
597 |
598 | describe "using the 'value' option" do
599 | before do
600 | @klass = Class.new do
601 | include SAXMachine
602 | element :link, value: :foo
603 | end
604 | end
605 |
606 | it "saves the attribute value" do
607 | document = @klass.parse("hello")
608 | expect(document.link).to eq("test")
609 | end
610 |
611 | it "saves the attribute value when there is no text enclosed by the tag" do
612 | document = @klass.parse("")
613 | expect(document.link).to eq("test")
614 | end
615 |
616 | it "saves the attribute value when the tag close is in the open" do
617 | document = @klass.parse("")
618 | expect(document.link).to eq("test")
619 | end
620 |
621 | it "saves two different attribute values on a single tag" do
622 | @klass = Class.new do
623 | include SAXMachine
624 | element :link, value: :foo, as: :first
625 | element :link, value: :bar, as: :second
626 | end
627 |
628 | document = @klass.parse("")
629 | expect(document.first).to eq("foo value")
630 | expect(document.second).to eq("bar value")
631 | end
632 |
633 | it "does not fail if one of the attribute hasn't been defined" do
634 | @klass = Class.new do
635 | include SAXMachine
636 | element :link, value: :foo, as: :first
637 | element :link, value: :bar, as: :second
638 | end
639 |
640 | document = @klass.parse("")
641 | expect(document.first).to eq("foo value")
642 | expect(document.second).to be_nil
643 | end
644 | end
645 |
646 | describe "when desiring both the content and attributes of an element" do
647 | before do
648 | @klass = Class.new do
649 | include SAXMachine
650 | element :link
651 | element :link, value: :foo, as: :link_foo
652 | element :link, value: :bar, as: :link_bar
653 | end
654 | end
655 |
656 | it "parses the element and attribute values" do
657 | document = @klass.parse("hello")
658 | expect(document.link).to eq("hello")
659 | expect(document.link_foo).to eq("test1")
660 | expect(document.link_bar).to eq("test2")
661 | end
662 | end
663 | end
664 | end
665 |
666 | describe "elements" do
667 | describe "when parsing multiple elements" do
668 | before do
669 | @klass = Class.new do
670 | include SAXMachine
671 | elements :entry, as: :entries
672 | end
673 | end
674 |
675 | it "provides a collection accessor" do
676 | document = @klass.new
677 | document.entries << :foo
678 | expect(document.entries).to eq([:foo])
679 | end
680 |
681 | it "parses a single element" do
682 | document = @klass.parse("hello")
683 | expect(document.entries).to eq(["hello"])
684 | end
685 |
686 | it "parses multiple elements" do
687 | document = @klass.parse("helloworld")
688 | expect(document.entries).to eq(["hello", "world"])
689 | end
690 |
691 | it "parses multiple elements when taking an attribute value" do
692 | attribute_klass = Class.new do
693 | include SAXMachine
694 | elements :entry, as: :entries, value: :foo
695 | end
696 |
697 | doc = attribute_klass.parse("")
698 | expect(doc.entries).to eq(["asdf", "jkl"])
699 | end
700 | end
701 |
702 | describe "when using the with and class options" do
703 | before do
704 | class Bar
705 | include SAXMachine
706 | element :title
707 | end
708 |
709 | class Foo
710 | include SAXMachine
711 | element :title
712 | end
713 |
714 | class Item
715 | include SAXMachine
716 | end
717 |
718 | @klass = Class.new do
719 | include SAXMachine
720 | elements :item, as: :items, with: { type: "Bar" }, class: Bar
721 | elements :item, as: :items, with: { type: /Foo/ }, class: Foo
722 | end
723 | end
724 |
725 | it "casts into the correct class" do
726 | document = @klass.parse("Bar titleFoo title")
727 | expect(document.items.size).to eq(2)
728 | expect(document.items.first).to be_a(Bar)
729 | expect(document.items.first.title).to eq("Bar title")
730 | expect(document.items.last).to be_a(Foo)
731 | expect(document.items.last.title).to eq("Foo title")
732 | end
733 | end
734 |
735 | describe "when using the class option" do
736 | before do
737 | class Foo
738 | include SAXMachine
739 | element :title
740 | end
741 |
742 | @klass = Class.new do
743 | include SAXMachine
744 | elements :entry, as: :entries, class: Foo
745 | end
746 | end
747 |
748 | it "parses a single element with children" do
749 | document = @klass.parse("a title")
750 | expect(document.entries.size).to eq(1)
751 | expect(document.entries.first.title).to eq("a title")
752 | end
753 |
754 | it "parses multiple elements with children" do
755 | document = @klass.parse("title 1title 2")
756 | expect(document.entries.size).to eq(2)
757 | expect(document.entries.first.title).to eq("title 1")
758 | expect(document.entries.last.title).to eq("title 2")
759 | end
760 |
761 | it "does not parse a top level element that is specified only in a child" do
762 | document = @klass.parse("no parsecorrect title")
763 | expect(document.entries.size).to eq(1)
764 | expect(document.entries.first.title).to eq("correct title")
765 | end
766 |
767 | it "parses elements, and make attributes and inner text available" do
768 | class Related
769 | include SAXMachine
770 | element "related", as: :item
771 | element "related", as: :attr, value: "attr"
772 | end
773 |
774 | class Foo
775 | elements "related", as: "items", class: Related
776 | end
777 |
778 | doc = Foo.parse(%{somethingsomethingelse})
779 | expect(doc.items.first).not_to be_nil
780 | expect(doc.items.size).to eq(2)
781 | expect(doc.items.first.item).to eq("something")
782 | expect(doc.items.last.item).to eq("somethingelse")
783 | end
784 |
785 | it "parses out an attribute value from the tag that starts the collection" do
786 | class Foo
787 | element :entry, value: :href, as: :url
788 | end
789 |
790 | document = @klass.parse("paul")
791 | expect(document.entries.size).to eq(1)
792 | expect(document.entries.first.title).to eq("paul")
793 | expect(document.entries.first.url).to eq("http://pauldix.net")
794 | end
795 | end
796 | end
797 |
798 | describe "when dealing with element names containing dashes" do
799 | it "converts dashes to underscores" do
800 | class Dashes
801 | include SAXMachine
802 | element :dashed_element
803 | end
804 |
805 | parsed = Dashes.parse("Text")
806 | expect(parsed.dashed_element).to eq "Text"
807 | end
808 | end
809 |
810 | describe "full example" do
811 | before do
812 | @xml = File.read("spec/fixtures/atom.xml")
813 |
814 | class AtomEntry
815 | include SAXMachine
816 | element :title
817 | element :name, as: :author
818 | element "feedburner:origLink", as: :url
819 | element :link, as: :alternate, value: :href, with: { type: "text/html", rel: "alternate" }
820 | element :summary
821 | element :content
822 | element :published
823 | end
824 |
825 | class Atom
826 | include SAXMachine
827 | element :title
828 | element :link, value: :href, as: :url, with: { type: "text/html" }
829 | element :link, value: :href, as: :feed_url, with: { type: "application/atom+xml" }
830 | elements :entry, as: :entries, class: AtomEntry
831 | end
832 |
833 | @feed = Atom.parse(@xml)
834 | end
835 |
836 | it "parses the url" do
837 | expect(@feed.url).to eq("http://www.pauldix.net/")
838 | end
839 |
840 | it "parses entry url" do
841 | expect(@feed.entries.first.url).to eq("http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1¶m2=2")
842 | expect(@feed.entries.first.alternate).to eq("http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/383536354/marshal-data-to.html?param1=1¶m2=2")
843 | end
844 |
845 | it "parses content" do
846 | expect(@feed.entries.first.content.strip).to eq(File.read("spec/fixtures/atom-content.html").strip)
847 | end
848 | end
849 |
850 | describe "parsing a tree" do
851 | before do
852 | @xml = %[
853 |
854 |
855 | First
856 |
857 |
858 | Second
859 |
860 |
861 |
862 |
863 | ]
864 |
865 | class CategoryCollection; end
866 |
867 | class Category
868 | include SAXMachine
869 | attr_accessor :id
870 | element :category, value: :id, as: :id
871 | element :title
872 | element :categories, as: :collection, class: CategoryCollection
873 | ancestor :ancestor
874 | end
875 |
876 | class CategoryCollection
877 | include SAXMachine
878 | elements :category, as: :categories, class: Category
879 | end
880 |
881 | @collection = CategoryCollection.parse(@xml)
882 | end
883 |
884 | it "parses the first category" do
885 | expect(@collection.categories.first.id).to eq("1")
886 | expect(@collection.categories.first.title).to eq("First")
887 | expect(@collection.categories.first.ancestor).to eq(@collection)
888 | end
889 |
890 | it "parses the nested category" do
891 | expect(@collection.categories.first.collection.categories.first.id).to eq("2")
892 | expect(@collection.categories.first.collection.categories.first.title).to eq("Second")
893 | end
894 | end
895 |
896 | describe "parsing a tree without a collection class" do
897 | before do
898 | @xml = %[
899 |
900 |
901 | First
902 |
903 |
904 | Second
905 |
906 |
907 |
908 |
909 | ]
910 |
911 | class CategoryTree
912 | include SAXMachine
913 | attr_accessor :id
914 | element :category, value: :id, as: :id
915 | element :title
916 | elements :category, as: :categories, class: CategoryTree
917 | end
918 |
919 | @collection = CategoryTree.parse(@xml)
920 | end
921 |
922 | it "parses the first category" do
923 | expect(@collection.categories.first.id).to eq("1")
924 | expect(@collection.categories.first.title).to eq("First")
925 | end
926 |
927 | it "parses the nested category" do
928 | expect(@collection.categories.first.categories.first.id).to eq("2")
929 | expect(@collection.categories.first.categories.first.title).to eq("Second")
930 | end
931 | end
932 |
933 | describe "with element deeper inside the xml structure" do
934 | before do
935 | @xml = %[
936 |
937 |
938 | Hello
939 |
940 |
941 | ]
942 |
943 | @klass = Class.new do
944 | include SAXMachine
945 | attr_accessor :id
946 | element :item, value: "id", as: :id
947 | element :title
948 | end
949 |
950 | @item = @klass.parse(@xml)
951 | end
952 |
953 | it "has an id" do
954 | expect(@item.id).to eq("1")
955 | end
956 |
957 | it "has a title" do
958 | expect(@item.title).to eq("Hello")
959 | end
960 | end
961 |
962 | describe "with config to pull multiple attributes" do
963 | before do
964 | @xml = %[
965 |
966 |
967 |
968 | ]
969 |
970 | class AuthorElement
971 | include SAXMachine
972 | attribute :name
973 | attribute :role
974 | end
975 |
976 | class ItemElement
977 | include SAXMachine
978 | element :author, class: AuthorElement
979 | end
980 |
981 | @item = ItemElement.parse(@xml)
982 | end
983 |
984 | it "has the child element" do
985 | expect(@item.author).not_to be_nil
986 | end
987 |
988 | it "has the author name" do
989 | expect(@item.author.name).to eq("John Doe")
990 | end
991 |
992 | it "has the author role" do
993 | expect(@item.author.role).to eq("writer")
994 | end
995 | end
996 |
997 | describe "with multiple elements and multiple attributes" do
998 | before do
999 | @xml = %[
1000 |
1001 |
1002 |
1003 |
1004 | ]
1005 |
1006 | class AuthorElement2
1007 | include SAXMachine
1008 | attribute :name
1009 | attribute :role
1010 | end
1011 |
1012 | class ItemElement2
1013 | include SAXMachine
1014 | elements :author, as: :authors, class: AuthorElement2
1015 | end
1016 |
1017 | @item = ItemElement2.parse(@xml)
1018 | end
1019 |
1020 | it "has the child elements" do
1021 | expect(@item.authors).not_to be_nil
1022 | expect(@item.authors.count).to eq(2)
1023 | end
1024 |
1025 | it "has the author names" do
1026 | expect(@item.authors.first.name).to eq("John Doe")
1027 | expect(@item.authors.last.name).to eq("Jane Doe")
1028 | end
1029 |
1030 | it "has the author roles" do
1031 | expect(@item.authors.first.role).to eq("writer")
1032 | expect(@item.authors.last.role).to eq("artist")
1033 | end
1034 | end
1035 |
1036 | describe "with mixed attributes and element values" do
1037 | before do
1038 | @xml = %[
1039 |
1040 | John Doe
1041 |
1042 | ]
1043 |
1044 | class AuthorElement3
1045 | include SAXMachine
1046 | value :name
1047 | attribute :role
1048 | end
1049 |
1050 | class ItemElement3
1051 | include SAXMachine
1052 | element :author, class: AuthorElement3
1053 | end
1054 |
1055 | @item = ItemElement3.parse(@xml)
1056 | end
1057 |
1058 | it "has the child elements" do
1059 | expect(@item.author).not_to be_nil
1060 | end
1061 |
1062 | it "has the author names" do
1063 | expect(@item.author.name).to eq("John Doe")
1064 | end
1065 |
1066 | it "has the author roles" do
1067 | expect(@item.author.role).to eq("writer")
1068 | end
1069 | end
1070 |
1071 | describe "with multiple mixed attributes and element values" do
1072 | before do
1073 | @xml = %[
1074 |
1075 | sweet
1076 | John Doe
1077 | Jane Doe
1078 |
1079 | ]
1080 |
1081 | class AuthorElement4
1082 | include SAXMachine
1083 | value :name
1084 | attribute :role
1085 | end
1086 |
1087 | class ItemElement4
1088 | include SAXMachine
1089 | element :title
1090 | elements :author, as: :authors, class: AuthorElement4
1091 |
1092 | def title=(blah)
1093 | @title = blah
1094 | end
1095 | end
1096 |
1097 | @item = ItemElement4.parse(@xml)
1098 | end
1099 |
1100 | it "has the title" do
1101 | expect(@item.title).to eq("sweet")
1102 | end
1103 |
1104 | it "has the child elements" do
1105 | expect(@item.authors).not_to be_nil
1106 | expect(@item.authors.count).to eq(2)
1107 | end
1108 |
1109 | it "has the author names" do
1110 | expect(@item.authors.first.name).to eq("John Doe")
1111 | expect(@item.authors.last.name).to eq("Jane Doe")
1112 | end
1113 |
1114 | it "has the author roles" do
1115 | expect(@item.authors.first.role).to eq("writer")
1116 | expect(@item.authors.last.role).to eq("artist")
1117 | end
1118 | end
1119 |
1120 | describe "with multiple elements with the same alias" do
1121 | let(:item) { ItemElement5.parse(xml) }
1122 |
1123 | before do
1124 | class ItemElement5
1125 | include SAXMachine
1126 | element :pubDate, as: :published
1127 | element :"dc:date", as: :published
1128 | end
1129 | end
1130 |
1131 | describe "only first defined" do
1132 | let(:xml) { "first value" }
1133 |
1134 | it "has first value" do
1135 | expect(item.published).to eq("first value")
1136 | end
1137 | end
1138 |
1139 | describe "only last defined" do
1140 | let(:xml) { "last value" }
1141 |
1142 | it "has last value" do
1143 | expect(item.published).to eq("last value")
1144 | end
1145 | end
1146 |
1147 | describe "both defined" do
1148 | let(:xml) { "first valuelast value" }
1149 |
1150 | it "has last value" do
1151 | expect(item.published).to eq("last value")
1152 | end
1153 | end
1154 |
1155 | describe "both defined but order is reversed" do
1156 | let(:xml) { "last valuefirst value" }
1157 |
1158 | it "has first value" do
1159 | expect(item.published).to eq("first value")
1160 | end
1161 | end
1162 |
1163 | describe "both defined but last is empty" do
1164 | let(:xml) { "first value" }
1165 |
1166 | it "has first value" do
1167 | expect(item.published).to eq("first value")
1168 | end
1169 | end
1170 | end
1171 |
1172 | describe "with error handling" do
1173 | before do
1174 | @xml = %[
1175 |
1176 | sweet
1177 | ]
1178 |
1179 | class ItemElement5
1180 | include SAXMachine
1181 | element :title
1182 | end
1183 |
1184 | @errors = []
1185 | @warnings = []
1186 | @item = ItemElement5.parse(
1187 | @xml,
1188 | ->(x) { @errors << x },
1189 | ->(x) { @warnings << x },
1190 | )
1191 | end
1192 |
1193 | it "has error" do
1194 | expect(@errors.uniq.size).to eq(1)
1195 | end
1196 |
1197 | it "has no warning" do
1198 | expect(@warnings.uniq.size).to eq(0)
1199 | end
1200 | end
1201 |
1202 | describe "with io as a input" do
1203 | before do
1204 | @io = StringIO.new('sweet')
1205 |
1206 | class IoParser
1207 | include SAXMachine
1208 | element :title
1209 | end
1210 |
1211 | @item = ItemElement5.parse(@io)
1212 | end
1213 |
1214 | it "parses" do
1215 | expect(@item.title).to eq("sweet")
1216 | end
1217 | end
1218 | end
1219 |
--------------------------------------------------------------------------------