├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── jsoner.gemspec ├── lib ├── jsoner.rb └── jsoner │ ├── table.rb │ ├── table_factory.rb │ └── version.rb └── spec ├── fixtures ├── json.rb ├── table.html ├── table.rb ├── table_extend.html └── table_extend.rb ├── jsoner ├── table_factory_spec.rb └── table_spec.rb └── jsoner_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | rvm: 2 | - 1.9.3 3 | - 2.0.0 4 | 5 | script: "bundle exec rspec spec/" 6 | 7 | notifications: 8 | email: 9 | - simlegate@163.com 10 | 11 | branches: 12 | only: 13 | - master 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v0.0.2 2 | * Filter other elements beyond table element 3 | * Get first table element when having more than two in parsed HTML 4 | 5 | ## v0.0.3 6 | * Parse HTML file including HTML Table 7 | * Fixing file is empty or have no table element 8 | 9 | ## v0.0.4 10 | * Add rspec test of checking integrity of HTML table 11 | * Supply exception message of NotFullTable 12 | * Parse Link including HTML Table 13 | 14 | ## v0.1.0 15 | * Support Ruby 2.0.0 16 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 simlegate 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jsoner 2 | 3 | Serialize HTML tables into JSON in Ruby. 4 | [![Gem Version](https://badge.fury.io/rb/jsoner.png)](http://badge.fury.io/rb/jsoner) [![Build Status](https://travis-ci.org/simlegate/jsoner.png?branch=master)](https://travis-ci.org/simlegate/jsoner) 5 | 6 | ## Installation 7 | 8 | Add this line to your application's Gemfile: 9 | 10 | gem 'jsoner' 11 | 12 | And then execute: 13 | 14 | $ bundle 15 | 16 | Or install it yourself as: 17 | 18 | $ gem install jsoner 19 | 20 | ## Usage 21 | 22 | ```ruby 23 | html = <<-eohtml 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
First NameLast NamePoints
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
50 | eohtml 51 | 52 | # Convert HTML table into Json 53 | 54 | json = Jsoner.parse(html) 55 | 56 | # output json => 57 | # 58 | # [ {"First Name"=>"Jill", "Last Name"=>"Smith", "Points"=>"50"}, 59 | # {"First Name"=>"Eve", "Last Name"=>"Jackson", "Points"=>"94"}, 60 | # {"First Name"=>"John", "Last Name"=>"Doe", "Points"=>"80"}, 61 | # {"First Name"=>"Adam", "Last Name"=>"Johnson", "Points"=>"67"} ] 62 | 63 | ``` 64 | Or, `Jsoner` can parse HTML file including HTML table 65 | 66 | ```ruby 67 | require 'open-uri' 68 | # you must have table.html file and assign file path correctly. 69 | 70 | table = Jsoner.parse(open('table.html')) 71 | 72 | # output json => 73 | # 74 | # [ {"First Name"=>"Jill", "Last Name"=>"Smith", "Points"=>"50"}, 75 | # {"First Name"=>"Eve", "Last Name"=>"Jackson", "Points"=>"94"}, 76 | # {"First Name"=>"John", "Last Name"=>"Doe", "Points"=>"80"}, 77 | # {"First Name"=>"Adam", "Last Name"=>"Johnson", "Points"=>"67"} ] 78 | 79 | ``` 80 | 81 | Or you can open Link 82 | 83 | ```ruby 84 | require 'open-uri' 85 | 86 | table = Jsoner.parse(open('http://www.w3school.com.cn/tiy/t.asp?f=html_table_header')) 87 | 88 | # output json => 89 | # 90 | # [{"姓名"=>"Bill Gates", "电话"=>"555 77 855"}] 91 | ``` 92 | ## THANKS 93 | 94 | [table-to-json](https://github.com/lightswitch05/table-to-json) written by [@lightswitch05](https://github.com/lightswitch05) in JavaScript. 95 | 96 | ## Contributing 97 | 98 | 1. Fork it 99 | 2. Create your feature branch (`git checkout -b my-new-feature`) 100 | 3. Commit your changes (`git commit -am 'Add some feature'`) 101 | 4. Push to the branch (`git push origin my-new-feature`) 102 | 5. Create new Pull Request 103 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | -------------------------------------------------------------------------------- /jsoner.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'jsoner/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "jsoner" 8 | spec.version = Jsoner::VERSION 9 | spec.authors = ["simlegate"] 10 | spec.email = ["simlegate@163.com"] 11 | spec.description = %q{Serialize HTML tables into JSON in Ruby} 12 | spec.summary = %q{Serialize HTML tables into JSON in Ruby} 13 | spec.homepage = "https://github.com/simlegate/jsoner" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files`.split($/) 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_development_dependency "bundler", "~> 1.3" 22 | spec.add_development_dependency "rake" 23 | spec.add_development_dependency "nokogiri", "~> 1.6.0" 24 | spec.add_development_dependency "rspec", "~> 2.14.1" 25 | end 26 | -------------------------------------------------------------------------------- /lib/jsoner.rb: -------------------------------------------------------------------------------- 1 | require "jsoner/version" 2 | require "jsoner/table" 3 | require "jsoner/table_factory" 4 | 5 | require 'nokogiri' 6 | 7 | module Jsoner 8 | 9 | class NotFullTable < StandardError 10 | def message 11 | 'Incomplete HTML Table' 12 | end 13 | end 14 | 15 | class << self 16 | 17 | def parse(html) 18 | # html = filter(html) 19 | if factory = TableFactory.check(Nokogiri::HTML.parse(html)) 20 | Table.new(factory).to_json 21 | end 22 | end 23 | 24 | def filter html 25 | if File.file? html 26 | File.open(html) { |file| file.read } 27 | else 28 | html 29 | end 30 | end 31 | 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/jsoner/table.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | 3 | module Jsoner 4 | class Table 5 | 6 | def initialize factory 7 | @table = factory.create 8 | end 9 | 10 | def row_number 11 | @table[:body].count 12 | end 13 | 14 | def to_json 15 | convert.to_json 16 | end 17 | 18 | # 19 | # convert Hash from factory into anthor Hash will be serialized into JSON 20 | # 21 | # Example: 22 | # table = { :header => ["First Name", "Last Name", "Points"] 23 | # :body => [["Jill", "Smith", "50"], 24 | # ["Eve", "Jackson", "94"], 25 | # ["John", "Doe", "80"], 26 | # ["Adam", "Johnson", "67"]] } 27 | # 28 | # Output: 29 | # table == [{"First Name"=>"Jill", "Last Name"=>"Smith", "Points"=>"50"}, 30 | # {"First Name"=>"Eve", "Last Name"=>"Jackson", "Points"=>"94"}, 31 | # {"First Name"=>"John", "Last Name"=>"Doe", "Points"=>"80"}, 32 | # {"First Name"=>"Adam", "Last Name"=>"Johnson", "Points"=>"67"}] 33 | def convert 34 | (0...row_number).map do |index| 35 | 36 | # 37 | # Combine two Arrays into Hash 38 | # a = ["a", "b", "c"] 39 | # b = ["d", "e", "f"] 40 | # Hash[a.zip(b)] 41 | # => {"a" => "d", "b" => "e", "b" => "f" } 42 | Hash[@table[:header].zip(@table[:body][index])] 43 | end 44 | end 45 | 46 | private :convert 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /lib/jsoner/table_factory.rb: -------------------------------------------------------------------------------- 1 | module Jsoner 2 | # 3 | # build Hash below from doc parsed by Nokogiki 4 | # table = { :header => ["First Name", "Last Name", "Score"] 5 | # :body => [["Jill", "Smith", "50"], 6 | # ["Eve", "Jackson", "94"], 7 | # ["John", "Doe", "80"], 8 | # ["Adam", "Johnson", "67"] 9 | # ] 10 | # } 11 | class TableFactory 12 | 13 | def initialize doc 14 | @table_rows = remove_housing(doc).search('tr') 15 | end 16 | 17 | def create 18 | { :header => build_header, :body => build_body } 19 | end 20 | 21 | def build_header 22 | @table_rows[0].search('th').map(&:content) 23 | end 24 | 25 | def build_body 26 | row_number = @table_rows.count - 1 27 | (1..row_number).map do |row| 28 | @table_rows[row].search('td').map(&:content) 29 | end 30 | end 31 | 32 | # 33 | # check whether table is full 34 | # a full table should include +th+, +td+, +tr+ elements 35 | def self.check doc 36 | table = doc.search('table').first 37 | unless table.search('tr').empty? || table.search('td').empty? || table.search('th').empty? 38 | TableFactory.new doc 39 | else 40 | raise Jsoner::NotFullTable 41 | end 42 | end 43 | 44 | private 45 | 46 | # 47 | # Remove other elements beside table and keep it clean 48 | # Or keep one table when including more than two table elements in parsed HTML 49 | # 50 | # Example: 51 | # 52 | # 53 | # // other elements 54 | #
55 | # 56 | # // other elements 57 | #
58 | # 59 | # 60 | # Output: 61 | # 62 | # // other elements 63 | #
64 | def remove_housing doc 65 | doc.search('table').first 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/jsoner/version.rb: -------------------------------------------------------------------------------- 1 | module Jsoner 2 | VERSION = "0.0.4" 3 | end 4 | -------------------------------------------------------------------------------- /spec/fixtures/json.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | def pre_json 3 | [ {"First Name"=>"Jill", "Last Name"=>"Smith", "Points"=>"50"}, 4 | {"First Name"=>"Eve", "Last Name"=>"Jackson", "Points"=>"94"}, 5 | {"First Name"=>"John", "Last Name"=>"Doe", "Points"=>"80"}, 6 | {"First Name"=>"Adam", "Last Name"=>"Johnson", "Points"=>"67"} ] 7 | end 8 | 9 | def json 10 | pre_json.to_json 11 | end 12 | -------------------------------------------------------------------------------- /spec/fixtures/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
First NameLast NamePoints
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
27 | -------------------------------------------------------------------------------- /spec/fixtures/table.rb: -------------------------------------------------------------------------------- 1 | def table_str 2 | <<-eohtml 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
First NameLast NamePoints
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
29 | eohtml 30 | end 31 | -------------------------------------------------------------------------------- /spec/fixtures/table_extend.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
21 | -------------------------------------------------------------------------------- /spec/fixtures/table_extend.rb: -------------------------------------------------------------------------------- 1 | def table_extend_str 2 | <<-eohtml 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
First NameLast NamePoints
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
30 | 31 |
32 | 33 | eohtml 34 | end 35 | 36 | def no_td_table 37 | <<-eohtml 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 |
First NameLast NamePoints
46 | eohtml 47 | end 48 | 49 | def no_tr_table 50 | <<-eohtml 51 | 52 |
53 | eohtml 54 | end 55 | 56 | def no_th_table 57 | <<-eohtml 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 |
JillSmith50
EveJackson94
JohnDoe80
AdamJohnson67
78 | eohtml 79 | end 80 | 81 | -------------------------------------------------------------------------------- /spec/jsoner/table_factory_spec.rb: -------------------------------------------------------------------------------- 1 | Dir["#{File.dirname(__FILE__)}/fixtures/*.rb"].each {|file| require file } 2 | require "#{File.dirname(__FILE__)}/../../lib/jsoner" 3 | 4 | describe 'build Hash below from doc parsed by Nokogiki' do 5 | 6 | before :each do 7 | @obj = Nokogiri::HTML.parse(table_str) 8 | @factory = Jsoner::TableFactory.new(@obj) 9 | end 10 | 11 | it "@table_rows should be instance of NodeSet" do 12 | @factory.instance_variable_get(:@table_rows).should be_instance_of Nokogiri::XML::NodeSet 13 | end 14 | 15 | context "Building header" do 16 | it "should return Array" do 17 | @factory.build_header.should be_instance_of Array 18 | end 19 | 20 | it "should match data from fixtures" do 21 | @factory.build_header.should == ["First Name", "Last Name", "Points"] 22 | end 23 | end 24 | 25 | context "Building body" do 26 | it "should return Double Dimensional Array" do 27 | @factory.build_body.should be_instance_of Array 28 | @factory.build_body.each{ |column| column.should be_instance_of Array } 29 | end 30 | end 31 | 32 | context "Building full hash defined table" do 33 | it "should have _header_ key" do 34 | @factory.create.should have_key :header 35 | end 36 | 37 | it "should have _body_ key" do 38 | @factory.create.should have_key :header 39 | end 40 | 41 | it "data mapped by _header_ key should match fixtures" do 42 | @factory.create[:header].should == ["First Name", "Last Name", "Points"] 43 | end 44 | 45 | it "data mapped by _body_ key should match fixtures" do 46 | @factory.create[:body].should == [["Jill", "Smith", "50"], 47 | ["Eve", "Jackson", "94"], 48 | ["John", "Doe", "80"], 49 | ["Adam", "Johnson", "67"]] 50 | end 51 | 52 | # TODO testing when having no header in table 53 | end 54 | 55 | context "check integrity of HTML table" do 56 | 57 | it "when having no tr element" do 58 | doc = Nokogiri::HTML.parse(no_tr_table) 59 | expect{ Jsoner::TableFactory.check(doc) }.to raise_error Jsoner::NotFullTable 60 | end 61 | 62 | it "when having no td element" do 63 | doc = Nokogiri::HTML.parse(no_td_table) 64 | expect{ Jsoner::TableFactory.check(doc) }.to raise_error Jsoner::NotFullTable 65 | end 66 | 67 | it "when having no th element" do 68 | doc = Nokogiri::HTML.parse(no_th_table) 69 | expect{ Jsoner::TableFactory.check(doc) }.to raise_error Jsoner::NotFullTable 70 | end 71 | 72 | it "should return factory if integrity" do 73 | doc = Nokogiri::HTML.parse(table_str) 74 | Jsoner::TableFactory.check(doc).should be_instance_of Jsoner::TableFactory 75 | end 76 | end 77 | 78 | context "filter HTML" do 79 | 80 | before :each do 81 | @ext_obj = Nokogiri::HTML.parse(table_extend_str) 82 | @ext_factory = Jsoner::TableFactory.new(@ext_obj) 83 | end 84 | 85 | it "@ext_obj should have two tables" do 86 | @ext_obj.search('table').count.should == 2 87 | end 88 | 89 | it "second table have no descendant nodes" do 90 | @ext_obj.search('table').last.search('descendant').should be_empty 91 | end 92 | 93 | it "first table should have _th_ elements" do 94 | @ext_obj.search('table').first.search('th').count.should == 3 95 | end 96 | 97 | it "should get first table after search table default" do 98 | @ext_factory.instance_variable_get(:@table_rows).search('tr').count.should ==5 99 | end 100 | end 101 | end 102 | -------------------------------------------------------------------------------- /spec/jsoner/table_spec.rb: -------------------------------------------------------------------------------- 1 | Dir["#{File.dirname(__FILE__)}/fixtures/*.rb"].each {|file| require file } 2 | require "#{File.dirname(__FILE__)}/../../lib/jsoner" 3 | 4 | describe "parse structure of table" do 5 | 6 | before :each do 7 | obj = Nokogiri::HTML.parse(table_str) 8 | factory = Jsoner::TableFactory.new(obj) 9 | @table = Jsoner::Table.new(factory) 10 | end 11 | 12 | context "the number of row beside header in table" do 13 | it "should == 4 " do 14 | @table.row_number.should == 4 15 | end 16 | end 17 | 18 | context "convert Hash from factory" do 19 | it "_convert_ should private method" do 20 | expect{ @table.convert }.to raise_error NoMethodError 21 | end 22 | 23 | it "should be instance of Array" do 24 | @table.send(:convert).should be_instance_of Array 25 | end 26 | 27 | it "everyone should be instance of Hash in Array" do 28 | @table.send(:convert).map{ |h| h.should be_instance_of Hash} 29 | end 30 | 31 | it "everyone should have keys including _First Name_,_Last Name_,_Score_ in Array" do 32 | @table.send(:convert).map{ |h| h.should have_key "First Name"} 33 | @table.send(:convert).map{ |h| h.should have_key "Last Name"} 34 | @table.send(:convert).map{ |h| h.should have_key "Points"} 35 | end 36 | 37 | it "match full data" do 38 | @table.send(:convert).should == pre_json 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /spec/jsoner_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | Dir["#{File.dirname(__FILE__)}/fixtures/*.rb"].each {|file| require file } 3 | require "#{File.dirname(__FILE__)}/../lib/jsoner" 4 | 5 | require 'open-uri' 6 | require 'json' 7 | 8 | describe Jsoner do 9 | 10 | it "should match data from json of fixtures" do 11 | Jsoner.parse(table_str).should == json 12 | end 13 | 14 | it "should parse file including table" do 15 | Jsoner.parse(open("#{File.dirname(__FILE__)}/fixtures/table.html")).should == json 16 | end 17 | 18 | it "should raise error when having no full table in HTML file" do 19 | expect{ Jsoner.parse(open("#{File.dirname(__FILE__)}/fixtures/table_extend.html"))}.to raise_error Jsoner::NotFullTable 20 | end 21 | 22 | # http://www.w3school.com.cn/tiy/t.asp?f=html_table_headers 23 | it "should parse Link include table" do 24 | json = Jsoner.parse(open("http://www.w3school.com.cn/tiy/t.asp?f=html_table_headers")) 25 | JSON.parse(json).should == [{"姓名"=>"Bill Gates", "电话"=>"555 77 855"}] 26 | end 27 | 28 | context "data parsed" do 29 | it "it is HTML file" do 30 | Jsoner.filter(open("#{File.dirname(__FILE__)}/fixtures/table.html")).should == table_str 31 | end 32 | 33 | it "it is String" do 34 | Jsoner.filter("
").should == "
" 35 | end 36 | end 37 | end 38 | --------------------------------------------------------------------------------