├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── bench ├── .gitignore ├── Gemfile ├── Gemfile.lock ├── README.md ├── build.sh ├── run.sh ├── shard.lock ├── shard.yml ├── test-libxml.cr ├── test-libxml.rb ├── test-libxml2.cr ├── test-libxml2.rb ├── test-myhtml.cr ├── test-myhtml2.cr └── xtime.rb ├── examples ├── example1.cr ├── example2.cr └── example3.cr ├── shard.yml ├── spec ├── modest_spec.cr └── spec_helper.cr └── src ├── ext └── Makefile ├── modest.cr └── modest ├── extend_myhtml.cr ├── finder.cr ├── lib.cr └── mycss.cr /.gitignore: -------------------------------------------------------------------------------- 1 | /doc/ 2 | /libs/ 3 | /lib/ 4 | /.crystal/ 5 | /.shards/ 6 | modest-c 7 | 8 | 9 | # Libraries don't need dependency lock 10 | # Dependencies will be locked in application that uses them 11 | /shard.lock 12 | 13 | bin_* 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: crystal 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Konstantin Makarchev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CRYSTAL ?= crystal 2 | CRYSTALFLAGS ?= --release 3 | 4 | .PHONY: all package spec 5 | all: bin_example1 bin_example2 bin_example3 6 | package: src/ext/modest-c/lib/libmodest_static.a 7 | 8 | libs: 9 | crystal deps 10 | 11 | bin_example1: src/*.cr src/**/*.cr examples/example1.cr package 12 | $(CRYSTAL) build examples/example1.cr $(CRYSTALFLAGS) -o $@ 13 | 14 | bin_example2: src/*.cr src/**/*.cr examples/example2.cr package 15 | $(CRYSTAL) build examples/example2.cr $(CRYSTALFLAGS) -o $@ 16 | 17 | bin_example3: src/*.cr src/**/*.cr examples/example3.cr package 18 | $(CRYSTAL) build examples/example3.cr $(CRYSTALFLAGS) -o $@ 19 | 20 | src/ext/modest-c/lib/libmodest_static.a: 21 | cd src/ext && make package 22 | 23 | spec: 24 | crystal spec 25 | 26 | .PHONY: clean 27 | clean: 28 | rm -f bin_* src/ext/modest-c/lib/libmodest_static.a 29 | rm -rf src/ext/modest-c 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## WARNING, this shard obsolete and moved to [myhtml](https://github.com/kostya/myhtml) directly, use [myhtml](https://github.com/kostya/myhtml) >= 1.0.0 2 | 3 | # modest 4 | 5 | CSS selectors for HTML5 Parser [myhtml](https://github.com/kostya/myhtml) (Crystal wrapper for https://github.com/lexborisov/Modest). 6 | 7 | ## Installation 8 | 9 | 10 | Add this to your application's `shard.yml`: 11 | 12 | ```yaml 13 | dependencies: 14 | modest: 15 | github: kostya/modest 16 | ``` 17 | 18 | ## Usage of CSS Selectors with myhtml parser 19 | 20 | ```crystal 21 | require "modest" 22 | 23 | page = <<-PAGE 24 | 25 |

bla

26 | 27 | PAGE 28 | 29 | myhtml = Myhtml::Parser.new(page) 30 | 31 | # css select from the root! scope (equal with myhtml.root!.css("...")) 32 | iterator = myhtml.css("div.aaa p#bbb a.ccc") # => Iterator(Myhtml::Node), methods: .each, .to_a, ... 33 | 34 | iterator.each do |node| 35 | p node.tag_id # MyHTML_TAG_A 36 | p node.tag_name # "a" 37 | p node.tag_sym # :a 38 | p node.attributes["href"]? # "http://..." 39 | p node.inner_text # "bla" 40 | puts node.to_html # bla 41 | end 42 | 43 | # css select from node scope 44 | if p_node = myhtml.css("div.aaa p#bbb").first? 45 | p_node.css("a.ccc").each do |node| 46 | p node.tag_sym # :a 47 | end 48 | end 49 | 50 | ``` 51 | 52 | ## Example 2 53 | 54 | ```crystal 55 | require "modest" 56 | 57 | html = <<-PAGE 58 |
59 |

60 |

61 |

62 | link1 63 | link2 64 |

65 |

66 |

67 |

68 |

69 |
70 | PAGE 71 | 72 | parser = Myhtml::Parser.new(html) 73 | 74 | # select all p nodes which id like `*p*` 75 | p parser.css("p[id*=p]").map(&.attribute_by("id")).to_a # => ["p1", "p2", "p3", "p4", "p5", "p6"] 76 | 77 | # select all nodes with class "jo" 78 | p parser.css("p.jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 79 | p parser.css(".jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 80 | 81 | # select odd child tag inside div, which not contain a 82 | p parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a # => ["p1", "p4", "p6"] 83 | 84 | # all elements with class=jo inside last div tag 85 | p parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a # => ["p4", "p6"] 86 | 87 | # a element with href ends like .png 88 | p parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a # => ["a2"] 89 | 90 | # find all a tags inside

, which href contain `html` 91 | p parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a # => ["a1"] 92 | 93 | # find all a tags inside

, which href contain `html` or ends_with `.png` 94 | p parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a # => ["a1", "a2"] 95 | 96 | # create finder and use it in many places, this is faster, than create it many times 97 | finder = Modest::Finder.new(".jo") 98 | p parser.css(finder).map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 99 | ``` 100 | 101 | ## Example 3 102 | ```crystal 103 | require "modest" 104 | 105 | html = <<-PAGE 106 | 107 | 108 | 109 |
Hello
110 | 111 | 112 | 113 | 114 | 115 |
123other
foocolumns
barare
xyzignored
116 | 117 | PAGE 118 | 119 | parser = Myhtml::Parser.new(html) 120 | 121 | p parser.css("#t2 tr td:first-child").map(&.inner_text).to_a # => ["123", "foo", "bar", "xyz"] 122 | p parser.css("#t2 tr td:first-child").map(&.to_html).to_a # => ["123", "foo", "bar", "xyz"] 123 | ``` 124 | 125 | ## Benchmark 126 | 127 | Comparing with nokorigi(libxml), and crystagiri(libxml). Parse 1000 times google page, code: https://github.com/kostya/modest/tree/master/bench 128 | 129 | ```crystal 130 | require "modest" 131 | page = File.read("./google.html") 132 | s = 0 133 | links = [] of String 134 | 1000.times do 135 | myhtml = Myhtml::Parser.new(page) 136 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a 137 | s += links.size 138 | myhtml.free 139 | end 140 | p links.last 141 | p s 142 | ``` 143 | 144 | Parse + Selectors 145 | 146 | | Lang | Package | Time, s | Memory, MiB | 147 | | -------- | ------------------ | ------- | ----------- | 148 | | Crystal | modest(myhtml) | 2.52 | 7.7 | 149 | | Crystal | Crystagiri(LibXML) | 19.89 | 14.3 | 150 | | Ruby 2.2 | Nokogiri(LibXML) | 45.05 | 136.2 | 151 | 152 | Selectors Only (files with suffix 2) 153 | 154 | | Lang | Package | Time, s | Memory, MiB | 155 | | -------- | ------------------ | ------- | ----------- | 156 | | Crystal | modest(myhtml) | 0.18 | 4.6 | 157 | | Crystal | Crystagiri(LibXML) | 12.30 | 6.6 | 158 | | Ruby 2.2 | Nokogiri(LibXML) | 28.06 | 68.8 | 159 | 160 | 161 | ## CSS Selectors rules 162 | https://drafts.csswg.org/selectors-4/ 163 | -------------------------------------------------------------------------------- /bench/.gitignore: -------------------------------------------------------------------------------- 1 | .shards 2 | lib 3 | *.html 4 | -------------------------------------------------------------------------------- /bench/Gemfile: -------------------------------------------------------------------------------- 1 | source :rubygems 2 | 3 | gem 'nokogiri' 4 | -------------------------------------------------------------------------------- /bench/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: http://rubygems.org/ 3 | specs: 4 | mini_portile2 (2.2.0) 5 | nokogiri (1.8.0) 6 | mini_portile2 (~> 2.2.0) 7 | 8 | PLATFORMS 9 | ruby 10 | 11 | DEPENDENCIES 12 | nokogiri 13 | 14 | BUNDLED WITH 15 | 1.13.6 16 | -------------------------------------------------------------------------------- /bench/README.md: -------------------------------------------------------------------------------- 1 | To compile all: `sh build.sh` 2 | 3 | To run all: `sh run.sh` -------------------------------------------------------------------------------- /bench/build.sh: -------------------------------------------------------------------------------- 1 | curl -s 'https://www.google.ru/search?client=opera&q=html+parsers&sourceid=opera&ie=UTF-8&oe=UTF-8&num=100' -A 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET4.0C)' > google.html 2 | bundle 3 | crystal deps 4 | crystal build test-libxml.cr --release -o bin_test_libxml --no-debug 5 | crystal build test-myhtml.cr --release -o bin_test_myhtml --no-debug 6 | crystal build test-libxml2.cr --release -o bin_test_libxml2 --no-debug 7 | crystal build test-myhtml2.cr --release -o bin_test_myhtml2 --no-debug 8 | -------------------------------------------------------------------------------- /bench/run.sh: -------------------------------------------------------------------------------- 1 | echo "============= Parsing + Selectors ==================" 2 | echo "Crystagiri(LibXML)" 3 | ./xtime.rb ./bin_test_libxml 4 | echo "modest(myhtml)" 5 | ./xtime.rb ./bin_test_myhtml 6 | echo "Nokogiri(LibXML)" 7 | ./xtime.rb ruby test-libxml.rb 8 | 9 | echo "============= Selectors Only ==================" 10 | echo "Crystagiri(LibXML)" 11 | ./xtime.rb ./bin_test_libxml2 12 | echo "modest(myhtml)" 13 | ./xtime.rb ./bin_test_myhtml2 14 | echo "Nokogiri(LibXML)" 15 | ./xtime.rb ruby test-libxml2.rb 16 | -------------------------------------------------------------------------------- /bench/shard.lock: -------------------------------------------------------------------------------- 1 | version: 1.0 2 | shards: 3 | crystagiri: 4 | github: madeindjs/crystagiri 5 | version: 0.3.2 6 | 7 | modest: 8 | github: kostya/modest 9 | version: 0.13 10 | 11 | myhtml: 12 | github: kostya/myhtml 13 | version: 0.26 14 | 15 | -------------------------------------------------------------------------------- /bench/shard.yml: -------------------------------------------------------------------------------- 1 | name: bench-html-parsers 2 | version: 0.1.0 3 | 4 | authors: 5 | - Konstantin Makarchev 6 | 7 | targets: 8 | bla: 9 | main: src/bla.cr 10 | 11 | crystal: 0.21 12 | 13 | dependencies: 14 | myhtml: 15 | github: kostya/myhtml 16 | modest: 17 | github: kostya/modest 18 | crystagiri: 19 | github: madeindjs/crystagiri 20 | 21 | license: MIT 22 | -------------------------------------------------------------------------------- /bench/test-libxml.cr: -------------------------------------------------------------------------------- 1 | require "crystagiri" 2 | 3 | page = File.read("./google.html") 4 | 5 | s = 0 6 | links = Array(String).new(initial_capacity: 100) 7 | 1000.times do 8 | links.clear 9 | doc = Crystagiri::HTML.new page 10 | doc.css("div.g h3.r a") { |tag| links << tag.node["href"].not_nil! } 11 | s += links.size 12 | end 13 | p links.last 14 | p s 15 | -------------------------------------------------------------------------------- /bench/test-libxml.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "nokogiri" 3 | 4 | page = File.read("./google.html") 5 | 6 | s = 0 7 | links = [] 8 | 1000.times do 9 | doc = Nokogiri::HTML(page) 10 | links = doc.css("div.g h3.r a").map { |link| link["href"] } 11 | s += links.size 12 | end 13 | p links.last 14 | p s 15 | -------------------------------------------------------------------------------- /bench/test-libxml2.cr: -------------------------------------------------------------------------------- 1 | require "crystagiri" 2 | 3 | page = File.read("./google.html") 4 | 5 | s = 0 6 | links = Array(String).new(initial_capacity: 100) 7 | doc = Crystagiri::HTML.new page 8 | 1000.times do 9 | links.clear 10 | doc.css("div.g h3.r a") { |tag| links << tag.node["href"].not_nil! } 11 | s += links.size 12 | end 13 | p s 14 | -------------------------------------------------------------------------------- /bench/test-libxml2.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "nokogiri" 3 | 4 | page = File.read("./google.html") 5 | 6 | s = 0 7 | doc = Nokogiri::HTML(page) 8 | 1000.times do 9 | links = doc.css("div.g h3.r a").map { |link| link["href"] } 10 | s += links.size 11 | end 12 | p s 13 | -------------------------------------------------------------------------------- /bench/test-myhtml.cr: -------------------------------------------------------------------------------- 1 | require "modest" 2 | 3 | page = File.read("./google.html") 4 | 5 | s = 0 6 | links = [] of String 7 | 1000.times do 8 | myhtml = Myhtml::Parser.new(page) 9 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a 10 | s += links.size 11 | myhtml.free 12 | end 13 | p links.last 14 | p s 15 | -------------------------------------------------------------------------------- /bench/test-myhtml2.cr: -------------------------------------------------------------------------------- 1 | require "modest" 2 | 3 | page = File.read("./google.html") 4 | 5 | s = 0 6 | links = [] of String 7 | myhtml = Myhtml::Parser.new(page) 8 | 1000.times do 9 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a 10 | s += links.size 11 | end 12 | p s 13 | -------------------------------------------------------------------------------- /bench/xtime.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | def mem(pid); `ps p #{pid} -o rss`.split.last.to_i; end 3 | t = Time.now 4 | pid = Process.spawn(*ARGV.to_a) 5 | mm = 0 6 | 7 | Thread.new do 8 | mm = mem(pid) 9 | while true 10 | sleep 0.1 11 | m = mem(pid) 12 | mm = m if m > mm 13 | end 14 | end 15 | 16 | Process.waitall 17 | STDERR.puts "%.2fs, %.1fMb" % [Time.now - t, mm / 1024.0] 18 | 19 | -------------------------------------------------------------------------------- /examples/example1.cr: -------------------------------------------------------------------------------- 1 | require "../src/modest" 2 | 3 | page = <<-PAGE 4 | 5 |

bla

6 | 7 | PAGE 8 | 9 | myhtml = Myhtml::Parser.new(page) 10 | 11 | # css select from the root! scope (equal with myhtml.root!.css("...")) 12 | iterator = myhtml.css("div.aaa p#bbb a.ccc") # => Iterator(Myhtml::Node), methods: .each, .to_a, ... 13 | 14 | iterator.each do |node| 15 | p node.tag_id # MyHTML_TAG_A 16 | p node.tag_name # "a" 17 | p node.tag_sym # :a 18 | p node.attributes["href"]? # "http://..." 19 | p node.inner_text # "bla" 20 | puts node.to_html # bla 21 | end 22 | 23 | # css select from node scope 24 | if p_node = myhtml.css("div.aaa p#bbb").first? 25 | p_node.css("a.ccc").each do |node| 26 | p node.tag_sym # :a 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /examples/example2.cr: -------------------------------------------------------------------------------- 1 | require "../src/modest" 2 | 3 | html = <<-PAGE 4 |
5 |

6 |

7 |

8 | link1 9 | link2 10 |

11 |

12 |

13 |

14 |

15 |
16 | PAGE 17 | 18 | parser = Myhtml::Parser.new(html) 19 | 20 | # select all p nodes which id like `*p*` 21 | p parser.css("p[id*=p]").map(&.attribute_by("id")).to_a # => ["p1", "p2", "p3", "p4", "p5", "p6"] 22 | 23 | # select all nodes with class "jo" 24 | p parser.css("p.jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 25 | p parser.css(".jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 26 | 27 | # select odd child tag inside div, which not contain a 28 | p parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a # => ["p1", "p4", "p6"] 29 | 30 | # all elements with class=jo inside last div tag 31 | p parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a # => ["p4", "p6"] 32 | 33 | # a element with href ends like .png 34 | p parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a # => ["a2"] 35 | 36 | # find all a tags inside

, which href contain `html` 37 | p parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a # => ["a1"] 38 | 39 | # find all a tags inside

, which href contain `html` or ends_with `.png` 40 | p parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a # => ["a1", "a2"] 41 | 42 | # create finder and use it in many places, this is faster, than create it many times 43 | finder = Modest::Finder.new(".jo") 44 | p parser.css(finder).map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"] 45 | -------------------------------------------------------------------------------- /examples/example3.cr: -------------------------------------------------------------------------------- 1 | require "../src/modest" 2 | 3 | html = <<-PAGE 4 | 5 | 6 | 7 |
Hello
8 | 9 | 10 | 11 | 12 | 13 |
123other
foocolumns
barare
xyzignored
14 | 15 | PAGE 16 | 17 | parser = Myhtml::Parser.new(html) 18 | 19 | p parser.css("#t2 tr td:first-child").map(&.inner_text).to_a # => ["123", "foo", "bar", "xyz"] 20 | p parser.css("#t2 tr td:first-child").map(&.to_html).to_a # => ["123", "foo", "bar", "xyz"] 21 | -------------------------------------------------------------------------------- /shard.yml: -------------------------------------------------------------------------------- 1 | name: modest 2 | version: 0.18 3 | 4 | authors: 5 | - Konstantin Makarchev 6 | 7 | dependencies: 8 | myhtml: 9 | github: kostya/myhtml 10 | version: "< 1.0" 11 | 12 | scripts: 13 | postinstall: cd src/ext && make package 14 | 15 | license: MIT 16 | -------------------------------------------------------------------------------- /spec/modest_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_helper" 2 | 3 | describe Modest do 4 | it "direct use Finder" do 5 | html = "

link

" 6 | selector = "div > :nth-child(2n+1):not(:has(a))" 7 | 8 | parser = Myhtml::Parser.new(html) 9 | finder = Modest::Finder.new(selector) 10 | nodes = finder.find(parser.html!).to_a 11 | 12 | nodes.size.should eq 2 13 | 14 | n1, n2 = nodes 15 | 16 | n1.tag_name.should eq "p" 17 | n1.attribute_by("id").should eq "p1" 18 | 19 | n2.tag_name.should eq "p" 20 | n2.attribute_by("id").should eq "p5" 21 | end 22 | 23 | it "css for root! node" do 24 | html = "

link

" 25 | 26 | parser = Myhtml::Parser.new(html) 27 | nodes = parser.root!.css("div > :nth-child(2n+1):not(:has(a))").to_a 28 | 29 | nodes.size.should eq 2 30 | 31 | n1, n2 = nodes 32 | 33 | n1.tag_name.should eq "p" 34 | n1.attribute_by("id").should eq "p1" 35 | 36 | n2.tag_name.should eq "p" 37 | n2.attribute_by("id").should eq "p5" 38 | end 39 | 40 | it "another rule" do 41 | html = "

link

" 42 | 43 | parser = Myhtml::Parser.new(html) 44 | parser.root!.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 45 | end 46 | 47 | it "another rule for parser itself" do 48 | html = "

link

" 49 | 50 | parser = Myhtml::Parser.new(html) 51 | parser.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 52 | end 53 | 54 | it "work for another scope node" do 55 | html = "

link

" 56 | 57 | parser = Myhtml::Parser.new(html) 58 | parser.nodes(:div).to_a.last.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p4 p6) 59 | parser.nodes(:div).to_a.first.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 60 | end 61 | 62 | context "build finder" do 63 | it "for parser" do 64 | html = "

link

" 65 | 66 | parser = Myhtml::Parser.new(html) 67 | finder = Modest::Finder.new(".jo") 68 | 69 | 10.times do 70 | parser.root!.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 71 | end 72 | 73 | finder.inspect.should eq "Modest::Finder(rule: `.jo`)" 74 | end 75 | 76 | it "for parser" do 77 | html = "

link

" 78 | 79 | parser = Myhtml::Parser.new(html) 80 | finder = Modest::Finder.new(".jo") 81 | 82 | 10.times do 83 | parser.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 84 | end 85 | end 86 | 87 | it "for root node" do 88 | html = "

link

" 89 | 90 | parser = Myhtml::Parser.new(html) 91 | finder = Modest::Finder.new(".jo") 92 | 93 | 10.times do 94 | parser.root!.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6) 95 | end 96 | end 97 | end 98 | 99 | it "should not raise on empty selector" do 100 | html = "

link

" 101 | 102 | parser = Myhtml::Parser.new(html) 103 | finder = Modest::Finder.new("") 104 | parser.css(finder).to_a.size.should eq 0 105 | end 106 | 107 | it "integration test" do 108 | html = <<-PAGE 109 |
110 |

111 |

112 |

113 | link1 114 | link2 115 |

116 |

117 |

118 |

119 |

120 |
121 | PAGE 122 | 123 | parser = Myhtml::Parser.new(html) 124 | 125 | # select all p nodes which id like `*p*` 126 | parser.css("p[id*=p]").map(&.attribute_by("id")).to_a.should eq ["p1", "p2", "p3", "p4", "p5", "p6"] 127 | 128 | # select all nodes with class "jo" 129 | parser.css("p.jo").map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"] 130 | parser.css(".jo").map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"] 131 | 132 | # select odd child tag inside div, which not contain a 133 | parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a.should eq ["p1", "p4", "p6"] 134 | 135 | # all elements with class=jo inside last div tag 136 | parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a.should eq ["p4", "p6"] 137 | 138 | # a element with href ends like .png 139 | parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a.should eq ["a2"] 140 | 141 | # find all a tags inside

, which href contain `html` 142 | parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a.should eq ["a1"] 143 | 144 | # find all a tags inside

, which href contain `html` or ends_with `.png` 145 | parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a.should eq ["a1", "a2"] 146 | 147 | # create finder and use it in many places 148 | finder = Modest::Finder.new(".jo") 149 | parser.css(finder).map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"] 150 | end 151 | 152 | it "integration test2" do 153 | html = <<-PAGE 154 | 155 | 156 | 157 |
Hello
158 | 159 | 160 | 161 | 162 | 163 |
123other
foocolumns
barare
xyzignored
164 | 165 | PAGE 166 | 167 | parser = Myhtml::Parser.new(html) 168 | parser.css("#t2 tr td:first-child").map(&.inner_text).to_a.should eq ["123", "foo", "bar", "xyz"] 169 | parser.css("#t2 tr td:first-child").map(&.to_html).to_a.should eq ["123", "foo", "bar", "xyz"] 170 | 171 | res = [] of String 172 | parser.css("#t2 tr").each do |node| 173 | res << node.css("td:first-child").first.inner_text 174 | end 175 | res.join('|').should eq "123|foo|bar|xyz" 176 | end 177 | 178 | it "not sigfaulting on more than 1024 elements" do 179 | str = "" + "

ooo
" * 20000 + "" 180 | parser = Myhtml::Parser.new(str) 181 | 182 | c = 0 183 | x = 0 184 | parser.css("div").each do |node| 185 | x += 1 186 | c += 1 if node.attribute_by("class") == "A" 187 | end 188 | x.should eq 20000 189 | c.should eq 20000 190 | end 191 | 192 | it "bug in css" do 193 | parser = Myhtml::Parser.new(%q{
bla
}) 194 | parser.css("div.jjjj").to_a.size.should eq 0 195 | end 196 | 197 | it "css with yield" do 198 | parser = Myhtml::Parser.new(%q{
bla
}) 199 | parser.css("div.jjjj") { |col| col.to_a.size }.should eq 1 200 | end 201 | end 202 | -------------------------------------------------------------------------------- /spec/spec_helper.cr: -------------------------------------------------------------------------------- 1 | require "spec" 2 | require "../src/modest" 3 | -------------------------------------------------------------------------------- /src/ext/Makefile: -------------------------------------------------------------------------------- 1 | REV := bcc92e72aeda7791d14f66c3c61067086ed89229 2 | 3 | .PHONY: package 4 | package: ./modest-c/lib/libmodest_static.a 5 | 6 | ./modest-c: 7 | git clone https://github.com/lexborisov/Modest.git ./modest-c 8 | cd modest-c && git reset --hard $(REV) 9 | 10 | ./modest-c/lib/libmodest_static.a: ./modest-c 11 | cd modest-c && make static MyHTML_BUILD_SHARED=OFF MyCORE_BUILD_WITHOUT_THREADS=YES PROJECT_OPTIMIZATION_LEVEL=-O3 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -rf modest-c 16 | -------------------------------------------------------------------------------- /src/modest.cr: -------------------------------------------------------------------------------- 1 | require "myhtml" 2 | require "./modest/*" 3 | 4 | module Modest 5 | VERSION = "0.18" 6 | end 7 | -------------------------------------------------------------------------------- /src/modest/extend_myhtml.cr: -------------------------------------------------------------------------------- 1 | require "myhtml" 2 | 3 | struct Myhtml::Parser 4 | delegate :css, to: root! 5 | end 6 | 7 | struct Myhtml::Node 8 | def css(rule : String) 9 | f = Modest::Finder.new(rule) 10 | css(f) 11 | ensure 12 | f.try &.free 13 | end 14 | 15 | def css(finder : Modest::Finder) 16 | finder.find(self) 17 | end 18 | 19 | def css(arg) 20 | collection = css(arg) 21 | yield collection 22 | ensure 23 | collection.try &.free 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /src/modest/finder.cr: -------------------------------------------------------------------------------- 1 | class Modest::Finder 2 | @finder : LibModest::ModestFinderT* 3 | 4 | def initialize(@rule : String) 5 | @finalized = false 6 | @finder = LibModest.finder_create_simple 7 | @css = Mycss.new 8 | @selectors = Modest::LibMyCss.entry_selectors(@css.raw_entry) 9 | @list = LibMyCss.selectors_parse(@selectors, Myhtml::Lib::MyEncodingList::MyENCODING_UTF_8, rule.to_unsafe, rule.bytesize, out status) 10 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK 11 | free 12 | raise Myhtml::Error.new("finder selectors_parse #{status}") 13 | end 14 | end 15 | 16 | def find(scope_node : Myhtml::Node) 17 | col = Pointer(Myhtml::Lib::MyhtmlCollectionT).new(0) 18 | LibModest.finder_by_selectors_list(@finder, scope_node.node, @list, pointerof(col)) 19 | Myhtml::Iterator.new(scope_node.tree, col) 20 | end 21 | 22 | def free 23 | unless @finalized 24 | @finalized = true 25 | LibMyCss.selectors_list_destroy(@selectors, @list, true) 26 | LibModest.finder_destroy(@finder, true) 27 | @css.free 28 | end 29 | end 30 | 31 | def finalize 32 | free 33 | end 34 | 35 | def inspect(io) 36 | io << "Modest::Finder(rule: `" 37 | io << @rule 38 | io << "`)" 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /src/modest/lib.cr: -------------------------------------------------------------------------------- 1 | module Modest 2 | @[Link(ldflags: "#{__DIR__}/../ext/modest-c/lib/libmodest_static.a")] 3 | lib LibMyCss 4 | enum MycssStatusT 5 | MyCSS_STATUS_OK = 0x000000 6 | MyCSS_STATUS_ERROR_MEMORY_ALLOCATION = 0x010001 7 | MyCSS_STATUS_ERROR_TOKENIZER_STATE_ALLOCATION = 0x010020 8 | MyCSS_STATUS_ERROR_TOKENIZER_INCOMING_BUFFER_ADD = 0x010021 9 | MyCSS_STATUS_ERROR_TOKENIZER_TOKEN_ALLOCATION = 0x010022 10 | MyCSS_STATUS_ERROR_INCOMING_BUFFER_INIT = 0x010030 11 | MyCSS_STATUS_ERROR_ENTRY_INCOMING_BUFFER_CREATE = 0x010039 12 | MyCSS_STATUS_ERROR_ENTRY_INCOMING_BUFFER_INIT = 0x010040 13 | MyCSS_STATUS_ERROR_ENTRY_TOKEN_INCOMING_BUFFER_INIT = 0x010041 14 | MyCSS_STATUS_ERROR_ENTRY_TOKEN_NODE_ADD = 0x010042 15 | MyCSS_STATUS_ERROR_SELECTORS_CREATE = 0x010100 16 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_CREATE = 0x010101 17 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_INIT = 0x010102 18 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_NODE_ADD = 0x010103 19 | MyCSS_STATUS_ERROR_SELECTORS_LIST_CREATE = 0x010104 20 | MyCSS_STATUS_ERROR_SELECTORS_LIST_INIT = 0x010105 21 | MyCSS_STATUS_ERROR_SELECTORS_LIST_ADD_NODE = 0x010106 22 | MyCSS_STATUS_ERROR_NAMESPACE_CREATE = 0x010200 23 | MyCSS_STATUS_ERROR_NAMESPACE_INIT = 0x010201 24 | MyCSS_STATUS_ERROR_NAMESPACE_ENTRIES_CREATE = 0x010202 25 | MyCSS_STATUS_ERROR_NAMESPACE_ENTRIES_INIT = 0x010203 26 | MyCSS_STATUS_ERROR_NAMESPACE_NODE_ADD = 0x010204 27 | MyCSS_STATUS_ERROR_MEDIA_CREATE = 0x010404 28 | MyCSS_STATUS_ERROR_STRING_CREATE = 0x010501 29 | MyCSS_STATUS_ERROR_STRING_INIT = 0x010502 30 | MyCSS_STATUS_ERROR_STRING_NODE_INIT = 0x010503 31 | MyCSS_STATUS_ERROR_AN_PLUS_B_CREATE = 0x010600 32 | MyCSS_STATUS_ERROR_AN_PLUS_B_INIT = 0x010601 33 | MyCSS_STATUS_ERROR_DECLARATION_CREATE = 0x010700 34 | MyCSS_STATUS_ERROR_DECLARATION_INIT = 0x010701 35 | MyCSS_STATUS_ERROR_DECLARATION_ENTRY_CREATE = 0x010702 36 | MyCSS_STATUS_ERROR_DECLARATION_ENTRY_INIT = 0x010703 37 | MyCSS_STATUS_ERROR_PARSER_LIST_CREATE = 0x010800 38 | end 39 | 40 | type MycssT = Void* 41 | type MycssEntryT = Void* 42 | type MysccSelectorsListT = Void* 43 | type MysccSelectorsT = Void* 44 | 45 | fun create = mycss_create : MycssT* 46 | fun init = mycss_init(mycss : MycssT*) : MycssStatusT 47 | fun entry_create = mycss_entry_create : MycssEntryT* 48 | fun entry_init = mycss_entry_init(mycss : MycssT*, entry : MycssEntryT*) : MycssStatusT 49 | fun selectors_parse = mycss_selectors_parse(selectors : MysccSelectorsT*, encoding : Myhtml::Lib::MyEncodingList, 50 | data : UInt8*, data_size : LibC::SizeT, out_status : MycssStatusT*) : MysccSelectorsListT* 51 | fun selectors_list_destroy = mycss_selectors_list_destroy(selectors : MysccSelectorsT*, selector_list : MysccSelectorsListT*, self_destroy : Bool) : MysccSelectorsListT* 52 | 53 | fun entry_selectors = mycss_entry_selectors(entry : MycssEntryT*) : MysccSelectorsT* 54 | fun destroy = mycss_destroy(mycss : MycssT*, self_destroy : Bool) : MycssT* 55 | fun entry_destroy = mycss_entry_destroy(entry : MycssEntryT*, self_destroy : Bool) : MycssEntryT* 56 | end 57 | 58 | # cd src/ext && make 59 | @[Link(ldflags: "#{__DIR__}/../ext/modest-c/lib/libmodest_static.a")] 60 | lib LibModest 61 | # modest 62 | type ModestFinderT = Void* 63 | 64 | enum ModestStatusT 65 | MODEST_STATUS_OK = 0x000000 66 | MODEST_STATUS_ERROR = 0x020000 67 | MODEST_STATUS_ERROR_MEMORY_ALLOCATION = 0x020001 68 | end 69 | 70 | fun finder_create_simple = modest_finder_create_simple : ModestFinderT* 71 | fun finder_destroy = modest_finder_destroy(finder : ModestFinderT*, self_destroy : Bool) : ModestFinderT* 72 | 73 | fun finder_by_selectors_list = modest_finder_by_selectors_list(finder : ModestFinderT*, 74 | scope_node : Myhtml::Lib::MyhtmlTreeNodeT*, 75 | sel_list : LibMyCss::MysccSelectorsListT*, 76 | collection : Myhtml::Lib::MyhtmlCollectionT**) : ModestStatusT 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /src/modest/mycss.cr: -------------------------------------------------------------------------------- 1 | class Modest::Mycss 2 | getter raw_mycss, raw_entry 3 | 4 | def initialize 5 | @finalized = false 6 | @raw_mycss = LibMyCss.create 7 | status = LibMyCss.init(@raw_mycss) 8 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK 9 | LibMyCss.destroy(@raw_mycss, true) 10 | raise Myhtml::Error.new("mycss init error #{status}") 11 | end 12 | @raw_entry = LibMyCss.entry_create 13 | status = LibMyCss.entry_init(@raw_mycss, @raw_entry) 14 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK 15 | free 16 | raise Myhtml::Error.new("mycss entry_init error #{status}") 17 | end 18 | end 19 | 20 | def free 21 | unless @finalized 22 | @finalized = true 23 | LibMyCss.entry_destroy(@raw_entry, true) 24 | LibMyCss.destroy(@raw_mycss, true) 25 | end 26 | end 27 | 28 | def finalize 29 | free 30 | end 31 | end 32 | --------------------------------------------------------------------------------