├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── bench
├── .gitignore
├── Gemfile
├── Gemfile.lock
├── README.md
├── build.sh
├── run.sh
├── shard.lock
├── shard.yml
├── test-libxml.cr
├── test-libxml.rb
├── test-libxml2.cr
├── test-libxml2.rb
├── test-myhtml.cr
├── test-myhtml2.cr
└── xtime.rb
├── examples
├── example1.cr
├── example2.cr
└── example3.cr
├── shard.yml
├── spec
├── modest_spec.cr
└── spec_helper.cr
└── src
├── ext
└── Makefile
├── modest.cr
└── modest
├── extend_myhtml.cr
├── finder.cr
├── lib.cr
└── mycss.cr
/.gitignore:
--------------------------------------------------------------------------------
1 | /doc/
2 | /libs/
3 | /lib/
4 | /.crystal/
5 | /.shards/
6 | modest-c
7 |
8 |
9 | # Libraries don't need dependency lock
10 | # Dependencies will be locked in application that uses them
11 | /shard.lock
12 |
13 | bin_*
14 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: crystal
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Konstantin Makarchev
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CRYSTAL ?= crystal
2 | CRYSTALFLAGS ?= --release
3 |
4 | .PHONY: all package spec
5 | all: bin_example1 bin_example2 bin_example3
6 | package: src/ext/modest-c/lib/libmodest_static.a
7 |
8 | libs:
9 | crystal deps
10 |
11 | bin_example1: src/*.cr src/**/*.cr examples/example1.cr package
12 | $(CRYSTAL) build examples/example1.cr $(CRYSTALFLAGS) -o $@
13 |
14 | bin_example2: src/*.cr src/**/*.cr examples/example2.cr package
15 | $(CRYSTAL) build examples/example2.cr $(CRYSTALFLAGS) -o $@
16 |
17 | bin_example3: src/*.cr src/**/*.cr examples/example3.cr package
18 | $(CRYSTAL) build examples/example3.cr $(CRYSTALFLAGS) -o $@
19 |
20 | src/ext/modest-c/lib/libmodest_static.a:
21 | cd src/ext && make package
22 |
23 | spec:
24 | crystal spec
25 |
26 | .PHONY: clean
27 | clean:
28 | rm -f bin_* src/ext/modest-c/lib/libmodest_static.a
29 | rm -rf src/ext/modest-c
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## WARNING, this shard obsolete and moved to [myhtml](https://github.com/kostya/myhtml) directly, use [myhtml](https://github.com/kostya/myhtml) >= 1.0.0
2 |
3 | # modest
4 |
5 | CSS selectors for HTML5 Parser [myhtml](https://github.com/kostya/myhtml) (Crystal wrapper for https://github.com/lexborisov/Modest).
6 |
7 | ## Installation
8 |
9 |
10 | Add this to your application's `shard.yml`:
11 |
12 | ```yaml
13 | dependencies:
14 | modest:
15 | github: kostya/modest
16 | ```
17 |
18 | ## Usage of CSS Selectors with myhtml parser
19 |
20 | ```crystal
21 | require "modest"
22 |
23 | page = <<-PAGE
24 |
25 |
26 |
27 | PAGE
28 |
29 | myhtml = Myhtml::Parser.new(page)
30 |
31 | # css select from the root! scope (equal with myhtml.root!.css("..."))
32 | iterator = myhtml.css("div.aaa p#bbb a.ccc") # => Iterator(Myhtml::Node), methods: .each, .to_a, ...
33 |
34 | iterator.each do |node|
35 | p node.tag_id # MyHTML_TAG_A
36 | p node.tag_name # "a"
37 | p node.tag_sym # :a
38 | p node.attributes["href"]? # "http://..."
39 | p node.inner_text # "bla"
40 | puts node.to_html # bla
41 | end
42 |
43 | # css select from node scope
44 | if p_node = myhtml.css("div.aaa p#bbb").first?
45 | p_node.css("a.ccc").each do |node|
46 | p node.tag_sym # :a
47 | end
48 | end
49 |
50 | ```
51 |
52 | ## Example 2
53 |
54 | ```crystal
55 | require "modest"
56 |
57 | html = <<-PAGE
58 |
70 | PAGE
71 |
72 | parser = Myhtml::Parser.new(html)
73 |
74 | # select all p nodes which id like `*p*`
75 | p parser.css("p[id*=p]").map(&.attribute_by("id")).to_a # => ["p1", "p2", "p3", "p4", "p5", "p6"]
76 |
77 | # select all nodes with class "jo"
78 | p parser.css("p.jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
79 | p parser.css(".jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
80 |
81 | # select odd child tag inside div, which not contain a
82 | p parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a # => ["p1", "p4", "p6"]
83 |
84 | # all elements with class=jo inside last div tag
85 | p parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a # => ["p4", "p6"]
86 |
87 | # a element with href ends like .png
88 | p parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a # => ["a2"]
89 |
90 | # find all a tags inside , which href contain `html`
91 | p parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a # => ["a1"]
92 |
93 | # find all a tags inside
, which href contain `html` or ends_with `.png`
94 | p parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a # => ["a1", "a2"]
95 |
96 | # create finder and use it in many places, this is faster, than create it many times
97 | finder = Modest::Finder.new(".jo")
98 | p parser.css(finder).map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
99 | ```
100 |
101 | ## Example 3
102 | ```crystal
103 | require "modest"
104 |
105 | html = <<-PAGE
106 |
107 |
110 |
111 | 123 | other |
112 | foo | columns |
113 | bar | are |
114 | xyz | ignored |
115 |
116 |
117 | PAGE
118 |
119 | parser = Myhtml::Parser.new(html)
120 |
121 | p parser.css("#t2 tr td:first-child").map(&.inner_text).to_a # => ["123", "foo", "bar", "xyz"]
122 | p parser.css("#t2 tr td:first-child").map(&.to_html).to_a # => ["123 | ", "foo | ", "bar | ", "xyz | "]
123 | ```
124 |
125 | ## Benchmark
126 |
127 | Comparing with nokorigi(libxml), and crystagiri(libxml). Parse 1000 times google page, code: https://github.com/kostya/modest/tree/master/bench
128 |
129 | ```crystal
130 | require "modest"
131 | page = File.read("./google.html")
132 | s = 0
133 | links = [] of String
134 | 1000.times do
135 | myhtml = Myhtml::Parser.new(page)
136 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a
137 | s += links.size
138 | myhtml.free
139 | end
140 | p links.last
141 | p s
142 | ```
143 |
144 | Parse + Selectors
145 |
146 | | Lang | Package | Time, s | Memory, MiB |
147 | | -------- | ------------------ | ------- | ----------- |
148 | | Crystal | modest(myhtml) | 2.52 | 7.7 |
149 | | Crystal | Crystagiri(LibXML) | 19.89 | 14.3 |
150 | | Ruby 2.2 | Nokogiri(LibXML) | 45.05 | 136.2 |
151 |
152 | Selectors Only (files with suffix 2)
153 |
154 | | Lang | Package | Time, s | Memory, MiB |
155 | | -------- | ------------------ | ------- | ----------- |
156 | | Crystal | modest(myhtml) | 0.18 | 4.6 |
157 | | Crystal | Crystagiri(LibXML) | 12.30 | 6.6 |
158 | | Ruby 2.2 | Nokogiri(LibXML) | 28.06 | 68.8 |
159 |
160 |
161 | ## CSS Selectors rules
162 | https://drafts.csswg.org/selectors-4/
163 |
--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | .shards
2 | lib
3 | *.html
4 |
--------------------------------------------------------------------------------
/bench/Gemfile:
--------------------------------------------------------------------------------
1 | source :rubygems
2 |
3 | gem 'nokogiri'
4 |
--------------------------------------------------------------------------------
/bench/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: http://rubygems.org/
3 | specs:
4 | mini_portile2 (2.2.0)
5 | nokogiri (1.8.0)
6 | mini_portile2 (~> 2.2.0)
7 |
8 | PLATFORMS
9 | ruby
10 |
11 | DEPENDENCIES
12 | nokogiri
13 |
14 | BUNDLED WITH
15 | 1.13.6
16 |
--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
1 | To compile all: `sh build.sh`
2 |
3 | To run all: `sh run.sh`
--------------------------------------------------------------------------------
/bench/build.sh:
--------------------------------------------------------------------------------
1 | curl -s 'https://www.google.ru/search?client=opera&q=html+parsers&sourceid=opera&ie=UTF-8&oe=UTF-8&num=100' -A 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET4.0C)' > google.html
2 | bundle
3 | crystal deps
4 | crystal build test-libxml.cr --release -o bin_test_libxml --no-debug
5 | crystal build test-myhtml.cr --release -o bin_test_myhtml --no-debug
6 | crystal build test-libxml2.cr --release -o bin_test_libxml2 --no-debug
7 | crystal build test-myhtml2.cr --release -o bin_test_myhtml2 --no-debug
8 |
--------------------------------------------------------------------------------
/bench/run.sh:
--------------------------------------------------------------------------------
1 | echo "============= Parsing + Selectors =================="
2 | echo "Crystagiri(LibXML)"
3 | ./xtime.rb ./bin_test_libxml
4 | echo "modest(myhtml)"
5 | ./xtime.rb ./bin_test_myhtml
6 | echo "Nokogiri(LibXML)"
7 | ./xtime.rb ruby test-libxml.rb
8 |
9 | echo "============= Selectors Only =================="
10 | echo "Crystagiri(LibXML)"
11 | ./xtime.rb ./bin_test_libxml2
12 | echo "modest(myhtml)"
13 | ./xtime.rb ./bin_test_myhtml2
14 | echo "Nokogiri(LibXML)"
15 | ./xtime.rb ruby test-libxml2.rb
16 |
--------------------------------------------------------------------------------
/bench/shard.lock:
--------------------------------------------------------------------------------
1 | version: 1.0
2 | shards:
3 | crystagiri:
4 | github: madeindjs/crystagiri
5 | version: 0.3.2
6 |
7 | modest:
8 | github: kostya/modest
9 | version: 0.13
10 |
11 | myhtml:
12 | github: kostya/myhtml
13 | version: 0.26
14 |
15 |
--------------------------------------------------------------------------------
/bench/shard.yml:
--------------------------------------------------------------------------------
1 | name: bench-html-parsers
2 | version: 0.1.0
3 |
4 | authors:
5 | - Konstantin Makarchev
6 |
7 | targets:
8 | bla:
9 | main: src/bla.cr
10 |
11 | crystal: 0.21
12 |
13 | dependencies:
14 | myhtml:
15 | github: kostya/myhtml
16 | modest:
17 | github: kostya/modest
18 | crystagiri:
19 | github: madeindjs/crystagiri
20 |
21 | license: MIT
22 |
--------------------------------------------------------------------------------
/bench/test-libxml.cr:
--------------------------------------------------------------------------------
1 | require "crystagiri"
2 |
3 | page = File.read("./google.html")
4 |
5 | s = 0
6 | links = Array(String).new(initial_capacity: 100)
7 | 1000.times do
8 | links.clear
9 | doc = Crystagiri::HTML.new page
10 | doc.css("div.g h3.r a") { |tag| links << tag.node["href"].not_nil! }
11 | s += links.size
12 | end
13 | p links.last
14 | p s
15 |
--------------------------------------------------------------------------------
/bench/test-libxml.rb:
--------------------------------------------------------------------------------
1 | require "bundler/setup"
2 | require "nokogiri"
3 |
4 | page = File.read("./google.html")
5 |
6 | s = 0
7 | links = []
8 | 1000.times do
9 | doc = Nokogiri::HTML(page)
10 | links = doc.css("div.g h3.r a").map { |link| link["href"] }
11 | s += links.size
12 | end
13 | p links.last
14 | p s
15 |
--------------------------------------------------------------------------------
/bench/test-libxml2.cr:
--------------------------------------------------------------------------------
1 | require "crystagiri"
2 |
3 | page = File.read("./google.html")
4 |
5 | s = 0
6 | links = Array(String).new(initial_capacity: 100)
7 | doc = Crystagiri::HTML.new page
8 | 1000.times do
9 | links.clear
10 | doc.css("div.g h3.r a") { |tag| links << tag.node["href"].not_nil! }
11 | s += links.size
12 | end
13 | p s
14 |
--------------------------------------------------------------------------------
/bench/test-libxml2.rb:
--------------------------------------------------------------------------------
1 | require "bundler/setup"
2 | require "nokogiri"
3 |
4 | page = File.read("./google.html")
5 |
6 | s = 0
7 | doc = Nokogiri::HTML(page)
8 | 1000.times do
9 | links = doc.css("div.g h3.r a").map { |link| link["href"] }
10 | s += links.size
11 | end
12 | p s
13 |
--------------------------------------------------------------------------------
/bench/test-myhtml.cr:
--------------------------------------------------------------------------------
1 | require "modest"
2 |
3 | page = File.read("./google.html")
4 |
5 | s = 0
6 | links = [] of String
7 | 1000.times do
8 | myhtml = Myhtml::Parser.new(page)
9 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a
10 | s += links.size
11 | myhtml.free
12 | end
13 | p links.last
14 | p s
15 |
--------------------------------------------------------------------------------
/bench/test-myhtml2.cr:
--------------------------------------------------------------------------------
1 | require "modest"
2 |
3 | page = File.read("./google.html")
4 |
5 | s = 0
6 | links = [] of String
7 | myhtml = Myhtml::Parser.new(page)
8 | 1000.times do
9 | links = myhtml.css("div.g h3.r a").map(&.attribute_by("href")).to_a
10 | s += links.size
11 | end
12 | p s
13 |
--------------------------------------------------------------------------------
/bench/xtime.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | def mem(pid); `ps p #{pid} -o rss`.split.last.to_i; end
3 | t = Time.now
4 | pid = Process.spawn(*ARGV.to_a)
5 | mm = 0
6 |
7 | Thread.new do
8 | mm = mem(pid)
9 | while true
10 | sleep 0.1
11 | m = mem(pid)
12 | mm = m if m > mm
13 | end
14 | end
15 |
16 | Process.waitall
17 | STDERR.puts "%.2fs, %.1fMb" % [Time.now - t, mm / 1024.0]
18 |
19 |
--------------------------------------------------------------------------------
/examples/example1.cr:
--------------------------------------------------------------------------------
1 | require "../src/modest"
2 |
3 | page = <<-PAGE
4 |
5 |
6 |
7 | PAGE
8 |
9 | myhtml = Myhtml::Parser.new(page)
10 |
11 | # css select from the root! scope (equal with myhtml.root!.css("..."))
12 | iterator = myhtml.css("div.aaa p#bbb a.ccc") # => Iterator(Myhtml::Node), methods: .each, .to_a, ...
13 |
14 | iterator.each do |node|
15 | p node.tag_id # MyHTML_TAG_A
16 | p node.tag_name # "a"
17 | p node.tag_sym # :a
18 | p node.attributes["href"]? # "http://..."
19 | p node.inner_text # "bla"
20 | puts node.to_html # bla
21 | end
22 |
23 | # css select from node scope
24 | if p_node = myhtml.css("div.aaa p#bbb").first?
25 | p_node.css("a.ccc").each do |node|
26 | p node.tag_sym # :a
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/examples/example2.cr:
--------------------------------------------------------------------------------
1 | require "../src/modest"
2 |
3 | html = <<-PAGE
4 |
16 | PAGE
17 |
18 | parser = Myhtml::Parser.new(html)
19 |
20 | # select all p nodes which id like `*p*`
21 | p parser.css("p[id*=p]").map(&.attribute_by("id")).to_a # => ["p1", "p2", "p3", "p4", "p5", "p6"]
22 |
23 | # select all nodes with class "jo"
24 | p parser.css("p.jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
25 | p parser.css(".jo").map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
26 |
27 | # select odd child tag inside div, which not contain a
28 | p parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a # => ["p1", "p4", "p6"]
29 |
30 | # all elements with class=jo inside last div tag
31 | p parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a # => ["p4", "p6"]
32 |
33 | # a element with href ends like .png
34 | p parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a # => ["a2"]
35 |
36 | # find all a tags inside , which href contain `html`
37 | p parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a # => ["a1"]
38 |
39 | # find all a tags inside
, which href contain `html` or ends_with `.png`
40 | p parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a # => ["a1", "a2"]
41 |
42 | # create finder and use it in many places, this is faster, than create it many times
43 | finder = Modest::Finder.new(".jo")
44 | p parser.css(finder).map(&.attribute_by("id")).to_a # => ["p2", "p4", "p6"]
45 |
--------------------------------------------------------------------------------
/examples/example3.cr:
--------------------------------------------------------------------------------
1 | require "../src/modest"
2 |
3 | html = <<-PAGE
4 |
5 |
8 |
9 | 123 | other |
10 | foo | columns |
11 | bar | are |
12 | xyz | ignored |
13 |
14 |
15 | PAGE
16 |
17 | parser = Myhtml::Parser.new(html)
18 |
19 | p parser.css("#t2 tr td:first-child").map(&.inner_text).to_a # => ["123", "foo", "bar", "xyz"]
20 | p parser.css("#t2 tr td:first-child").map(&.to_html).to_a # => ["123 | ", "foo | ", "bar | ", "xyz | "]
21 |
--------------------------------------------------------------------------------
/shard.yml:
--------------------------------------------------------------------------------
1 | name: modest
2 | version: 0.18
3 |
4 | authors:
5 | - Konstantin Makarchev
6 |
7 | dependencies:
8 | myhtml:
9 | github: kostya/myhtml
10 | version: "< 1.0"
11 |
12 | scripts:
13 | postinstall: cd src/ext && make package
14 |
15 | license: MIT
16 |
--------------------------------------------------------------------------------
/spec/modest_spec.cr:
--------------------------------------------------------------------------------
1 | require "./spec_helper"
2 |
3 | describe Modest do
4 | it "direct use Finder" do
5 | html = ""
6 | selector = "div > :nth-child(2n+1):not(:has(a))"
7 |
8 | parser = Myhtml::Parser.new(html)
9 | finder = Modest::Finder.new(selector)
10 | nodes = finder.find(parser.html!).to_a
11 |
12 | nodes.size.should eq 2
13 |
14 | n1, n2 = nodes
15 |
16 | n1.tag_name.should eq "p"
17 | n1.attribute_by("id").should eq "p1"
18 |
19 | n2.tag_name.should eq "p"
20 | n2.attribute_by("id").should eq "p5"
21 | end
22 |
23 | it "css for root! node" do
24 | html = ""
25 |
26 | parser = Myhtml::Parser.new(html)
27 | nodes = parser.root!.css("div > :nth-child(2n+1):not(:has(a))").to_a
28 |
29 | nodes.size.should eq 2
30 |
31 | n1, n2 = nodes
32 |
33 | n1.tag_name.should eq "p"
34 | n1.attribute_by("id").should eq "p1"
35 |
36 | n2.tag_name.should eq "p"
37 | n2.attribute_by("id").should eq "p5"
38 | end
39 |
40 | it "another rule" do
41 | html = ""
42 |
43 | parser = Myhtml::Parser.new(html)
44 | parser.root!.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
45 | end
46 |
47 | it "another rule for parser itself" do
48 | html = ""
49 |
50 | parser = Myhtml::Parser.new(html)
51 | parser.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
52 | end
53 |
54 | it "work for another scope node" do
55 | html = ""
56 |
57 | parser = Myhtml::Parser.new(html)
58 | parser.nodes(:div).to_a.last.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p4 p6)
59 | parser.nodes(:div).to_a.first.css(".jo").to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
60 | end
61 |
62 | context "build finder" do
63 | it "for parser" do
64 | html = ""
65 |
66 | parser = Myhtml::Parser.new(html)
67 | finder = Modest::Finder.new(".jo")
68 |
69 | 10.times do
70 | parser.root!.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
71 | end
72 |
73 | finder.inspect.should eq "Modest::Finder(rule: `.jo`)"
74 | end
75 |
76 | it "for parser" do
77 | html = ""
78 |
79 | parser = Myhtml::Parser.new(html)
80 | finder = Modest::Finder.new(".jo")
81 |
82 | 10.times do
83 | parser.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
84 | end
85 | end
86 |
87 | it "for root node" do
88 | html = ""
89 |
90 | parser = Myhtml::Parser.new(html)
91 | finder = Modest::Finder.new(".jo")
92 |
93 | 10.times do
94 | parser.root!.css(finder).to_a.map(&.attribute_by("id")).should eq %w(p2 p4 p6)
95 | end
96 | end
97 | end
98 |
99 | it "should not raise on empty selector" do
100 | html = ""
101 |
102 | parser = Myhtml::Parser.new(html)
103 | finder = Modest::Finder.new("")
104 | parser.css(finder).to_a.size.should eq 0
105 | end
106 |
107 | it "integration test" do
108 | html = <<-PAGE
109 |
110 |
111 |
112 |
113 | link1
114 | link2
115 |
120 |
121 | PAGE
122 |
123 | parser = Myhtml::Parser.new(html)
124 |
125 | # select all p nodes which id like `*p*`
126 | parser.css("p[id*=p]").map(&.attribute_by("id")).to_a.should eq ["p1", "p2", "p3", "p4", "p5", "p6"]
127 |
128 | # select all nodes with class "jo"
129 | parser.css("p.jo").map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"]
130 | parser.css(".jo").map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"]
131 |
132 | # select odd child tag inside div, which not contain a
133 | parser.css("div > :nth-child(2n+1):not(:has(a))").map(&.attribute_by("id")).to_a.should eq ["p1", "p4", "p6"]
134 |
135 | # all elements with class=jo inside last div tag
136 | parser.css("div").to_a.last.css(".jo").map(&.attribute_by("id")).to_a.should eq ["p4", "p6"]
137 |
138 | # a element with href ends like .png
139 | parser.css(%q{a[href$=".png"]}).map(&.attribute_by("id")).to_a.should eq ["a2"]
140 |
141 | # find all a tags inside , which href contain `html`
142 | parser.css(%q{p[id=p3] > a[href*="html"]}).map(&.attribute_by("id")).to_a.should eq ["a1"]
143 |
144 | # find all a tags inside
, which href contain `html` or ends_with `.png`
145 | parser.css(%q{p[id=p3] > a:matches([href *= "html"], [href $= ".png"])}).map(&.attribute_by("id")).to_a.should eq ["a1", "a2"]
146 |
147 | # create finder and use it in many places
148 | finder = Modest::Finder.new(".jo")
149 | parser.css(finder).map(&.attribute_by("id")).to_a.should eq ["p2", "p4", "p6"]
150 | end
151 |
152 | it "integration test2" do
153 | html = <<-PAGE
154 |
155 |
158 |
159 | 123 | other |
160 | foo | columns |
161 | bar | are |
162 | xyz | ignored |
163 |
164 |
165 | PAGE
166 |
167 | parser = Myhtml::Parser.new(html)
168 | parser.css("#t2 tr td:first-child").map(&.inner_text).to_a.should eq ["123", "foo", "bar", "xyz"]
169 | parser.css("#t2 tr td:first-child").map(&.to_html).to_a.should eq ["123 | ", "foo | ", "bar | ", "xyz | "]
170 |
171 | res = [] of String
172 | parser.css("#t2 tr").each do |node|
173 | res << node.css("td:first-child").first.inner_text
174 | end
175 | res.join('|').should eq "123|foo|bar|xyz"
176 | end
177 |
178 | it "not sigfaulting on more than 1024 elements" do
179 | str = "" + "ooo
" * 20000 + ""
180 | parser = Myhtml::Parser.new(str)
181 |
182 | c = 0
183 | x = 0
184 | parser.css("div").each do |node|
185 | x += 1
186 | c += 1 if node.attribute_by("class") == "A"
187 | end
188 | x.should eq 20000
189 | c.should eq 20000
190 | end
191 |
192 | it "bug in css" do
193 | parser = Myhtml::Parser.new(%q{bla
})
194 | parser.css("div.jjjj").to_a.size.should eq 0
195 | end
196 |
197 | it "css with yield" do
198 | parser = Myhtml::Parser.new(%q{bla
})
199 | parser.css("div.jjjj") { |col| col.to_a.size }.should eq 1
200 | end
201 | end
202 |
--------------------------------------------------------------------------------
/spec/spec_helper.cr:
--------------------------------------------------------------------------------
1 | require "spec"
2 | require "../src/modest"
3 |
--------------------------------------------------------------------------------
/src/ext/Makefile:
--------------------------------------------------------------------------------
1 | REV := bcc92e72aeda7791d14f66c3c61067086ed89229
2 |
3 | .PHONY: package
4 | package: ./modest-c/lib/libmodest_static.a
5 |
6 | ./modest-c:
7 | git clone https://github.com/lexborisov/Modest.git ./modest-c
8 | cd modest-c && git reset --hard $(REV)
9 |
10 | ./modest-c/lib/libmodest_static.a: ./modest-c
11 | cd modest-c && make static MyHTML_BUILD_SHARED=OFF MyCORE_BUILD_WITHOUT_THREADS=YES PROJECT_OPTIMIZATION_LEVEL=-O3
12 |
13 | .PHONY: clean
14 | clean:
15 | rm -rf modest-c
16 |
--------------------------------------------------------------------------------
/src/modest.cr:
--------------------------------------------------------------------------------
1 | require "myhtml"
2 | require "./modest/*"
3 |
4 | module Modest
5 | VERSION = "0.18"
6 | end
7 |
--------------------------------------------------------------------------------
/src/modest/extend_myhtml.cr:
--------------------------------------------------------------------------------
1 | require "myhtml"
2 |
3 | struct Myhtml::Parser
4 | delegate :css, to: root!
5 | end
6 |
7 | struct Myhtml::Node
8 | def css(rule : String)
9 | f = Modest::Finder.new(rule)
10 | css(f)
11 | ensure
12 | f.try &.free
13 | end
14 |
15 | def css(finder : Modest::Finder)
16 | finder.find(self)
17 | end
18 |
19 | def css(arg)
20 | collection = css(arg)
21 | yield collection
22 | ensure
23 | collection.try &.free
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/src/modest/finder.cr:
--------------------------------------------------------------------------------
1 | class Modest::Finder
2 | @finder : LibModest::ModestFinderT*
3 |
4 | def initialize(@rule : String)
5 | @finalized = false
6 | @finder = LibModest.finder_create_simple
7 | @css = Mycss.new
8 | @selectors = Modest::LibMyCss.entry_selectors(@css.raw_entry)
9 | @list = LibMyCss.selectors_parse(@selectors, Myhtml::Lib::MyEncodingList::MyENCODING_UTF_8, rule.to_unsafe, rule.bytesize, out status)
10 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK
11 | free
12 | raise Myhtml::Error.new("finder selectors_parse #{status}")
13 | end
14 | end
15 |
16 | def find(scope_node : Myhtml::Node)
17 | col = Pointer(Myhtml::Lib::MyhtmlCollectionT).new(0)
18 | LibModest.finder_by_selectors_list(@finder, scope_node.node, @list, pointerof(col))
19 | Myhtml::Iterator.new(scope_node.tree, col)
20 | end
21 |
22 | def free
23 | unless @finalized
24 | @finalized = true
25 | LibMyCss.selectors_list_destroy(@selectors, @list, true)
26 | LibModest.finder_destroy(@finder, true)
27 | @css.free
28 | end
29 | end
30 |
31 | def finalize
32 | free
33 | end
34 |
35 | def inspect(io)
36 | io << "Modest::Finder(rule: `"
37 | io << @rule
38 | io << "`)"
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/src/modest/lib.cr:
--------------------------------------------------------------------------------
1 | module Modest
2 | @[Link(ldflags: "#{__DIR__}/../ext/modest-c/lib/libmodest_static.a")]
3 | lib LibMyCss
4 | enum MycssStatusT
5 | MyCSS_STATUS_OK = 0x000000
6 | MyCSS_STATUS_ERROR_MEMORY_ALLOCATION = 0x010001
7 | MyCSS_STATUS_ERROR_TOKENIZER_STATE_ALLOCATION = 0x010020
8 | MyCSS_STATUS_ERROR_TOKENIZER_INCOMING_BUFFER_ADD = 0x010021
9 | MyCSS_STATUS_ERROR_TOKENIZER_TOKEN_ALLOCATION = 0x010022
10 | MyCSS_STATUS_ERROR_INCOMING_BUFFER_INIT = 0x010030
11 | MyCSS_STATUS_ERROR_ENTRY_INCOMING_BUFFER_CREATE = 0x010039
12 | MyCSS_STATUS_ERROR_ENTRY_INCOMING_BUFFER_INIT = 0x010040
13 | MyCSS_STATUS_ERROR_ENTRY_TOKEN_INCOMING_BUFFER_INIT = 0x010041
14 | MyCSS_STATUS_ERROR_ENTRY_TOKEN_NODE_ADD = 0x010042
15 | MyCSS_STATUS_ERROR_SELECTORS_CREATE = 0x010100
16 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_CREATE = 0x010101
17 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_INIT = 0x010102
18 | MyCSS_STATUS_ERROR_SELECTORS_ENTRIES_NODE_ADD = 0x010103
19 | MyCSS_STATUS_ERROR_SELECTORS_LIST_CREATE = 0x010104
20 | MyCSS_STATUS_ERROR_SELECTORS_LIST_INIT = 0x010105
21 | MyCSS_STATUS_ERROR_SELECTORS_LIST_ADD_NODE = 0x010106
22 | MyCSS_STATUS_ERROR_NAMESPACE_CREATE = 0x010200
23 | MyCSS_STATUS_ERROR_NAMESPACE_INIT = 0x010201
24 | MyCSS_STATUS_ERROR_NAMESPACE_ENTRIES_CREATE = 0x010202
25 | MyCSS_STATUS_ERROR_NAMESPACE_ENTRIES_INIT = 0x010203
26 | MyCSS_STATUS_ERROR_NAMESPACE_NODE_ADD = 0x010204
27 | MyCSS_STATUS_ERROR_MEDIA_CREATE = 0x010404
28 | MyCSS_STATUS_ERROR_STRING_CREATE = 0x010501
29 | MyCSS_STATUS_ERROR_STRING_INIT = 0x010502
30 | MyCSS_STATUS_ERROR_STRING_NODE_INIT = 0x010503
31 | MyCSS_STATUS_ERROR_AN_PLUS_B_CREATE = 0x010600
32 | MyCSS_STATUS_ERROR_AN_PLUS_B_INIT = 0x010601
33 | MyCSS_STATUS_ERROR_DECLARATION_CREATE = 0x010700
34 | MyCSS_STATUS_ERROR_DECLARATION_INIT = 0x010701
35 | MyCSS_STATUS_ERROR_DECLARATION_ENTRY_CREATE = 0x010702
36 | MyCSS_STATUS_ERROR_DECLARATION_ENTRY_INIT = 0x010703
37 | MyCSS_STATUS_ERROR_PARSER_LIST_CREATE = 0x010800
38 | end
39 |
40 | type MycssT = Void*
41 | type MycssEntryT = Void*
42 | type MysccSelectorsListT = Void*
43 | type MysccSelectorsT = Void*
44 |
45 | fun create = mycss_create : MycssT*
46 | fun init = mycss_init(mycss : MycssT*) : MycssStatusT
47 | fun entry_create = mycss_entry_create : MycssEntryT*
48 | fun entry_init = mycss_entry_init(mycss : MycssT*, entry : MycssEntryT*) : MycssStatusT
49 | fun selectors_parse = mycss_selectors_parse(selectors : MysccSelectorsT*, encoding : Myhtml::Lib::MyEncodingList,
50 | data : UInt8*, data_size : LibC::SizeT, out_status : MycssStatusT*) : MysccSelectorsListT*
51 | fun selectors_list_destroy = mycss_selectors_list_destroy(selectors : MysccSelectorsT*, selector_list : MysccSelectorsListT*, self_destroy : Bool) : MysccSelectorsListT*
52 |
53 | fun entry_selectors = mycss_entry_selectors(entry : MycssEntryT*) : MysccSelectorsT*
54 | fun destroy = mycss_destroy(mycss : MycssT*, self_destroy : Bool) : MycssT*
55 | fun entry_destroy = mycss_entry_destroy(entry : MycssEntryT*, self_destroy : Bool) : MycssEntryT*
56 | end
57 |
58 | # cd src/ext && make
59 | @[Link(ldflags: "#{__DIR__}/../ext/modest-c/lib/libmodest_static.a")]
60 | lib LibModest
61 | # modest
62 | type ModestFinderT = Void*
63 |
64 | enum ModestStatusT
65 | MODEST_STATUS_OK = 0x000000
66 | MODEST_STATUS_ERROR = 0x020000
67 | MODEST_STATUS_ERROR_MEMORY_ALLOCATION = 0x020001
68 | end
69 |
70 | fun finder_create_simple = modest_finder_create_simple : ModestFinderT*
71 | fun finder_destroy = modest_finder_destroy(finder : ModestFinderT*, self_destroy : Bool) : ModestFinderT*
72 |
73 | fun finder_by_selectors_list = modest_finder_by_selectors_list(finder : ModestFinderT*,
74 | scope_node : Myhtml::Lib::MyhtmlTreeNodeT*,
75 | sel_list : LibMyCss::MysccSelectorsListT*,
76 | collection : Myhtml::Lib::MyhtmlCollectionT**) : ModestStatusT
77 | end
78 | end
79 |
--------------------------------------------------------------------------------
/src/modest/mycss.cr:
--------------------------------------------------------------------------------
1 | class Modest::Mycss
2 | getter raw_mycss, raw_entry
3 |
4 | def initialize
5 | @finalized = false
6 | @raw_mycss = LibMyCss.create
7 | status = LibMyCss.init(@raw_mycss)
8 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK
9 | LibMyCss.destroy(@raw_mycss, true)
10 | raise Myhtml::Error.new("mycss init error #{status}")
11 | end
12 | @raw_entry = LibMyCss.entry_create
13 | status = LibMyCss.entry_init(@raw_mycss, @raw_entry)
14 | if status != LibMyCss::MycssStatusT::MyCSS_STATUS_OK
15 | free
16 | raise Myhtml::Error.new("mycss entry_init error #{status}")
17 | end
18 | end
19 |
20 | def free
21 | unless @finalized
22 | @finalized = true
23 | LibMyCss.entry_destroy(@raw_entry, true)
24 | LibMyCss.destroy(@raw_mycss, true)
25 | end
26 | end
27 |
28 | def finalize
29 | free
30 | end
31 | end
32 |
--------------------------------------------------------------------------------