├── .gitignore
├── misc
├── rubygems_guides.rb
├── ruby_hacking_guide.rb
├── ecma_262_3rd_edition_jp.rb
├── clojure.rb
├── getting_real_to_json.pl
├── SmoothCoffeeScript_to_json.pl
├── sicp_to_json.pl
└── perfect_guide_for_english_learning.pl
├── index.tx
├── ncx.tx
├── opf.tx
├── README.md
└── webiblo.pl
/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | tmp
3 |
--------------------------------------------------------------------------------
/misc/rubygems_guides.rb:
--------------------------------------------------------------------------------
1 | %w(uri json nokogiri curb cgi).each {|g| require g }
2 |
3 | root = 'http://guides.rubygems.org'
4 | book = {
5 | :title => 'RubyGems Guides',
6 | :authors => %w( rubygems ),
7 | :cover_image => 'http://guides.rubygems.org/images/logo.png',
8 | :content_xpath => '//section[@id="chapters"]',
9 | :chapters => []
10 | }
11 |
12 | def curl(url)
13 | c = Curl::Easy.new(url.to_s)
14 | c.follow_location = true
15 | c.perform
16 | c.body_str
17 | end
18 |
19 | doc = Nokogiri::HTML(curl(root))
20 | doc.xpath('//section[@id="chapters"]/h2/a').each do |a|
21 | chapter_url = URI(root) + a[:href]
22 | chapter = {
23 | :uri => chapter_url,
24 | :title => a.text,
25 | }
26 | book[:chapters] << chapter
27 | end
28 |
29 | puts JSON.pretty_generate(book)
30 |
--------------------------------------------------------------------------------
/misc/ruby_hacking_guide.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | %w(uri json nokogiri curb cgi).each {|g| require g }
3 |
4 | root = 'http://i.loveruby.net/ja/rhg/book/'
5 | book = {
6 | :title => 'Rubyソースコード完全解説',
7 | :authors => [ 'Minero Aoki' ],
8 | :cover_image => 'http://direct.ips.co.jp/directsys/Images/Goods/1/1721B.gif',
9 | :content_xpath => '//body',
10 | :chapters => []
11 | }
12 |
13 | def curl(url)
14 | c = Curl::Easy.new(url.to_s)
15 | c.follow_location = true
16 | c.perform
17 | c.body_str
18 | end
19 |
20 | doc = Nokogiri::HTML(curl(root))
21 | doc.xpath('//ul/li/a').each do |a|
22 | chapter_url = URI(root) + a[:href]
23 | chapter = {
24 | :uri => chapter_url,
25 | :title => a.text,
26 | }
27 | book[:chapters] << chapter
28 | end
29 |
30 | puts JSON.pretty_generate(book)
31 |
--------------------------------------------------------------------------------
/misc/ecma_262_3rd_edition_jp.rb:
--------------------------------------------------------------------------------
1 | %w/uri json nokogiri curb cgi/.each {|g| require g }
2 |
3 | root = 'http://www2u.biglobe.ne.jp/~oz-07ams/prog/ecma262r3/fulltoc.html'
4 | book = {
5 | :title => 'ECMA-262 3rd edition',
6 | :authors => [
7 | 'TAKI'
8 | ],
9 | :cover_image => 'http://www.ecma-international.org/images/logo_printerf.jpg',
10 | :content_xpath => '//div[@class="section level1"]',
11 | :chapters => []
12 | }
13 |
14 | def curl(url)
15 | c = Curl::Easy.new(url.to_s)
16 | c.follow_location = true
17 | c.perform
18 | c.body_str
19 | end
20 |
21 | doc = Nokogiri::HTML(curl(root))
22 | doc.xpath('//body/dl/dt/a').each do |a|
23 | chapter_url = URI(root) + a[:href]
24 | chapter = {
25 | :uri => chapter_url,
26 | :title => a.text,
27 | }
28 | book[:chapters] << chapter
29 | end
30 |
31 | puts book.to_json
32 |
--------------------------------------------------------------------------------
/index.tx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | [% title %]
6 |
7 |
8 | [% title %]
9 |
10 |
11 | [% FOREACH part IN parts -%]
12 | [% IF part.title -%]
13 | - [% part.title %]
14 | [% END -%]
15 |
16 | [% FOREACH chapter IN part.chapters -%]
17 | - [% chapter.title %]
18 |
19 | [% FOREACH section IN chapter.sections -%]
20 | - [% section.title %]
21 |
26 | [% END -%]
27 |
28 | [% END -%]
29 |
30 | [% END -%]
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/misc/clojure.rb:
--------------------------------------------------------------------------------
1 | %w/uri json nokogiri curb cgi/.each {|g| require g }
2 |
3 | root = 'http://clojure.org/'
4 | book = {
5 | :title => 'Clojure',
6 | :authors => [
7 | 'Rich Hickey'
8 | ],
9 | :cover_image => 'http://clojure.org/space/showimage/clojure-icon.gif',
10 | :content_xpath => '//div[@id="content_view"]',
11 | :exclude_xpath => '//div[@id="toc"]',
12 | :chapters => []
13 | }
14 |
15 | def curl(url)
16 | c = Curl::Easy.new(url.to_s)
17 | c.follow_location = true
18 | c.perform
19 | c.body_str
20 | end
21 |
22 | def sections(url)
23 | sections = []
24 | doc = Nokogiri::HTML(curl(url))
25 | doc.xpath('//div[@id="toc"]//a').each do |a|
26 | if a[:href] =~ /^#/
27 | sections << {
28 | :uri => url.to_s + a[:href],
29 | :title => a.text
30 | }
31 | end
32 | end
33 |
34 | sections
35 | end
36 |
37 | doc = Nokogiri::HTML(curl(root))
38 | doc.xpath('//div[@class="WikiCustomNav WikiElement wiki"]//a').each do |a|
39 | next if a[:href] =~ /^http/
40 | chapter_url = URI(root) + a[:href]
41 | chapter = {
42 | :uri => chapter_url,
43 | :title => a.text,
44 | #:sections => sections(chapter_url)
45 | }
46 | book[:chapters] << chapter
47 | end
48 |
49 | puts book.to_json
50 |
--------------------------------------------------------------------------------
/ncx.tx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | [% title %]
5 |
6 |
7 | [% title %] - Table of Contents
8 |
9 | [% num = 1 -%]
10 |
11 | [% FOREACH part IN parts -%]
12 | [% IF part.href -%]
13 |
14 | [% part.title %]
15 |
16 | [% num = num + 1 -%]
17 | [% END -%]
18 | [% FOREACH chapter IN part.chapters -%]
19 | [% IF chapter.href -%]
20 |
21 | [% chapter.title %]
22 |
23 | [% num = num + 1 -%]
24 | [% END -%]
25 | [% FOREACH section IN chapter.sections -%]
26 |
27 | [% section.title %]
28 |
29 | [% num = num + 1 -%]
30 | [% END -%]
31 | [% END -%]
32 | [% END -%]
33 |
34 |
35 |
--------------------------------------------------------------------------------
/misc/getting_real_to_json.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | use JSON::XS;
7 | use LWP::Simple;
8 | use HTML::TreeBuilder::XPath;
9 | use YAML;
10 |
11 |
12 | my $base = 'http://gettingreal.37signals.com';
13 | my $tree = HTML::TreeBuilder::XPath->new;
14 | $tree->parse(get("$base/toc.php"));
15 |
16 | my $chapters = [];
17 | for my $chapter ( $tree->findnodes('//h2') ) {
18 | my @contents = $chapter->content_list;
19 | my $name = $contents[0]->attr('name');
20 | next if !$name or $name !~ /^ch\d+/;
21 | my $title = $contents[1];
22 | $title =~ s/\s+$//;
23 |
24 | my $sections = [];
25 | for my $section ( $chapter->right->findnodes('li/a') ) {
26 | push @$sections, {
27 | uri => $base . '/' . $section->attr('href'),
28 | title => $section->as_text,
29 | };
30 | }
31 |
32 | push @$chapters, {
33 | title => $title,
34 | sections => $sections,
35 | };
36 | }
37 |
38 | my $json = JSON::XS->new;
39 | $json->indent(1);
40 |
41 | print $json->encode({
42 | title => 'Getting Real',
43 | authors => ['37signals'],
44 | date => '2012/1/9',
45 | chapters => $chapters,
46 | content_xpath => q{//div[@class="content"]},
47 | exclude_xpath => q{//div[@class="next"]},
48 | cover_image => 'http://ec2.images-amazon.com/images/I/31jvYr2h6GL._SS500_.jpg',
49 | });
50 |
51 | exit;
52 |
--------------------------------------------------------------------------------
/misc/SmoothCoffeeScript_to_json.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 | use warnings;
5 | use LWP::Simple;
6 | use HTML::TreeBuilder::XPath;
7 | use JSON::XS;
8 |
9 | my $base = 'http://autotelicum.github.com/Smooth-CoffeeScript/SmoothCoffeeScript.html';
10 |
11 | my $content = get('http://autotelicum.github.com/Smooth-CoffeeScript/SmoothCoffeeScript.html');
12 |
13 | my $tree = HTML::TreeBuilder::XPath->new;
14 | $tree->no_expand_entities(1);
15 | $tree->parse($content);
16 | $tree->eof;
17 |
18 | my $book = {
19 | title => ($tree->findnodes('//title'))[0]->as_text,
20 | author => ($tree->findnodes('//div[@class="author"]'))[0]->as_text,
21 | cover_image => 'http://autotelicum.github.com/Smooth-CoffeeScript/img/SmoothCoverWithSolutions.jpg',
22 | };
23 |
24 | my $parts = [];
25 |
26 | for my $a ( $tree->findnodes('//a[@class="Link"]') ) {
27 | my $href = $a->attr('href');
28 | my $title = $a->as_text;
29 |
30 | next if $href !~ /^#toc/ and $href !~ /^#Index/;
31 |
32 | if ( $href =~ /^#toc-Part/ or $href eq '#Index' ) {
33 | push @$parts, {
34 | title => $title,
35 | uri => $base . $href,
36 | chapters => [],
37 | };
38 | }
39 | elsif ( $href =~ /^#toc-Chapter/ or $href =~ /^#toc-Appendix/ ) {
40 | push @{ $parts->[-1]->{chapters} }, {
41 | title => $title,
42 | uri => $base. $href,
43 | sections => [],
44 | };
45 | }
46 | elsif ( $href =~ /^#toc-Section/ ) {
47 | push @{ $parts->[-1]->{chapters}->[-1]->{sections} }, {
48 | title => $title,
49 | uri => $base. $href,
50 | subsections => [],
51 | };
52 | }
53 | elsif ( $href =~ /^#toc-Subsection/ ) {
54 | push @{ $parts->[-1]->{chapters}->[-1]->{sections}->[-1]->{subsections} }, {
55 | title => $title,
56 | uri => $base. $href,
57 | };
58 | }
59 | }
60 |
61 | $book->{parts} = $parts;
62 |
63 | my $json = JSON::XS->new;
64 | $json->indent(1);
65 |
66 | print $json->encode($book);
67 |
--------------------------------------------------------------------------------
/misc/sicp_to_json.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 | use warnings;
5 | use LWP::Simple;
6 | use HTML::TreeBuilder::XPath;
7 | use JSON::XS;
8 |
9 | my $base = 'http://mitpress.mit.edu/sicp/full-text/book';
10 |
11 | my $book = {
12 | title => 'Structure and Interpretation of Computer Programs',
13 | authors => [ 'Harold Abelson', 'Gerald Jay Sussman', 'Julie Sussman' ],
14 | cover_image => 'http://mitpress.mit.edu/sicp/full-text/book/cover.jpg',
15 | exclude_xpath => q{//div[@class="navigation"]},
16 | };
17 |
18 | my $chapters = [];
19 |
20 | my $contents = get('http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-4.html');
21 |
22 | my $tree = HTML::TreeBuilder::XPath->new;
23 | $tree->parse($contents);
24 | $tree->eof;
25 |
26 | for my $a ( $tree->findnodes('//a') ) {
27 | my $name = $a->attr('name');
28 | next if !$name or $name !~ /\%_toc/ or $name eq '%_toc_start';
29 |
30 | my $text = $a->as_text;
31 | $text =~ s/\240/ /g;
32 | my $href = "$base/" . $a->attr('href');
33 |
34 | if ( $name =~ /^\%_toc_\%_chap_Temp/ ) {
35 | push @$chapters, {
36 | title => $text,
37 | uri => $href,
38 | };
39 | next;
40 | }
41 | # entering the chapter
42 | elsif ( $name =~ /^\%_toc_\%_chap_\d$/ ) {
43 | push @$chapters, {
44 | title => $text,
45 | uri => $href,
46 | sections => [],
47 | };
48 | }
49 | # entering the section
50 | elsif ( $name =~ /\%_toc_\%_sec_\d\.\d$/ ) {
51 | push @{ $chapters->[-1]->{sections} }, {
52 | title => $text,
53 | uri => $href,
54 | subsections => [],
55 | };
56 | }
57 | # entring the subsection
58 | elsif ( $name =~ /\%_toc_\%_sec_\d\.\d\.\d$/ ) {
59 | push @{ $chapters->[-1]->{sections}->[-1]->{subsections} }, {
60 | title => $text,
61 | uri => $href,
62 | };
63 | }
64 | }
65 |
66 | $book->{chapters} = $chapters;
67 |
68 | my $json = JSON::XS->new;
69 | $json->indent(1);
70 |
71 | print $json->encode($book);
72 |
73 |
--------------------------------------------------------------------------------
/opf.tx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 | [% title %]
7 | en-us
8 | [% FOREACH author IN authors -%]
9 | [% author %]
10 | [% END -%]
11 | [% description %]
12 | [% date %]
13 |
14 |
15 |
16 | [% cover_file %]
17 |
18 |
19 |
20 |
21 |
22 | [% num = 1 -%]
23 | [% FOREACH part IN parts -%]
24 | [% IF part.file -%]
25 |
26 | [% num = num + 1 -%]
27 | [% END -%]
28 | [% FOREACH chapter IN part.chapters -%]
29 | [% IF chapter.file -%]
30 |
31 | [% num = num + 1 -%]
32 | [% END -%]
33 | [% FOREACH section IN chapter.sections -%]
34 |
35 | [% num = num + 1 -%]
36 | [% END -%]
37 | [% END -%]
38 | [% END -%]
39 |
40 |
41 | [% num = 1 -%]
42 | [% FOREACH part IN parts -%]
43 | [% FOREACH chapter IN part.chapters -%]
44 | [% FOREACH section IN chapter.sections -%]
45 |
46 | [% num = num + 1 -%]
47 | [% END -%]
48 | [% END -%]
49 | [% END -%]
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Webiblo - web to ebook project
2 |
3 | ---------------------------------------
4 |
5 | ## Overview
6 |
7 | Webiblo is a project to convert web sites to ebooks.
8 |
9 | You can convert Getting Real web data to mobipcoket format like this.
10 |
11 | $ webiblo.pl http://mizzy.org/webiblo/data/Getting_Real.json
12 |
13 |
14 | ---------------------------------------
15 |
16 | ## JSON data format
17 |
18 | JSON data format to convert web data to ebook is like this:
19 |
20 | {
21 | "title" : "Structure and Interpretation of Computer Programs",
22 | "authors" : [
23 | "Harold Abelson",
24 | "Gerald Jay Sussman",
25 | "Julie Sussman"
26 | ],
27 | "cover_image" : "http://mitpress.mit.edu/sicp/full-text/book/cover.jpg",
28 | "content_xpath" : "//div[@class=\"content\"]", # Optional
29 | "exclude_xpath" : "//div[@class=\"navigation\"]", # Optional
30 | "chapters" : [
31 | {
32 | "title" : "Foreword",
33 | "uri" : "http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-5.html#%_chap_Temp_2"
34 | },
35 | {
36 | "title" : "1 Building Abstractions with Procedures",
37 | "uri" : "http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-9.html#%_chap_1",
38 | "sections" : [
39 | "title" : "1.1 The Elements of Programming",
40 | "uri" : "http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-10.html#%_sec_1.1"
41 | "subsections" : [
42 | {
43 | "title" : "1.1.1 Expressions",
44 | "uri" : "http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-10.html#%_sec_1.1.1"
45 | },
46 | ]
47 | ]
48 | }
49 | ]
50 | }
51 |
52 | These are the examples.
53 |
54 | * http://mizzy.org/webiblo/data/Getting_Real.json
55 | * http://mizzy.org/webiblo/data/SICP.json
56 | * http://mizzy.org/webiblo/data/SmoothCoffeeScript.json
57 |
58 | ---------------------------------------
59 |
60 | ## Try your own JSON data
61 |
62 | webiblo.pl takes JSON data from STDIN, so you can run webiblo.pl like this:
63 |
64 | $ cat data.json | webiblo.pl
65 |
66 |
67 | ## Share your JSON data
68 |
69 | JSON data are put on [gh-pages branch](https://github.com/mizzy/webiblo/tree/gh-pages) and shared on [GitHub Pages](http://mizzy.org/webiblo/).
70 |
71 | If you create a JSON data for webiblo, please send me pull requests.
72 |
73 | ---------------------------------------
74 |
75 | ## TODO
76 |
77 | * Support formats other than mobipocket. (eg. EPUB3)
78 | * Search JSON catalog from CLI.
--------------------------------------------------------------------------------
/misc/perfect_guide_for_english_learning.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 | #use utf8;
6 | use Web::Query qw(wq);
7 | use Data::Section::Simple qw(get_data_section);
8 | use JSON::XS qw(encode_json);
9 | use YAML::Syck;
10 |
11 | my $sec_i = 0;
12 |
13 | my $meta = {
14 | title => '英語上達完全マップ',
15 | author => '森沢 洋介',
16 | cover_image => 'http://images-jp.amazon.com/images/P/4860641027.01.LZZZZZZZ.jpg',
17 | content_xpath => '//body/table//tr[count(preceding-sibling::*) = 4 and parent::*]',
18 | exclude_xpath => '//body/table//tr[count(preceding-sibling::*) = 3 and parent::*]',
19 | chapters => [],
20 | };
21 |
22 | my $data = YAML::Syck::Load do { local $/; };
23 |
24 | for (my $i = 0; ; $i++) {
25 | my $data = $data->[$i] or last;
26 | my ($title, $uri, $sec) = @{$data};
27 | my $chapter = {};
28 | $chapter->{title} = $title;
29 | $chapter->{uri} = $uri if $uri;
30 | $chapter->{sections} = do {
31 | my @secs;
32 | while (my ($title, $uri) = splice @{$sec}, 0, 2) {
33 | push @secs, {
34 | title => $title,
35 | ($uri ? ('uri', $uri) : ()),
36 | };
37 | }
38 | \@secs;
39 | };
40 | push @{$meta->{chapters}}, $chapter;
41 | }
42 |
43 | my $json = JSON::XS->new;
44 | $json->indent(1);
45 | print $json->encode($meta);
46 |
47 | __DATA__
48 | ---
49 | -
50 | - はじめに
51 | - http://homepage3.nifty.com/mutuno/01_first/01_first.html
52 | -
53 | - 英語のテスト特にTOEICについて
54 | - http://homepage3.nifty.com/mutuno/02_toeic/02_toeic.html
55 | -
56 | - 英語は日本で上達する
57 | - http://homepage3.nifty.com/mutuno/03_japan/03_japan.html
58 | -
59 | - 英語力を解剖する
60 | - http://homepage3.nifty.com/mutuno/04_dissect/04_dissect.html
61 | -
62 | - 英語トレーニング法
63 | - http://homepage3.nifty.com/mutuno/05_training/05_training.html
64 | -
65 | - 音読パッケージ
66 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training01.html
67 | - 瞬間英作文
68 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training02.html
69 | - 文法
70 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training03.html
71 | - 精読
72 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training04.html
73 | - 多読(速読)
74 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training05.html
75 | - 語彙増強=ボキャビル
76 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training06.html
77 | - リスニング
78 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training07.html
79 | - 会話
80 | - HTtp://homepage3.nifty.com/mutuno/05_training/05_training08.html
81 | -
82 | - トレーニングの進め方
83 | - http://homepage3.nifty.com/mutuno/06_case/06_case.html
84 | -
85 | - 標準ケース
86 | - http://homepage3.nifty.com/mutuno/06_case/06_case01.html
87 | - 実例ケース
88 | - http://homepage3.nifty.com/mutuno/06_case/06_case02.html
89 | - 目的・タイプ別ケース
90 | - http://homepage3.nifty.com/mutuno/06_case/06_case03.html
91 | -
92 | - トレーニングを継続するために
93 | - http://homepage3.nifty.com/mutuno/07_continue/07_continue.html
94 | -
95 | - おすすめ教材集
96 | - http://homepage3.nifty.com/mutuno/08_book/08_book.html
97 | -
98 | - アドバイス集
99 | - http://homepage3.nifty.com/mutuno/09_advice/09_advice.html
100 | -
101 | - Q & A
102 | - http://homepage3.nifty.com/mutuno/10_QA/10_QA.html
103 | -
104 | - 教室案内
105 | - http://homepage3.nifty.com/mutuno/11_school/11_school.html
106 | -
107 | - リンク
108 | - http://homepage3.nifty.com/mutuno/12_link/12_link.html
109 |
--------------------------------------------------------------------------------
/webiblo.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 | use JSON::Syck;
6 | use LWP::Simple;
7 | use URI;
8 | use HTML::TreeBuilder::XPath;
9 | use Text::Xslate;
10 | use Image::Resize;
11 |
12 | my $style = HTML::Element->new('style');
13 | $style->attr('type', 'text/css');
14 | $style->push_content(<