<\/embed><\/object>"
38 | end
39 |
40 | end
--------------------------------------------------------------------------------
/spec/extractula/custom_extractors/y_frog_spec.rb:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | require File.dirname(__FILE__) + '/../../spec_helper'
4 |
5 | describe Extractula::YFrog do
6 | before do
7 | @url = Domainatrix.parse("http://img70.yfrog.com/i/8pfo.jpg/")
8 | @html = Nokogiri::HTML::Document.new
9 | end
10 |
11 | it "can extract images from yfrog.com" do
12 | Extractula::YFrog.can_extract?(@url, @html).should be_true
13 | end
14 |
15 | it "should have media type 'image'" do
16 | Extractula::YFrog.media_type.should == 'image'
17 | end
18 | end
19 |
20 | describe "extracting from a YFrog page" do
21 |
22 | before do
23 | @extracted_content = Extractula::YFrog.new("http://img70.yfrog.com/i/8pfo.jpg/", read_test_file("yfrog.html")).extract
24 | end
25 |
26 | it "extracts the title" do
27 | @extracted_content.title.should == "Yfrog - 8pfo - Uploaded by cwgabriel"
28 | end
29 |
30 | it "extracts the content" do
31 | @extracted_content.content.should == "Done for today I think."
32 | end
33 |
34 | it "extracts the image url" do
35 | @extracted_content.image_urls.should include("http://img70.yfrog.com/img70/3152/8pfo.jpg")
36 | end
37 |
38 | end
--------------------------------------------------------------------------------
/spec/extractula/custom_extractors/you_tube_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/../../spec_helper'
2 |
3 | describe Extractula::YouTube do
4 | before do
5 | @url = Domainatrix.parse("http://www.youtube.com/watch?v=FzRH3iTQPrk")
6 | @html = Nokogiri::HTML::Document.new
7 | end
8 |
9 | it "can extract videos from youtube.com" do
10 | Extractula::YouTube.can_extract?(@url, @html).should be_true
11 | end
12 |
13 | it "should have media type 'video'" do
14 | Extractula::YouTube.media_type.should == 'video'
15 | end
16 | end
17 |
18 | describe "extracting from a YouTube page" do
19 |
20 | before do
21 | @response = Extractula::OEmbed::Response.new(read_test_file("youtube-oembed.json"))
22 | Extractula::OEmbed.stub!(:request).and_return(@response)
23 | @extracted_content = Extractula::YouTube.new("http://www.youtube.com/watch?v=FzRH3iTQPrk", read_test_file("youtube.html")).extract
24 | end
25 |
26 | it "extracts the title" do
27 | @extracted_content.title.should == "The Sneezing Baby Panda"
28 | end
29 |
30 | it "extracts the content" do
31 | @extracted_content.content.should == "A Baby Panda Sneezing\n\nhttp://www.twitter.com/_jam..."
32 | end
33 |
34 | it "extracts the main video" do
35 | @extracted_content.video_embed.should == " "
36 | end
37 |
38 | end
--------------------------------------------------------------------------------
/spec/extractula/extracted_content_spec.rb:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | require File.dirname(__FILE__) + '/../spec_helper'
3 |
4 | describe "extracted content" do
5 | it "has a url" do
6 | Extractula::ExtractedContent.new(:url => "http://pauldix.net").url.should == "http://pauldix.net"
7 | end
8 |
9 | it "has a title" do
10 | Extractula::ExtractedContent.new(:title => "whatevs").title.should == "whatevs"
11 | end
12 |
13 | it "has content" do
14 | Extractula::ExtractedContent.new(:content => "some content").content.should == "some content"
15 | end
16 |
17 | describe "summary" do
18 | it "has a summary" do
19 | Extractula::ExtractedContent.new(:summary => "a summary!").summary.should == "a summary!"
20 | end
21 |
22 | it "generates the summary from the content" do
23 | extracted = Extractula::ExtractedContent.new(:content => "I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.
\nIt's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The Typhoeus readme highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.
\nIn addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.
")
24 | extracted.summary.should == "I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity."
25 | end
26 |
27 | it "cleans script tags and their content" do
28 | Extractula::ExtractedContent.new(:content => read_test_file("script_tag_remove_case.html")).summary.should == "Obama to meet with House Republicans By Perry Bacon Jr. Washington Post Staff Writer Tuesday, January 26, 2010; A13 President Obama will meet Friday with perhaps his harshest critics outside of Fox News headquarters: the House Republicans."
29 | end
30 | end
31 |
32 | describe "image_urls" do
33 | it "has image_urls" do
34 | Extractula::ExtractedContent.new(:image_urls => ["first.jpg", "second.tiff"]).image_urls.should == ["first.jpg", "second.tiff"]
35 | end
36 |
37 | it "generates the image urls from the content" do
38 | extracted = Extractula::ExtractedContent.new(:content => "
\n\n\n
\nWhen designers start a new Web site, they often sketch out a first idea of the page layout using paper and stencil.
\nDesigners call this sketch a \"wireframe.\"
\nWoorkup.com's Antonio Lupetti collected 10 beautiful examples of wireframes.
\nHe gave us permission to republish them here >
")
39 | extracted.image_urls.should == ["http://static.businessinsider.com/~~/f?id=4b3a466f000000000086e662&maxX=311&maxY=233"]
40 | end
41 | end
42 |
43 | describe "video_embed" do
44 | it "has a video_embed" do
45 | Extractula::ExtractedContent.new(:video_embed => "some embed code").video_embed.should == "some embed code"
46 | end
47 |
48 | it "pulls video embed tags from the content" do
49 | extracted = Extractula::ExtractedContent.new(:content => "\n First Twitter , then Foursquare, now the Weather Channel? People are broadcasting their wedding proposals all over the place these days.
\nThat’s right, the other night Weather Channel meteorologist Kim Perez’s beau, police Sgt. Marty Cunningham (best name EVER), asked her to marry him during a routine forecast. Good thing she said yes, otherwise Cunningham’s disposition would have been cloudy with a serious chance of all-out mortification.
\nSocial media and viral videos have taken the place of the jumbotron when it comes to marriage proposals, allowing one to sound one’s not-so barbaric yawp over the roofs of the world. In today’s look-at-me society, public proposals are probably the least offensive byproduct. Meaning that even the most hardened of cynics can admit that they’re kind of sweet.
\nCheck out Cunningham’s proposal below (I personally enjoy that the weather map reads “ring ing in the New Year”), and then dive right into our list of even more social media wooers. What’s next? Entire domains dedicated to popping the question?
\n
\n\n \n \n \n \n
\n \n \nMore Wedding Bells and Whistles \n \nCONGRATS: Mashable Marriage Proposal Live at #SocialGood [Video]
\nMan Proposes Marriage via Foursquare Check-In
\nDid We Just Witness a Twitter Marriage Proposal?
\nSuccessful Marriage Proposal on Twitter Today: We #blamedrewscancer
\nJust Married: Groom Changes Facebook Relationship Status at the Altar [VIDEO]
")
50 | extracted.video_embed.should == " \n \n \n "
51 | end
52 | end
53 |
54 | describe "some regressions" do
55 | it "doesn't error with undefined method 'node_name' for nil:NilClass when looking at elements" do
56 | extracted = Extractula::Extractor.new("http://viceland.com/caprica/", read_test_file("node-name-error.html")).extract
57 | extracted.title.should == "Syfy + Motherboard.tv Caprica Screenings Contest"
58 | end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/spec/extractula/extractor_spec.rb:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | require File.dirname(__FILE__) + '/../spec_helper'
4 |
5 | describe Extractula::Extractor do
6 |
7 | before do
8 | @url = Domainatrix.parse("http://www.website.com/")
9 | @html = Nokogiri::HTML::Document.new
10 | end
11 |
12 | it "should not be able to extract anything" do
13 | Extractula::Extractor.can_extract?(@url, @html).should be_false
14 | end
15 |
16 | describe "extracting" do
17 | it "should give an empty ExtractedContent object" do
18 | content = Extractula::Extractor.new(@url, @html).extract
19 | content.title.should be_nil
20 | content.summary.should be_empty
21 | content.image_urls.should be_empty
22 | content.video_embed.should be_nil
23 | end
24 | end
25 |
26 | describe "when subclassing" do
27 | before do
28 | class Thingy < Extractula::Extractor; end
29 | end
30 |
31 | it "should add the subclass as an extractor to the Extractula module" do
32 | Extractula.instance_variable_get(:@extractors).should include(Thingy)
33 | end
34 |
35 | describe "setting the domain" do
36 | before do
37 | Thingy.domain 'website'
38 | end
39 |
40 | it "should be able to extract urls from that domain" do
41 | Thingy.can_extract?(@url, @html).should be_true
42 | end
43 |
44 | it "can extract urls based on a domain regex" do
45 | class Foo < Extractula::Extractor; domain /www\.youtube\.com\/watch/; end
46 | Foo.can_extract?(Domainatrix.parse("http://www.youtube.com/watch?v=31g0YE61PLQ&feature=rec-fresh+div-r-1-HM"), nil).should be_true
47 | Foo.can_extract?(Domainatrix.parse("http://www.youtube.com/about.html"), nil).should_not be_true
48 | end
49 | end
50 | end
51 |
52 | describe "media type" do
53 | before do
54 | class Thingy < Extractula::Extractor; end
55 | @thingy = Thingy.new @url, @html
56 | end
57 |
58 | it "should default to 'text'" do
59 | @thingy.media_type.should == 'text'
60 | end
61 |
62 | describe "when set" do
63 | before do
64 | Thingy.media_type 'video'
65 | end
66 |
67 | it "should be the given media type" do
68 | @thingy.media_type.should == 'video'
69 | end
70 | end
71 | end
72 |
73 | describe "post-processing blocks on attribute paths" do
74 | before do
75 | class Thingy < Extractula::Extractor; end
76 | Thingy.title_path('#element') { |t| t.reverse }
77 | node = stub('fake XML node', :text => 'This text is frontways.')
78 | @html.stub!(:at).with('#element').and_return(node)
79 | @thing = Thingy.new @url, @html
80 | end
81 |
82 | it "should run yield the value to the block" do
83 | @thing.title.should == '.syawtnorf si txet sihT'
84 | end
85 | end
86 | end
87 |
88 | describe "dom extraction" do
89 | it "returns an extracted content object with the url set" do
90 | result = Extractula::Extractor.new("http://pauldix.net", "").extract
91 | result.should be_a Extractula::ExtractedContent
92 | result.url.should == "http://pauldix.net"
93 | end
94 | end
95 |
96 | describe "extraction cases" do
97 | describe "extracting from a typepad blog" do
98 | before(:all) do
99 | @extracted_content = Extractula::Extractor.new(
100 | "http://www.pauldix.net/2009/10/typhoeus-the-best-ruby-http-client-just-got-better.html",
101 | read_test_file("typhoeus-the-best-ruby-http-client-just-got-better.html")).extract
102 | end
103 |
104 | it "extracts the title" do
105 | @extracted_content.title.should == "Paul Dix Explains Nothing: Typhoeus, the best Ruby HTTP client just got better"
106 | end
107 |
108 | it "extracts the content" do
109 | @extracted_content.content.should == "I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.
\nIt's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The Typhoeus readme highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.
\nIn addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.
"
110 | end
111 | end
112 |
113 | describe "extracting from wordpress - techcrunch" do
114 | before(:all) do
115 | @extracted_content = Extractula::Extractor.new(
116 | "http://www.techcrunch.com/2009/12/29/totlol-youtube/",
117 | read_test_file("totlol-youtube.html")).extract
118 | end
119 |
120 | it "extracts the title" do
121 | @extracted_content.title.should == "The Sad Tale Of Totlol And How YouTube’s Changing TOS Made It Hard To Make A Buck"
122 | end
123 |
124 | it "extracts the content" do
125 | @extracted_content.content.should == Nokogiri::HTML(read_test_file("totlol-youtube.html")).css("div.entry").first.inner_html.strip
126 | end
127 | end
128 |
129 | describe "extracting from wordpress - mashable" do
130 | before(:all) do
131 | @extracted_content = Extractula::Extractor.new(
132 | "http://mashable.com/2009/12/29/ustream-new-years-eve/",
133 | read_test_file("ustream-new-years-eve.html")).extract
134 | end
135 |
136 | it "extracts the title" do
137 | @extracted_content.title.should == "New Years Eve: Watch Live Celebrations on Ustream"
138 | end
139 |
140 | it "extracts the content" do
141 | @extracted_content.content.should == Nokogiri::HTML(read_test_file("ustream-new-years-eve.html")).css("div.text-content").first.inner_html.strip
142 | end
143 |
144 | it "extracts content with a video embed" do
145 | extracted = Extractula::Extractor.new(
146 | "http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/",
147 | read_test_file("weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video.html")).extract
148 | extracted.content.should == "\n First Twitter , then Foursquare, now the Weather Channel? People are broadcasting their wedding proposals all over the place these days.
\nThat’s right, the other night Weather Channel meteorologist Kim Perez’s beau, police Sgt. Marty Cunningham (best name EVER), asked her to marry him during a routine forecast. Good thing she said yes, otherwise Cunningham’s disposition would have been cloudy with a serious chance of all-out mortification.
\nSocial media and viral videos have taken the place of the jumbotron when it comes to marriage proposals, allowing one to sound one’s not-so barbaric yawp over the roofs of the world. In today’s look-at-me society, public proposals are probably the least offensive byproduct. Meaning that even the most hardened of cynics can admit that they’re kind of sweet.
\nCheck out Cunningham’s proposal below (I personally enjoy that the weather map reads “ring ing in the New Year”), and then dive right into our list of even more social media wooers. What’s next? Entire domains dedicated to popping the question?
\n
\n\n \n \n \n \n
\n \n \nMore Wedding Bells and Whistles \n \nCONGRATS: Mashable Marriage Proposal Live at #SocialGood [Video]
\nMan Proposes Marriage via Foursquare Check-In
\nDid We Just Witness a Twitter Marriage Proposal?
\nSuccessful Marriage Proposal on Twitter Today: We #blamedrewscancer
\nJust Married: Groom Changes Facebook Relationship Status at the Altar [VIDEO]
"
149 | end
150 | end
151 |
152 | describe "extracting from alleyinsider" do
153 | before(:all) do
154 | @extracted_content = Extractula::Extractor.new(
155 | "http://www.businessinsider.com/10-stunning-web-site-prototype-sketches-2009-12",
156 | read_test_file("10-stunning-web-site-prototype-sketches.html")).extract
157 | end
158 |
159 | it "extracts the title" do
160 | @extracted_content.title.should == "10 Stunning Web Site Prototype Sketches"
161 | end
162 |
163 | it "extracts the content" do
164 | @extracted_content.content.should == Nokogiri::HTML(read_test_file("10-stunning-web-site-prototype-sketches.html")).css("div.KonaBody").first.inner_html.strip
165 | end
166 | end
167 |
168 | describe "extracting from nytimes" do
169 | before(:all) do
170 | @front_page = Extractula::Extractor.new(
171 | "http://www.nytimes.com/",
172 | read_test_file("nytimes.html")).extract
173 | @story_page = Extractula::Extractor.new(
174 | "http://www.nytimes.com/2009/12/31/world/asia/31history.html?_r=1&hp",
175 | read_test_file("nytimes_story.html")).extract
176 | end
177 |
178 | it "extracts the title" do
179 | @front_page.title.should == "The New York Times - Breaking News, World News & Multimedia"
180 | end
181 |
182 | it "extracts the content" do
183 | @front_page.content.should == Nokogiri::HTML(read_test_file("nytimes.html")).css("div.story").first.inner_html.strip
184 | end
185 |
186 | it "extracts a story title" do
187 | @story_page.title.should == "Army Historians Document Early Missteps in Afghanistan - NYTimes.com"
188 | end
189 |
190 | it "extracts the story content" do
191 | @story_page.content.should == Nokogiri::HTML(read_test_file("nytimes_story.html")).css("nyt_text").first.inner_html.strip
192 | end
193 | end
194 | end
195 |
--------------------------------------------------------------------------------
/spec/extractula/oembed_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/../spec_helper'
2 |
3 | describe Extractula::OEmbed do
4 |
5 | describe "setting a global max height" do
6 | before do
7 | class Thing; include Extractula::OEmbed; end
8 | Extractula::OEmbed.max_height 500
9 | end
10 |
11 | it "should be the default height for all OEmbeddable classes" do
12 | Thing.oembed_max_height.should == 500
13 | end
14 |
15 | it "should not override a specified height" do
16 | Thing.oembed_max_height 300
17 | Thing.oembed_max_height.should == 300
18 | end
19 | end
20 |
21 | describe "setting a global max width" do
22 | before do
23 | class Thing; include Extractula::OEmbed; end
24 | Extractula::OEmbed.max_width 500
25 | end
26 |
27 | it "should be the default width for all OEmbeddable classes" do
28 | Thing.oembed_max_width.should == 500
29 | end
30 |
31 | it "should not override a specified width" do
32 | Thing.oembed_max_width 300
33 | Thing.oembed_max_width.should == 300
34 | end
35 | end
36 |
37 | end
--------------------------------------------------------------------------------
/spec/extractula_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/spec_helper'
2 |
3 | describe "extractula" do
4 | it "can add custom extractors" do
5 | custom_extractor = Class.new(Extractula::Extractor) do
6 | def self.can_extract? url, html
7 | true
8 | end
9 |
10 | def extract
11 | Extractula::ExtractedContent.new :url => "custom extractor url", :summary => "my custom extractor"
12 | end
13 | end
14 |
15 | # Extractula.add_extractor custom_extractor
16 | content = Extractula.extract("http://pauldix.net", "some html")
17 | content.url.should == "custom extractor url"
18 | content.summary.should == "my custom extractor"
19 | Extractula.remove_extractor custom_extractor
20 | end
21 |
22 | it "skips custom extractors that can't extract the passed url and html" do
23 | custom_extractor = Class.new(Extractula::Extractor) do
24 | def self.can_extract? url, html
25 | false
26 | end
27 |
28 | def extract
29 | Extractula::ExtractedContent.new :url => "this url", :summary => "this summary"
30 | end
31 | end
32 |
33 | # Extractula.add_extractor custom_extractor
34 | content = Extractula.extract("http://pauldix.net", "some html")
35 | content.url.should_not == "this url"
36 | content.summary.should_not == "this summary"
37 | Extractula.remove_extractor custom_extractor
38 | end
39 |
40 | it "extracts from a url and document and returns an ExtractedContent object" do
41 | result = Extractula.extract("http://pauldix.net", "")
42 | result.should be_a Extractula::ExtractedContent
43 | result.url.should == "http://pauldix.net"
44 | end
45 |
46 | it "saves a reference to the last extractor used" do
47 | custom_extractor = Class.new(Extractula::Extractor) do
48 | def self.can_extract? url, html
49 | true
50 | end
51 | end
52 | Extractula.extract "http://pauldix.net", "some html"
53 | Extractula.last_extractor.should == custom_extractor
54 | Extractula.remove_extractor custom_extractor
55 | end
56 |
57 | describe "defining an inline custom extractor" do
58 | it "takes a block form definition" do
59 | klass = Extractula.custom_extractor do
60 | domain 'pauldix'
61 | content_path '#content'
62 | end
63 | Extractula.extractors.should include(klass)
64 | Extractula.remove_extractor klass
65 | end
66 |
67 | it "takes a hash form definition" do
68 | klass = Extractula.custom_extractor :domain => 'pauldix', :content_path => '#content'
69 | Extractula.extractors.should include(klass)
70 | Extractula.remove_extractor klass
71 | end
72 |
73 | it "can be named" do
74 | klass = Extractula.custom_extractor :PaulDix do
75 | domain 'pauldix'
76 | content_path '#content'
77 | end
78 | Extractula.const_defined?(:PaulDix).should be_true
79 | Extractula.remove_extractor klass
80 | end
81 |
82 | it "can contain the OEmbed module" do
83 | klass = Extractula.custom_extractor :oembed => true
84 | klass.should include(Extractula::OEmbed)
85 | Extractula.remove_extractor klass
86 | end
87 | end
88 | end
--------------------------------------------------------------------------------
/spec/spec.opts:
--------------------------------------------------------------------------------
1 | --diff
2 | --color
3 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require "rubygems"
2 | require "spec"
3 |
4 | # gem install redgreen for colored test output
5 | begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
6 |
7 | path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
8 | $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
9 |
10 | require "lib/extractula"
11 | require "lib/extractula/custom_extractors"
12 |
13 | def read_test_file(file_name)
14 | File.read("#{File.dirname(__FILE__)}/test-files/#{file_name}")
15 | end
16 |
--------------------------------------------------------------------------------
/spec/test-files/dinosaur-comics.html:
--------------------------------------------------------------------------------
1 |
2 | Dinosaur Comics - December 30th, 2009 - awesome fun times!
The permalink for this comic is:
http://www.qwantz.com/index.php?comic=1624
To share this comic on social networking sites, use these links!
del.icio.us digg facebook reddit stumbleupon
To add this comic to your website or blog, copy and paste this code:
<a href="http://www.qwantz.com/index.php?comic=1624"><img src="http://www.qwantz.com/comics/comic2-503.png"></a> And to add this comic to a forum, copy and paste this code:
[URL="http://www.qwantz.com/index.php?comic=1624"][IMG]http://www.qwantz.com/comics/comic2-503.png[/IMG][/URL] hide this box?
--------------------------------------------------------------------------------
/spec/test-files/flickr-oembed.json:
--------------------------------------------------------------------------------
1 | {"version":"1.0","type":"photo","title":"Greyhound Fisheye","author_name":"kotobuki711","author_url":"http:\/\/www.flickr.com\/photos\/kotobuki711\/","cache_age":3600,"provider_name":"Flickr","provider_url":"http:\/\/www.flickr.com\/","width":"500","height":"500","url":"http:\/\/farm3.static.flickr.com\/2127\/1789570897_6db70a9dbe.jpg"}
2 |
--------------------------------------------------------------------------------
/spec/test-files/node-name-error.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Syfy + Motherboard.tv Caprica Screenings Contest
8 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
69 |
70 | For additional information on the
71 | contest, please visit
72 | motherboard.tv/caprica
73 |
74 |
75 |
76 |
77 |
78 |
79 |
83 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/spec/test-files/script_tag_remove_case.html:
--------------------------------------------------------------------------------
1 | \n\n
\n\n\n
\nObama to meet with House Republicans \n\nBy Perry Bacon Jr. \nWashington Post Staff Writer \nTuesday, January 26, 2010;\nA13\n \n
\n\n
\n\nPresident Obama will meet Friday with perhaps his harshest critics outside of Fox News headquarters: the House Republicans.\n
\n\nThe House GOP invited Obama this year to speak at its annual retreat, which will be held in Baltimore from Thursday to Saturday. Coming only two days after Obama's State of the Union address, the session could herald better relations between the two sides in 2010 -- or lift their tensions to an even higher level.\n
\n\nThe White House and congressional Republicans spent much of last year bickering over whom to blame for their inability to work together, as the administration constantly blasted the House GOP for unanimously opposing the economic stimulus, while Republicans said Obama and House Democrats refused to incorporate their ideas. A private meeting at the White House that included Obama and House Republicans in December on job growth turned into a griping session, with the president accusing the GOP of "scaring" Americans about his policies while Republicans said the anxiety in the country stemmed from his agenda.\n
\n\nSo far this year, nothing has changed. House Republicans have said Obama's policies led to the defeat of Democrat Martha Coakley in the special Senate election in Massachusetts. White House advisers, in turn, have blamed the GOP for the negative tone of Washington politics.\n
\n\nRep. Mike Pence (Ind.), the No. 3 in the House GOP leadership and the organizer of the retreat, said House Republicans wanted a stronger relationship with Obama and said the GOP 's goals of working with Obama and winning this fall's elections are not in conflict. "We serve our party best when we serve our country," he said. But he added that "the conversation with the president has to be a two-way street."\n
\n\nIn addition to Obama, the House GOP will hear from Virginia Gov. Robert F. McDonnell , one of the party's new stars, as well as former House speaker Newt Gingrich and former House majority leader Richard K. Armey, who heads up the conservative activist group FreedomWorks. Party leaders said they will focus on discussing a policy agenda for their candidates in the midterm elections .\n
\n\nLast year's retreat was at the Homestead in Hot Springs, Va. This year, worried about the appearance of a staying at a posh hotel as unemployment hovers over 10 percent, the Republicans have opted for a Marriott near the Inner Harbor. Earlier this month, Democrats eschewed holding a retreat at a luxury resort and heard from experts and the president in the Capitol's visitor center.\n
\n'Maybe I'm a masochist' \nWhile he deals with a energized GOP, Obama will also face an increasingly anxious left of his party in Congress. The Progressive Caucus, a group of more than 80 of the most liberal members in Congress, says Republican Scott Brown's upset victory in Massachusetts was not because Obama and Democrats were too liberal, but because they were insufficiently so.\n
\n\n"I don't think it was about health care, it was because change didn't happen fast enough -- that's the frustration," said Rep. Lynn Woolsey (D-Calif.), one of the group's leaders. "I believe that if we had pursued the populist, progressive agenda, such as a public option, we could have energized our base."\n
\n\nA Washington Post-Kaiser-Harvard poll of Massachusetts voters conducted after Brown's election showed that young and minority voters, who formed the backbone of Obama's support in 2008, represented a smaller percentage of the electorate in last Tuesday's special election. It's not clear whether policy issues or Obama's absence from the ballot caused some of these voters not to go to the polls.\n
\n\nWhatever the reason for the Massachusetts loss, Rep. Raul Raul Grijalva (D-Ariz.), leader of the Progressive Caucus, has outlined an agenda for 2010 that he says will appeal to the base: increased funding for education, a job-creation bill bigger than the $154 billion version that passed the House in December over the objections of many Democratic moderates, and immigration reform. The latter in particular is unlikely to pass this year.\n
\n\n"We are going to push," he said. "Maybe I'm masochist, but I'm still optimistic."\n
\nSelf-evident truths? \nThe tea party is coming to Capitol Hill. Hours before the president's speech on Wednesday, Rep. Michele Bachmann (R-Minn.), one of the lawmakers most closely allied with the movement, and FreedomWorks will hold an event with conservative activists and lawmakers to tout a "Declaration of Health Care Independence." An aide to Bachmann said the proposal would "protect the rights of the American to make their own health decisions," as well as include 10 conservative ideas for future health reform.\n
\n\nThe health-care event is one of the first steps the tea-party movement will take this year as it seeks to expand its influence. At a news conference Monday, FreedomWorks put out a list of candidates it is backing or opposing in key races this year. Florida Gov. Charlie Crist (R), a candidate for the Senate; Sen. Harry Reid (D-Nev.); and Rep. Alan Grayson (D-Fla.) each are labeled an "Enemy of Liberty" whom the group will oppose. FreedomWorks will back GOP Senate candidates Marco Rubio (Fla.), Pat Toomey (Pa.) and Rand Paul (Ky.) -- each, according to the group, is a "Champion of Freedom."\n
\n\nIn Session is a weekly look inside Congress. \n
\n\n
\n\n
\n \n \n\n\n
\n \n
\n\n\302\251\302\2402010\302\240The Washington Post Company
\n
--------------------------------------------------------------------------------
/spec/test-files/totlol-youtube.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | The Sad Tale Of Totlol And How YouTube’s Changing TOS Made It Hard To Make A Buck
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
17 |
18 |
19 |
23 |
25 |
29 |
30 |
35 |
36 |
37 |
38 |
47 |
48 |
49 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
92 |
93 |
94 |
109 |
110 |
111 |
112 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
146 |
147 |
148 |
149 |
150 |
151 |
183 |
184 |
185 |
188 |
189 |
190 |
191 |
227 |
228 |
229 |
For developers, the Web is increasingly becoming a rich trove of data which can be plucked and used as the foundation to build new services and applications. The data on the Web is becoming increasingly accessible through application programming interfaces (APIs), and some of the richest APIs come from the biggest sites on the Web: YouTube, Facebook, Twitter. But just as these APIs give life to tens of thousands of developers, they can also be limiting. Ron Ilan, the developer and entrepreneur behind the children’s video site Totlol , learned the hard way that if you live by the API, you can also die by the API.
230 |
Totlol is a site filled with children’s’ videos from YouTube curated by parents. Think of it as a safe, white-listed, children’s version of YouTube. It is built entirely on top of YouTube’s APIs. But a change in the terms of service (TOS) of those APIs caused Ron to shut down the free version of his site six months ago and move to a subscription model which never really became a going concern.
231 |
Ron clearly blames YouTube for his woes. You can read his version of the whole sad tale , which portrays YouTube as conspiring to change its API terms of service in response to Totlol. Whether or not there was any actual malice on the part of YouTube, or the change was just a coincidence in timing, as someone who was on the YouTube API team told Ilan via email, the episode is a cautionary tale for anyone trying to build a business on another company’s APIs.
232 |
The gist of what happened is that Ilan developed Totlol using YouTube’s APIs. The service wrapped YouTube videos in Totlol’s own player on its site, where people could create collections and do much more. YouTube noticed the app and even featured it in its Google Code widget on July 7, 2008, after some delay. That also happened to be the exact same day that Google changed the terms of service for its API to disallow commercial use without “YouTube’s prior written approval,” including for the following:
233 |
the sale of advertising, sponsorships, or promotions on any page of the API Client containing YouTube audiovisual content, unless other content not obtained from YouTube appears on the same page and is of sufficient value to be the basis for such sales
234 |
That pretty much killed Totlol’s revenue model, which was to place ads on the pages where the videos were played. Just bad luck, right? Ron asked YouTube for permission to run ads on his site, but he never got a response. Ron was understandably frustrated buy this turn of events. The site was his livelihood. In his post, he sums up what he thinks happened this way:
235 |
When the YouTube API team saw Totlol they liked it. At about the same time someone else at Google saw it, realized the potential it, and/or similar implementations may have, and initiated a ToS modification. An instruction was given to delay public acknowledgement of Totlol until the modified ToS where published. Later an instruction was given to avoid public acknowledgement at all.
236 |
Maybe there was a connection, or maybe this conspiracy existed only in Ron’s mind. It is hard to believe YouTube would modify it in response to a single developer. In a statement, YouTube responds:
237 |
Updates to our API Terms of Service generally take months of preparation and review and are pushed out primarily to better serve our users, partners and developers. When new Terms of Service are ready, we notify our developers through as many channels as possible, including on our developer blog.
238 |
And YouTube did at least try to reach out to him. In June of this year, he was approached by a director of product management at YouTube who wanted to know what YouTube could do to prevent such failures in the future. In an email, the YouTube director asked Ron:
239 |
What types of business models would we need to support in order to make this worth a developer’s while?
240 | . . . Semi-related: what about the YouTube APIs made it challenging to run the site as a standalone?
241 |
242 |
The questions make it clear that YouTube knew there were things it could do to make its APIs more developer-friendly. The two even met at a Starbucks, but nothing came of the meeting.
243 |
Ultimately, it was up Ron to build a site that not only attracted users but was also economically viable. But like many developers, he was at the mercy of YouTube’s rules. Live by the API, die by the API. Ron is now looking for a regular 9-to-5 job to support his family.
244 |
YouTube has no problem splitting revenues with bigger partners such as Vevo, which show their videos on both their own site and on YouTube. But maybe YouTube is making a distinction between splitting revenues with content creators and with content aggregators like Totlol. Is there not enough value in content aggregation when done creatively. The executives in charge of Google News, at least, would answer in the affirmative. YouTube is not a kid’s site, yet Totlol was able to create a kid’s site out of YouTube, with different features and a different look and feel.
245 |
YouTube wants to control the economics surrounding its videos, whether they are watched on YouTube or on another site. The last thing it wants is to encourage a bunch of spam sites filled with Youtube videos and AdSense. That’s fair enough. But Totlol was a legitimate site, even an innovative one. It was the kind of site YouTube should do everything it can to encourage. Tales like this one make you wonder how hard it is for developers who want to play by the rules to build businesses on top of those APIs. Is YouTube helping developers or thwarting them?
246 |
247 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
274 |
275 |
280 |
281 |
Advertisement
282 |
283 |
286 |
287 |
288 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
352 |
353 |
354 |
355 |
356 |
357 |
420 |
432 |
438 |
439 |
440 |
443 |
444 |
445 |
448 |
449 |
450 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
464 |
465 |
466 |
469 |
470 |
471 |
474 |
475 |
476 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
507 |
508 |
512 |
513 |
519 |
520 |
521 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
542 |
543 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
--------------------------------------------------------------------------------
/spec/test-files/twitpic.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 | @AMY_CLUB si te dejo Jack ? Lol on Twitpic
7 |
8 |
9 |
10 |
11 |
27 |
28 |
29 |
30 |
31 |
32 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 | @AMY_CLUB si te dejo Jack ? Lol
82 |
83 |
84 |
92 |
93 |
94 |
162 |
163 |
164 |
165 |
166 |
167 |
173 |
174 |
178 |
183 |
184 |
185 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
205 |
206 |
207 |
208 |
209 |
210 |
--------------------------------------------------------------------------------
/spec/test-files/typhoeus-the-best-ruby-http-client-just-got-better.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
18 |
19 |
20 |
21 |
22 | Paul Dix Explains Nothing: Typhoeus, the best Ruby HTTP client just got better
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
Entrepreneurship, programming, software development, politics, NYC, and random thoughts.
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | « Bypassing wxWidgets error when building Erlang from Source on OS X |
52 | Main
53 | | Resources for Synchronous Reads, Asynchronous Writes at RubyConf 2009 »
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.
It's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The Typhoeus readme highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.
In addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.
66 |
67 |
68 |
69 |
70 |
71 |
83 |
84 |
85 |
86 |
87 |
88 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
186 |
187 |
188 |
189 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
339 |
340 |
346 |
347 |
352 |
353 |
354 |
355 |
360 |
361 |
362 |
363 | My Github
364 | Feedzirra My Ruby library for parsing and fetching feeds at blinding speed.
365 | SAX Machine My Ruby library exposes a DSL for building Nokogiri backed SAX parsers.
366 | Typhoeus My Ruby library for running HTTP requests quickly, easily, and in parallel.
367 |
368 |
369 |
370 |
371 |
372 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
482 |
More...
483 |
484 |
485 |
486 |
487 |
488 |
489 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
572 |
573 |
574 |
575 |
576 |
577 |
--------------------------------------------------------------------------------
/spec/test-files/vimeo.json:
--------------------------------------------------------------------------------
1 | {"type":"video","version":"1.0","provider_name":"Vimeo","provider_url":"http:\/\/vimeo.com\/","title":"Cracker Bag","author_name":"Glendyn Ivin","author_url":"http:\/\/vimeo.com\/user1783024","is_plus":"1","html":"<\/embed><\/object>","width":"640","height":"360","duration":"866","description":"Eddie spends her pocket money obsessively hoarding fireworks and carefully planning for cracker night. When it finally it arrives, Eddie and her family head to the local football oval. In the frosty air Eddie lights the fuse of her first cracker and experiences a pivotal moment, one of the seemingly small experiences of childhood, that affects us for the rest of our lives. \n\nSet in the 1980s, Cracker Bag is a gentle suburban observation which subtly reflects a disenchanting prelude to the coming of age. \n\nWinner of the Palme D'Or - Short Film Cannes Film Festival 2003\n\nwww.GlendynIvin.com\nwww.Exitfilms.com","thumbnail_url":"http:\/\/ts.vimeo.com.s3.amazonaws.com\/422\/231\/42223153_200.jpg","thumbnail_width":"200","thumbnail_height":"150","video_id":"8833777"}
--------------------------------------------------------------------------------
/spec/test-files/youtube-oembed.json:
--------------------------------------------------------------------------------
1 | {
2 | "provider_url": "http://www.youtube.com/",
3 | "title": "The Sneezing Baby Panda",
4 | "html": " ",
5 | "author_name": "jimvwmoss",
6 | "height": 344,
7 | "width": 425,
8 | "version": "1.0",
9 | "author_url": "http://www.youtube.com/user/jimvwmoss",
10 | "provider_name": "YouTube",
11 | "type": "video"
12 | }
--------------------------------------------------------------------------------