11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/globally_unique_identifier.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | class GloballyUniqueIdentifier
6 | include SAXMachine
7 |
8 | attribute :isPermaLink, as: :is_perma_link
9 |
10 | value :guid
11 |
12 | def perma_link?
13 | is_perma_link != "false"
14 | end
15 |
16 | def url
17 | perma_link? ? guid : nil
18 | end
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/google_docs_atom_entry.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | class GoogleDocsAtomEntry
6 | include SAXMachine
7 | include FeedEntryUtilities
8 | include AtomEntryUtilities
9 |
10 | element :"docs:md5Checksum", as: :checksum
11 | element :"docs:filename", as: :original_filename
12 | element :"docs:suggestedFilename", as: :suggested_filename
13 | end
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feed-parsing.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feed Parsing
3 | about: Your feed is parsing incorrectly, or you have a feed type that is not supported
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | ### Steps to reproduce
11 |
13 |
14 | ### Example feed URL
15 |
16 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "bundler/gem_tasks"
4 | require "rspec/core/rake_task"
5 | require "rubocop/rake_task"
6 | require "yard"
7 |
8 | RSpec::Core::RakeTask.new(:spec) do |t|
9 | t.verbose = false
10 | end
11 |
12 | RuboCop::RakeTask.new(:rubocop) do |t|
13 | t.options = ["--display-cop-names"]
14 | end
15 |
16 | YARD::Rake::YardocTask.new do |t|
17 | t.files = ["lib/**/*.rb", "-", "LICENSE"]
18 | end
19 |
20 | task default: %i[spec rubocop]
21 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/rss_feed_burner_entry.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with RDF feed entries.
6 | class RSSFeedBurnerEntry
7 | include SAXMachine
8 | include FeedEntryUtilities
9 | include RSSEntryUtilities
10 |
11 | element :"feedburner:origLink", as: :orig_link
12 | private :orig_link
13 |
14 | def url
15 | orig_link || super
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | source "https://rubygems.org/"
4 |
5 | gemspec
6 |
7 | gem "faraday", "~> 2.14.0"
8 | gem "pry", "~> 0.15.0"
9 | gem "rspec", "~> 3.13.0"
10 | gem "rubocop", "~> 1.81.1"
11 | gem "rubocop-performance", "~> 1.26.0"
12 | gem "rubocop-rake", "~> 0.7.1"
13 | gem "rubocop-rspec", "~> 3.7.0"
14 | gem "simplecov", "~> 0.22.0"
15 | gem "yard", "~> 0.9.34"
16 |
17 | group :test do
18 | gem "oga", "~> 3.4"
19 | gem "ox", "~> 2.14.17", platforms: %i[mri rbx]
20 | gem "rake", "~> 13.3.0"
21 | end
22 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/podlove_chapter.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | class PodloveChapter
6 | include SAXMachine
7 | include FeedEntryUtilities
8 |
9 | attribute :start, as: :start_ntp
10 | attribute :title
11 | attribute :href, as: :url
12 | attribute :image
13 |
14 | def start
15 | return unless start_ntp
16 |
17 | parts = start_ntp.split(":")
18 | parts.reverse.to_enum.with_index.sum do |part, index|
19 | part.to_f * (60**index)
20 | end
21 | end
22 | end
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_feed_burner_entry.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with Feedburner Atom feed entries.
6 | class AtomFeedBurnerEntry
7 | include SAXMachine
8 | include FeedEntryUtilities
9 | include AtomEntryUtilities
10 |
11 | element :"feedburner:origLink", as: :orig_link
12 | private :orig_link
13 |
14 | element :"media:thumbnail", as: :image, value: :url
15 | element :"media:content", as: :image, value: :url
16 |
17 | def url
18 | orig_link || super
19 | end
20 | end
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_google_alerts_entry.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "uri"
4 |
5 | module Feedjira
6 | module Parser
7 | # Parser for dealing with Feedburner Atom feed entries.
8 | class AtomGoogleAlertsEntry
9 | include SAXMachine
10 | include FeedEntryUtilities
11 | include AtomEntryUtilities
12 |
13 | def url
14 | url = super
15 | return unless url&.start_with?("https://www.google.com/url?")
16 |
17 | uri = URI(url)
18 | cons = URI.decode_www_form(uri.query).assoc("url")
19 | cons && cons[1]
20 | end
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/spec/feedjira/configuration_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Configuration do
6 | describe ".configure" do
7 | it "sets strip_whitespace config" do
8 | Feedjira.configure { |config| config.strip_whitespace = true }
9 | expect(Feedjira.strip_whitespace).to be true
10 | end
11 |
12 | it "allows parsers to be modified" do
13 | custom_parser = Class.new
14 |
15 | Feedjira.configure { |config| config.parsers.unshift(custom_parser) }
16 | expect(Feedjira.parsers.first).to eq(custom_parser)
17 | Feedjira.reset_configuration!
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/spec/sample_feeds/atom_simple_single_entry.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Example Feed
4 |
5 | 2003-12-13T18:30:02Z
6 |
7 | John Doe
8 |
9 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6
10 |
11 | Atom-Powered Robots Run Amok
12 |
13 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
14 | 2003-12-13T18:30:02Z
15 | Some text.
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "bundler" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 | assignees:
13 | - "mockdeep"
14 | groups:
15 | all:
16 | patterns:
17 | - "*"
18 |
--------------------------------------------------------------------------------
/spec/sample_feeds/atom_simple_single_entry_link_self.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Example Feed
4 |
5 | 2003-12-13T18:30:02Z
6 |
7 | John Doe
8 |
9 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6
10 |
11 | Atom-Powered Robots Run Amok
12 |
13 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
14 | 2003-12-13T18:30:02Z
15 | Some text.
16 |
17 |
18 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/i_tunes_rss_owner_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::ITunesRSSOwner do
6 | before do
7 | # I don't really like doing it this way because these unit test should only
8 | # rely on RSSEntry, but this is actually how it should work. You would
9 | # never just pass entry xml straight to the ITunesRssOwner
10 | feed = Feedjira::Parser::ITunesRSS.parse sample_itunes_feed
11 | @owner = feed.itunes_owners.first
12 | end
13 |
14 | it "parses the name" do
15 | expect(@owner.name).to eq "John Doe"
16 | end
17 |
18 | it "parses the email" do
19 | expect(@owner.email).to eq "john.doe@example.com"
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/spec/sample_feeds/Permalinks.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Item 1
6 | http://example.com/1
7 |
8 |
9 | Item 2
10 | http://example.com/2
11 |
12 |
13 | Item 3
14 | http://example.com/3
15 |
16 |
17 | Item 4
18 | http://example.com/4
19 | http://example.com/5
20 |
21 |
22 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/rss_feed_burner.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with RSS feeds.
6 | class RSSFeedBurner
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :description
12 | element :link, as: :url
13 | element :lastBuildDate, as: :last_built
14 | elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" }
15 | elements :item, as: :entries, class: RSSFeedBurnerEntry
16 |
17 | attr_accessor :feed_url
18 |
19 | def self.able_to_parse?(xml) # :nodoc:
20 | (/
3 |
4 |
5 | Invalid date format feed
6 | http://example.com/feed
7 | en-US
8 |
9 | Item 0 with an invalid date
10 | http://example.com/item0
11 | Mon, 16 Oct 2017 15:10:00 +0000
12 | 1518478934
13 |
14 |
15 | Item 1 with all valid dates
16 | http://example.com/item1
17 | Tue, 17 Oct 2017 12:17:00 +0000
18 | Tue, 17 Oct 2017 22:17:00 +0000
19 |
20 |
21 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_youtube.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with RSS feeds.
6 | class AtomYoutube
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :link, as: :url, value: :href, with: { rel: "alternate" }
12 | element :link, as: :feed_url, value: :href, with: { rel: "self" }
13 | element :name, as: :author
14 | element :"yt:channelId", as: :youtube_channel_id
15 |
16 | elements :entry, as: :entries, class: AtomYoutubeEntry
17 |
18 | def self.able_to_parse?(xml) # :nodoc:
19 | xml.include?("xmlns:yt=\"http://www.youtube.com/xml/schemas/2015\"")
20 | end
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/lib/feedjira/preprocessor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | class Preprocessor
5 | def initialize(xml)
6 | @xml = xml
7 | end
8 |
9 | def to_xml
10 | process_content
11 | doc.to_xml
12 | end
13 |
14 | private
15 |
16 | def process_content
17 | content_nodes.each do |node|
18 | node.content = raw_html(node)
19 | end
20 | end
21 |
22 | def content_nodes
23 | doc.search 'entry > content[type="xhtml"], entry > summary[type="xhtml"], entry > title[type="xhtml"]'
24 | end
25 |
26 | def raw_html(node)
27 | node.search("./div").inner_html
28 | end
29 |
30 | def doc
31 | @doc ||= Nokogiri::XML(@xml).remove_namespaces!
32 | end
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/google_docs_atom_entry_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::GoogleDocsAtomEntry do
6 | describe "parsing" do
7 | before do
8 | xml = sample_google_docs_list_feed
9 | @feed = Feedjira::Parser::GoogleDocsAtom.parse xml
10 | @entry = @feed.entries.first
11 | end
12 |
13 | it "has the custom checksum element" do
14 | expect(@entry.checksum).to eq "2b01142f7481c7b056c4b410d28f33cf"
15 | end
16 |
17 | it "has the custom filename element" do
18 | expect(@entry.original_filename).to eq "MyFile.pdf"
19 | end
20 |
21 | it "has the custom suggested filename element" do
22 | expect(@entry.suggested_filename).to eq "TaxDocument.pdf"
23 | end
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_google_alerts.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with Feedburner Atom feeds.
6 | class AtomGoogleAlerts
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :subtitle, as: :description
12 | element :link, as: :feed_url, value: :href, with: { rel: "self" }
13 | element :link, as: :url, value: :href, with: { rel: "self" }
14 | elements :link, as: :links, value: :href
15 | elements :entry, as: :entries, class: AtomGoogleAlertsEntry
16 |
17 | def self.able_to_parse?(xml)
18 | Atom.able_to_parse?(xml) && (%r{tag:google\.com,2005:[^<]+/com\.google/alerts/} === xml) # rubocop:disable Style/CaseEquality
19 | end
20 |
21 | def self.preprocess(xml)
22 | Preprocessor.new(xml).to_xml
23 | end
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/google_docs_atom.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require File.expand_path("./atom", File.dirname(__FILE__))
4 | module Feedjira
5 | module Parser
6 | class GoogleDocsAtom
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :subtitle, as: :description
12 | element :link, as: :url, value: :href, with: { type: "text/html" }
13 | element :link, as: :feed_url, value: :href, with: { type: "application/atom+xml" }
14 | elements :link, as: :links, value: :href
15 | elements :entry, as: :entries, class: GoogleDocsAtomEntry
16 |
17 | def url
18 | @url ||= links.first
19 | end
20 |
21 | def self.able_to_parse?(xml) # :nodoc:
22 | %r{https?://docs\.google\.com/.*} =~ xml
23 | end
24 |
25 | def feed_url
26 | @feed_url ||= links.first
27 | end
28 | end
29 | end
30 | end
31 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/rss.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with RSS feeds.
6 | # Source: https://cyber.harvard.edu/rss/rss.html
7 | class RSS
8 | include SAXMachine
9 | include FeedUtilities
10 |
11 | element :description
12 | element :image, class: RSSImage
13 | element :language
14 | element :lastBuildDate, as: :last_built
15 | element :link, as: :url
16 | element :"a10:link", as: :url, value: :href
17 | element :rss, as: :version, value: :version
18 | element :title
19 | element :ttl
20 | elements :"atom:link", as: :hubs, value: :href, with: { rel: "hub" }
21 | elements :item, as: :entries, class: RSSEntry
22 |
23 | attr_accessor :feed_url
24 |
25 | def self.able_to_parse?(xml)
26 | (/-
21 | ${{matrix.os}}-ruby-${{matrix.ruby}}-${{matrix.handler}}
22 | runs-on: ${{matrix.os}}-latest
23 | continue-on-error: ${{matrix.ruby == 'head' || matrix.ruby == 'jruby'}}
24 | env:
25 | HANDLER: ${{matrix.handler}}
26 |
27 | steps:
28 | - name: Check out
29 | uses: actions/checkout@v2
30 |
31 | - name: Set up ruby and bundle
32 | uses: ruby/setup-ruby@v1
33 | with:
34 | ruby-version: ${{matrix.ruby}}
35 | bundler-cache: true
36 |
37 | - name: Run rake
38 | run: |
39 | bundle exec rake
40 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with Atom feeds.
6 | class Atom
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :subtitle, as: :description
12 | element :link, as: :url, value: :href, with: { type: "text/html" }
13 | element :link, as: :feed_url, value: :href, with: { rel: "self" }
14 | elements :link, as: :links, value: :href
15 | elements :link, as: :hubs, value: :href, with: { rel: "hub" }
16 | elements :entry, as: :entries, class: AtomEntry
17 | element :icon
18 |
19 | def self.able_to_parse?(xml)
20 | %r{]+xmlns\s?=\s?["'](https?://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)["'][^>]*>} =~ xml
21 | end
22 |
23 | def url
24 | @url || (links - [feed_url]).last
25 | end
26 |
27 | def self.preprocess(xml)
28 | Preprocessor.new(xml).to_xml
29 | end
30 | end
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/feedjira.gemspec:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require File.expand_path("lib/feedjira/version", __dir__)
4 |
5 | Gem::Specification.new do |s|
6 | s.authors = [
7 | "Adam Hess",
8 | "Akinori Musha",
9 | "Ezekiel Templin",
10 | "Jon Allured",
11 | "Julien Kirch",
12 | "Michael Stock",
13 | "Paul Dix"
14 | ]
15 | s.homepage = "https://github.com/feedjira/feedjira"
16 | s.license = "MIT"
17 | s.name = "feedjira"
18 | s.platform = Gem::Platform::RUBY
19 | s.summary = "A feed parsing library"
20 | s.version = Feedjira::VERSION
21 |
22 | s.metadata = {
23 | "homepage_uri" => "https://github.com/feedjira/feedjira",
24 | "source_code_uri" => "https://github.com/feedjira/feedjira",
25 | "changelog_uri" => "https://github.com/feedjira/feedjira/blob/main/CHANGELOG.md",
26 | "rubygems_mfa_required" => "true"
27 | }
28 |
29 | s.files = `git ls-files`.split("\n")
30 | s.require_paths = ["lib"]
31 |
32 | s.required_ruby_version = ">=3.1"
33 |
34 | s.add_dependency "logger", ">= 1.0", "< 2"
35 | s.add_dependency "loofah", ">= 2.3.1", "< 3"
36 | s.add_dependency "sax-machine", ">= 1.0", "< 2"
37 | end
38 |
--------------------------------------------------------------------------------
/spec/feedjira/preprocessor_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Preprocessor do
6 | it "returns the xml as parsed by Nokogiri" do
7 | xml = ""
8 | doc = Nokogiri::XML(xml).remove_namespaces!
9 | processor = described_class.new xml
10 | escaped = processor.to_xml
11 |
12 | expect(escaped).to eq doc.to_xml
13 | end
14 |
15 | it "escapes markup in xhtml content" do
16 | processor = described_class.new sample_atom_xhtml_feed
17 | escaped = processor.to_xml
18 | escaped_parts = escaped.split "\n"
19 |
20 | expect(escaped_parts[10]).to match(%r{<i>dogs</i>}) # title
21 | expect(escaped_parts[16]).to match(%r{<b>XHTML</b>}) # summary
22 | expect(escaped_parts[26]).to match(/<p>$/) # content
23 | end
24 |
25 | it "leaves escaped html within pre tag" do
26 | processor = described_class.new(sample_atom_xhtml_with_escpaed_html_in_pre_tag_feed)
27 | escaped = processor.to_xml
28 | expected_pre_tag = " <pre><b>test<b></pre>"
29 | expect(escaped.split("\n")[7]).to eq(expected_pre_tag)
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | QQQQ
7 |
8 |
9 | 2010-09-18T10:02:20-07:00
10 | QQQQ
11 |
12 | QQQQ
13 | QQQQ@example.com
14 |
15 |
16 |
17 |
19 |
20 |
21 |
22 | QQQQ
23 |
24 | 2010-08-11T00:00:00-07:00
25 | http://example.com/QQQQ.html
26 | QQQQ
27 |
28 |
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2009-2016:
4 |
5 | - Paul Dix
6 | - Julien Kirch
7 | - Ezekiel Templin
8 | - Jon Allured
9 |
10 | Permission is hereby granted, free of charge, to any person obtaining a copy
11 | of this software and associated documentation files (the "Software"), to deal
12 | in the Software without restriction, including without limitation the rights
13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 | copies of the Software, and to permit persons to whom the Software is
15 | furnished to do so, subject to the following conditions:
16 |
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 |
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 | SOFTWARE.
27 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/itunes_rss_category.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # iTunes extensions to the standard RSS2.0 item
6 | # Source: https://help.apple.com/itc/podcasts_connect/#/itcb54353390
7 | class ITunesRSSCategory
8 | include SAXMachine
9 |
10 | attribute :text
11 |
12 | elements :"itunes:category", as: :itunes_categories,
13 | class: ITunesRSSCategory
14 |
15 | def each_subcategory(&block)
16 | return to_enum(__method__) unless block
17 |
18 | yield text
19 |
20 | itunes_categories.each do |itunes_category|
21 | itunes_category.each_subcategory(&block)
22 | end
23 | end
24 |
25 | def each_path(ancestors = [], &block)
26 | return to_enum(__method__, ancestors) unless block
27 |
28 | category_hierarchy = ancestors + [text]
29 |
30 | if itunes_categories.empty?
31 | yield category_hierarchy
32 | else
33 | itunes_categories.each do |itunes_category|
34 | itunes_category.each_path(category_hierarchy, &block)
35 | end
36 | end
37 | end
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/i_tunes_rss_category_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::ITunesRSSCategory do
6 | describe "#each_subcategory" do
7 | it "returns an enumerator when no block is given" do
8 | category = described_class.new
9 | category.text = "Technology"
10 |
11 | result = category.each_subcategory
12 | expect(result).to be_an(Enumerator)
13 | end
14 |
15 | it "yields category text and subcategories when block is given" do
16 | parent_category = described_class.new
17 | parent_category.text = "Technology"
18 |
19 | subcategory = described_class.new
20 | subcategory.text = "Gadgets"
21 |
22 | parent_category.itunes_categories = [subcategory]
23 |
24 | yielded_categories = []
25 | parent_category.each_subcategory { |cat| yielded_categories << cat }
26 |
27 | expect(yielded_categories).to eq %w[Technology Gadgets]
28 | end
29 | end
30 |
31 | describe "#each_path" do
32 | it "returns an enumerator when no block is given" do
33 | category = described_class.new
34 | category.text = "Technology"
35 |
36 | result = category.each_path
37 | expect(result).to be_an(Enumerator)
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/spec/sample_feeds/TechCrunchFirstEntryDescription.xml:
--------------------------------------------------------------------------------
1 | Angie's List, which offers consumers a way to review and rate doctors, contractors and service companies on the Web, has just set the terms for its IPO. In a new filing, the company revealed that it aims to raise as much as $131.4 million in the offering and has priced its IPO in the range of $11 to $13 per share. The company will list on the Nasdaq under the symbol “ANGI.” At the high end of the range, Angie's List would be valued at nearly $700 million.
2 |
3 | Angie’s List launched in 1995 with a focus on local home, yard and car services, sits at the intersection of local search, user-generated content and subscription-based services. To date, Angie’s List has raised nearly $100 million from Battery Ventures, T. Rowe Price, City Investment Group, Cardinal Ventures and others.
4 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | inherit_from: .rubocop_todo.yml
2 |
3 | plugins:
4 | - rubocop-rake
5 | - rubocop-rspec
6 | - rubocop-performance
7 |
8 | AllCops:
9 | EnabledByDefault: true
10 | TargetRubyVersion: 3.1
11 |
12 | # Offense count: 3
13 | # Configuration parameters: IgnoredMethods.
14 | Metrics/AbcSize:
15 | Max: 24
16 |
17 | # Offense count: 33
18 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods.
19 | # ExcludedMethods: refine
20 | Metrics/BlockLength:
21 | Max: 235
22 |
23 | # Offense count: 7
24 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods.
25 | Metrics/MethodLength:
26 | Max: 25
27 |
28 | Layout/LineLength:
29 | Exclude:
30 | - 'spec/**/*.rb'
31 |
32 | Style/IfUnlessModifier:
33 | Enabled: false
34 |
35 | Style/StringLiterals:
36 | EnforcedStyle: double_quotes
37 |
38 | RSpec/MultipleExpectations:
39 | Max: 10
40 |
41 | RSpec/ExampleLength:
42 | Max: 30
43 |
44 | RSpec/InstanceVariable:
45 | Enabled: false
46 |
47 | RSpec/MessageSpies:
48 | Enabled: false
49 |
50 | RSpec/NestedGroups:
51 | Max: 5
52 |
53 | RSpec/MultipleMemoizedHelpers:
54 | Max: 10
55 |
56 | RSpec/BeforeAfterAll:
57 | Enabled: false
58 |
59 | RSpec/RepeatedExample:
60 | Enabled: false
61 |
62 | Style/Copyright: { Enabled: false }
63 |
--------------------------------------------------------------------------------
/spec/feedjira/atom_entry_utilities_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | RSpec.describe Feedjira::AtomEntryUtilities do
6 | def klass
7 | Class.new do
8 | include SAXMachine
9 | include Feedjira::AtomEntryUtilities
10 | end
11 | end
12 |
13 | describe "#title" do
14 | it "returns the title when set" do
15 | entry = klass.new
16 | entry.title = "My Title"
17 |
18 | expect(entry.title).to eq "My Title"
19 | end
20 |
21 | it "returns a sanitized version of the raw title when present" do
22 | entry = klass.new
23 | entry.raw_title = "My Raw \tTitle"
24 |
25 | expect(entry.title).to eq "My Raw Title"
26 | end
27 |
28 | it "returns nil when no raw title is present" do
29 | entry = klass.new
30 |
31 | expect(entry.title).to be_nil
32 | end
33 | end
34 |
35 | describe "#url" do
36 | it "returns the url when set" do
37 | entry = klass.new
38 | entry.url = "http://exampoo.com/feed"
39 |
40 | expect(entry.url).to eq "http://exampoo.com/feed"
41 | end
42 |
43 | it "returns the first link when not set" do
44 | entry = klass.new
45 | entry.links = ["http://exampoo.com/feed"]
46 |
47 | expect(entry.url).to eq "http://exampoo.com/feed"
48 | end
49 | end
50 | end
51 |
--------------------------------------------------------------------------------
/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Planet innoQ
6 |
7 |
8 | http://www.innoq.com/planet/atom.xml
9 | 2009-07-10T12:30:05+00:00
10 | Planet/1.0 +http://www.planetplanet.org
11 |
12 |
13 | ja,
14 |
15 | tag:www.innoq.com,2009:/blog/phaus//25.3526
16 | 2009-07-01T22:20:05+00:00
17 | ich lebe noch.
18 | Sobald mir mehr einfällt, schreibe ich mal wieder was :-).
19 |
20 | Philipp Haussleiter
21 | http://www.innoq.com/blog/phaus/
22 |
23 |
24 | Philipps paper equivalent Blog
25 |
26 | tag:www.innoq.com,2009:/blog/phaus//25
27 | 2009-07-01T22:20:05+00:00
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/lib/feedjira/feed.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | class Feed
5 | class << self
6 | def add_common_feed_element(element_tag, options = {})
7 | Feedjira.parsers.each do |k|
8 | k.element(element_tag, options)
9 | end
10 | end
11 |
12 | def add_common_feed_elements(element_tag, options = {})
13 | Feedjira.parsers.each do |k|
14 | k.elements(element_tag, options)
15 | end
16 | end
17 |
18 | def add_common_feed_entry_element(element_tag, options = {})
19 | call_on_each_feed_entry(:element, element_tag, options)
20 | end
21 |
22 | def add_common_feed_entry_elements(element_tag, options = {})
23 | call_on_each_feed_entry(:elements, element_tag, options)
24 | end
25 |
26 | private
27 |
28 | def call_on_each_feed_entry(method, *parameters)
29 | Feedjira.parsers.each do |klass|
30 | klass.sax_config.collection_elements.each_value do |value|
31 | collection_configs = value.select do |v|
32 | v.accessor == "entries" && v.data_class.is_a?(Class)
33 | end
34 |
35 | collection_configs.each do |config|
36 | config.data_class.send(method, *parameters)
37 | end
38 | end
39 | end
40 | end
41 | end
42 | end
43 | end
44 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_youtube_entry.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | class AtomYoutubeEntry
6 | include SAXMachine
7 | include FeedEntryUtilities
8 | include AtomEntryUtilities
9 |
10 | sax_config.top_level_elements["link"].clear
11 | sax_config.collection_elements["link"].clear
12 |
13 | element :link, as: :url, value: :href, with: { rel: "alternate" }
14 |
15 | element :"media:description", as: :content
16 | element :"yt:videoId", as: :youtube_video_id
17 | element :"yt:channelId", as: :youtube_channel_id
18 | element :"media:title", as: :media_title
19 | element :"media:content", as: :media_url, value: :url
20 | element :"media:content", as: :media_type, value: :type
21 | element :"media:content", as: :media_width, value: :width
22 | element :"media:content", as: :media_height, value: :height
23 | element :"media:thumbnail", as: :media_thumbnail_url, value: :url
24 | element :"media:thumbnail", as: :media_thumbnail_width, value: :width
25 | element :"media:thumbnail", as: :media_thumbnail_height, value: :height
26 | element :"media:starRating", as: :media_star_count, value: :count
27 | element :"media:starRating", as: :media_star_average, value: :average
28 | element :"media:statistics", as: :media_views, value: :views
29 | end
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/json_feed.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with JSON Feeds.
6 | class JSONFeed
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | def self.able_to_parse?(json)
11 | json.include?("https://jsonfeed.org/version/") ||
12 | json.include?('https:\/\/jsonfeed.org\/version\/')
13 | end
14 |
15 | def self.parse(json)
16 | new(JSON.parse(json))
17 | end
18 |
19 | attr_reader :json, :version, :title, :description, :url, :feed_url, :icon, :favicon,
20 | :language, :expired, :entries
21 |
22 | def initialize(json)
23 | @json = json
24 | @version = json.fetch("version")
25 | @title = json.fetch("title")
26 | @url = json.fetch("home_page_url", nil)
27 | @feed_url = json.fetch("feed_url", nil)
28 | @icon = json.fetch("icon", nil)
29 | @favicon = json.fetch("favicon", nil)
30 | @description = json.fetch("description", nil)
31 | @language = json.fetch("language", nil)
32 | @expired = json.fetch("expired", nil)
33 | @entries = parse_items(json["items"])
34 | end
35 |
36 | private
37 |
38 | def parse_items(items)
39 | items.map do |item|
40 | Feedjira::Parser::JSONFeedItem.new(item)
41 | end
42 | end
43 | end
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/atom_youtube_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require File.join(File.dirname(__FILE__), %w[.. .. spec_helper])
4 |
5 | describe Feedjira::Parser::AtomYoutube do
6 | describe "#will_parse?" do
7 | it "returns true for an atom youtube feed" do
8 | expect(described_class).to be_able_to_parse(sample_youtube_atom_feed)
9 | end
10 |
11 | it "returns fase for an atom feed" do
12 | expect(described_class).not_to be_able_to_parse(sample_atom_feed)
13 | end
14 |
15 | it "returns false for an rss feedburner feed" do
16 | expect(described_class).not_to be_able_to_parse(sample_rss_feed_burner_feed)
17 | end
18 | end
19 |
20 | describe "parsing" do
21 | before do
22 | @feed = described_class.parse(sample_youtube_atom_feed)
23 | end
24 |
25 | it "parses the title" do
26 | expect(@feed.title).to eq "Google"
27 | end
28 |
29 | it "parses the author" do
30 | expect(@feed.author).to eq "Google Author"
31 | end
32 |
33 | it "parses the url" do
34 | expect(@feed.url).to eq "http://www.youtube.com/user/Google"
35 | end
36 |
37 | it "parses the feed_url" do
38 | expect(@feed.feed_url).to eq "http://www.youtube.com/feeds/videos.xml?user=google"
39 | end
40 |
41 | it "parses the YouTube channel id" do
42 | expect(@feed.youtube_channel_id).to eq "UCK8sQmJBp8GCxrOtXWBpyEA"
43 | end
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/podlove_chapter_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::PodloveChapter do
6 | before do
7 | @item = Feedjira::Parser::ITunesRSS.parse(sample_podlove_feed).entries.first
8 | @chapter = @item.chapters.first
9 | end
10 |
11 | it "parses chapters" do
12 | expect(@item.chapters.size).to eq 15
13 | end
14 |
15 | it "sorts chapters by time" do
16 | expect(@item.chapters.last.title).to eq "Abschied"
17 | end
18 |
19 | describe "#start" do
20 | it "returns the start time" do
21 | expect(@chapter.start_ntp).to eq "00:00:26.407"
22 | expect(@chapter.start).to eq 26.407
23 | expect(@item.chapters[1].start).to eq 50
24 | expect(@item.chapters[2].start).to eq 59.12
25 | expect(@item.chapters[3].start).to eq 89.201
26 | expect(@item.chapters.last.start).to eq 5700.034
27 | end
28 |
29 | it "returns nil when start_ntp is not present" do
30 | chapter = described_class.new
31 |
32 | expect(chapter.start).to be_nil
33 | end
34 | end
35 |
36 | it "parses the title" do
37 | expect(@chapter.title).to eq "Neil DeGrasse Tyson on Science"
38 | end
39 |
40 | it "parses the link" do
41 | expect(@chapter.url).to eq "https://example.com"
42 | end
43 |
44 | it "parses the image" do
45 | expect(@chapter.image).to eq "https://pics.example.com/pic.png"
46 | end
47 | end
48 |
--------------------------------------------------------------------------------
/lib/feedjira/atom_entry_utilities.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module AtomEntryUtilities
5 | def self.included(mod)
6 | mod.class_exec do
7 | element :title, as: :raw_title, with: { type: "html" }
8 | element :title, as: :raw_title, with: { type: "xhtml" }
9 | element :title, as: :raw_title, with: { type: "xml" }
10 | element :title, as: :title, with: { type: "text" }
11 | element :title, as: :title, with: { type: nil }
12 | element :title, as: :title_type, value: :type
13 |
14 | element :name, as: :author
15 | element :content
16 | element :summary
17 | element :enclosure, as: :image, value: :href
18 |
19 | element :published
20 | element :id, as: :entry_id
21 | element :created, as: :published
22 | element :issued, as: :published
23 | element :updated
24 | element :modified, as: :updated
25 |
26 | elements :category, as: :categories, value: :term
27 |
28 | element :link, as: :url, value: :href, with: {
29 | type: "text/html",
30 | rel: "alternate"
31 | }
32 |
33 | elements :link, as: :links, value: :href
34 | end
35 | end
36 |
37 | def title
38 | @title ||=
39 | case @raw_title
40 | when String
41 | Loofah.fragment(@raw_title).xpath("normalize-space(.)")
42 | else
43 | @title
44 | end
45 | end
46 |
47 | def url
48 | @url ||= links.first
49 | end
50 | end
51 | end
52 |
--------------------------------------------------------------------------------
/lib/feedjira/util/parse_time.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "time"
4 | require "date"
5 |
6 | module Feedjira
7 | module Util
8 | # Module for safely parsing time strings
9 | module ParseTime
10 | # Parse a time string and convert it to UTC without raising errors.
11 | # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
12 | #
13 | # === Parameters
14 | # [dt] Time definition to be parsed.
15 | #
16 | # === Returns
17 | # A Time instance in UTC or nil if there were errors while parsing.
18 | def self.call(datetime)
19 | if datetime.is_a?(Time)
20 | datetime.utc
21 | elsif datetime.respond_to?(:to_datetime)
22 | datetime.to_time.utc
23 | else
24 | parse_string_safely datetime.to_s
25 | end
26 | rescue StandardError => e
27 | Feedjira.logger.debug("Failed to parse time #{datetime}")
28 | Feedjira.logger.debug(e)
29 | nil
30 | end
31 |
32 | # Parse a string safely, handling special 14-digit format
33 | #
34 | # === Parameters
35 | # [string] String to be parsed as time.
36 | #
37 | # === Returns
38 | # A Time instance in UTC or nil if there were errors while parsing.
39 | def self.parse_string_safely(string)
40 | return nil if string.empty?
41 |
42 | if /\A\d{14}\z/.match?(string)
43 | Time.parse("#{string}Z", true)
44 | else
45 | Time.parse(string).utc
46 | end
47 | end
48 |
49 | private_class_method :parse_string_safely
50 | end
51 | end
52 | end
53 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/atom_feed_burner.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with Feedburner Atom feeds.
6 | class AtomFeedBurner
7 | include SAXMachine
8 | include FeedUtilities
9 |
10 | element :title
11 | element :subtitle, as: :description
12 | element :link, as: :url_text_html, value: :href,
13 | with: { type: "text/html" }
14 | element :link, as: :url_notype, value: :href, with: { type: nil }
15 | element :link, as: :feed_url_link, value: :href, with: { type: "application/atom+xml" }
16 | element :"atom10:link", as: :feed_url_atom10_link, value: :href,
17 | with: { type: "application/atom+xml" }
18 | elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" }
19 | elements :entry, as: :entries, class: AtomFeedBurnerEntry
20 |
21 | attr_writer :url, :feed_url
22 |
23 | def self.able_to_parse?(xml)
24 | (xml.include?(" with type="text/html" if present,
28 | # with no type attribute otherwise
29 | def url
30 | @url || @url_text_html || @url_notype
31 | end
32 |
33 | # Feed feed_url is with type="application/atom+xml" if present,
34 | # with type="application/atom+xml" otherwise
35 | def feed_url
36 | @feed_url || @feed_url_link || @feed_url_atom10_link
37 | end
38 |
39 | def self.preprocess(xml)
40 | Preprocessor.new(xml).to_xml
41 | end
42 | end
43 | end
44 | end
45 |
--------------------------------------------------------------------------------
/spec/feedjira/util/parse_time_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | RSpec.describe Feedjira::Util::ParseTime do
6 | describe ".call" do
7 | it "returns the datetime in utc when given a Time" do
8 | time = Time.now
9 |
10 | expect(described_class.call(time)).to eq(time.utc)
11 | end
12 |
13 | it "returns the datetime in utc when given a Date" do
14 | date = Date.today
15 |
16 | expect(described_class.call(date)).to eq(date.to_time.utc)
17 | end
18 |
19 | it "returns the datetime in utc when given a String" do
20 | timestamp = "2016-01-01 00:00:00"
21 |
22 | expect(described_class.call(timestamp)).to eq(Time.parse(timestamp).utc)
23 | end
24 |
25 | it "returns nil when given an empty String" do
26 | timestamp = ""
27 |
28 | expect(described_class.call(timestamp)).to be_nil
29 | end
30 |
31 | it "returns the the datetime in utc given a 14-digit time" do
32 | time = Time.now.utc
33 | timestamp = time.strftime("%Y%m%d%H%M%S")
34 |
35 | expect(described_class.call(timestamp)).to eq(time.floor)
36 | end
37 |
38 | context "when given an invalid time string" do
39 | it "returns nil" do
40 | timestamp = "2016-51-51 00:00:00"
41 |
42 | expect(described_class.call(timestamp)).to be_nil
43 | end
44 |
45 | it "logs an error" do
46 | timestamp = "2016-51-51 00:00:00"
47 |
48 | expect(Feedjira.logger)
49 | .to receive(:debug).with("Failed to parse time #{timestamp}")
50 | expect(Feedjira.logger)
51 | .to receive(:debug).with(an_instance_of(ArgumentError))
52 |
53 | described_class.call(timestamp)
54 | end
55 | end
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/lib/feedjira/rss_entry_utilities.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module RSSEntryUtilities
5 | # rubocop:todo Metrics/MethodLength
6 | def self.included(mod) # rubocop:todo Metrics/AbcSize, Metrics/MethodLength
7 | mod.class_exec do
8 | element :title
9 |
10 | element :"content:encoded", as: :content
11 | element :"a10:content", as: :content
12 | element :description, as: :summary
13 |
14 | element :link, as: :url
15 | element :"a10:link", as: :url, value: :href
16 |
17 | element :author
18 | element :"dc:creator", as: :author
19 | element :"a10:name", as: :author
20 |
21 | element :pubDate, as: :published
22 | element :pubdate, as: :published
23 | element :issued, as: :published
24 | element :"dc:date", as: :published
25 | element :"dc:Date", as: :published
26 | element :"dcterms:created", as: :published
27 |
28 | element :"dcterms:modified", as: :updated
29 | element :"a10:updated", as: :updated
30 |
31 | element :guid, as: :entry_id, class: Feedjira::Parser::GloballyUniqueIdentifier
32 | element :"dc:identifier", as: :dc_identifier
33 |
34 | element :"media:thumbnail", as: :image, value: :url
35 | element :"media:content", as: :image, value: :url
36 | element :enclosure, as: :image, value: :url
37 |
38 | element :comments
39 |
40 | elements :category, as: :categories
41 | end
42 | end
43 | # rubocop:enable Metrics/MethodLength
44 |
45 | def entry_id
46 | @entry_id&.guid
47 | end
48 |
49 | def url
50 | @url || @entry_id&.url
51 | end
52 |
53 | def id
54 | entry_id || @dc_identifier || @url
55 | end
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/atom_google_alerts_entry_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::AtomGoogleAlertsEntry do
6 | before do
7 | feed = Feedjira::Parser::AtomGoogleAlerts.parse sample_google_alerts_atom_feed
8 | @entry = feed.entries.first
9 | end
10 |
11 | it "parses the title" do
12 | expect(@entry.title).to eq "Report offers Prediction of Automotive Slack Market by Top key players like Haldex, Meritor, Bendix ..."
13 | expect(@entry.raw_title).to eq "Report offers Prediction of Automotive Slack Market by Top key players like Haldex, Meritor, Bendix ..."
14 | expect(@entry.title_type).to eq "html"
15 | end
16 |
17 | it "parses the url out of the params when the host is google" do
18 | url = "https://www.exampoo.com"
19 | entry = described_class.new(url: "https://www.google.com/url?url=#{url}")
20 |
21 | expect(entry.url).to eq url
22 | end
23 |
24 | it "returns nil when the url is not present" do
25 | entry = described_class.new
26 |
27 | expect(entry.url).to be_nil
28 | end
29 |
30 | it "returns nil when the host is not google" do
31 | entry = described_class.new(url: "https://www.exampoo.com")
32 |
33 | expect(entry.url).to be_nil
34 | end
35 |
36 | it "parses the content" do
37 | expect(@entry.content).to eq "Automotive Slack Market reports provides a comprehensive overview of the global market size and share. It provides strategists, marketers and senior ..."
38 | end
39 |
40 | it "parses the published date" do
41 | published = Feedjira::Util::ParseTime.call "2019-07-10T11:53:37Z"
42 | expect(@entry.published).to eq published
43 | end
44 |
45 | it "parses the updated date" do
46 | updated = Feedjira::Util::ParseTime.call "2019-07-10T11:53:37Z"
47 | expect(@entry.updated).to eq updated
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/json_feed_item.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # Parser for dealing with JSON Feed items.
6 | class JSONFeedItem
7 | include FeedEntryUtilities
8 |
9 | attr_reader :json, :entry_id, :url, :external_url, :title, :content, :summary,
10 | :published, :updated, :image, :banner_image, :author, :categories
11 |
12 | def initialize(json)
13 | @json = json
14 | @entry_id = json.fetch("id")
15 | @url = json.fetch("url")
16 | @external_url = json.fetch("external_url", nil)
17 | @title = json.fetch("title", nil)
18 | @content = parse_content(json.fetch("content_html", nil), json.fetch("content_text", nil))
19 | @summary = json.fetch("summary", nil)
20 | @image = json.fetch("image", nil)
21 | @banner_image = json.fetch("banner_image", nil)
22 | @published = parse_published(json.fetch("date_published", nil))
23 | @updated = parse_updated(json.fetch("date_modified", nil))
24 | @author = author_name(json.fetch("author", nil))
25 | @categories = json.fetch("tags", [])
26 | end
27 |
28 | private
29 |
30 | def parse_published(date_published)
31 | return nil unless date_published
32 |
33 | Feedjira::Util::ParseTime.call(date_published)
34 | end
35 |
36 | def parse_updated(date_modified)
37 | return nil unless date_modified
38 |
39 | Feedjira::Util::ParseTime.call(date_modified)
40 | end
41 |
42 | # Convenience method to return the included content type.
43 | # Prefer content_html unless it isn't included.
44 | def parse_content(content_html, content_text)
45 | return content_html unless content_html.nil?
46 |
47 | content_text
48 | end
49 |
50 | def author_name(author_obj)
51 | return nil if author_obj.nil?
52 |
53 | author_obj["name"]
54 | end
55 | end
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/lib/feedjira/configuration.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | # Feedjira::Configuration
4 | module Feedjira
5 | # Provides global configuration options for Feedjira
6 | #
7 | # @example Set configuration options using a block
8 | # Feedjira.configure do |config|
9 | # config.strip_whitespace = true
10 | # end
11 | module Configuration
12 | attr_accessor(
13 | :logger,
14 | :parsers,
15 | :strip_whitespace
16 | )
17 |
18 | # Modify Feedjira's current configuration
19 | #
20 | # @yieldparam [Feedjria] config current Feedjira config
21 | # @example
22 | # Feedjira.configure do |config|
23 | # config.strip_whitespace = true
24 | # end
25 | def configure
26 | yield self
27 | end
28 |
29 | # Reset Feedjira's configuration to defaults
30 | #
31 | # @example
32 | # Feedjira.reset_configuration!
33 | def reset_configuration!
34 | set_default_configuration
35 | end
36 |
37 | # @private
38 | def self.extended(base)
39 | base.set_default_configuration
40 | end
41 |
42 | # @private
43 | def set_default_configuration
44 | self.logger = default_logger
45 | self.parsers = default_parsers
46 | self.strip_whitespace = false
47 | end
48 |
49 | private
50 |
51 | # @private
52 | def default_logger
53 | Logger.new($stdout).tap do |logger|
54 | logger.progname = "Feedjira"
55 | logger.level = Logger::WARN
56 | end
57 | end
58 |
59 | # @private
60 | def default_parsers
61 | [
62 | Feedjira::Parser::ITunesRSS,
63 | Feedjira::Parser::RSSFeedBurner,
64 | Feedjira::Parser::GoogleDocsAtom,
65 | Feedjira::Parser::AtomYoutube,
66 | Feedjira::Parser::AtomFeedBurner,
67 | Feedjira::Parser::AtomGoogleAlerts,
68 | Feedjira::Parser::Atom,
69 | Feedjira::Parser::RSS,
70 | Feedjira::Parser::JSONFeed
71 | ]
72 | end
73 | end
74 | end
75 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/atom_google_alerts_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | module Feedjira
6 | module Parser
7 | describe "#able_to_parse?" do
8 | it "returns true for a Google Alerts atom feed" do
9 | expect(AtomGoogleAlerts).to be_able_to_parse(sample_google_alerts_atom_feed)
10 | end
11 |
12 | it "returns false for an rdf feed" do
13 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_rdf_feed)
14 | end
15 |
16 | it "returns false for a regular atom feed" do
17 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_atom_feed)
18 | end
19 |
20 | it "returns false for a feedburner atom feed" do
21 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_feedburner_atom_feed)
22 | end
23 | end
24 |
25 | describe "parsing" do
26 | before do
27 | @feed = AtomGoogleAlerts.parse(sample_google_alerts_atom_feed)
28 | end
29 |
30 | it "parses the title" do
31 | expect(@feed.title).to eq "Google Alert - Slack"
32 | end
33 |
34 | it "parses the descripton" do
35 | expect(@feed.description).to be_nil
36 | end
37 |
38 | it "parses the url" do
39 | expect(@feed.url).to eq "https://www.google.com/alerts/feeds/04175468913983673025/4428013283581841004"
40 | end
41 |
42 | it "parses the feed_url" do
43 | expect(@feed.feed_url).to eq "https://www.google.com/alerts/feeds/04175468913983673025/4428013283581841004"
44 | end
45 |
46 | it "parses entries" do
47 | expect(@feed.entries.size).to eq 20
48 | end
49 | end
50 |
51 | describe "preprocessing" do
52 | it "retains markup in xhtml content" do
53 | AtomGoogleAlerts.preprocess_xml = true
54 |
55 | feed = AtomGoogleAlerts.parse sample_google_alerts_atom_feed
56 | entry = feed.entries.first
57 |
58 | expect(entry.content).to include("Slack")
59 | end
60 | end
61 | end
62 | end
63 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/itunes_rss_item.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # iTunes extensions to the standard RSS2.0 item
6 | # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
7 | class ITunesRSSItem
8 | include SAXMachine
9 | include FeedEntryUtilities
10 | include RSSEntryUtilities
11 |
12 | sax_config.top_level_elements["enclosure"].clear
13 |
14 | # If author is not present use author tag on the item
15 | element :"itunes:author", as: :itunes_author
16 | element :"itunes:block", as: :itunes_block
17 | element :"itunes:duration", as: :itunes_duration
18 | element :"itunes:explicit", as: :itunes_explicit
19 | element :"itunes:keywords", as: :itunes_keywords
20 | element :"itunes:subtitle", as: :itunes_subtitle
21 | element :"itunes:image", value: :href, as: :itunes_image
22 | element :"itunes:isClosedCaptioned", as: :itunes_closed_captioned
23 | element :"itunes:order", as: :itunes_order
24 | element :"itunes:season", as: :itunes_season
25 | element :"itunes:episode", as: :itunes_episode
26 | element :"itunes:title", as: :itunes_title
27 | element :"itunes:episodeType", as: :itunes_episode_type
28 |
29 | # If summary is not present, use the description tag
30 | element :"itunes:summary", as: :itunes_summary
31 | element :enclosure, value: :length, as: :enclosure_length
32 | element :enclosure, value: :type, as: :enclosure_type
33 | element :enclosure, value: :url, as: :enclosure_url
34 | elements "psc:chapter", as: :raw_chapters, class: Feedjira::Parser::PodloveChapter
35 |
36 | # Podlove requires clients to re-order by start time in the
37 | # event the publisher doesn't provide them in that
38 | # order. SAXMachine doesn't have any sort capability afaik, so
39 | # we have to sort chapters manually.
40 | def chapters
41 | raw_chapters.sort_by(&:start)
42 | end
43 | end
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/google_docs_atom_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | module Feedjira
6 | module Parser
7 | describe ".able_to_parser?" do
8 | it "returns true for Google Docs feed" do
9 | expect(GoogleDocsAtom).to be_able_to_parse(sample_google_docs_list_feed)
10 | end
11 |
12 | it "is not able to parse another Atom feed" do
13 | expect(GoogleDocsAtom).not_to be_able_to_parse(sample_atom_feed)
14 | end
15 | end
16 |
17 | describe "parsing" do
18 | before do
19 | @feed = GoogleDocsAtom.parse(sample_google_docs_list_feed)
20 | end
21 |
22 | it "returns a bunch of objects" do
23 | expect(@feed.entries).not_to be_empty
24 | end
25 |
26 | it "populates a title, interhited from the Atom entry" do
27 | expect(@feed.title).not_to be_nil
28 | end
29 |
30 | it "returns a bunch of entries of type GoogleDocsAtomEntry" do
31 | expect(@feed.entries.first).to be_a GoogleDocsAtomEntry
32 | end
33 | end
34 |
35 | describe "#url" do
36 | it "returns the url when set" do
37 | feed = GoogleDocsAtom.new
38 |
39 | feed.url = "http://exampoo.com/feed"
40 |
41 | expect(feed.url).to eq "http://exampoo.com/feed"
42 | end
43 |
44 | it "returns the first link when not set" do
45 | feed = GoogleDocsAtom.new
46 |
47 | feed.links = ["http://exampoo.com/feed"]
48 |
49 | expect(feed.url).to eq "http://exampoo.com/feed"
50 | end
51 | end
52 |
53 | describe "#feed_url" do
54 | it "returns the feed_url when set" do
55 | feed = GoogleDocsAtom.new
56 |
57 | feed.feed_url = "http://exampoo.com/feed"
58 |
59 | expect(feed.feed_url).to eq "http://exampoo.com/feed"
60 | end
61 |
62 | it "returns the first link when not set" do
63 | feed = GoogleDocsAtom.new
64 |
65 | feed.links = ["http://exampoo.com/feed"]
66 |
67 | expect(feed.feed_url).to eq "http://exampoo.com/feed"
68 | end
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/json_feed_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | module Feedjira
6 | module Parser
7 | describe ".able_to_parse?" do
8 | it "returns true for a JSON feed" do
9 | expect(JSONFeed).to be_able_to_parse(sample_json_feed)
10 | end
11 |
12 | it "returns true for a JSON feed with escaped URIs" do
13 | expect(JSONFeed).to be_able_to_parse(sample_json_feed_with_escaped_uris)
14 | end
15 |
16 | it "returns false for an RSS feed" do
17 | expect(JSONFeed).not_to be_able_to_parse(sample_rss_feed)
18 | end
19 |
20 | it "returns false for an Atom feed" do
21 | expect(JSONFeed).not_to be_able_to_parse(sample_atom_feed)
22 | end
23 | end
24 |
25 | describe "parsing" do
26 | before do
27 | @feed = JSONFeed.parse(sample_json_feed)
28 | end
29 |
30 | it "parses the version" do
31 | expect(@feed.version).to eq "https://jsonfeed.org/version/1"
32 | end
33 |
34 | it "parses the title" do
35 | expect(@feed.title).to eq "inessential.com"
36 | end
37 |
38 | it "parses the url" do
39 | expect(@feed.url).to eq "http://inessential.com/"
40 | end
41 |
42 | it "parses the feed_url" do
43 | expect(@feed.feed_url).to eq "http://inessential.com/feed.json"
44 | end
45 |
46 | it "parses the description" do
47 | expect(@feed.description).to eq "Brent Simmons’s weblog."
48 | end
49 |
50 | it "parses the favicon" do
51 | expect(@feed.favicon).to eq "http://inessential.com/favicon.ico"
52 | end
53 |
54 | it "parses the icon" do
55 | expect(@feed.icon).to eq "http://inessential.com/icon.png"
56 | end
57 |
58 | it "parses the language" do
59 | expect(@feed.language).to eq "en-US"
60 | end
61 |
62 | it "parses expired and return default (nil)" do
63 | expect(@feed.expired).to be_nil
64 | end
65 |
66 | it "parses entries" do
67 | expect(@feed.entries.size).to eq 20
68 | end
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml:
--------------------------------------------------------------------------------
1 |
There's lots to like about Google's new web browser, Chrome, which was released today. When I read the awesome comic strip introduction yesterday, however, the thing that stood out most for me was in very small type: the name Lars Bak attached to the V8 JavaScript engine. I know of Lars from his work on Self, Strongtalk, HotSpot and OOVM, and his involvement in V8 says a lot about the kind of language implementation it will be. David Griswold has posted some more information on the Strongtalk list:
2 |
3 |
4 | The V8 development team has multiple members of the original
5 | Animorphic team; it is headed by Lars Bak, who was the technical lead
6 | for both Strongtalk and the HotSpot Java VM (as well as a huge
7 | contributor to the original Self VM). I think that you will find
8 | that V8 has a lot of the creamy goodness of the Strongtalk and Self
9 | VMs, with many big architectural improvements
10 |
11 |
12 | I'll post more on this later, but things are getting interesting...
13 |
14 |
Update: the V8 code is already available, and builds and runs fine on Mac OS X. From the design docs, it's pretty clear that this is indeed what I was hoping for: a mainstream, open source dynamic language implementation that learned and applies the lessons from Smalltalk, Self and Strongtalk. Most telling are that the only two papers cited in that document are titled "An Efficient Implementation of Self" and "An Efficient Implementation of the Smalltalk-80 System".
15 |
16 |
The "classes as nodes in a state machine" trick for expando properties is especially neat.
17 |
18 |
The bad news: V8 is over 100,000 lines of C++.
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/rss_feed_burner_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | module Feedjira
6 | module Parser
7 | describe "#will_parse?" do
8 | it "returns true for a feedburner rss feed" do
9 | expect(RSSFeedBurner).to be_able_to_parse sample_rss_feed_burner_feed
10 | end
11 |
12 | it "returns false for a regular RSS feed" do
13 | expect(RSSFeedBurner).not_to be_able_to_parse sample_rss_feed
14 | end
15 |
16 | it "returns false for a feedburner atom feed" do
17 | expect(RSSFeedBurner).not_to be_able_to_parse sample_feedburner_atom_feed
18 | end
19 |
20 | it "returns false for an rdf feed" do
21 | expect(RSSFeedBurner).not_to be_able_to_parse sample_rdf_feed
22 | end
23 |
24 | it "returns false for a regular atom feed" do
25 | expect(RSSFeedBurner).not_to be_able_to_parse sample_atom_feed
26 | end
27 | end
28 |
29 | describe "parsing" do
30 | before do
31 | @feed = RSSFeedBurner.parse(sample_rss_feed_burner_feed)
32 | end
33 |
34 | it "parses the title" do
35 | expect(@feed.title).to eq "TechCrunch"
36 | end
37 |
38 | it "parses the description" do
39 | description = "TechCrunch is a group-edited blog that profiles the companies, products and events defining and transforming the new web."
40 | expect(@feed.description).to eq description
41 | end
42 |
43 | it "parses the url" do
44 | expect(@feed.url).to eq "http://techcrunch.com"
45 | end
46 |
47 | it "parses the last build date" do
48 | expect(@feed.last_built).to eq "Wed, 02 Nov 2011 17:29:59 +0000"
49 | end
50 |
51 | it "parses the hub urls" do
52 | expect(@feed.hubs.count).to eq 2
53 | expect(@feed.hubs.first).to eq "http://pubsubhubbub.appspot.com/"
54 | end
55 |
56 | it "provides an accessor for the feed_url" do
57 | expect(@feed).to respond_to :feed_url
58 | expect(@feed).to respond_to :feed_url=
59 | end
60 |
61 | it "parses entries" do
62 | expect(@feed.entries.size).to eq 20
63 | end
64 | end
65 | end
66 | end
67 |
--------------------------------------------------------------------------------
/lib/feedjira/feed_entry_utilities.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module FeedEntryUtilities
5 | include Enumerable
6 |
7 | def published
8 | @published ||= @updated
9 | end
10 |
11 | def parse_datetime(string)
12 | DateTime.parse(string).to_time.utc
13 | rescue StandardError => e
14 | Feedjira.logger.debug("Failed to parse date #{string.inspect}")
15 | Feedjira.logger.debug(e)
16 | nil
17 | end
18 |
19 | ##
20 | # Returns the id of the entry or its url if not id is present, as some
21 | # formats don't support it
22 | # rubocop:disable Naming/MemoizedInstanceVariableName
23 | def id
24 | @entry_id ||= @url
25 | end
26 | # rubocop:enable Naming/MemoizedInstanceVariableName
27 |
28 | ##
29 | # Writer for published. By default, we keep the "oldest" publish time found.
30 | def published=(val)
31 | parsed = parse_datetime(val)
32 | @published = parsed if parsed && (!@published || parsed < @published)
33 | end
34 |
35 | ##
36 | # Writer for updated. By default, we keep the most recent update time found.
37 | def updated=(val)
38 | parsed = parse_datetime(val)
39 | @updated = parsed if parsed && (!@updated || parsed > @updated)
40 | end
41 |
42 | def sanitize!
43 | %w[title author summary content image].each do |name|
44 | next unless respond_to?(name)
45 |
46 | current_value = send(name)
47 | if current_value.is_a?(String)
48 | send(:"#{name}=", Loofah.scrub_fragment(current_value, :prune).to_s)
49 | end
50 | end
51 | end
52 |
53 | alias last_modified published
54 |
55 | def each
56 | @rss_fields ||= instance_variables.map do |ivar|
57 | ivar.to_s.sub("@", "")
58 | end.select do |field| # rubocop:disable Style/MultilineBlockChain
59 | # select callable (public) methods only
60 | respond_to?(field)
61 | end
62 |
63 | @rss_fields.each do |field|
64 | yield(field, instance_variable_get(:"@#{field}"))
65 | end
66 | end
67 |
68 | def [](field)
69 | instance_variable_get(:"@#{field}")
70 | end
71 |
72 | def []=(field, value)
73 | instance_variable_set(:"@#{field}", value)
74 | end
75 | end
76 | end
77 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/atom_feed_burner_entry_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::AtomFeedBurnerEntry do
6 | before do
7 | Feedjira::Parser::AtomFeedBurner.preprocess_xml = false
8 | # I don't really like doing it this way because these unit test should only
9 | # rely on AtomEntry, but this is actually how it should work. You would
10 | # never just pass entry xml straight to the AtomEnry
11 | feed = Feedjira::Parser::AtomFeedBurner.parse sample_feedburner_atom_feed
12 | @entry = feed.entries.first
13 | end
14 |
15 | it "parses the title" do
16 | expect(@entry.title).to eq "Making a Ruby C library even faster"
17 | end
18 |
19 | it "is able to fetch a url via the 'alternate' rel if no origLink exists" do
20 | xml = File.read("#{File.dirname(__FILE__)}/../../sample_feeds/PaulDixExplainsNothingAlternate.xml")
21 | entry = Feedjira::Parser::AtomFeedBurner.parse(xml).entries.first
22 | expect(entry.url).to eq("http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/519925023/making-a-ruby-c-library-even-faster.html")
23 | end
24 |
25 | it "parses the url" do
26 | expect(@entry.url).to eq "http://www.pauldix.net/2009/01/making-a-ruby-c-library-even-faster.html"
27 | end
28 |
29 | it "parses the url when there is no alternate" do
30 | xml = File.read("#{File.dirname(__FILE__)}/../../sample_feeds/FeedBurnerUrlNoAlternate.xml")
31 | entry = Feedjira::Parser::AtomFeedBurner.parse(xml).entries.first
32 | expect(entry.url).to eq "http://example.com/QQQQ.html"
33 | end
34 |
35 | it "parses the author" do
36 | expect(@entry.author).to eq "Paul Dix"
37 | end
38 |
39 | it "parses the content" do
40 | expect(@entry.content).to eq sample_feedburner_atom_entry_content
41 | end
42 |
43 | it "provides a summary" do
44 | summary = "Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how..."
45 | expect(@entry.summary).to eq summary
46 | end
47 |
48 | it "parses the published date" do
49 | published = Feedjira::Util::ParseTime.call "Thu Jan 22 15:50:22 UTC 2009"
50 | expect(@entry.published).to eq published
51 | end
52 |
53 | it "parses the categories" do
54 | expect(@entry.categories).to eq ["Ruby", "Another Category"]
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/spec/sample_feeds.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module SampleFeeds
4 | FEEDS = {
5 | sample_atom_feed: "AmazonWebServicesBlog.xml",
6 | sample_atom_simple: "atom_simple_single_entry.xml",
7 | sample_atom_simple_link_self: "atom_simple_single_entry_link_self.xml",
8 | sample_atom_middleman_feed: "FeedjiraBlog.xml",
9 | sample_atom_xhtml_feed: "pet_atom.xml",
10 | sample_atom_feed_line_breaks: "AtomFeedWithSpacesAroundEquals.xml",
11 | sample_atom_entry_content: "AmazonWebServicesBlogFirstEntryContent.xml",
12 | sample_itunes_feed: "itunes.xml",
13 | sample_itunes_feedburner_feed: "itunes_feedburner.xml",
14 | sample_itunes_feed_with_single_quotes: "ITunesWithSingleQuotedAttributes.xml",
15 | sample_itunes_feed_with_spaces: "ITunesWithSpacesInAttributes.xml",
16 | sample_podlove_feed: "CRE.xml",
17 | sample_rdf_feed: "HREFConsideredHarmful.xml",
18 | sample_rdf_entry_content: "HREFConsideredHarmfulFirstEntry.xml",
19 | sample_rss_feed_burner_feed: "TechCrunch.xml",
20 | sample_rss_feed_burner_entry_content: "TechCrunchFirstEntry.xml",
21 | sample_rss_feed_burner_entry_description: "TechCrunchFirstEntryDescription.xml",
22 | sample_rss_feed: "TenderLovemaking.xml",
23 | sample_rss_entry_content: "TenderLovemakingFirstEntry.xml",
24 | sample_feedburner_atom_feed: "PaulDixExplainsNothing.xml",
25 | sample_feedburner_atom_feed_alternate: "GiantRobotsSmashingIntoOtherGiantRobots.xml",
26 | sample_feedburner_atom_entry_content: "PaulDixExplainsNothingFirstEntryContent.xml",
27 | sample_google_alerts_atom_feed: "google_alerts_atom.xml",
28 | sample_wfw_feed: "PaulDixExplainsNothingWFW.xml",
29 | sample_google_docs_list_feed: "GoogleDocsList.xml",
30 | sample_feed_burner_atom_xhtml_feed: "FeedBurnerXHTML.xml",
31 | sample_duplicate_content_atom_feed: "DuplicateContentAtomFeed.xml",
32 | sample_youtube_atom_feed: "youtube_atom.xml",
33 | sample_atom_xhtml_with_escpaed_html_in_pre_tag_feed: "AtomEscapedHTMLInPreTag.xml",
34 | sample_json_feed: "json_feed.json",
35 | sample_json_feed_with_escaped_uris: "json_feed_with_escaped_uris.json",
36 | sample_rss_feed_huffpost_ca: "HuffPostCanada.xml",
37 | sample_invalid_date_format_feed: "InvalidDateFormat.xml",
38 | sample_rss_feed_permalinks: "Permalinks.xml",
39 | sample_rss_feed_with_a10_namespace: "a10.xml",
40 | sample_rss_feed_with_comments: "RSSWithComments.xml"
41 | }.freeze
42 |
43 | FEEDS.each do |method, filename|
44 | define_method(method) { load_sample filename }
45 | end
46 |
47 | def load_sample(filename)
48 | File.read("#{File.dirname(__FILE__)}/sample_feeds/#{filename}")
49 | end
50 | end
51 |
--------------------------------------------------------------------------------
/lib/feedjira/parser/itunes_rss.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module Parser
5 | # iTunes is RSS 2.0 + some apple extensions
6 | # Sources:
7 | # * https://cyber.harvard.edu/rss/rss.html
8 | # * http://lists.apple.com/archives/syndication-dev/2005/Nov/msg00002.html
9 | # * https://help.apple.com/itc/podcasts_connect/
10 | class ITunesRSS
11 | include SAXMachine
12 | include FeedUtilities
13 |
14 | attr_accessor :feed_url
15 |
16 | # RSS 2.0 elements that need including
17 | element :copyright
18 | element :description
19 | element :image, class: RSSImage
20 | element :language
21 | element :lastBuildDate, as: :last_built
22 | element :link, as: :url
23 | element :managingEditor, as: :managing_editor
24 | element :rss, as: :version, value: :version
25 | element :title
26 | element :ttl
27 |
28 | # If author is not present use managingEditor on the channel
29 | element :"itunes:author", as: :itunes_author
30 | element :"itunes:block", as: :itunes_block
31 | element :"itunes:image", value: :href, as: :itunes_image
32 | element :"itunes:explicit", as: :itunes_explicit
33 | element :"itunes:complete", as: :itunes_complete
34 | element :"itunes:keywords", as: :itunes_keywords
35 | element :"itunes:type", as: :itunes_type
36 |
37 | # New URL for the podcast feed
38 | element :"itunes:new_feed_url", as: :itunes_new_feed_url
39 | element :"itunes:subtitle", as: :itunes_subtitle
40 |
41 | # If summary is not present, use the description tag
42 | element :"itunes:summary", as: :itunes_summary
43 |
44 | # iTunes RSS feeds can have multiple main categories and multiple
45 | # sub-categories per category.
46 | elements :"itunes:category", as: :_itunes_categories,
47 | class: ITunesRSSCategory
48 | private :_itunes_categories
49 |
50 | def itunes_categories
51 | _itunes_categories.flat_map do |itunes_category|
52 | itunes_category.enum_for(:each_subcategory).to_a
53 | end
54 | end
55 |
56 | def itunes_category_paths
57 | _itunes_categories.flat_map do |itunes_category|
58 | itunes_category.enum_for(:each_path).to_a
59 | end
60 | end
61 |
62 | elements :"itunes:owner", as: :itunes_owners, class: ITunesRSSOwner
63 | elements :item, as: :entries, class: ITunesRSSItem
64 |
65 | def self.able_to_parse?(xml)
66 | %r{xmlns:itunes\s?=\s?["']http://www\.itunes\.com/dtds/podcast-1\.0\.dtd["']}i =~ xml
67 | end
68 | end
69 | end
70 | end
71 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/rss_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::RSS do
6 | describe "#will_parse?" do
7 | it "returns true for an RSS feed" do
8 | expect(described_class).to be_able_to_parse(sample_rss_feed)
9 | end
10 |
11 | it "returns false for an atom feed" do
12 | expect(described_class).not_to be_able_to_parse(sample_atom_feed)
13 | end
14 |
15 | it "returns false for an rss feedburner feed" do
16 | able = described_class.able_to_parse? sample_rss_feed_burner_feed
17 | expect(able).to be false
18 | end
19 | end
20 |
21 | describe "parsing" do
22 | before do
23 | @feed = described_class.parse(sample_rss_feed)
24 | end
25 |
26 | it "parses the version" do
27 | expect(@feed.version).to eq "2.0"
28 | end
29 |
30 | it "parses the title" do
31 | expect(@feed.title).to eq "Tender Lovemaking"
32 | end
33 |
34 | it "parses the description" do
35 | expect(@feed.description).to eq "The act of making love, tenderly."
36 | end
37 |
38 | it "parses the url" do
39 | expect(@feed.url).to eq "http://tenderlovemaking.com"
40 | end
41 |
42 | it "parses the ttl" do
43 | expect(@feed.ttl).to eq "60"
44 | end
45 |
46 | it "parses the last build date" do
47 | expect(@feed.last_built).to eq "Sat, 07 Sep 2002 09:42:31 GMT"
48 | end
49 |
50 | it "parses the hub urls" do
51 | expect(@feed.hubs.count).to eq 1
52 | expect(@feed.hubs.first).to eq "http://pubsubhubbub.appspot.com/"
53 | end
54 |
55 | it "provides an accessor for the feed_url" do
56 | expect(@feed).to respond_to :feed_url
57 | expect(@feed).to respond_to :feed_url=
58 | end
59 |
60 | it "parses the language" do
61 | expect(@feed.language).to eq "en"
62 | end
63 |
64 | it "parses the image url" do
65 | expect(@feed.image.url).to eq "https://tenderlovemaking.com/images/header-logo-text-trimmed.png"
66 | end
67 |
68 | it "parses the image title" do
69 | expect(@feed.image.title).to eq "Tender Lovemaking"
70 | end
71 |
72 | it "parses the image link" do
73 | expect(@feed.image.link).to eq "http://tenderlovemaking.com"
74 | end
75 |
76 | it "parses the image width" do
77 | expect(@feed.image.width).to eq "766"
78 | end
79 |
80 | it "parses the image height" do
81 | expect(@feed.image.height).to eq "138"
82 | end
83 |
84 | it "parses the image description" do
85 | expect(@feed.image.description).to eq "The act of making love, tenderly."
86 | end
87 |
88 | it "parses entries" do
89 | expect(@feed.entries.size).to eq 10
90 | end
91 | end
92 | end
93 |
--------------------------------------------------------------------------------
/spec/feedjira/feed_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Feed do
6 | describe ".add_common_feed_element" do
7 | before(:all) do
8 | described_class.add_common_feed_element("generator")
9 | end
10 |
11 | it "parses the added element out of Atom feeds" do
12 | expect(Feedjira.parse(sample_wfw_feed).generator).to eq "TypePad"
13 | end
14 |
15 | it "parses the added element out of Atom Feedburner feeds" do
16 | expect(Feedjira::Parser::Atom.new).to respond_to(:generator)
17 | end
18 |
19 | it "parses the added element out of RSS feeds" do
20 | expect(Feedjira::Parser::RSS.new).to respond_to(:generator)
21 | end
22 | end
23 |
24 | describe ".add_common_feed_elements" do
25 | before do
26 | described_class.add_common_feed_elements(:foos)
27 | end
28 |
29 | it "parses the added element out of Atom feeds" do
30 | expect(Feedjira.parse(sample_wfw_feed).foos).to eq []
31 | end
32 |
33 | it "parses the added element out of Atom Feedburner feeds" do
34 | expect(Feedjira::Parser::Atom.new).to respond_to(:foos)
35 | end
36 |
37 | it "parses the added element out of RSS feeds" do
38 | expect(Feedjira::Parser::RSS.new).to respond_to(:foos)
39 | end
40 | end
41 |
42 | describe ".add_common_feed_entry_element" do
43 | before(:all) do
44 | tag = "wfw:commentRss"
45 | described_class.add_common_feed_entry_element tag, as: :comment_rss
46 | end
47 |
48 | it "parses the added element out of Atom feeds entries" do
49 | entry = Feedjira.parse(sample_wfw_feed).entries.first
50 | expect(entry.comment_rss).to eq "this is the new val"
51 | end
52 |
53 | it "parses the added element out of Atom Feedburner feeds entries" do
54 | expect(Feedjira::Parser::AtomEntry.new).to respond_to(:comment_rss)
55 | end
56 |
57 | it "parses the added element out of RSS feeds entries" do
58 | expect(Feedjira::Parser::RSSEntry.new).to respond_to(:comment_rss)
59 | end
60 | end
61 |
62 | describe ".add_common_feed_entry_elements" do
63 | before do
64 | described_class.add_common_feed_entry_elements(:things)
65 | end
66 |
67 | it "parses the added element out of Atom feeds entries" do
68 | entry = Feedjira.parse(sample_wfw_feed).entries.first
69 | expect(entry.things).to eq []
70 | end
71 |
72 | it "parses the added element out of Atom Feedburner feeds entries" do
73 | expect(Feedjira::Parser::AtomEntry.new).to respond_to(:things)
74 | end
75 |
76 | it "parses the added element out of RSS feeds entries" do
77 | expect(Feedjira::Parser::RSSEntry.new).to respond_to(:things)
78 | end
79 | end
80 | end
81 |
--------------------------------------------------------------------------------
/lib/feedjira.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "zlib"
4 | require "sax-machine"
5 | require "loofah"
6 | require "logger"
7 | require "json"
8 |
9 | require_relative "feedjira/util"
10 | require_relative "feedjira/util/parse_time"
11 | require_relative "feedjira/configuration"
12 | require_relative "feedjira/feed_entry_utilities"
13 | require_relative "feedjira/feed_utilities"
14 | require_relative "feedjira/feed"
15 | require_relative "feedjira/rss_entry_utilities"
16 | require_relative "feedjira/atom_entry_utilities"
17 | require_relative "feedjira/parser"
18 | require_relative "feedjira/parser/globally_unique_identifier"
19 | require_relative "feedjira/parser/rss_entry"
20 | require_relative "feedjira/parser/rss_image"
21 | require_relative "feedjira/parser/rss"
22 | require_relative "feedjira/parser/atom_entry"
23 | require_relative "feedjira/parser/atom"
24 | require_relative "feedjira/preprocessor"
25 | require_relative "feedjira/version"
26 |
27 | require_relative "feedjira/parser/rss_feed_burner_entry"
28 | require_relative "feedjira/parser/rss_feed_burner"
29 | require_relative "feedjira/parser/podlove_chapter"
30 | require_relative "feedjira/parser/itunes_rss_owner"
31 | require_relative "feedjira/parser/itunes_rss_category"
32 | require_relative "feedjira/parser/itunes_rss_item"
33 | require_relative "feedjira/parser/itunes_rss"
34 | require_relative "feedjira/parser/atom_feed_burner_entry"
35 | require_relative "feedjira/parser/atom_feed_burner"
36 | require_relative "feedjira/parser/atom_google_alerts_entry"
37 | require_relative "feedjira/parser/atom_google_alerts"
38 | require_relative "feedjira/parser/google_docs_atom_entry"
39 | require_relative "feedjira/parser/google_docs_atom"
40 | require_relative "feedjira/parser/atom_youtube_entry"
41 | require_relative "feedjira/parser/atom_youtube"
42 | require_relative "feedjira/parser/json_feed"
43 | require_relative "feedjira/parser/json_feed_item"
44 |
45 | # Feedjira
46 | module Feedjira
47 | NoParserAvailable = Class.new(StandardError)
48 |
49 | extend Configuration
50 |
51 | # Parse XML with first compatible parser
52 | #
53 | # @example
54 | # xml = HTTParty.get("http://example.com").body
55 | # Feedjira.parse(xml)
56 | def parse(xml, parser: nil, &block)
57 | parser ||= parser_for_xml(xml)
58 |
59 | if parser.nil?
60 | raise NoParserAvailable, "No valid parser for XML."
61 | end
62 |
63 | parser.parse(xml, &block)
64 | end
65 | module_function :parse
66 |
67 | # Find compatible parser for given XML
68 | #
69 | # @example
70 | # xml = HTTParty.get("http://example.com").body
71 | # parser = Feedjira.parser_for_xml(xml)
72 | # parser.parse(xml)
73 | def parser_for_xml(xml)
74 | Feedjira.parsers.detect { |klass| klass.able_to_parse?(xml) }
75 | end
76 | module_function :parser_for_xml
77 | end
78 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/i_tunes_rss_item_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::Parser::ITunesRSSItem do
6 | before do
7 | # I don't really like doing it this way because these unit test should only
8 | # rely on ITunesRssItem, but this is actually how it should work. You would
9 | # never just pass entry xml straight to the ITunesRssItem
10 | @item = Feedjira::Parser::ITunesRSS.parse(sample_itunes_feed).entries.first
11 | end
12 |
13 | it "parses the title" do
14 | expect(@item.title).to eq "Shake Shake Shake Your Spices"
15 | end
16 |
17 | it "parses the itunes title" do
18 | expect(@item.itunes_title).to eq "Shake Shake Shake Your Spices"
19 | end
20 |
21 | it "parses the author" do
22 | expect(@item.itunes_author).to eq "John Doe"
23 | end
24 |
25 | it "parses the subtitle" do
26 | expect(@item.itunes_subtitle).to eq "A short primer on table spices"
27 | end
28 |
29 | it "parses the summary" do
30 | summary = "This week we talk about salt and pepper shakers, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party!"
31 | expect(@item.itunes_summary).to eq summary
32 | end
33 |
34 | it "parses the itunes season" do
35 | expect(@item.itunes_season).to eq "1"
36 | end
37 |
38 | it "parses the itunes episode number" do
39 | expect(@item.itunes_episode).to eq "3"
40 | end
41 |
42 | it "parses the itunes episode type" do
43 | expect(@item.itunes_episode_type).to eq "full"
44 | end
45 |
46 | it "parses the enclosure" do
47 | expect(@item.enclosure_length).to eq "8727310"
48 | expect(@item.enclosure_type).to eq "audio/x-m4a"
49 | expect(@item.enclosure_url).to eq "http://example.com/podcasts/everything/AllAboutEverythingEpisode3.m4a"
50 | end
51 |
52 | it "parses the guid as id" do
53 | expect(@item.id).to eq "http://example.com/podcasts/archive/aae20050615.m4a"
54 | end
55 |
56 | it "parses the published date" do
57 | published = Feedjira::Util::ParseTime.call "Wed Jun 15 19:00:00 UTC 2005"
58 | expect(@item.published).to eq published
59 | end
60 |
61 | it "parses the duration" do
62 | expect(@item.itunes_duration).to eq "7:04"
63 | end
64 |
65 | it "parses the keywords" do
66 | expect(@item.itunes_keywords).to eq "salt, pepper, shaker, exciting"
67 | end
68 |
69 | it "parses the image" do
70 | expect(@item.itunes_image).to eq "http://example.com/podcasts/everything/AllAboutEverything.jpg"
71 | end
72 |
73 | it "parses the order" do
74 | expect(@item.itunes_order).to eq "12"
75 | end
76 |
77 | it "parses the closed captioned flag" do
78 | expect(@item.itunes_closed_captioned).to eq "yes"
79 | end
80 |
81 | it "parses the encoded content" do
82 | content = "
TOPIC: Gooseneck Options
"
83 | expect(@item.content).to eq content
84 | end
85 | end
86 |
--------------------------------------------------------------------------------
/spec/sample_feeds/a10.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Some Title
5 | Some Description
6 | Thu, 14 May 2020 10:00:18 Z
7 | Some Category
8 |
9 | Sat, 16 May 2020 08:50:40 GMT
10 |
11 | Title 5
12 | Description 5
13 | Thu, 14 May 2020 10:00:18 Z
14 |
15 |
16 | John Doe
17 | http://www.example.com/
18 | john.doe@example.com
19 |
20 | 2020-05-14T10:00:18Z
21 |
22 |
23 | Title 4
24 | Description 4
25 | Wed, 13 May 2020 10:17:57 Z
26 |
27 |
28 | John Doe
29 | http://www.example.com/
30 | john.doe@example.com
31 |
32 | 2020-05-13T10:17:57Z
33 |
34 |
35 | Title 3
36 | Dfescription 3
37 | Tue, 12 May 2020 15:00:00 Z
38 |
39 |
40 | John Doe
41 | http://www.example.com/
42 | john.doe@example.com
43 |
44 | 2020-05-12T15:00:00Z
45 |
46 |
47 | Title 2
48 | Description 2
49 | Tue, 12 May 2020 07:52:36 Z
50 |
51 |
52 | John Doe
53 | http://www.example.com/
54 | john.doe@example.com
55 |
56 | 2020-05-12T07:52:36Z
57 |
58 |
59 | Title 1
60 | Description 1
61 | Thu, 07 May 2020 07:36:53 Z
62 |
63 |
64 | John Doe
65 | http://www.example.com/
66 | john.doe@example.com
67 |
68 | 2020-05-07T07:36:53Z
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/spec/feedjira/parser/atom_youtube_entry_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require File.join(File.dirname(__FILE__), %w[.. .. spec_helper])
4 |
5 | describe Feedjira::Parser::AtomYoutubeEntry do
6 | describe "parsing" do
7 | before do
8 | @feed = Feedjira::Parser::AtomYoutube.parse(sample_youtube_atom_feed)
9 | @entry = @feed.entries.first
10 | end
11 |
12 | it "has the title" do
13 | expect(@entry.title).to eq "The Google app: Questions Title"
14 | end
15 |
16 | it "has the url" do
17 | expect(@entry.url).to eq "http://www.youtube.com/watch?v=5shykyfmb28"
18 | end
19 |
20 | it "has the entry id" do
21 | expect(@entry.entry_id).to eq "yt:video:5shykyfmb28"
22 | end
23 |
24 | it "has the published date" do
25 | expect(@entry.published).to eq Feedjira::Util::ParseTime.call("2015-05-04T00:01:27+00:00")
26 | end
27 |
28 | it "has the updated date" do
29 | expect(@entry.updated).to eq Feedjira::Util::ParseTime.call("2015-05-13T17:38:30+00:00")
30 | end
31 |
32 | it "has the content populated from the media:description element" do
33 | expect(@entry.content).to eq "A question is the most powerful force in the world. It can start you on an adventure or spark a connection. See where a question can take you. The Google app is available on iOS and Android. Download the app here: http://www.google.com/search/about/download"
34 | end
35 |
36 | it "has the summary but blank" do
37 | expect(@entry.summary).to be_nil
38 | end
39 |
40 | it "has the custom youtube video id" do
41 | expect(@entry.youtube_video_id).to eq "5shykyfmb28"
42 | end
43 |
44 | it "has the custom media title" do
45 | expect(@entry.media_title).to eq "The Google app: Questions"
46 | end
47 |
48 | it "has the custom media url" do
49 | expect(@entry.media_url).to eq "https://www.youtube.com/v/5shykyfmb28?version=3"
50 | end
51 |
52 | it "has the custom media type" do
53 | expect(@entry.media_type).to eq "application/x-shockwave-flash"
54 | end
55 |
56 | it "has the custom media width" do
57 | expect(@entry.media_width).to eq "640"
58 | end
59 |
60 | it "has the custom media height" do
61 | expect(@entry.media_height).to eq "390"
62 | end
63 |
64 | it "has the custom media thumbnail url" do
65 | expect(@entry.media_thumbnail_url).to eq "https://i2.ytimg.com/vi/5shykyfmb28/hqdefault.jpg"
66 | end
67 |
68 | it "has the custom media thumbnail width" do
69 | expect(@entry.media_thumbnail_width).to eq "480"
70 | end
71 |
72 | it "has the custom media thumbnail height" do
73 | expect(@entry.media_thumbnail_height).to eq "360"
74 | end
75 |
76 | it "has the custom media star count" do
77 | expect(@entry.media_star_count).to eq "3546"
78 | end
79 |
80 | it "has the custom media star average" do
81 | expect(@entry.media_star_average).to eq "4.79"
82 | end
83 |
84 | it "has the custom media views" do
85 | expect(@entry.media_views).to eq "251497"
86 | end
87 | end
88 | end
89 |
--------------------------------------------------------------------------------
/lib/feedjira/feed_utilities.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Feedjira
4 | module FeedUtilities
5 | UPDATABLE_ATTRIBUTES = %w[title feed_url url last_modified etag].freeze
6 |
7 | attr_writer :new_entries, :updated, :last_modified
8 | attr_accessor :etag
9 |
10 | def self.included(base)
11 | base.extend ClassMethods
12 | end
13 |
14 | module ClassMethods
15 | def parse(xml, &)
16 | xml = strip_whitespace(xml)
17 | xml = preprocess(xml) if preprocess_xml
18 | super(xml, &)
19 | end
20 |
21 | def preprocess(xml)
22 | # noop
23 | xml
24 | end
25 |
26 | def preprocess_xml=(value)
27 | @preprocess_xml = value
28 | end
29 |
30 | def preprocess_xml
31 | @preprocess_xml
32 | end
33 |
34 | def strip_whitespace(xml)
35 | if Feedjira.strip_whitespace
36 | xml.strip
37 | else
38 | xml.lstrip
39 | end
40 | end
41 | end
42 |
43 | def last_modified
44 | @last_modified ||= entries.reject { |e| e.published.nil? }.max_by(&:published)&.published
45 | end
46 |
47 | def updated?
48 | @updated || false
49 | end
50 |
51 | def new_entries
52 | @new_entries ||= []
53 | end
54 |
55 | def new_entries?
56 | !new_entries.empty?
57 | end
58 |
59 | def update_from_feed(feed)
60 | self.new_entries += find_new_entries_for(feed)
61 | entries.unshift(*self.new_entries)
62 |
63 | @updated = false
64 |
65 | UPDATABLE_ATTRIBUTES.each do |name|
66 | @updated ||= update_attribute(feed, name)
67 | end
68 | end
69 |
70 | def update_attribute(feed, name)
71 | old_value = send(name)
72 | new_value = feed.send(name)
73 |
74 | if old_value == new_value
75 | false
76 | else
77 | send(:"#{name}=", new_value)
78 | true
79 | end
80 | end
81 |
82 | def sanitize_entries!
83 | entries.each(&:sanitize!)
84 | end
85 |
86 | private
87 |
88 | # This implementation is a hack, which is why it's so ugly. It's to get
89 | # around the fact that not all feeds have a published date. However,
90 | # they're always ordered with the newest one first. So we go through the
91 | # entries just parsed and insert each one as a new entry until we get to
92 | # one that has the same id as the the newest for the feed.
93 | def find_new_entries_for(feed)
94 | return feed.entries if entries.empty?
95 |
96 | latest_entry = entries.first
97 | found_new_entries = []
98 |
99 | feed.entries.each do |entry|
100 | break unless new_entry?(entry, latest_entry)
101 |
102 | found_new_entries << entry
103 | end
104 |
105 | found_new_entries
106 | end
107 |
108 | def new_entry?(entry, latest)
109 | nil_ids = entry.entry_id.nil? && latest.entry_id.nil?
110 | new_id = entry.entry_id != latest.entry_id
111 | new_url = entry.url != latest.url
112 |
113 | (nil_ids || new_id) && new_url
114 | end
115 | end
116 | end
117 |
--------------------------------------------------------------------------------
/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle.
4 |
5 | We talked about his plans to use AWS as part of his new social video
6 | portal startup. I won't spill any beans before he's ready to talk
7 | about it himself, but I will say that he has a really good concept,
8 | strong backers, and infectious enthusiasm for the online world.
9 |
10 |
11 |
12 | He's now ready to hire a software architect and designer in order to
13 | bring his vision to life. I've posted the job below; you can
14 | send your resume to apply@web.tv
15 | if you are interested, qualified, and located in the right part
16 | of the world.
17 |
18 |
19 |
20 |
21 |
Software Architect & Designer
22 |
23 |
24 | We are a reputable Internet technology, software services and e-commerce company based
25 | in
26 | Istanbul and
27 | Bursa,
28 | Turkey.
29 | We are looking for a talented Software Architect who will
30 | be working in Istanbul for a certain period of time, for our new global scale
31 | "social video portal" project. Below are the qualifications required and job
32 | description for the position to be held.
33 |
34 |
35 |
38 |
39 |
Qualifications:
40 |
41 |
42 |
Extensive knowledge of web technologies.
43 |
Experienced in web based application design and development.
44 |
Solid bacground in object oriented design and development.
45 |
Preferrably experienced in live broadcasting over the internet, video streaming, video sharing and social networking web site development and design.
46 |
Knowledge and experience of design and development of multi-tier, distributed, massively multi-user systems.
47 |
Experienced in Cloud Computing applications (preferably with AWS).
48 |
Very good command of PHP or Python.
49 |
Experinced in relational database design.
50 |
Familarity with Erlang, and knowledge or experience of Java, C/C++, Ajax, Adobe Flex, mySQL is a plus.
51 |
Self motivated, enthusiastic, team player.
52 |
53 |
54 |
Job Description:
55 |
56 |
Will be mainly responsible for designing the overall system for a multi-tier, massively multi-user live video multi-casting, videosharing web site which will also have features of a social network.
57 |
Will be involved in Design and Development phases of software development cycle. Will contribute to the Analysis phase.
58 |
Will lead the Software Development Team for the period of the contract and report to the Project Coordinator.
"my_id", "url" => "my_url", **overrides }
8 | end
9 |
10 | before do
11 | # I don't really like doing it this way because these unit test should only
12 | # rely on JSONFeed, but this is actually how it should work. You would
13 | # never just pass entry json straight to the JSONFeedItem
14 | @entry = Feedjira::Parser::JSONFeed.parse(sample_json_feed).entries.first
15 | end
16 |
17 | it "parses the id" do
18 | expect(@entry.id).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi"
19 | end
20 |
21 | it "parses the url" do
22 | expect(@entry.url).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi"
23 | end
24 |
25 | it "parses the title" do
26 | expect(@entry.title).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls"
27 | end
28 |
29 | it "parses the content" do
30 | content = "
On Wednesday night I know where I’ll be — playing keyboard for a few songs at the James Dempsey and the Breakpoints concert benefitting App Camp for Girls.
"
31 | expect(@entry.content).to eq content
32 | end
33 |
34 | it "parses the published date" do
35 | published = Feedjira::Util::ParseTime.call "2017-06-02T22:05:47-07:00"
36 | expect(@entry.published).to eq published
37 | end
38 |
39 | it "sets the published date to nil when not present" do
40 | entry = described_class.new(params)
41 |
42 | expect(entry.published).to be_nil
43 | end
44 |
45 | it "sets updated to date_modified when present" do
46 | updated = "2017-06-02T22:05:47-07:00"
47 | entry = described_class.new(params("date_modified" => updated))
48 |
49 | updated = Feedjira::Util::ParseTime.call "2017-06-02T22:05:47-07:00"
50 | expect(entry.updated).to eq updated
51 | end
52 |
53 | it "sets updated to nil when date_modified is not present" do
54 | entry = described_class.new(params)
55 |
56 | expect(entry.updated).to be_nil
57 | end
58 |
59 | it "sets the author when nested author object is present" do
60 | entry = described_class.new(params("author" => { "name" => "John Doe" }))
61 |
62 | expect(entry.author).to eq "John Doe"
63 | end
64 |
65 | it "sets the author to nil when nested author object is not present" do
66 | entry = described_class.new(params)
67 |
68 | expect(entry.author).to be_nil
69 | end
70 |
71 | it "supports each" do
72 | expect(@entry).to respond_to :each
73 | end
74 |
75 | it "is able to list out all the fields with each" do
76 | all_fields = []
77 | title_value = ""
78 | @entry.each do |field, value|
79 | all_fields << field
80 | title_value = value if field == "title"
81 | end
82 |
83 | expect(title_value).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls"
84 |
85 | expected_fields = %w[
86 | author
87 | banner_image
88 | categories
89 | content
90 | entry_id
91 | external_url
92 | image
93 | json
94 | published
95 | summary
96 | title
97 | updated
98 | url
99 | ]
100 | expect(all_fields).to match_array expected_fields
101 | end
102 |
103 | it "supports checking if a field exists in the entry" do
104 | expect(@entry).to include "title"
105 | expect(@entry).to include "url"
106 | end
107 |
108 | it "allows access to fields with hash syntax" do
109 | expect(@entry["title"]).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls"
110 | expect(@entry["url"]).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi"
111 | end
112 |
113 | it "allows setting field values with hash syntax" do
114 | @entry["title"] = "Foobar"
115 | expect(@entry.title).to eq "Foobar"
116 | end
117 | end
118 |
--------------------------------------------------------------------------------
/spec/sample_feeds/TechCrunchFirstEntry.xml:
--------------------------------------------------------------------------------
1 |
Angie’s List, which offers consumers a way to review and rate doctors, contractors and service companies on the Web, has just set the terms for its IPO. In a new filing, the company revealed that it aims to raise as much as $131.4 million in the offering and has priced its IPO in the range of $11 to $13 per share. The company will list on the Nasdaq under the symbol “ANGI.” At the high end of the range, Angie’s List would be valued at nearly $700 million.
2 |
Angie’s List launched in 1995 with a focus on local home, yard and car services, sits at the intersection of local search, user-generated content and subscription-based services. To date, Angie’s List has raised nearly $100 million from Battery Ventures, T. Rowe Price, City Investment Group, Cardinal Ventures and others.
3 |
As of September 30, 2011, the company offered its service to paying members in 175 local markets in the United States (compared to 170 as of August). Angie’s List now has more than 1 million (up from 820,000) paid memberships.
4 |
Angie’s List incurred marketing expenses of $30.2 million and $48 million in 2010 and the nine months ended September 30, 2011, respectively. In 2010 and the nine months ended September 30, 2011, the company’s revenue was $59.0 million and $62.6 million, respectively. In the same periods, Angie’s net loss was $27.2 million and $43.2 million. Angie’s List has incurred net losses its start and had an accumulated deficit of $160.6 million as of September 30, 2011.
Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how to make a ruby library that is already using c underneath perform better. Since I've never written a Ruby C extension and it's been a few years since I've touched C, I decided it would be a good educational experience to give it a try.
2 |
3 |
First, let's look into how Nokogiri and SAX-Machine perform a parse. The syntax for SAX-Machine builds up a set of class variables (actually, instance variables on a class object) that describe what you're interested in parsing. So when you see something like this:
4 |
5 | It calls the 'element' and 'elements' methods inserted by the SAXMachine module that build up ruby objects that describe what XML tags we're interested in for the Entry class. That's all pretty straight forward and not really the source of any slowdown in the parsing process. These calls only happen once, when you first load the class.
6 |
7 |
Things get interesting when you run a parse. So you run Entry.parse(some_xml). That makes the call to Nokogiri, which in turn makes a call to libxml. Libxml then parses over the stream (or string) and makes calls to C methods (in Nokogiri) on certain events. For our purposes, the most interesting are start_element, end_element, and characters_func. The C code in Nokogiri for these is basic. It simply converts those C variables into Ruby ones and then makes calls to whatever instance of Nokogiri::XML:SAX::Document (a Ruby object) is associated with this parse. This is where SAXMachine comes back in. It has handlers for these events that match up the tags with the previously defined SAXMachine objects attached to the Entry class. It ignores the events that don't match a tag (however, it still needs to determine if the tag should be ignored).
8 |
9 |
The only possible place I saw to speed things up was to push more of SAX event handling down into the C code. Unfortunately, the only way to do this was to abandon Nokogiri and write my own code to interface with libxml. I used the xml_sax_parser.c from Nokogiri as a base and added to it. I changed it so the SAXMachine definitions of what was interesting would be stored in C. I then changed the SAX handling code to capture the events in C and determine if a tag was of interest there before sending it off to the Ruby objects. The end result is that calls are only made to Ruby when there is an actual event of interest. Thus, I avoid doing any comparisons in Ruby and those classes are simply wrappers that call out to the correct value setters.
10 |
11 |
Here are the results of a quick speed comparison against the Nokogiri SAXMachine, parsing my atom feed using code from my last post.
12 |
user system total real sax c 0.060000 0.000000 0.060000 ( 0.069990) sax nokogiri 0.500000 0.010000 0.510000 ( 0.520278)
13 | The SAX C is 7.4 times faster than SAX Nokogiri. Now, that doesn't seem like a whole lot, but I think it's quite good considering it was against a library that was already half in C. It's even more punctuated when you look at the comparison of these two against rfeedparser.
14 |
user system total real sax c 0.060000 0.000000 0.060000 ( 0.069990) sax nokogiri 0.500000 0.010000 0.510000 ( 0.520278) rfeedparser 13.770000 1.730000 15.500000 ( 15.690309)
15 |
The SAX C version is 224 times faster than rfeedparser! The 7 times multiple from the Nokogiri version of SAXMachine really makes a difference. Unfortunately, I really only wrote this code as a test. It's not even close to something I would use for real. It has memory leaks, isn't thread safe, is completely unreadable, and has hidden bugs that I know about. You can take a look at it in all its misery on the c-rafactor branch of SAXMachine on github. Even though the code is awful, I think it's interesting that there can be this much variability in performance on Ruby libraries that are using C.
16 |
17 |
I could actually turn this into a legitimate working version, but it would take more work than I think it's worth at this point. Also, I'm not excited about the idea of dealing with C issues in SAXMachine. I would be more excited for it if I could get this type of SAX parsing thing into Nokogiri (in addition to the one that is there now). For now, I'll move on to using the Nokogiri version of SAXMachine to create a feed parsing library.
18 |
19 |
--------------------------------------------------------------------------------
/spec/feedjira/feed_utilities_entry_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require "spec_helper"
4 |
5 | describe Feedjira::FeedUtilities do
6 | before do
7 | @klass = Class.new do
8 | include Feedjira::FeedEntryUtilities
9 | end
10 | end
11 |
12 | describe "handling dates" do
13 | it "parses an ISO 8601 formatted datetime into Time" do
14 | time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
15 | expect(time.class).to eq Time
16 | expect(time).to eq Feedjira::Util::ParseTime.call("Wed Feb 20 18:05:00 UTC 2008")
17 | end
18 |
19 | it "parses a ISO 8601 with milliseconds into Time" do
20 | time = @klass.new.parse_datetime("2013-09-17T08:20:13.931-04:00")
21 | expect(time.class).to eq Time
22 | expect(time).to eq Time.strptime("Tue Sep 17 12:20:13.931 UTC 2013", "%a %b %d %H:%M:%S.%N %Z %Y")
23 | end
24 | end
25 |
26 | describe "updated= method" do
27 | it "sets updated when no existing updated value and parsed date is valid" do
28 | instance = @klass.new
29 | instance.updated = "2023-01-01T10:00:00Z"
30 | expect(instance["updated"]).to eq Time.parse("2023-01-01T10:00:00Z").utc
31 | end
32 |
33 | it "updates to newer date when existing updated value is older" do
34 | instance = @klass.new
35 | instance.updated = "2023-01-01T10:00:00Z"
36 | instance.updated = "2023-01-02T10:00:00Z"
37 | expect(instance["updated"]).to eq Time.parse("2023-01-02T10:00:00Z").utc
38 | end
39 |
40 | it "keeps existing updated value when new date is older" do
41 | instance = @klass.new
42 | instance.updated = "2023-01-02T10:00:00Z"
43 | instance.updated = "2023-01-01T10:00:00Z"
44 | expect(instance["updated"]).to eq Time.parse("2023-01-02T10:00:00Z").utc
45 | end
46 |
47 | it "does not set updated when date parsing fails" do
48 | instance = @klass.new
49 | instance.updated = "invalid-date"
50 | expect(instance["updated"]).to be_nil
51 | end
52 |
53 | it "does not change existing updated when new date is invalid" do
54 | instance = @klass.new
55 | instance.updated = "2023-01-01T10:00:00Z"
56 | original_updated = instance["updated"]
57 | instance.updated = "invalid-date"
58 | expect(instance["updated"]).to eq original_updated
59 | end
60 | end
61 |
62 | describe "published= method" do
63 | it "sets published when no existing published value and parsed date is valid" do
64 | instance = @klass.new
65 | instance.published = "2023-01-01T10:00:00Z"
66 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc
67 | end
68 |
69 | it "updates to older date when existing published value is newer" do
70 | instance = @klass.new
71 | instance.published = "2023-01-02T10:00:00Z"
72 | instance.published = "2023-01-01T10:00:00Z"
73 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc
74 | end
75 |
76 | it "keeps existing published value when new date is newer" do
77 | instance = @klass.new
78 | instance.published = "2023-01-01T10:00:00Z"
79 | instance.published = "2023-01-02T10:00:00Z"
80 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc
81 | end
82 |
83 | it "does not set published when date parsing fails" do
84 | instance = @klass.new
85 | instance.published = "invalid-date"
86 | expect(instance["published"]).to be_nil
87 | end
88 |
89 | it "does not change existing published when new date is invalid" do
90 | instance = @klass.new
91 | instance.published = "2023-01-01T10:00:00Z"
92 | original_published = instance["published"]
93 | instance.published = "invalid-date"
94 | expect(instance["published"]).to eq original_published
95 | end
96 | end
97 |
98 | describe "sanitizing" do
99 | before do
100 | @feed = Feedjira.parse(sample_atom_feed)
101 | @entry = @feed.entries.first
102 | end
103 |
104 | it "doesn't fail when no elements are defined on includer" do
105 | expect { @klass.new.sanitize! }.not_to raise_error
106 | end
107 |
108 | it "provides a sanitized title" do
109 | new_title = "#{@entry.title}"
110 | @entry.title = new_title
111 | scrubbed_title = Loofah.scrub_fragment(new_title, :prune).to_s
112 | expect(Loofah.scrub_fragment(@entry.title, :prune).to_s).to eq scrubbed_title
113 | end
114 |
115 | it "sanitizes content in place" do
116 | new_content = "