├── .rspec ├── lib ├── feedjira │ ├── version.rb │ ├── parser.rb │ ├── util.rb │ ├── parser │ │ ├── rss_entry.rb │ │ ├── itunes_rss_owner.rb │ │ ├── rss_image.rb │ │ ├── atom_entry.rb │ │ ├── globally_unique_identifier.rb │ │ ├── google_docs_atom_entry.rb │ │ ├── rss_feed_burner_entry.rb │ │ ├── podlove_chapter.rb │ │ ├── atom_feed_burner_entry.rb │ │ ├── atom_google_alerts_entry.rb │ │ ├── rss_feed_burner.rb │ │ ├── atom_youtube.rb │ │ ├── atom_google_alerts.rb │ │ ├── google_docs_atom.rb │ │ ├── rss.rb │ │ ├── atom.rb │ │ ├── itunes_rss_category.rb │ │ ├── atom_youtube_entry.rb │ │ ├── json_feed.rb │ │ ├── atom_feed_burner.rb │ │ ├── json_feed_item.rb │ │ ├── itunes_rss_item.rb │ │ └── itunes_rss.rb │ ├── preprocessor.rb │ ├── feed.rb │ ├── atom_entry_utilities.rb │ ├── util │ │ └── parse_time.rb │ ├── rss_entry_utilities.rb │ ├── configuration.rb │ ├── feed_entry_utilities.rb │ └── feed_utilities.rb └── feedjira.rb ├── .github ├── ISSUE_TEMPLATE │ ├── general-issue.md │ └── feed-parsing.md ├── dependabot.yml ├── workflows │ └── ruby.yml └── copilot-instructions.md ├── .gitignore ├── spec ├── support │ └── coverage.rb ├── spec_helper.rb ├── sample_feeds │ ├── AtomEscapedHTMLInPreTag.xml │ ├── atom_simple_single_entry.xml │ ├── atom_simple_single_entry_link_self.xml │ ├── Permalinks.xml │ ├── InvalidDateFormat.xml │ ├── FeedBurnerUrlNoAlternate.xml │ ├── TechCrunchFirstEntryDescription.xml │ ├── atom_with_link_tag_for_url_unmarked.xml │ ├── HREFConsideredHarmfulFirstEntry.xml │ ├── a10.xml │ ├── AmazonWebServicesBlogFirstEntryContent.xml │ ├── ITunesWithSpacesInAttributes.xml │ ├── AtomFeedWithSpacesAroundEquals.xml │ ├── ITunesWithSingleQuotedAttributes.xml │ ├── TechCrunchFirstEntry.xml │ ├── FeedjiraBlog.xml │ ├── itunes.xml │ ├── PaulDixExplainsNothingFirstEntryContent.xml │ ├── TenderLovemakingFirstEntry.xml │ ├── itunes_feedburner.xml │ └── GoogleDocsList.xml ├── feedjira │ ├── configuration_spec.rb │ ├── parser │ │ ├── i_tunes_rss_owner_spec.rb │ │ ├── google_docs_atom_entry_spec.rb │ │ ├── i_tunes_rss_category_spec.rb │ │ ├── atom_youtube_spec.rb │ │ ├── podlove_chapter_spec.rb │ │ ├── atom_google_alerts_entry_spec.rb │ │ ├── atom_google_alerts_spec.rb │ │ ├── google_docs_atom_spec.rb │ │ ├── json_feed_spec.rb │ │ ├── rss_feed_burner_spec.rb │ │ ├── atom_feed_burner_entry_spec.rb │ │ ├── rss_spec.rb │ │ ├── i_tunes_rss_item_spec.rb │ │ ├── atom_youtube_entry_spec.rb │ │ ├── atom_entry_spec.rb │ │ ├── rss_feed_burner_entry_spec.rb │ │ ├── itunes_rss_spec.rb │ │ ├── atom_feed_burner_spec.rb │ │ ├── json_feed_item_spec.rb │ │ ├── rss_entry_spec.rb │ │ └── atom_spec.rb │ ├── preprocessor_spec.rb │ ├── atom_entry_utilities_spec.rb │ ├── util │ │ └── parse_time_spec.rb │ ├── feed_spec.rb │ └── feed_utilities_entry_spec.rb ├── sample_feeds.rb └── feedjira_spec.rb ├── Rakefile ├── Gemfile ├── feedjira.gemspec ├── LICENSE ├── .rubocop.yml ├── CODE_OF_CONDUCT.md └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --color -------------------------------------------------------------------------------- /lib/feedjira/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | VERSION = "4.0.1" 5 | end 6 | -------------------------------------------------------------------------------- /lib/feedjira/parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feed Parsing 3 | about: Anything else 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- -------------------------------------------------------------------------------- /lib/feedjira/util.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | # Utility modules and helper functions 5 | module Util 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .bundle 2 | .projections.json 3 | .ruby-gemset 4 | .ruby-version 5 | Gemfile.lock 6 | doc/ 7 | .yardoc/ 8 | pkg/ 9 | rdoc/ 10 | coverage/ 11 | vendor/bundle/ 12 | -------------------------------------------------------------------------------- /spec/support/coverage.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "simplecov" 4 | 5 | SimpleCov.start do 6 | enable_coverage :branch 7 | add_filter "_spec.rb" 8 | end 9 | 10 | SimpleCov.minimum_coverage(line: 100, branch: 100) 11 | -------------------------------------------------------------------------------- /lib/feedjira/parser/rss_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RDF feed entries. 6 | class RSSEntry 7 | include SAXMachine 8 | include FeedEntryUtilities 9 | include RSSEntryUtilities 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/feedjira/parser/itunes_rss_owner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | class ITunesRSSOwner 6 | include SAXMachine 7 | include FeedUtilities 8 | 9 | element :"itunes:name", as: :name 10 | element :"itunes:email", as: :email 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # coverage setup must come before loading lib/ code 4 | require "support/coverage" 5 | 6 | require File.expand_path("#{File.dirname(__FILE__)}/../lib/feedjira") 7 | require "sample_feeds" 8 | 9 | SAXMachine.handler = ENV["HANDLER"].to_sym if ENV["HANDLER"] 10 | 11 | RSpec.configure do |c| 12 | c.include SampleFeeds 13 | end 14 | -------------------------------------------------------------------------------- /lib/feedjira/parser/rss_image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RSS images 6 | class RSSImage 7 | include SAXMachine 8 | 9 | element :description 10 | element :height 11 | element :link 12 | element :title 13 | element :url 14 | element :width 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with Atom feed entries. 6 | class AtomEntry 7 | include SAXMachine 8 | include FeedEntryUtilities 9 | include AtomEntryUtilities 10 | 11 | element :"media:thumbnail", as: :image, value: :url 12 | element :"media:content", as: :image, value: :url 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /spec/sample_feeds/AtomEscapedHTMLInPreTag.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Test feed 4 | 5 | Test entry 6 | 7 |
8 |

This is escaped html:

9 |
<b>test<b>
10 |
11 |
12 |
13 |
14 | -------------------------------------------------------------------------------- /lib/feedjira/parser/globally_unique_identifier.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | class GloballyUniqueIdentifier 6 | include SAXMachine 7 | 8 | attribute :isPermaLink, as: :is_perma_link 9 | 10 | value :guid 11 | 12 | def perma_link? 13 | is_perma_link != "false" 14 | end 15 | 16 | def url 17 | perma_link? ? guid : nil 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/feedjira/parser/google_docs_atom_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | class GoogleDocsAtomEntry 6 | include SAXMachine 7 | include FeedEntryUtilities 8 | include AtomEntryUtilities 9 | 10 | element :"docs:md5Checksum", as: :checksum 11 | element :"docs:filename", as: :original_filename 12 | element :"docs:suggestedFilename", as: :suggested_filename 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feed-parsing.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feed Parsing 3 | about: Your feed is parsing incorrectly, or you have a feed type that is not supported 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Steps to reproduce 11 | 13 | 14 | ### Example feed URL 15 | 16 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "bundler/gem_tasks" 4 | require "rspec/core/rake_task" 5 | require "rubocop/rake_task" 6 | require "yard" 7 | 8 | RSpec::Core::RakeTask.new(:spec) do |t| 9 | t.verbose = false 10 | end 11 | 12 | RuboCop::RakeTask.new(:rubocop) do |t| 13 | t.options = ["--display-cop-names"] 14 | end 15 | 16 | YARD::Rake::YardocTask.new do |t| 17 | t.files = ["lib/**/*.rb", "-", "LICENSE"] 18 | end 19 | 20 | task default: %i[spec rubocop] 21 | -------------------------------------------------------------------------------- /lib/feedjira/parser/rss_feed_burner_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RDF feed entries. 6 | class RSSFeedBurnerEntry 7 | include SAXMachine 8 | include FeedEntryUtilities 9 | include RSSEntryUtilities 10 | 11 | element :"feedburner:origLink", as: :orig_link 12 | private :orig_link 13 | 14 | def url 15 | orig_link || super 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org/" 4 | 5 | gemspec 6 | 7 | gem "faraday", "~> 2.14.0" 8 | gem "pry", "~> 0.15.0" 9 | gem "rspec", "~> 3.13.0" 10 | gem "rubocop", "~> 1.81.1" 11 | gem "rubocop-performance", "~> 1.26.0" 12 | gem "rubocop-rake", "~> 0.7.1" 13 | gem "rubocop-rspec", "~> 3.7.0" 14 | gem "simplecov", "~> 0.22.0" 15 | gem "yard", "~> 0.9.34" 16 | 17 | group :test do 18 | gem "oga", "~> 3.4" 19 | gem "ox", "~> 2.14.17", platforms: %i[mri rbx] 20 | gem "rake", "~> 13.3.0" 21 | end 22 | -------------------------------------------------------------------------------- /lib/feedjira/parser/podlove_chapter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | class PodloveChapter 6 | include SAXMachine 7 | include FeedEntryUtilities 8 | 9 | attribute :start, as: :start_ntp 10 | attribute :title 11 | attribute :href, as: :url 12 | attribute :image 13 | 14 | def start 15 | return unless start_ntp 16 | 17 | parts = start_ntp.split(":") 18 | parts.reverse.to_enum.with_index.sum do |part, index| 19 | part.to_f * (60**index) 20 | end 21 | end 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_feed_burner_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with Feedburner Atom feed entries. 6 | class AtomFeedBurnerEntry 7 | include SAXMachine 8 | include FeedEntryUtilities 9 | include AtomEntryUtilities 10 | 11 | element :"feedburner:origLink", as: :orig_link 12 | private :orig_link 13 | 14 | element :"media:thumbnail", as: :image, value: :url 15 | element :"media:content", as: :image, value: :url 16 | 17 | def url 18 | orig_link || super 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_google_alerts_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "uri" 4 | 5 | module Feedjira 6 | module Parser 7 | # Parser for dealing with Feedburner Atom feed entries. 8 | class AtomGoogleAlertsEntry 9 | include SAXMachine 10 | include FeedEntryUtilities 11 | include AtomEntryUtilities 12 | 13 | def url 14 | url = super 15 | return unless url&.start_with?("https://www.google.com/url?") 16 | 17 | uri = URI(url) 18 | cons = URI.decode_www_form(uri.query).assoc("url") 19 | cons && cons[1] 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /spec/feedjira/configuration_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Configuration do 6 | describe ".configure" do 7 | it "sets strip_whitespace config" do 8 | Feedjira.configure { |config| config.strip_whitespace = true } 9 | expect(Feedjira.strip_whitespace).to be true 10 | end 11 | 12 | it "allows parsers to be modified" do 13 | custom_parser = Class.new 14 | 15 | Feedjira.configure { |config| config.parsers.unshift(custom_parser) } 16 | expect(Feedjira.parsers.first).to eq(custom_parser) 17 | Feedjira.reset_configuration! 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /spec/sample_feeds/atom_simple_single_entry.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 5 | 2003-12-13T18:30:02Z 6 | 7 | John Doe 8 | 9 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 10 | 11 | Atom-Powered Robots Run Amok 12 | 13 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 14 | 2003-12-13T18:30:02Z 15 | Some text. 16 | 17 | 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "bundler" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | assignees: 13 | - "mockdeep" 14 | groups: 15 | all: 16 | patterns: 17 | - "*" 18 | -------------------------------------------------------------------------------- /spec/sample_feeds/atom_simple_single_entry_link_self.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 5 | 2003-12-13T18:30:02Z 6 | 7 | John Doe 8 | 9 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 10 | 11 | Atom-Powered Robots Run Amok 12 | 13 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 14 | 2003-12-13T18:30:02Z 15 | Some text. 16 | 17 | 18 | -------------------------------------------------------------------------------- /spec/feedjira/parser/i_tunes_rss_owner_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::ITunesRSSOwner do 6 | before do 7 | # I don't really like doing it this way because these unit test should only 8 | # rely on RSSEntry, but this is actually how it should work. You would 9 | # never just pass entry xml straight to the ITunesRssOwner 10 | feed = Feedjira::Parser::ITunesRSS.parse sample_itunes_feed 11 | @owner = feed.itunes_owners.first 12 | end 13 | 14 | it "parses the name" do 15 | expect(@owner.name).to eq "John Doe" 16 | end 17 | 18 | it "parses the email" do 19 | expect(@owner.email).to eq "john.doe@example.com" 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /spec/sample_feeds/Permalinks.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Item 1 6 | http://example.com/1 7 | 8 | 9 | Item 2 10 | http://example.com/2 11 | 12 | 13 | Item 3 14 | http://example.com/3 15 | 16 | 17 | Item 4 18 | http://example.com/4 19 | http://example.com/5 20 | 21 | 22 | -------------------------------------------------------------------------------- /lib/feedjira/parser/rss_feed_burner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RSS feeds. 6 | class RSSFeedBurner 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :description 12 | element :link, as: :url 13 | element :lastBuildDate, as: :last_built 14 | elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" } 15 | elements :item, as: :entries, class: RSSFeedBurnerEntry 16 | 17 | attr_accessor :feed_url 18 | 19 | def self.able_to_parse?(xml) # :nodoc: 20 | (/ 3 | 4 | 5 | Invalid date format feed 6 | http://example.com/feed 7 | en-US 8 | 9 | Item 0 with an invalid date 10 | http://example.com/item0 11 | Mon, 16 Oct 2017 15:10:00 +0000 12 | 1518478934 13 | 14 | 15 | Item 1 with all valid dates 16 | http://example.com/item1 17 | Tue, 17 Oct 2017 12:17:00 +0000 18 | Tue, 17 Oct 2017 22:17:00 +0000 19 | 20 | 21 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_youtube.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RSS feeds. 6 | class AtomYoutube 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :link, as: :url, value: :href, with: { rel: "alternate" } 12 | element :link, as: :feed_url, value: :href, with: { rel: "self" } 13 | element :name, as: :author 14 | element :"yt:channelId", as: :youtube_channel_id 15 | 16 | elements :entry, as: :entries, class: AtomYoutubeEntry 17 | 18 | def self.able_to_parse?(xml) # :nodoc: 19 | xml.include?("xmlns:yt=\"http://www.youtube.com/xml/schemas/2015\"") 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/feedjira/preprocessor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | class Preprocessor 5 | def initialize(xml) 6 | @xml = xml 7 | end 8 | 9 | def to_xml 10 | process_content 11 | doc.to_xml 12 | end 13 | 14 | private 15 | 16 | def process_content 17 | content_nodes.each do |node| 18 | node.content = raw_html(node) 19 | end 20 | end 21 | 22 | def content_nodes 23 | doc.search 'entry > content[type="xhtml"], entry > summary[type="xhtml"], entry > title[type="xhtml"]' 24 | end 25 | 26 | def raw_html(node) 27 | node.search("./div").inner_html 28 | end 29 | 30 | def doc 31 | @doc ||= Nokogiri::XML(@xml).remove_namespaces! 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/feedjira/parser/google_docs_atom_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::GoogleDocsAtomEntry do 6 | describe "parsing" do 7 | before do 8 | xml = sample_google_docs_list_feed 9 | @feed = Feedjira::Parser::GoogleDocsAtom.parse xml 10 | @entry = @feed.entries.first 11 | end 12 | 13 | it "has the custom checksum element" do 14 | expect(@entry.checksum).to eq "2b01142f7481c7b056c4b410d28f33cf" 15 | end 16 | 17 | it "has the custom filename element" do 18 | expect(@entry.original_filename).to eq "MyFile.pdf" 19 | end 20 | 21 | it "has the custom suggested filename element" do 22 | expect(@entry.suggested_filename).to eq "TaxDocument.pdf" 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_google_alerts.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with Feedburner Atom feeds. 6 | class AtomGoogleAlerts 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :subtitle, as: :description 12 | element :link, as: :feed_url, value: :href, with: { rel: "self" } 13 | element :link, as: :url, value: :href, with: { rel: "self" } 14 | elements :link, as: :links, value: :href 15 | elements :entry, as: :entries, class: AtomGoogleAlertsEntry 16 | 17 | def self.able_to_parse?(xml) 18 | Atom.able_to_parse?(xml) && (%r{tag:google\.com,2005:[^<]+/com\.google/alerts/} === xml) # rubocop:disable Style/CaseEquality 19 | end 20 | 21 | def self.preprocess(xml) 22 | Preprocessor.new(xml).to_xml 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/feedjira/parser/google_docs_atom.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path("./atom", File.dirname(__FILE__)) 4 | module Feedjira 5 | module Parser 6 | class GoogleDocsAtom 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :subtitle, as: :description 12 | element :link, as: :url, value: :href, with: { type: "text/html" } 13 | element :link, as: :feed_url, value: :href, with: { type: "application/atom+xml" } 14 | elements :link, as: :links, value: :href 15 | elements :entry, as: :entries, class: GoogleDocsAtomEntry 16 | 17 | def url 18 | @url ||= links.first 19 | end 20 | 21 | def self.able_to_parse?(xml) # :nodoc: 22 | %r{https?://docs\.google\.com/.*} =~ xml 23 | end 24 | 25 | def feed_url 26 | @feed_url ||= links.first 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/feedjira/parser/rss.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with RSS feeds. 6 | # Source: https://cyber.harvard.edu/rss/rss.html 7 | class RSS 8 | include SAXMachine 9 | include FeedUtilities 10 | 11 | element :description 12 | element :image, class: RSSImage 13 | element :language 14 | element :lastBuildDate, as: :last_built 15 | element :link, as: :url 16 | element :"a10:link", as: :url, value: :href 17 | element :rss, as: :version, value: :version 18 | element :title 19 | element :ttl 20 | elements :"atom:link", as: :hubs, value: :href, with: { rel: "hub" } 21 | elements :item, as: :entries, class: RSSEntry 22 | 23 | attr_accessor :feed_url 24 | 25 | def self.able_to_parse?(xml) 26 | (/- 21 | ${{matrix.os}}-ruby-${{matrix.ruby}}-${{matrix.handler}} 22 | runs-on: ${{matrix.os}}-latest 23 | continue-on-error: ${{matrix.ruby == 'head' || matrix.ruby == 'jruby'}} 24 | env: 25 | HANDLER: ${{matrix.handler}} 26 | 27 | steps: 28 | - name: Check out 29 | uses: actions/checkout@v2 30 | 31 | - name: Set up ruby and bundle 32 | uses: ruby/setup-ruby@v1 33 | with: 34 | ruby-version: ${{matrix.ruby}} 35 | bundler-cache: true 36 | 37 | - name: Run rake 38 | run: | 39 | bundle exec rake 40 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with Atom feeds. 6 | class Atom 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :subtitle, as: :description 12 | element :link, as: :url, value: :href, with: { type: "text/html" } 13 | element :link, as: :feed_url, value: :href, with: { rel: "self" } 14 | elements :link, as: :links, value: :href 15 | elements :link, as: :hubs, value: :href, with: { rel: "hub" } 16 | elements :entry, as: :entries, class: AtomEntry 17 | element :icon 18 | 19 | def self.able_to_parse?(xml) 20 | %r{]+xmlns\s?=\s?["'](https?://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)["'][^>]*>} =~ xml 21 | end 22 | 23 | def url 24 | @url || (links - [feed_url]).last 25 | end 26 | 27 | def self.preprocess(xml) 28 | Preprocessor.new(xml).to_xml 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /feedjira.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path("lib/feedjira/version", __dir__) 4 | 5 | Gem::Specification.new do |s| 6 | s.authors = [ 7 | "Adam Hess", 8 | "Akinori Musha", 9 | "Ezekiel Templin", 10 | "Jon Allured", 11 | "Julien Kirch", 12 | "Michael Stock", 13 | "Paul Dix" 14 | ] 15 | s.homepage = "https://github.com/feedjira/feedjira" 16 | s.license = "MIT" 17 | s.name = "feedjira" 18 | s.platform = Gem::Platform::RUBY 19 | s.summary = "A feed parsing library" 20 | s.version = Feedjira::VERSION 21 | 22 | s.metadata = { 23 | "homepage_uri" => "https://github.com/feedjira/feedjira", 24 | "source_code_uri" => "https://github.com/feedjira/feedjira", 25 | "changelog_uri" => "https://github.com/feedjira/feedjira/blob/main/CHANGELOG.md", 26 | "rubygems_mfa_required" => "true" 27 | } 28 | 29 | s.files = `git ls-files`.split("\n") 30 | s.require_paths = ["lib"] 31 | 32 | s.required_ruby_version = ">=3.1" 33 | 34 | s.add_dependency "logger", ">= 1.0", "< 2" 35 | s.add_dependency "loofah", ">= 2.3.1", "< 3" 36 | s.add_dependency "sax-machine", ">= 1.0", "< 2" 37 | end 38 | -------------------------------------------------------------------------------- /spec/feedjira/preprocessor_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Preprocessor do 6 | it "returns the xml as parsed by Nokogiri" do 7 | xml = "" 8 | doc = Nokogiri::XML(xml).remove_namespaces! 9 | processor = described_class.new xml 10 | escaped = processor.to_xml 11 | 12 | expect(escaped).to eq doc.to_xml 13 | end 14 | 15 | it "escapes markup in xhtml content" do 16 | processor = described_class.new sample_atom_xhtml_feed 17 | escaped = processor.to_xml 18 | escaped_parts = escaped.split "\n" 19 | 20 | expect(escaped_parts[10]).to match(%r{<i>dogs</i>}) # title 21 | expect(escaped_parts[16]).to match(%r{<b>XHTML</b>}) # summary 22 | expect(escaped_parts[26]).to match(/<p>$/) # content 23 | end 24 | 25 | it "leaves escaped html within pre tag" do 26 | processor = described_class.new(sample_atom_xhtml_with_escpaed_html_in_pre_tag_feed) 27 | escaped = processor.to_xml 28 | expected_pre_tag = " <pre>&lt;b&gt;test&lt;b&gt;</pre>" 29 | expect(escaped.split("\n")[7]).to eq(expected_pre_tag) 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /spec/sample_feeds/FeedBurnerUrlNoAlternate.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | QQQQ 7 | 8 | 9 | 2010-09-18T10:02:20-07:00 10 | QQQQ 11 | 12 | QQQQ 13 | QQQQ@example.com 14 | 15 | 16 | 17 | 19 | 20 | 21 | 22 | QQQQ 23 | 24 | 2010-08-11T00:00:00-07:00 25 | http://example.com/QQQQ.html 26 | QQQQ 27 | 28 | 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2009-2016: 4 | 5 | - Paul Dix 6 | - Julien Kirch 7 | - Ezekiel Templin 8 | - Jon Allured 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | -------------------------------------------------------------------------------- /lib/feedjira/parser/itunes_rss_category.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # iTunes extensions to the standard RSS2.0 item 6 | # Source: https://help.apple.com/itc/podcasts_connect/#/itcb54353390 7 | class ITunesRSSCategory 8 | include SAXMachine 9 | 10 | attribute :text 11 | 12 | elements :"itunes:category", as: :itunes_categories, 13 | class: ITunesRSSCategory 14 | 15 | def each_subcategory(&block) 16 | return to_enum(__method__) unless block 17 | 18 | yield text 19 | 20 | itunes_categories.each do |itunes_category| 21 | itunes_category.each_subcategory(&block) 22 | end 23 | end 24 | 25 | def each_path(ancestors = [], &block) 26 | return to_enum(__method__, ancestors) unless block 27 | 28 | category_hierarchy = ancestors + [text] 29 | 30 | if itunes_categories.empty? 31 | yield category_hierarchy 32 | else 33 | itunes_categories.each do |itunes_category| 34 | itunes_category.each_path(category_hierarchy, &block) 35 | end 36 | end 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/feedjira/parser/i_tunes_rss_category_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::ITunesRSSCategory do 6 | describe "#each_subcategory" do 7 | it "returns an enumerator when no block is given" do 8 | category = described_class.new 9 | category.text = "Technology" 10 | 11 | result = category.each_subcategory 12 | expect(result).to be_an(Enumerator) 13 | end 14 | 15 | it "yields category text and subcategories when block is given" do 16 | parent_category = described_class.new 17 | parent_category.text = "Technology" 18 | 19 | subcategory = described_class.new 20 | subcategory.text = "Gadgets" 21 | 22 | parent_category.itunes_categories = [subcategory] 23 | 24 | yielded_categories = [] 25 | parent_category.each_subcategory { |cat| yielded_categories << cat } 26 | 27 | expect(yielded_categories).to eq %w[Technology Gadgets] 28 | end 29 | end 30 | 31 | describe "#each_path" do 32 | it "returns an enumerator when no block is given" do 33 | category = described_class.new 34 | category.text = "Technology" 35 | 36 | result = category.each_path 37 | expect(result).to be_an(Enumerator) 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/sample_feeds/TechCrunchFirstEntryDescription.xml: -------------------------------------------------------------------------------- 1 | angies-listAngie's List, which offers consumers a way to review and rate doctors, contractors and service companies on the Web, has just set the terms for its IPO. In a new filing, the company revealed that it aims to raise as much as $131.4 million in the offering and has priced its IPO in the range of $11 to $13 per share. The company will list on the Nasdaq under the symbol “ANGI.” At the high end of the range, Angie's List would be valued at nearly $700 million. 2 | 3 | Angie’s List launched in 1995 with a focus on local home, yard and car services, sits at the intersection of local search, user-generated content and subscription-based services. To date, Angie’s List has raised nearly $100 million from Battery Ventures, T. Rowe Price, City Investment Group, Cardinal Ventures and others. 4 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | inherit_from: .rubocop_todo.yml 2 | 3 | plugins: 4 | - rubocop-rake 5 | - rubocop-rspec 6 | - rubocop-performance 7 | 8 | AllCops: 9 | EnabledByDefault: true 10 | TargetRubyVersion: 3.1 11 | 12 | # Offense count: 3 13 | # Configuration parameters: IgnoredMethods. 14 | Metrics/AbcSize: 15 | Max: 24 16 | 17 | # Offense count: 33 18 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods. 19 | # ExcludedMethods: refine 20 | Metrics/BlockLength: 21 | Max: 235 22 | 23 | # Offense count: 7 24 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods. 25 | Metrics/MethodLength: 26 | Max: 25 27 | 28 | Layout/LineLength: 29 | Exclude: 30 | - 'spec/**/*.rb' 31 | 32 | Style/IfUnlessModifier: 33 | Enabled: false 34 | 35 | Style/StringLiterals: 36 | EnforcedStyle: double_quotes 37 | 38 | RSpec/MultipleExpectations: 39 | Max: 10 40 | 41 | RSpec/ExampleLength: 42 | Max: 30 43 | 44 | RSpec/InstanceVariable: 45 | Enabled: false 46 | 47 | RSpec/MessageSpies: 48 | Enabled: false 49 | 50 | RSpec/NestedGroups: 51 | Max: 5 52 | 53 | RSpec/MultipleMemoizedHelpers: 54 | Max: 10 55 | 56 | RSpec/BeforeAfterAll: 57 | Enabled: false 58 | 59 | RSpec/RepeatedExample: 60 | Enabled: false 61 | 62 | Style/Copyright: { Enabled: false } 63 | -------------------------------------------------------------------------------- /spec/feedjira/atom_entry_utilities_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | RSpec.describe Feedjira::AtomEntryUtilities do 6 | def klass 7 | Class.new do 8 | include SAXMachine 9 | include Feedjira::AtomEntryUtilities 10 | end 11 | end 12 | 13 | describe "#title" do 14 | it "returns the title when set" do 15 | entry = klass.new 16 | entry.title = "My Title" 17 | 18 | expect(entry.title).to eq "My Title" 19 | end 20 | 21 | it "returns a sanitized version of the raw title when present" do 22 | entry = klass.new 23 | entry.raw_title = "My Raw \tTitle" 24 | 25 | expect(entry.title).to eq "My Raw Title" 26 | end 27 | 28 | it "returns nil when no raw title is present" do 29 | entry = klass.new 30 | 31 | expect(entry.title).to be_nil 32 | end 33 | end 34 | 35 | describe "#url" do 36 | it "returns the url when set" do 37 | entry = klass.new 38 | entry.url = "http://exampoo.com/feed" 39 | 40 | expect(entry.url).to eq "http://exampoo.com/feed" 41 | end 42 | 43 | it "returns the first link when not set" do 44 | entry = klass.new 45 | entry.links = ["http://exampoo.com/feed"] 46 | 47 | expect(entry.url).to eq "http://exampoo.com/feed" 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Planet innoQ 6 | 7 | 8 | http://www.innoq.com/planet/atom.xml 9 | 2009-07-10T12:30:05+00:00 10 | Planet/1.0 +http://www.planetplanet.org 11 | 12 | 13 | ja, 14 | 15 | tag:www.innoq.com,2009:/blog/phaus//25.3526 16 | 2009-07-01T22:20:05+00:00 17 | ich lebe noch. 18 | Sobald mir mehr einfällt, schreibe ich mal wieder was :-). 19 | 20 | Philipp Haussleiter 21 | http://www.innoq.com/blog/phaus/ 22 | 23 | 24 | Philipps paper equivalent Blog 25 | 26 | tag:www.innoq.com,2009:/blog/phaus//25 27 | 2009-07-01T22:20:05+00:00 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /lib/feedjira/feed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | class Feed 5 | class << self 6 | def add_common_feed_element(element_tag, options = {}) 7 | Feedjira.parsers.each do |k| 8 | k.element(element_tag, options) 9 | end 10 | end 11 | 12 | def add_common_feed_elements(element_tag, options = {}) 13 | Feedjira.parsers.each do |k| 14 | k.elements(element_tag, options) 15 | end 16 | end 17 | 18 | def add_common_feed_entry_element(element_tag, options = {}) 19 | call_on_each_feed_entry(:element, element_tag, options) 20 | end 21 | 22 | def add_common_feed_entry_elements(element_tag, options = {}) 23 | call_on_each_feed_entry(:elements, element_tag, options) 24 | end 25 | 26 | private 27 | 28 | def call_on_each_feed_entry(method, *parameters) 29 | Feedjira.parsers.each do |klass| 30 | klass.sax_config.collection_elements.each_value do |value| 31 | collection_configs = value.select do |v| 32 | v.accessor == "entries" && v.data_class.is_a?(Class) 33 | end 34 | 35 | collection_configs.each do |config| 36 | config.data_class.send(method, *parameters) 37 | end 38 | end 39 | end 40 | end 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_youtube_entry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | class AtomYoutubeEntry 6 | include SAXMachine 7 | include FeedEntryUtilities 8 | include AtomEntryUtilities 9 | 10 | sax_config.top_level_elements["link"].clear 11 | sax_config.collection_elements["link"].clear 12 | 13 | element :link, as: :url, value: :href, with: { rel: "alternate" } 14 | 15 | element :"media:description", as: :content 16 | element :"yt:videoId", as: :youtube_video_id 17 | element :"yt:channelId", as: :youtube_channel_id 18 | element :"media:title", as: :media_title 19 | element :"media:content", as: :media_url, value: :url 20 | element :"media:content", as: :media_type, value: :type 21 | element :"media:content", as: :media_width, value: :width 22 | element :"media:content", as: :media_height, value: :height 23 | element :"media:thumbnail", as: :media_thumbnail_url, value: :url 24 | element :"media:thumbnail", as: :media_thumbnail_width, value: :width 25 | element :"media:thumbnail", as: :media_thumbnail_height, value: :height 26 | element :"media:starRating", as: :media_star_count, value: :count 27 | element :"media:starRating", as: :media_star_average, value: :average 28 | element :"media:statistics", as: :media_views, value: :views 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/feedjira/parser/json_feed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with JSON Feeds. 6 | class JSONFeed 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | def self.able_to_parse?(json) 11 | json.include?("https://jsonfeed.org/version/") || 12 | json.include?('https:\/\/jsonfeed.org\/version\/') 13 | end 14 | 15 | def self.parse(json) 16 | new(JSON.parse(json)) 17 | end 18 | 19 | attr_reader :json, :version, :title, :description, :url, :feed_url, :icon, :favicon, 20 | :language, :expired, :entries 21 | 22 | def initialize(json) 23 | @json = json 24 | @version = json.fetch("version") 25 | @title = json.fetch("title") 26 | @url = json.fetch("home_page_url", nil) 27 | @feed_url = json.fetch("feed_url", nil) 28 | @icon = json.fetch("icon", nil) 29 | @favicon = json.fetch("favicon", nil) 30 | @description = json.fetch("description", nil) 31 | @language = json.fetch("language", nil) 32 | @expired = json.fetch("expired", nil) 33 | @entries = parse_items(json["items"]) 34 | end 35 | 36 | private 37 | 38 | def parse_items(items) 39 | items.map do |item| 40 | Feedjira::Parser::JSONFeedItem.new(item) 41 | end 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_youtube_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.join(File.dirname(__FILE__), %w[.. .. spec_helper]) 4 | 5 | describe Feedjira::Parser::AtomYoutube do 6 | describe "#will_parse?" do 7 | it "returns true for an atom youtube feed" do 8 | expect(described_class).to be_able_to_parse(sample_youtube_atom_feed) 9 | end 10 | 11 | it "returns fase for an atom feed" do 12 | expect(described_class).not_to be_able_to_parse(sample_atom_feed) 13 | end 14 | 15 | it "returns false for an rss feedburner feed" do 16 | expect(described_class).not_to be_able_to_parse(sample_rss_feed_burner_feed) 17 | end 18 | end 19 | 20 | describe "parsing" do 21 | before do 22 | @feed = described_class.parse(sample_youtube_atom_feed) 23 | end 24 | 25 | it "parses the title" do 26 | expect(@feed.title).to eq "Google" 27 | end 28 | 29 | it "parses the author" do 30 | expect(@feed.author).to eq "Google Author" 31 | end 32 | 33 | it "parses the url" do 34 | expect(@feed.url).to eq "http://www.youtube.com/user/Google" 35 | end 36 | 37 | it "parses the feed_url" do 38 | expect(@feed.feed_url).to eq "http://www.youtube.com/feeds/videos.xml?user=google" 39 | end 40 | 41 | it "parses the YouTube channel id" do 42 | expect(@feed.youtube_channel_id).to eq "UCK8sQmJBp8GCxrOtXWBpyEA" 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/feedjira/parser/podlove_chapter_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::PodloveChapter do 6 | before do 7 | @item = Feedjira::Parser::ITunesRSS.parse(sample_podlove_feed).entries.first 8 | @chapter = @item.chapters.first 9 | end 10 | 11 | it "parses chapters" do 12 | expect(@item.chapters.size).to eq 15 13 | end 14 | 15 | it "sorts chapters by time" do 16 | expect(@item.chapters.last.title).to eq "Abschied" 17 | end 18 | 19 | describe "#start" do 20 | it "returns the start time" do 21 | expect(@chapter.start_ntp).to eq "00:00:26.407" 22 | expect(@chapter.start).to eq 26.407 23 | expect(@item.chapters[1].start).to eq 50 24 | expect(@item.chapters[2].start).to eq 59.12 25 | expect(@item.chapters[3].start).to eq 89.201 26 | expect(@item.chapters.last.start).to eq 5700.034 27 | end 28 | 29 | it "returns nil when start_ntp is not present" do 30 | chapter = described_class.new 31 | 32 | expect(chapter.start).to be_nil 33 | end 34 | end 35 | 36 | it "parses the title" do 37 | expect(@chapter.title).to eq "Neil DeGrasse Tyson on Science" 38 | end 39 | 40 | it "parses the link" do 41 | expect(@chapter.url).to eq "https://example.com" 42 | end 43 | 44 | it "parses the image" do 45 | expect(@chapter.image).to eq "https://pics.example.com/pic.png" 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/feedjira/atom_entry_utilities.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module AtomEntryUtilities 5 | def self.included(mod) 6 | mod.class_exec do 7 | element :title, as: :raw_title, with: { type: "html" } 8 | element :title, as: :raw_title, with: { type: "xhtml" } 9 | element :title, as: :raw_title, with: { type: "xml" } 10 | element :title, as: :title, with: { type: "text" } 11 | element :title, as: :title, with: { type: nil } 12 | element :title, as: :title_type, value: :type 13 | 14 | element :name, as: :author 15 | element :content 16 | element :summary 17 | element :enclosure, as: :image, value: :href 18 | 19 | element :published 20 | element :id, as: :entry_id 21 | element :created, as: :published 22 | element :issued, as: :published 23 | element :updated 24 | element :modified, as: :updated 25 | 26 | elements :category, as: :categories, value: :term 27 | 28 | element :link, as: :url, value: :href, with: { 29 | type: "text/html", 30 | rel: "alternate" 31 | } 32 | 33 | elements :link, as: :links, value: :href 34 | end 35 | end 36 | 37 | def title 38 | @title ||= 39 | case @raw_title 40 | when String 41 | Loofah.fragment(@raw_title).xpath("normalize-space(.)") 42 | else 43 | @title 44 | end 45 | end 46 | 47 | def url 48 | @url ||= links.first 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /lib/feedjira/util/parse_time.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "time" 4 | require "date" 5 | 6 | module Feedjira 7 | module Util 8 | # Module for safely parsing time strings 9 | module ParseTime 10 | # Parse a time string and convert it to UTC without raising errors. 11 | # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC. 12 | # 13 | # === Parameters 14 | # [dt] Time definition to be parsed. 15 | # 16 | # === Returns 17 | # A Time instance in UTC or nil if there were errors while parsing. 18 | def self.call(datetime) 19 | if datetime.is_a?(Time) 20 | datetime.utc 21 | elsif datetime.respond_to?(:to_datetime) 22 | datetime.to_time.utc 23 | else 24 | parse_string_safely datetime.to_s 25 | end 26 | rescue StandardError => e 27 | Feedjira.logger.debug("Failed to parse time #{datetime}") 28 | Feedjira.logger.debug(e) 29 | nil 30 | end 31 | 32 | # Parse a string safely, handling special 14-digit format 33 | # 34 | # === Parameters 35 | # [string] String to be parsed as time. 36 | # 37 | # === Returns 38 | # A Time instance in UTC or nil if there were errors while parsing. 39 | def self.parse_string_safely(string) 40 | return nil if string.empty? 41 | 42 | if /\A\d{14}\z/.match?(string) 43 | Time.parse("#{string}Z", true) 44 | else 45 | Time.parse(string).utc 46 | end 47 | end 48 | 49 | private_class_method :parse_string_safely 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/feedjira/parser/atom_feed_burner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with Feedburner Atom feeds. 6 | class AtomFeedBurner 7 | include SAXMachine 8 | include FeedUtilities 9 | 10 | element :title 11 | element :subtitle, as: :description 12 | element :link, as: :url_text_html, value: :href, 13 | with: { type: "text/html" } 14 | element :link, as: :url_notype, value: :href, with: { type: nil } 15 | element :link, as: :feed_url_link, value: :href, with: { type: "application/atom+xml" } 16 | element :"atom10:link", as: :feed_url_atom10_link, value: :href, 17 | with: { type: "application/atom+xml" } 18 | elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" } 19 | elements :entry, as: :entries, class: AtomFeedBurnerEntry 20 | 21 | attr_writer :url, :feed_url 22 | 23 | def self.able_to_parse?(xml) 24 | (xml.include?(" with type="text/html" if present, 28 | # with no type attribute otherwise 29 | def url 30 | @url || @url_text_html || @url_notype 31 | end 32 | 33 | # Feed feed_url is with type="application/atom+xml" if present, 34 | # with type="application/atom+xml" otherwise 35 | def feed_url 36 | @feed_url || @feed_url_link || @feed_url_atom10_link 37 | end 38 | 39 | def self.preprocess(xml) 40 | Preprocessor.new(xml).to_xml 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /spec/feedjira/util/parse_time_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | RSpec.describe Feedjira::Util::ParseTime do 6 | describe ".call" do 7 | it "returns the datetime in utc when given a Time" do 8 | time = Time.now 9 | 10 | expect(described_class.call(time)).to eq(time.utc) 11 | end 12 | 13 | it "returns the datetime in utc when given a Date" do 14 | date = Date.today 15 | 16 | expect(described_class.call(date)).to eq(date.to_time.utc) 17 | end 18 | 19 | it "returns the datetime in utc when given a String" do 20 | timestamp = "2016-01-01 00:00:00" 21 | 22 | expect(described_class.call(timestamp)).to eq(Time.parse(timestamp).utc) 23 | end 24 | 25 | it "returns nil when given an empty String" do 26 | timestamp = "" 27 | 28 | expect(described_class.call(timestamp)).to be_nil 29 | end 30 | 31 | it "returns the the datetime in utc given a 14-digit time" do 32 | time = Time.now.utc 33 | timestamp = time.strftime("%Y%m%d%H%M%S") 34 | 35 | expect(described_class.call(timestamp)).to eq(time.floor) 36 | end 37 | 38 | context "when given an invalid time string" do 39 | it "returns nil" do 40 | timestamp = "2016-51-51 00:00:00" 41 | 42 | expect(described_class.call(timestamp)).to be_nil 43 | end 44 | 45 | it "logs an error" do 46 | timestamp = "2016-51-51 00:00:00" 47 | 48 | expect(Feedjira.logger) 49 | .to receive(:debug).with("Failed to parse time #{timestamp}") 50 | expect(Feedjira.logger) 51 | .to receive(:debug).with(an_instance_of(ArgumentError)) 52 | 53 | described_class.call(timestamp) 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/feedjira/rss_entry_utilities.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module RSSEntryUtilities 5 | # rubocop:todo Metrics/MethodLength 6 | def self.included(mod) # rubocop:todo Metrics/AbcSize, Metrics/MethodLength 7 | mod.class_exec do 8 | element :title 9 | 10 | element :"content:encoded", as: :content 11 | element :"a10:content", as: :content 12 | element :description, as: :summary 13 | 14 | element :link, as: :url 15 | element :"a10:link", as: :url, value: :href 16 | 17 | element :author 18 | element :"dc:creator", as: :author 19 | element :"a10:name", as: :author 20 | 21 | element :pubDate, as: :published 22 | element :pubdate, as: :published 23 | element :issued, as: :published 24 | element :"dc:date", as: :published 25 | element :"dc:Date", as: :published 26 | element :"dcterms:created", as: :published 27 | 28 | element :"dcterms:modified", as: :updated 29 | element :"a10:updated", as: :updated 30 | 31 | element :guid, as: :entry_id, class: Feedjira::Parser::GloballyUniqueIdentifier 32 | element :"dc:identifier", as: :dc_identifier 33 | 34 | element :"media:thumbnail", as: :image, value: :url 35 | element :"media:content", as: :image, value: :url 36 | element :enclosure, as: :image, value: :url 37 | 38 | element :comments 39 | 40 | elements :category, as: :categories 41 | end 42 | end 43 | # rubocop:enable Metrics/MethodLength 44 | 45 | def entry_id 46 | @entry_id&.guid 47 | end 48 | 49 | def url 50 | @url || @entry_id&.url 51 | end 52 | 53 | def id 54 | entry_id || @dc_identifier || @url 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_google_alerts_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::AtomGoogleAlertsEntry do 6 | before do 7 | feed = Feedjira::Parser::AtomGoogleAlerts.parse sample_google_alerts_atom_feed 8 | @entry = feed.entries.first 9 | end 10 | 11 | it "parses the title" do 12 | expect(@entry.title).to eq "Report offers Prediction of Automotive Slack Market by Top key players like Haldex, Meritor, Bendix ..." 13 | expect(@entry.raw_title).to eq "Report offers Prediction of Automotive Slack Market by Top key players like Haldex, Meritor, Bendix ..." 14 | expect(@entry.title_type).to eq "html" 15 | end 16 | 17 | it "parses the url out of the params when the host is google" do 18 | url = "https://www.exampoo.com" 19 | entry = described_class.new(url: "https://www.google.com/url?url=#{url}") 20 | 21 | expect(entry.url).to eq url 22 | end 23 | 24 | it "returns nil when the url is not present" do 25 | entry = described_class.new 26 | 27 | expect(entry.url).to be_nil 28 | end 29 | 30 | it "returns nil when the host is not google" do 31 | entry = described_class.new(url: "https://www.exampoo.com") 32 | 33 | expect(entry.url).to be_nil 34 | end 35 | 36 | it "parses the content" do 37 | expect(@entry.content).to eq "Automotive Slack Market reports provides a comprehensive overview of the global market size and share. It provides strategists, marketers and senior ..." 38 | end 39 | 40 | it "parses the published date" do 41 | published = Feedjira::Util::ParseTime.call "2019-07-10T11:53:37Z" 42 | expect(@entry.published).to eq published 43 | end 44 | 45 | it "parses the updated date" do 46 | updated = Feedjira::Util::ParseTime.call "2019-07-10T11:53:37Z" 47 | expect(@entry.updated).to eq updated 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/feedjira/parser/json_feed_item.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # Parser for dealing with JSON Feed items. 6 | class JSONFeedItem 7 | include FeedEntryUtilities 8 | 9 | attr_reader :json, :entry_id, :url, :external_url, :title, :content, :summary, 10 | :published, :updated, :image, :banner_image, :author, :categories 11 | 12 | def initialize(json) 13 | @json = json 14 | @entry_id = json.fetch("id") 15 | @url = json.fetch("url") 16 | @external_url = json.fetch("external_url", nil) 17 | @title = json.fetch("title", nil) 18 | @content = parse_content(json.fetch("content_html", nil), json.fetch("content_text", nil)) 19 | @summary = json.fetch("summary", nil) 20 | @image = json.fetch("image", nil) 21 | @banner_image = json.fetch("banner_image", nil) 22 | @published = parse_published(json.fetch("date_published", nil)) 23 | @updated = parse_updated(json.fetch("date_modified", nil)) 24 | @author = author_name(json.fetch("author", nil)) 25 | @categories = json.fetch("tags", []) 26 | end 27 | 28 | private 29 | 30 | def parse_published(date_published) 31 | return nil unless date_published 32 | 33 | Feedjira::Util::ParseTime.call(date_published) 34 | end 35 | 36 | def parse_updated(date_modified) 37 | return nil unless date_modified 38 | 39 | Feedjira::Util::ParseTime.call(date_modified) 40 | end 41 | 42 | # Convenience method to return the included content type. 43 | # Prefer content_html unless it isn't included. 44 | def parse_content(content_html, content_text) 45 | return content_html unless content_html.nil? 46 | 47 | content_text 48 | end 49 | 50 | def author_name(author_obj) 51 | return nil if author_obj.nil? 52 | 53 | author_obj["name"] 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/feedjira/configuration.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Feedjira::Configuration 4 | module Feedjira 5 | # Provides global configuration options for Feedjira 6 | # 7 | # @example Set configuration options using a block 8 | # Feedjira.configure do |config| 9 | # config.strip_whitespace = true 10 | # end 11 | module Configuration 12 | attr_accessor( 13 | :logger, 14 | :parsers, 15 | :strip_whitespace 16 | ) 17 | 18 | # Modify Feedjira's current configuration 19 | # 20 | # @yieldparam [Feedjria] config current Feedjira config 21 | # @example 22 | # Feedjira.configure do |config| 23 | # config.strip_whitespace = true 24 | # end 25 | def configure 26 | yield self 27 | end 28 | 29 | # Reset Feedjira's configuration to defaults 30 | # 31 | # @example 32 | # Feedjira.reset_configuration! 33 | def reset_configuration! 34 | set_default_configuration 35 | end 36 | 37 | # @private 38 | def self.extended(base) 39 | base.set_default_configuration 40 | end 41 | 42 | # @private 43 | def set_default_configuration 44 | self.logger = default_logger 45 | self.parsers = default_parsers 46 | self.strip_whitespace = false 47 | end 48 | 49 | private 50 | 51 | # @private 52 | def default_logger 53 | Logger.new($stdout).tap do |logger| 54 | logger.progname = "Feedjira" 55 | logger.level = Logger::WARN 56 | end 57 | end 58 | 59 | # @private 60 | def default_parsers 61 | [ 62 | Feedjira::Parser::ITunesRSS, 63 | Feedjira::Parser::RSSFeedBurner, 64 | Feedjira::Parser::GoogleDocsAtom, 65 | Feedjira::Parser::AtomYoutube, 66 | Feedjira::Parser::AtomFeedBurner, 67 | Feedjira::Parser::AtomGoogleAlerts, 68 | Feedjira::Parser::Atom, 69 | Feedjira::Parser::RSS, 70 | Feedjira::Parser::JSONFeed 71 | ] 72 | end 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_google_alerts_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe "#able_to_parse?" do 8 | it "returns true for a Google Alerts atom feed" do 9 | expect(AtomGoogleAlerts).to be_able_to_parse(sample_google_alerts_atom_feed) 10 | end 11 | 12 | it "returns false for an rdf feed" do 13 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_rdf_feed) 14 | end 15 | 16 | it "returns false for a regular atom feed" do 17 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_atom_feed) 18 | end 19 | 20 | it "returns false for a feedburner atom feed" do 21 | expect(AtomGoogleAlerts).not_to be_able_to_parse(sample_feedburner_atom_feed) 22 | end 23 | end 24 | 25 | describe "parsing" do 26 | before do 27 | @feed = AtomGoogleAlerts.parse(sample_google_alerts_atom_feed) 28 | end 29 | 30 | it "parses the title" do 31 | expect(@feed.title).to eq "Google Alert - Slack" 32 | end 33 | 34 | it "parses the descripton" do 35 | expect(@feed.description).to be_nil 36 | end 37 | 38 | it "parses the url" do 39 | expect(@feed.url).to eq "https://www.google.com/alerts/feeds/04175468913983673025/4428013283581841004" 40 | end 41 | 42 | it "parses the feed_url" do 43 | expect(@feed.feed_url).to eq "https://www.google.com/alerts/feeds/04175468913983673025/4428013283581841004" 44 | end 45 | 46 | it "parses entries" do 47 | expect(@feed.entries.size).to eq 20 48 | end 49 | end 50 | 51 | describe "preprocessing" do 52 | it "retains markup in xhtml content" do 53 | AtomGoogleAlerts.preprocess_xml = true 54 | 55 | feed = AtomGoogleAlerts.parse sample_google_alerts_atom_feed 56 | entry = feed.entries.first 57 | 58 | expect(entry.content).to include("Slack") 59 | end 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/feedjira/parser/itunes_rss_item.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # iTunes extensions to the standard RSS2.0 item 6 | # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html 7 | class ITunesRSSItem 8 | include SAXMachine 9 | include FeedEntryUtilities 10 | include RSSEntryUtilities 11 | 12 | sax_config.top_level_elements["enclosure"].clear 13 | 14 | # If author is not present use author tag on the item 15 | element :"itunes:author", as: :itunes_author 16 | element :"itunes:block", as: :itunes_block 17 | element :"itunes:duration", as: :itunes_duration 18 | element :"itunes:explicit", as: :itunes_explicit 19 | element :"itunes:keywords", as: :itunes_keywords 20 | element :"itunes:subtitle", as: :itunes_subtitle 21 | element :"itunes:image", value: :href, as: :itunes_image 22 | element :"itunes:isClosedCaptioned", as: :itunes_closed_captioned 23 | element :"itunes:order", as: :itunes_order 24 | element :"itunes:season", as: :itunes_season 25 | element :"itunes:episode", as: :itunes_episode 26 | element :"itunes:title", as: :itunes_title 27 | element :"itunes:episodeType", as: :itunes_episode_type 28 | 29 | # If summary is not present, use the description tag 30 | element :"itunes:summary", as: :itunes_summary 31 | element :enclosure, value: :length, as: :enclosure_length 32 | element :enclosure, value: :type, as: :enclosure_type 33 | element :enclosure, value: :url, as: :enclosure_url 34 | elements "psc:chapter", as: :raw_chapters, class: Feedjira::Parser::PodloveChapter 35 | 36 | # Podlove requires clients to re-order by start time in the 37 | # event the publisher doesn't provide them in that 38 | # order. SAXMachine doesn't have any sort capability afaik, so 39 | # we have to sort chapters manually. 40 | def chapters 41 | raw_chapters.sort_by(&:start) 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/feedjira/parser/google_docs_atom_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe ".able_to_parser?" do 8 | it "returns true for Google Docs feed" do 9 | expect(GoogleDocsAtom).to be_able_to_parse(sample_google_docs_list_feed) 10 | end 11 | 12 | it "is not able to parse another Atom feed" do 13 | expect(GoogleDocsAtom).not_to be_able_to_parse(sample_atom_feed) 14 | end 15 | end 16 | 17 | describe "parsing" do 18 | before do 19 | @feed = GoogleDocsAtom.parse(sample_google_docs_list_feed) 20 | end 21 | 22 | it "returns a bunch of objects" do 23 | expect(@feed.entries).not_to be_empty 24 | end 25 | 26 | it "populates a title, interhited from the Atom entry" do 27 | expect(@feed.title).not_to be_nil 28 | end 29 | 30 | it "returns a bunch of entries of type GoogleDocsAtomEntry" do 31 | expect(@feed.entries.first).to be_a GoogleDocsAtomEntry 32 | end 33 | end 34 | 35 | describe "#url" do 36 | it "returns the url when set" do 37 | feed = GoogleDocsAtom.new 38 | 39 | feed.url = "http://exampoo.com/feed" 40 | 41 | expect(feed.url).to eq "http://exampoo.com/feed" 42 | end 43 | 44 | it "returns the first link when not set" do 45 | feed = GoogleDocsAtom.new 46 | 47 | feed.links = ["http://exampoo.com/feed"] 48 | 49 | expect(feed.url).to eq "http://exampoo.com/feed" 50 | end 51 | end 52 | 53 | describe "#feed_url" do 54 | it "returns the feed_url when set" do 55 | feed = GoogleDocsAtom.new 56 | 57 | feed.feed_url = "http://exampoo.com/feed" 58 | 59 | expect(feed.feed_url).to eq "http://exampoo.com/feed" 60 | end 61 | 62 | it "returns the first link when not set" do 63 | feed = GoogleDocsAtom.new 64 | 65 | feed.links = ["http://exampoo.com/feed"] 66 | 67 | expect(feed.feed_url).to eq "http://exampoo.com/feed" 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /spec/feedjira/parser/json_feed_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe ".able_to_parse?" do 8 | it "returns true for a JSON feed" do 9 | expect(JSONFeed).to be_able_to_parse(sample_json_feed) 10 | end 11 | 12 | it "returns true for a JSON feed with escaped URIs" do 13 | expect(JSONFeed).to be_able_to_parse(sample_json_feed_with_escaped_uris) 14 | end 15 | 16 | it "returns false for an RSS feed" do 17 | expect(JSONFeed).not_to be_able_to_parse(sample_rss_feed) 18 | end 19 | 20 | it "returns false for an Atom feed" do 21 | expect(JSONFeed).not_to be_able_to_parse(sample_atom_feed) 22 | end 23 | end 24 | 25 | describe "parsing" do 26 | before do 27 | @feed = JSONFeed.parse(sample_json_feed) 28 | end 29 | 30 | it "parses the version" do 31 | expect(@feed.version).to eq "https://jsonfeed.org/version/1" 32 | end 33 | 34 | it "parses the title" do 35 | expect(@feed.title).to eq "inessential.com" 36 | end 37 | 38 | it "parses the url" do 39 | expect(@feed.url).to eq "http://inessential.com/" 40 | end 41 | 42 | it "parses the feed_url" do 43 | expect(@feed.feed_url).to eq "http://inessential.com/feed.json" 44 | end 45 | 46 | it "parses the description" do 47 | expect(@feed.description).to eq "Brent Simmons’s weblog." 48 | end 49 | 50 | it "parses the favicon" do 51 | expect(@feed.favicon).to eq "http://inessential.com/favicon.ico" 52 | end 53 | 54 | it "parses the icon" do 55 | expect(@feed.icon).to eq "http://inessential.com/icon.png" 56 | end 57 | 58 | it "parses the language" do 59 | expect(@feed.language).to eq "en-US" 60 | end 61 | 62 | it "parses expired and return default (nil)" do 63 | expect(@feed.expired).to be_nil 64 | end 65 | 66 | it "parses entries" do 67 | expect(@feed.entries.size).to eq 20 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml: -------------------------------------------------------------------------------- 1 |

There's lots to like about Google's new web browser, Chrome, which was released today.  When I read the awesome comic strip introduction yesterday, however, the thing that stood out most for me was in very small type: the name Lars Bak attached to the V8 JavaScript engine.  I know of Lars from his work on Self, Strongtalk, HotSpot and OOVM, and his involvement in V8 says a lot about the kind of language implementation it will be.  David Griswold has posted some more information on the Strongtalk list: 2 | 3 |

4 | The V8 development team has multiple members of the original 5 | Animorphic team; it is headed by Lars Bak, who was the technical lead 6 | for both Strongtalk and the HotSpot Java VM (as well as a huge 7 | contributor to the original Self VM).   I think that you will find 8 | that V8 has a lot of the creamy goodness of the Strongtalk and Self 9 | VMs, with many big architectural improvements 10 |

11 | 12 | I'll post more on this later, but things are getting interesting...

13 | 14 |

Update: the V8 code is already available, and builds and runs fine on Mac OS X.  From the design docs, it's pretty clear that this is indeed what I was hoping for: a mainstream, open source dynamic language implementation that learned and applies the lessons from Smalltalk, Self and Strongtalk.  Most telling are that the only two papers cited in that document are titled "An Efficient Implementation of Self" and "An Efficient Implementation of the Smalltalk-80 System".

15 | 16 |

The "classes as nodes in a state machine" trick for expando properties is especially neat.

17 | 18 |

The bad news: V8 is over 100,000 lines of C++.

19 | 20 |

 

21 | 22 |

-------------------------------------------------------------------------------- /spec/feedjira/parser/rss_feed_burner_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe "#will_parse?" do 8 | it "returns true for a feedburner rss feed" do 9 | expect(RSSFeedBurner).to be_able_to_parse sample_rss_feed_burner_feed 10 | end 11 | 12 | it "returns false for a regular RSS feed" do 13 | expect(RSSFeedBurner).not_to be_able_to_parse sample_rss_feed 14 | end 15 | 16 | it "returns false for a feedburner atom feed" do 17 | expect(RSSFeedBurner).not_to be_able_to_parse sample_feedburner_atom_feed 18 | end 19 | 20 | it "returns false for an rdf feed" do 21 | expect(RSSFeedBurner).not_to be_able_to_parse sample_rdf_feed 22 | end 23 | 24 | it "returns false for a regular atom feed" do 25 | expect(RSSFeedBurner).not_to be_able_to_parse sample_atom_feed 26 | end 27 | end 28 | 29 | describe "parsing" do 30 | before do 31 | @feed = RSSFeedBurner.parse(sample_rss_feed_burner_feed) 32 | end 33 | 34 | it "parses the title" do 35 | expect(@feed.title).to eq "TechCrunch" 36 | end 37 | 38 | it "parses the description" do 39 | description = "TechCrunch is a group-edited blog that profiles the companies, products and events defining and transforming the new web." 40 | expect(@feed.description).to eq description 41 | end 42 | 43 | it "parses the url" do 44 | expect(@feed.url).to eq "http://techcrunch.com" 45 | end 46 | 47 | it "parses the last build date" do 48 | expect(@feed.last_built).to eq "Wed, 02 Nov 2011 17:29:59 +0000" 49 | end 50 | 51 | it "parses the hub urls" do 52 | expect(@feed.hubs.count).to eq 2 53 | expect(@feed.hubs.first).to eq "http://pubsubhubbub.appspot.com/" 54 | end 55 | 56 | it "provides an accessor for the feed_url" do 57 | expect(@feed).to respond_to :feed_url 58 | expect(@feed).to respond_to :feed_url= 59 | end 60 | 61 | it "parses entries" do 62 | expect(@feed.entries.size).to eq 20 63 | end 64 | end 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /lib/feedjira/feed_entry_utilities.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module FeedEntryUtilities 5 | include Enumerable 6 | 7 | def published 8 | @published ||= @updated 9 | end 10 | 11 | def parse_datetime(string) 12 | DateTime.parse(string).to_time.utc 13 | rescue StandardError => e 14 | Feedjira.logger.debug("Failed to parse date #{string.inspect}") 15 | Feedjira.logger.debug(e) 16 | nil 17 | end 18 | 19 | ## 20 | # Returns the id of the entry or its url if not id is present, as some 21 | # formats don't support it 22 | # rubocop:disable Naming/MemoizedInstanceVariableName 23 | def id 24 | @entry_id ||= @url 25 | end 26 | # rubocop:enable Naming/MemoizedInstanceVariableName 27 | 28 | ## 29 | # Writer for published. By default, we keep the "oldest" publish time found. 30 | def published=(val) 31 | parsed = parse_datetime(val) 32 | @published = parsed if parsed && (!@published || parsed < @published) 33 | end 34 | 35 | ## 36 | # Writer for updated. By default, we keep the most recent update time found. 37 | def updated=(val) 38 | parsed = parse_datetime(val) 39 | @updated = parsed if parsed && (!@updated || parsed > @updated) 40 | end 41 | 42 | def sanitize! 43 | %w[title author summary content image].each do |name| 44 | next unless respond_to?(name) 45 | 46 | current_value = send(name) 47 | if current_value.is_a?(String) 48 | send(:"#{name}=", Loofah.scrub_fragment(current_value, :prune).to_s) 49 | end 50 | end 51 | end 52 | 53 | alias last_modified published 54 | 55 | def each 56 | @rss_fields ||= instance_variables.map do |ivar| 57 | ivar.to_s.sub("@", "") 58 | end.select do |field| # rubocop:disable Style/MultilineBlockChain 59 | # select callable (public) methods only 60 | respond_to?(field) 61 | end 62 | 63 | @rss_fields.each do |field| 64 | yield(field, instance_variable_get(:"@#{field}")) 65 | end 66 | end 67 | 68 | def [](field) 69 | instance_variable_get(:"@#{field}") 70 | end 71 | 72 | def []=(field, value) 73 | instance_variable_set(:"@#{field}", value) 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_feed_burner_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::AtomFeedBurnerEntry do 6 | before do 7 | Feedjira::Parser::AtomFeedBurner.preprocess_xml = false 8 | # I don't really like doing it this way because these unit test should only 9 | # rely on AtomEntry, but this is actually how it should work. You would 10 | # never just pass entry xml straight to the AtomEnry 11 | feed = Feedjira::Parser::AtomFeedBurner.parse sample_feedburner_atom_feed 12 | @entry = feed.entries.first 13 | end 14 | 15 | it "parses the title" do 16 | expect(@entry.title).to eq "Making a Ruby C library even faster" 17 | end 18 | 19 | it "is able to fetch a url via the 'alternate' rel if no origLink exists" do 20 | xml = File.read("#{File.dirname(__FILE__)}/../../sample_feeds/PaulDixExplainsNothingAlternate.xml") 21 | entry = Feedjira::Parser::AtomFeedBurner.parse(xml).entries.first 22 | expect(entry.url).to eq("http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/519925023/making-a-ruby-c-library-even-faster.html") 23 | end 24 | 25 | it "parses the url" do 26 | expect(@entry.url).to eq "http://www.pauldix.net/2009/01/making-a-ruby-c-library-even-faster.html" 27 | end 28 | 29 | it "parses the url when there is no alternate" do 30 | xml = File.read("#{File.dirname(__FILE__)}/../../sample_feeds/FeedBurnerUrlNoAlternate.xml") 31 | entry = Feedjira::Parser::AtomFeedBurner.parse(xml).entries.first 32 | expect(entry.url).to eq "http://example.com/QQQQ.html" 33 | end 34 | 35 | it "parses the author" do 36 | expect(@entry.author).to eq "Paul Dix" 37 | end 38 | 39 | it "parses the content" do 40 | expect(@entry.content).to eq sample_feedburner_atom_entry_content 41 | end 42 | 43 | it "provides a summary" do 44 | summary = "Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how..." 45 | expect(@entry.summary).to eq summary 46 | end 47 | 48 | it "parses the published date" do 49 | published = Feedjira::Util::ParseTime.call "Thu Jan 22 15:50:22 UTC 2009" 50 | expect(@entry.published).to eq published 51 | end 52 | 53 | it "parses the categories" do 54 | expect(@entry.categories).to eq ["Ruby", "Another Category"] 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /spec/sample_feeds.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module SampleFeeds 4 | FEEDS = { 5 | sample_atom_feed: "AmazonWebServicesBlog.xml", 6 | sample_atom_simple: "atom_simple_single_entry.xml", 7 | sample_atom_simple_link_self: "atom_simple_single_entry_link_self.xml", 8 | sample_atom_middleman_feed: "FeedjiraBlog.xml", 9 | sample_atom_xhtml_feed: "pet_atom.xml", 10 | sample_atom_feed_line_breaks: "AtomFeedWithSpacesAroundEquals.xml", 11 | sample_atom_entry_content: "AmazonWebServicesBlogFirstEntryContent.xml", 12 | sample_itunes_feed: "itunes.xml", 13 | sample_itunes_feedburner_feed: "itunes_feedburner.xml", 14 | sample_itunes_feed_with_single_quotes: "ITunesWithSingleQuotedAttributes.xml", 15 | sample_itunes_feed_with_spaces: "ITunesWithSpacesInAttributes.xml", 16 | sample_podlove_feed: "CRE.xml", 17 | sample_rdf_feed: "HREFConsideredHarmful.xml", 18 | sample_rdf_entry_content: "HREFConsideredHarmfulFirstEntry.xml", 19 | sample_rss_feed_burner_feed: "TechCrunch.xml", 20 | sample_rss_feed_burner_entry_content: "TechCrunchFirstEntry.xml", 21 | sample_rss_feed_burner_entry_description: "TechCrunchFirstEntryDescription.xml", 22 | sample_rss_feed: "TenderLovemaking.xml", 23 | sample_rss_entry_content: "TenderLovemakingFirstEntry.xml", 24 | sample_feedburner_atom_feed: "PaulDixExplainsNothing.xml", 25 | sample_feedburner_atom_feed_alternate: "GiantRobotsSmashingIntoOtherGiantRobots.xml", 26 | sample_feedburner_atom_entry_content: "PaulDixExplainsNothingFirstEntryContent.xml", 27 | sample_google_alerts_atom_feed: "google_alerts_atom.xml", 28 | sample_wfw_feed: "PaulDixExplainsNothingWFW.xml", 29 | sample_google_docs_list_feed: "GoogleDocsList.xml", 30 | sample_feed_burner_atom_xhtml_feed: "FeedBurnerXHTML.xml", 31 | sample_duplicate_content_atom_feed: "DuplicateContentAtomFeed.xml", 32 | sample_youtube_atom_feed: "youtube_atom.xml", 33 | sample_atom_xhtml_with_escpaed_html_in_pre_tag_feed: "AtomEscapedHTMLInPreTag.xml", 34 | sample_json_feed: "json_feed.json", 35 | sample_json_feed_with_escaped_uris: "json_feed_with_escaped_uris.json", 36 | sample_rss_feed_huffpost_ca: "HuffPostCanada.xml", 37 | sample_invalid_date_format_feed: "InvalidDateFormat.xml", 38 | sample_rss_feed_permalinks: "Permalinks.xml", 39 | sample_rss_feed_with_a10_namespace: "a10.xml", 40 | sample_rss_feed_with_comments: "RSSWithComments.xml" 41 | }.freeze 42 | 43 | FEEDS.each do |method, filename| 44 | define_method(method) { load_sample filename } 45 | end 46 | 47 | def load_sample(filename) 48 | File.read("#{File.dirname(__FILE__)}/sample_feeds/#{filename}") 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/feedjira/parser/itunes_rss.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module Parser 5 | # iTunes is RSS 2.0 + some apple extensions 6 | # Sources: 7 | # * https://cyber.harvard.edu/rss/rss.html 8 | # * http://lists.apple.com/archives/syndication-dev/2005/Nov/msg00002.html 9 | # * https://help.apple.com/itc/podcasts_connect/ 10 | class ITunesRSS 11 | include SAXMachine 12 | include FeedUtilities 13 | 14 | attr_accessor :feed_url 15 | 16 | # RSS 2.0 elements that need including 17 | element :copyright 18 | element :description 19 | element :image, class: RSSImage 20 | element :language 21 | element :lastBuildDate, as: :last_built 22 | element :link, as: :url 23 | element :managingEditor, as: :managing_editor 24 | element :rss, as: :version, value: :version 25 | element :title 26 | element :ttl 27 | 28 | # If author is not present use managingEditor on the channel 29 | element :"itunes:author", as: :itunes_author 30 | element :"itunes:block", as: :itunes_block 31 | element :"itunes:image", value: :href, as: :itunes_image 32 | element :"itunes:explicit", as: :itunes_explicit 33 | element :"itunes:complete", as: :itunes_complete 34 | element :"itunes:keywords", as: :itunes_keywords 35 | element :"itunes:type", as: :itunes_type 36 | 37 | # New URL for the podcast feed 38 | element :"itunes:new_feed_url", as: :itunes_new_feed_url 39 | element :"itunes:subtitle", as: :itunes_subtitle 40 | 41 | # If summary is not present, use the description tag 42 | element :"itunes:summary", as: :itunes_summary 43 | 44 | # iTunes RSS feeds can have multiple main categories and multiple 45 | # sub-categories per category. 46 | elements :"itunes:category", as: :_itunes_categories, 47 | class: ITunesRSSCategory 48 | private :_itunes_categories 49 | 50 | def itunes_categories 51 | _itunes_categories.flat_map do |itunes_category| 52 | itunes_category.enum_for(:each_subcategory).to_a 53 | end 54 | end 55 | 56 | def itunes_category_paths 57 | _itunes_categories.flat_map do |itunes_category| 58 | itunes_category.enum_for(:each_path).to_a 59 | end 60 | end 61 | 62 | elements :"itunes:owner", as: :itunes_owners, class: ITunesRSSOwner 63 | elements :item, as: :entries, class: ITunesRSSItem 64 | 65 | def self.able_to_parse?(xml) 66 | %r{xmlns:itunes\s?=\s?["']http://www\.itunes\.com/dtds/podcast-1\.0\.dtd["']}i =~ xml 67 | end 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /spec/feedjira/parser/rss_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::RSS do 6 | describe "#will_parse?" do 7 | it "returns true for an RSS feed" do 8 | expect(described_class).to be_able_to_parse(sample_rss_feed) 9 | end 10 | 11 | it "returns false for an atom feed" do 12 | expect(described_class).not_to be_able_to_parse(sample_atom_feed) 13 | end 14 | 15 | it "returns false for an rss feedburner feed" do 16 | able = described_class.able_to_parse? sample_rss_feed_burner_feed 17 | expect(able).to be false 18 | end 19 | end 20 | 21 | describe "parsing" do 22 | before do 23 | @feed = described_class.parse(sample_rss_feed) 24 | end 25 | 26 | it "parses the version" do 27 | expect(@feed.version).to eq "2.0" 28 | end 29 | 30 | it "parses the title" do 31 | expect(@feed.title).to eq "Tender Lovemaking" 32 | end 33 | 34 | it "parses the description" do 35 | expect(@feed.description).to eq "The act of making love, tenderly." 36 | end 37 | 38 | it "parses the url" do 39 | expect(@feed.url).to eq "http://tenderlovemaking.com" 40 | end 41 | 42 | it "parses the ttl" do 43 | expect(@feed.ttl).to eq "60" 44 | end 45 | 46 | it "parses the last build date" do 47 | expect(@feed.last_built).to eq "Sat, 07 Sep 2002 09:42:31 GMT" 48 | end 49 | 50 | it "parses the hub urls" do 51 | expect(@feed.hubs.count).to eq 1 52 | expect(@feed.hubs.first).to eq "http://pubsubhubbub.appspot.com/" 53 | end 54 | 55 | it "provides an accessor for the feed_url" do 56 | expect(@feed).to respond_to :feed_url 57 | expect(@feed).to respond_to :feed_url= 58 | end 59 | 60 | it "parses the language" do 61 | expect(@feed.language).to eq "en" 62 | end 63 | 64 | it "parses the image url" do 65 | expect(@feed.image.url).to eq "https://tenderlovemaking.com/images/header-logo-text-trimmed.png" 66 | end 67 | 68 | it "parses the image title" do 69 | expect(@feed.image.title).to eq "Tender Lovemaking" 70 | end 71 | 72 | it "parses the image link" do 73 | expect(@feed.image.link).to eq "http://tenderlovemaking.com" 74 | end 75 | 76 | it "parses the image width" do 77 | expect(@feed.image.width).to eq "766" 78 | end 79 | 80 | it "parses the image height" do 81 | expect(@feed.image.height).to eq "138" 82 | end 83 | 84 | it "parses the image description" do 85 | expect(@feed.image.description).to eq "The act of making love, tenderly." 86 | end 87 | 88 | it "parses entries" do 89 | expect(@feed.entries.size).to eq 10 90 | end 91 | end 92 | end 93 | -------------------------------------------------------------------------------- /spec/feedjira/feed_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Feed do 6 | describe ".add_common_feed_element" do 7 | before(:all) do 8 | described_class.add_common_feed_element("generator") 9 | end 10 | 11 | it "parses the added element out of Atom feeds" do 12 | expect(Feedjira.parse(sample_wfw_feed).generator).to eq "TypePad" 13 | end 14 | 15 | it "parses the added element out of Atom Feedburner feeds" do 16 | expect(Feedjira::Parser::Atom.new).to respond_to(:generator) 17 | end 18 | 19 | it "parses the added element out of RSS feeds" do 20 | expect(Feedjira::Parser::RSS.new).to respond_to(:generator) 21 | end 22 | end 23 | 24 | describe ".add_common_feed_elements" do 25 | before do 26 | described_class.add_common_feed_elements(:foos) 27 | end 28 | 29 | it "parses the added element out of Atom feeds" do 30 | expect(Feedjira.parse(sample_wfw_feed).foos).to eq [] 31 | end 32 | 33 | it "parses the added element out of Atom Feedburner feeds" do 34 | expect(Feedjira::Parser::Atom.new).to respond_to(:foos) 35 | end 36 | 37 | it "parses the added element out of RSS feeds" do 38 | expect(Feedjira::Parser::RSS.new).to respond_to(:foos) 39 | end 40 | end 41 | 42 | describe ".add_common_feed_entry_element" do 43 | before(:all) do 44 | tag = "wfw:commentRss" 45 | described_class.add_common_feed_entry_element tag, as: :comment_rss 46 | end 47 | 48 | it "parses the added element out of Atom feeds entries" do 49 | entry = Feedjira.parse(sample_wfw_feed).entries.first 50 | expect(entry.comment_rss).to eq "this is the new val" 51 | end 52 | 53 | it "parses the added element out of Atom Feedburner feeds entries" do 54 | expect(Feedjira::Parser::AtomEntry.new).to respond_to(:comment_rss) 55 | end 56 | 57 | it "parses the added element out of RSS feeds entries" do 58 | expect(Feedjira::Parser::RSSEntry.new).to respond_to(:comment_rss) 59 | end 60 | end 61 | 62 | describe ".add_common_feed_entry_elements" do 63 | before do 64 | described_class.add_common_feed_entry_elements(:things) 65 | end 66 | 67 | it "parses the added element out of Atom feeds entries" do 68 | entry = Feedjira.parse(sample_wfw_feed).entries.first 69 | expect(entry.things).to eq [] 70 | end 71 | 72 | it "parses the added element out of Atom Feedburner feeds entries" do 73 | expect(Feedjira::Parser::AtomEntry.new).to respond_to(:things) 74 | end 75 | 76 | it "parses the added element out of RSS feeds entries" do 77 | expect(Feedjira::Parser::RSSEntry.new).to respond_to(:things) 78 | end 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /lib/feedjira.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "zlib" 4 | require "sax-machine" 5 | require "loofah" 6 | require "logger" 7 | require "json" 8 | 9 | require_relative "feedjira/util" 10 | require_relative "feedjira/util/parse_time" 11 | require_relative "feedjira/configuration" 12 | require_relative "feedjira/feed_entry_utilities" 13 | require_relative "feedjira/feed_utilities" 14 | require_relative "feedjira/feed" 15 | require_relative "feedjira/rss_entry_utilities" 16 | require_relative "feedjira/atom_entry_utilities" 17 | require_relative "feedjira/parser" 18 | require_relative "feedjira/parser/globally_unique_identifier" 19 | require_relative "feedjira/parser/rss_entry" 20 | require_relative "feedjira/parser/rss_image" 21 | require_relative "feedjira/parser/rss" 22 | require_relative "feedjira/parser/atom_entry" 23 | require_relative "feedjira/parser/atom" 24 | require_relative "feedjira/preprocessor" 25 | require_relative "feedjira/version" 26 | 27 | require_relative "feedjira/parser/rss_feed_burner_entry" 28 | require_relative "feedjira/parser/rss_feed_burner" 29 | require_relative "feedjira/parser/podlove_chapter" 30 | require_relative "feedjira/parser/itunes_rss_owner" 31 | require_relative "feedjira/parser/itunes_rss_category" 32 | require_relative "feedjira/parser/itunes_rss_item" 33 | require_relative "feedjira/parser/itunes_rss" 34 | require_relative "feedjira/parser/atom_feed_burner_entry" 35 | require_relative "feedjira/parser/atom_feed_burner" 36 | require_relative "feedjira/parser/atom_google_alerts_entry" 37 | require_relative "feedjira/parser/atom_google_alerts" 38 | require_relative "feedjira/parser/google_docs_atom_entry" 39 | require_relative "feedjira/parser/google_docs_atom" 40 | require_relative "feedjira/parser/atom_youtube_entry" 41 | require_relative "feedjira/parser/atom_youtube" 42 | require_relative "feedjira/parser/json_feed" 43 | require_relative "feedjira/parser/json_feed_item" 44 | 45 | # Feedjira 46 | module Feedjira 47 | NoParserAvailable = Class.new(StandardError) 48 | 49 | extend Configuration 50 | 51 | # Parse XML with first compatible parser 52 | # 53 | # @example 54 | # xml = HTTParty.get("http://example.com").body 55 | # Feedjira.parse(xml) 56 | def parse(xml, parser: nil, &block) 57 | parser ||= parser_for_xml(xml) 58 | 59 | if parser.nil? 60 | raise NoParserAvailable, "No valid parser for XML." 61 | end 62 | 63 | parser.parse(xml, &block) 64 | end 65 | module_function :parse 66 | 67 | # Find compatible parser for given XML 68 | # 69 | # @example 70 | # xml = HTTParty.get("http://example.com").body 71 | # parser = Feedjira.parser_for_xml(xml) 72 | # parser.parse(xml) 73 | def parser_for_xml(xml) 74 | Feedjira.parsers.detect { |klass| klass.able_to_parse?(xml) } 75 | end 76 | module_function :parser_for_xml 77 | end 78 | -------------------------------------------------------------------------------- /spec/feedjira/parser/i_tunes_rss_item_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::ITunesRSSItem do 6 | before do 7 | # I don't really like doing it this way because these unit test should only 8 | # rely on ITunesRssItem, but this is actually how it should work. You would 9 | # never just pass entry xml straight to the ITunesRssItem 10 | @item = Feedjira::Parser::ITunesRSS.parse(sample_itunes_feed).entries.first 11 | end 12 | 13 | it "parses the title" do 14 | expect(@item.title).to eq "Shake Shake Shake Your Spices" 15 | end 16 | 17 | it "parses the itunes title" do 18 | expect(@item.itunes_title).to eq "Shake Shake Shake Your Spices" 19 | end 20 | 21 | it "parses the author" do 22 | expect(@item.itunes_author).to eq "John Doe" 23 | end 24 | 25 | it "parses the subtitle" do 26 | expect(@item.itunes_subtitle).to eq "A short primer on table spices" 27 | end 28 | 29 | it "parses the summary" do 30 | summary = "This week we talk about salt and pepper shakers, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party!" 31 | expect(@item.itunes_summary).to eq summary 32 | end 33 | 34 | it "parses the itunes season" do 35 | expect(@item.itunes_season).to eq "1" 36 | end 37 | 38 | it "parses the itunes episode number" do 39 | expect(@item.itunes_episode).to eq "3" 40 | end 41 | 42 | it "parses the itunes episode type" do 43 | expect(@item.itunes_episode_type).to eq "full" 44 | end 45 | 46 | it "parses the enclosure" do 47 | expect(@item.enclosure_length).to eq "8727310" 48 | expect(@item.enclosure_type).to eq "audio/x-m4a" 49 | expect(@item.enclosure_url).to eq "http://example.com/podcasts/everything/AllAboutEverythingEpisode3.m4a" 50 | end 51 | 52 | it "parses the guid as id" do 53 | expect(@item.id).to eq "http://example.com/podcasts/archive/aae20050615.m4a" 54 | end 55 | 56 | it "parses the published date" do 57 | published = Feedjira::Util::ParseTime.call "Wed Jun 15 19:00:00 UTC 2005" 58 | expect(@item.published).to eq published 59 | end 60 | 61 | it "parses the duration" do 62 | expect(@item.itunes_duration).to eq "7:04" 63 | end 64 | 65 | it "parses the keywords" do 66 | expect(@item.itunes_keywords).to eq "salt, pepper, shaker, exciting" 67 | end 68 | 69 | it "parses the image" do 70 | expect(@item.itunes_image).to eq "http://example.com/podcasts/everything/AllAboutEverything.jpg" 71 | end 72 | 73 | it "parses the order" do 74 | expect(@item.itunes_order).to eq "12" 75 | end 76 | 77 | it "parses the closed captioned flag" do 78 | expect(@item.itunes_closed_captioned).to eq "yes" 79 | end 80 | 81 | it "parses the encoded content" do 82 | content = "

TOPIC: Gooseneck Options

" 83 | expect(@item.content).to eq content 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /spec/sample_feeds/a10.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Some Title 5 | Some Description 6 | Thu, 14 May 2020 10:00:18 Z 7 | Some Category 8 | 9 | Sat, 16 May 2020 08:50:40 GMT 10 | 11 | Title 5 12 | Description 5 13 | Thu, 14 May 2020 10:00:18 Z 14 | 15 | 16 | John Doe 17 | http://www.example.com/ 18 | john.doe@example.com 19 | 20 | 2020-05-14T10:00:18Z 21 | 22 | 23 | Title 4 24 | Description 4 25 | Wed, 13 May 2020 10:17:57 Z 26 | 27 | 28 | John Doe 29 | http://www.example.com/ 30 | john.doe@example.com 31 | 32 | 2020-05-13T10:17:57Z 33 | 34 | 35 | Title 3 36 | Dfescription 3 37 | Tue, 12 May 2020 15:00:00 Z 38 | 39 | 40 | John Doe 41 | http://www.example.com/ 42 | john.doe@example.com 43 | 44 | 2020-05-12T15:00:00Z 45 | 46 | 47 | Title 2 48 | Description 2 49 | Tue, 12 May 2020 07:52:36 Z 50 | 51 | 52 | John Doe 53 | http://www.example.com/ 54 | john.doe@example.com 55 | 56 | 2020-05-12T07:52:36Z 57 | 58 | 59 | Title 1 60 | Description 1 61 | Thu, 07 May 2020 07:36:53 Z 62 | 63 | 64 | John Doe 65 | http://www.example.com/ 66 | john.doe@example.com 67 | 68 | 2020-05-07T07:36:53Z 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_youtube_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.join(File.dirname(__FILE__), %w[.. .. spec_helper]) 4 | 5 | describe Feedjira::Parser::AtomYoutubeEntry do 6 | describe "parsing" do 7 | before do 8 | @feed = Feedjira::Parser::AtomYoutube.parse(sample_youtube_atom_feed) 9 | @entry = @feed.entries.first 10 | end 11 | 12 | it "has the title" do 13 | expect(@entry.title).to eq "The Google app: Questions Title" 14 | end 15 | 16 | it "has the url" do 17 | expect(@entry.url).to eq "http://www.youtube.com/watch?v=5shykyfmb28" 18 | end 19 | 20 | it "has the entry id" do 21 | expect(@entry.entry_id).to eq "yt:video:5shykyfmb28" 22 | end 23 | 24 | it "has the published date" do 25 | expect(@entry.published).to eq Feedjira::Util::ParseTime.call("2015-05-04T00:01:27+00:00") 26 | end 27 | 28 | it "has the updated date" do 29 | expect(@entry.updated).to eq Feedjira::Util::ParseTime.call("2015-05-13T17:38:30+00:00") 30 | end 31 | 32 | it "has the content populated from the media:description element" do 33 | expect(@entry.content).to eq "A question is the most powerful force in the world. It can start you on an adventure or spark a connection. See where a question can take you. The Google app is available on iOS and Android. Download the app here: http://www.google.com/search/about/download" 34 | end 35 | 36 | it "has the summary but blank" do 37 | expect(@entry.summary).to be_nil 38 | end 39 | 40 | it "has the custom youtube video id" do 41 | expect(@entry.youtube_video_id).to eq "5shykyfmb28" 42 | end 43 | 44 | it "has the custom media title" do 45 | expect(@entry.media_title).to eq "The Google app: Questions" 46 | end 47 | 48 | it "has the custom media url" do 49 | expect(@entry.media_url).to eq "https://www.youtube.com/v/5shykyfmb28?version=3" 50 | end 51 | 52 | it "has the custom media type" do 53 | expect(@entry.media_type).to eq "application/x-shockwave-flash" 54 | end 55 | 56 | it "has the custom media width" do 57 | expect(@entry.media_width).to eq "640" 58 | end 59 | 60 | it "has the custom media height" do 61 | expect(@entry.media_height).to eq "390" 62 | end 63 | 64 | it "has the custom media thumbnail url" do 65 | expect(@entry.media_thumbnail_url).to eq "https://i2.ytimg.com/vi/5shykyfmb28/hqdefault.jpg" 66 | end 67 | 68 | it "has the custom media thumbnail width" do 69 | expect(@entry.media_thumbnail_width).to eq "480" 70 | end 71 | 72 | it "has the custom media thumbnail height" do 73 | expect(@entry.media_thumbnail_height).to eq "360" 74 | end 75 | 76 | it "has the custom media star count" do 77 | expect(@entry.media_star_count).to eq "3546" 78 | end 79 | 80 | it "has the custom media star average" do 81 | expect(@entry.media_star_average).to eq "4.79" 82 | end 83 | 84 | it "has the custom media views" do 85 | expect(@entry.media_views).to eq "251497" 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /lib/feedjira/feed_utilities.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Feedjira 4 | module FeedUtilities 5 | UPDATABLE_ATTRIBUTES = %w[title feed_url url last_modified etag].freeze 6 | 7 | attr_writer :new_entries, :updated, :last_modified 8 | attr_accessor :etag 9 | 10 | def self.included(base) 11 | base.extend ClassMethods 12 | end 13 | 14 | module ClassMethods 15 | def parse(xml, &) 16 | xml = strip_whitespace(xml) 17 | xml = preprocess(xml) if preprocess_xml 18 | super(xml, &) 19 | end 20 | 21 | def preprocess(xml) 22 | # noop 23 | xml 24 | end 25 | 26 | def preprocess_xml=(value) 27 | @preprocess_xml = value 28 | end 29 | 30 | def preprocess_xml 31 | @preprocess_xml 32 | end 33 | 34 | def strip_whitespace(xml) 35 | if Feedjira.strip_whitespace 36 | xml.strip 37 | else 38 | xml.lstrip 39 | end 40 | end 41 | end 42 | 43 | def last_modified 44 | @last_modified ||= entries.reject { |e| e.published.nil? }.max_by(&:published)&.published 45 | end 46 | 47 | def updated? 48 | @updated || false 49 | end 50 | 51 | def new_entries 52 | @new_entries ||= [] 53 | end 54 | 55 | def new_entries? 56 | !new_entries.empty? 57 | end 58 | 59 | def update_from_feed(feed) 60 | self.new_entries += find_new_entries_for(feed) 61 | entries.unshift(*self.new_entries) 62 | 63 | @updated = false 64 | 65 | UPDATABLE_ATTRIBUTES.each do |name| 66 | @updated ||= update_attribute(feed, name) 67 | end 68 | end 69 | 70 | def update_attribute(feed, name) 71 | old_value = send(name) 72 | new_value = feed.send(name) 73 | 74 | if old_value == new_value 75 | false 76 | else 77 | send(:"#{name}=", new_value) 78 | true 79 | end 80 | end 81 | 82 | def sanitize_entries! 83 | entries.each(&:sanitize!) 84 | end 85 | 86 | private 87 | 88 | # This implementation is a hack, which is why it's so ugly. It's to get 89 | # around the fact that not all feeds have a published date. However, 90 | # they're always ordered with the newest one first. So we go through the 91 | # entries just parsed and insert each one as a new entry until we get to 92 | # one that has the same id as the the newest for the feed. 93 | def find_new_entries_for(feed) 94 | return feed.entries if entries.empty? 95 | 96 | latest_entry = entries.first 97 | found_new_entries = [] 98 | 99 | feed.entries.each do |entry| 100 | break unless new_entry?(entry, latest_entry) 101 | 102 | found_new_entries << entry 103 | end 104 | 105 | found_new_entries 106 | end 107 | 108 | def new_entry?(entry, latest) 109 | nil_ids = entry.entry_id.nil? && latest.entry_id.nil? 110 | new_id = entry.entry_id != latest.entry_id 111 | new_url = entry.url != latest.url 112 | 113 | (nil_ids || new_id) && new_url 114 | end 115 | end 116 | end 117 | -------------------------------------------------------------------------------- /spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml: -------------------------------------------------------------------------------- 1 | 2 |

3 | Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle. 4 | 5 | We talked about his plans to use AWS as part of his new social video 6 | portal startup. I won't spill any beans before he's ready to talk 7 | about it himself, but I will say that he has a really good concept, 8 | strong backers, and infectious enthusiasm for the online world.

9 |

10 | 11 |

12 | He's now ready to hire a software architect and designer in order to 13 | bring his vision to life. I've posted the job below; you can 14 | send your resume to apply@web.tv 15 | if you are interested, qualified, and located in the right part 16 | of the world. 17 |

18 | 19 |
20 | 21 |
Software Architect & Designer
22 | 23 |

24 | We are a reputable Internet technology, software services and e-commerce company based 25 | in 26 | Istanbul and 27 | Bursa, 28 | Turkey. 29 | We are looking for a talented Software Architect who will 30 | be working in Istanbul for a certain period of time, for our new global scale 31 | "social video portal" project. Below are the qualifications required and job 32 | description for the position to be held. 33 |

34 | 35 | 38 | 39 |

Qualifications:

40 | 41 |
    42 |
  • Extensive knowledge of web technologies.
  • 43 |
  • Experienced in web based application design and development.
  • 44 |
  • Solid bacground in object oriented design and development.
  • 45 |
  • Preferrably experienced in live broadcasting over the internet, video streaming, video sharing and social networking web site development and design.
  • 46 |
  • Knowledge and experience of design and development of multi-tier, distributed, massively multi-user systems.
  • 47 |
  • Experienced in Cloud Computing applications (preferably with AWS).
  • 48 |
  • Very good command of PHP or Python.
  • 49 |
  • Experinced in relational database design.
  • 50 |
  • Familarity with Erlang, and knowledge or experience of Java, C/C++, Ajax, Adobe Flex, mySQL is a plus.
  • 51 |
  • Self motivated, enthusiastic, team player.
  • 52 |
53 | 54 |

Job Description:

55 |
    56 |
  • Will be mainly responsible for designing the overall system for a multi-tier, massively multi-user live video multi-casting, videosharing web site which will also have features of a social network.
  • 57 |
  • Will be involved in Design and Development phases of software development cycle. Will contribute to the Analysis phase.
  • 58 |
  • Will lead the Software Development Team for the period of the contract and report to the Project Coordinator.
  • 59 |
60 |
61 | 62 |

-- Jeff;

63 |
64 | -------------------------------------------------------------------------------- /spec/sample_feeds/ITunesWithSpacesInAttributes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | All About Everything 9 | http://www.example.com/podcasts/everything/index.html 10 | en-us 11 | ℗ & © 2005 John Doe & Family 12 | A show about everything 13 | John Doe 14 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 15 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 16 | 17 | John Doe 18 | john.doe@example.com 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | Shake Shake Shake Your Spices 28 | John Doe 29 | A short primer on table spices 30 | This week we talk about salt and pepper shakers, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party! 31 | 32 | http://example.com/podcasts/archive/aae20050615.m4a 33 | Wed, 15 Jun 2005 19:00:00 GMT 34 | 7:04 35 | salt, pepper, shaker, exciting 36 | 37 | 38 | 39 | Socket Wrench Shootout 40 | Jane Doe 41 | Comparing socket wrenches is fun! 42 | This week we talk about metric vs. old english socket wrenches. Which one is better? Do you really need both? Get all of your answers here. 43 | 44 | http://example.com/podcasts/archive/aae20050608.mp3 45 | Wed, 8 Jun 2005 19:00:00 GMT 46 | 4:34 47 | metric, socket, wrenches, tool 48 | 49 | 50 | 51 | Red, Whine, & Blue 52 | Various 53 | Red + Blue != Purple 54 | This week we talk about surviving in a Red state if you are a Blue person. Or vice versa. 55 | 56 | http://example.com/podcasts/archive/aae20050601.mp3 57 | Wed, 1 Jun 2005 19:00:00 GMT 58 | 3:59 59 | politics, red, blue, state 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at mikeastock@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | 18 | 19 | 20 | 21 | http://alerts.weather.gov/cap/wwaatmget.php?x=MEC015&y=0 22 | NWS CAP Server 23 | 2012-11-07T09:17:00-05:00 24 | 25 | w-nws.webmaster@noaa.gov 26 | 27 | Current Watches, Warnings and Advisories for Lincoln (MEC015) Maine Issued by the National Weather Service 28 | 29 | 30 | http://alerts.weather.gov/cap/wwacapget.php?x=ME124CCF70CDD4.WinterWeatherAdvisory.124CCF729F10ME.CARWSWCAR.b113b32cf3dd0946aab63451118d16e7 31 | 2012-11-07T04:09:00-05:00 32 | 2012-11-07T04:09:00-05:00 33 | 34 | w-nws.webmaster@noaa.gov 35 | 36 | Winter Weather Advisory issued November 07 at 4:09AM EST until November 08 at 12:00PM EST by NWS 37 | 38 | ...WINTER WEATHER ADVISORY IN EFFECT FROM 7 PM THIS EVENING TO NOON EST THURSDAY... THE NATIONAL WEATHER SERVICE IN CARIBOU HAS ISSUED A WINTER WEATHER ADVISORY FOR SNOW AND MIXED PRECIPITATION...WHICH IS IN EFFECT FROM 7 PM THIS EVENING TO NOON EST THURSDAY. * PRECIPITATION TYPE...SNOW...SLEET AND FREEZING RAIN 39 | Winter Weather Advisory 40 | 2012-11-07T04:09:00-05:00 41 | 2012-11-07T16:00:00-05:00 42 | Actual 43 | Alert 44 | Met 45 | Expected 46 | Minor 47 | Likely 48 | Central Penobscot; Central Washington; Interior Hancock; Northern Penobscot; Northern Washington; Southeast Aroostook; Southern Penobscot; Southern Piscataquis 49 | 50 | 51 | FIPS6 52 | 023003 023009 023019 023021 023029 53 | UGC 54 | MEZ005 MEZ006 MEZ011 MEZ015 MEZ016 MEZ017 MEZ031 MEZ032 55 | 56 | 57 | VTEC 58 | /O.NEW.KCAR.WW.Y.0024.121108T0000Z-121108T1700Z/ 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::AtomEntry do 6 | before do 7 | # I don't really like doing it this way because these unit test should only 8 | # rely on AtomEntry, but this is actually how it should work. You would 9 | # never just pass entry xml straight to the AtomEnry 10 | @entry = Feedjira::Parser::Atom.parse(sample_atom_feed).entries.first 11 | end 12 | 13 | it "parses the title" do 14 | title = "AWS Job: Architect & Designer Position in Turkey" 15 | expect(@entry.title).to eq title 16 | end 17 | 18 | it "parses the url" do 19 | expect(@entry.url).to eq "http://aws.typepad.com/aws/2009/01/aws-job-architect-designer-position-in-turkey.html" 20 | end 21 | 22 | it "parses the url even when" do 23 | xml = load_sample("atom_with_link_tag_for_url_unmarked.xml") 24 | entries = Feedjira::Parser::Atom.parse(xml).entries 25 | expect(entries.first.url).to eq "http://www.innoq.com/blog/phaus/2009/07/ja.html" 26 | end 27 | 28 | it "parses the author" do 29 | expect(@entry.author).to eq "AWS Editor" 30 | end 31 | 32 | it "parses the content" do 33 | expect(@entry.content).to eq sample_atom_entry_content 34 | end 35 | 36 | it "provides a summary" do 37 | summary = "Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle. We talked about his plans to use AWS as part of his new social video portal startup. I won't spill any beans before he's ready to..." 38 | expect(@entry.summary).to eq summary 39 | end 40 | 41 | it "parses the published date" do 42 | published = Feedjira::Util::ParseTime.call "Fri Jan 16 18:21:00 UTC 2009" 43 | expect(@entry.published).to eq published 44 | end 45 | 46 | it "parses the categories" do 47 | expect(@entry.categories).to eq %w[Turkey Seattle] 48 | end 49 | 50 | it "parses the updated date" do 51 | updated = Feedjira::Util::ParseTime.call "Fri Jan 16 18:21:00 UTC 2009" 52 | expect(@entry.updated).to eq updated 53 | end 54 | 55 | it "parses the id" do 56 | expect(@entry.id).to eq "tag:typepad.com,2003:post-61484736" 57 | end 58 | 59 | it "supports each" do 60 | expect(@entry).to respond_to :each 61 | end 62 | 63 | it "is able to list out all fields with each" do 64 | all_fields = [] 65 | title_value = "" 66 | 67 | @entry.each do |field, value| 68 | all_fields << field 69 | title_value = value if field == "title" 70 | end 71 | 72 | expect(title_value).to eq "AWS Job: Architect & Designer Position in Turkey" 73 | 74 | expected_fields = %w[ 75 | author 76 | categories 77 | content 78 | entry_id 79 | links 80 | published 81 | summary 82 | title 83 | title_type 84 | updated 85 | url 86 | ] 87 | expect(all_fields.sort).to eq expected_fields 88 | end 89 | 90 | it "supports checking if a field exists in the entry" do 91 | expect(@entry).to include "author" 92 | expect(@entry).to include "title" 93 | end 94 | 95 | it "allows access to fields with hash syntax" do 96 | title = "AWS Job: Architect & Designer Position in Turkey" 97 | expect(@entry["title"]).to eq title 98 | expect(@entry["author"]).to eq "AWS Editor" 99 | end 100 | 101 | it "allows setting field values with hash syntax" do 102 | @entry["title"] = "Foobar" 103 | expect(@entry.title).to eq "Foobar" 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /spec/feedjira/parser/rss_feed_burner_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::Parser::RSSFeedBurnerEntry do 6 | before do 7 | tag = "wfw:commentRss" 8 | Feedjira::Feed.add_common_feed_entry_element(tag, as: :comment_rss) 9 | # I don't really like doing it this way because these unit test should only 10 | # rely on RSSEntry, but this is actually how it should work. You would 11 | # never just pass entry xml straight to the AtomEnry 12 | feed = Feedjira::Parser::RSSFeedBurner.parse sample_rss_feed_burner_feed 13 | @entry = feed.entries.first 14 | end 15 | 16 | after do 17 | # We change the title in one or more specs to test []= 18 | if @entry.title != "Angie’s List Sets Price Range IPO At $11 To $13 Per Share; Valued At Over $600M" 19 | feed = Feedjira::Parser::RSS.parse sample_rss_feed_burner_feed 20 | @entry.title = feed.entries.first.title 21 | end 22 | end 23 | 24 | it "parses the title" do 25 | title = "Angie’s List Sets Price Range IPO At $11 To $13 Per Share; Valued At Over $600M" 26 | expect(@entry.title).to eq title 27 | end 28 | 29 | it "parses the original url" do 30 | expect(@entry.url).to eq "http://techcrunch.com/2011/11/02/angies-list-prices-ipo-at-11-to-13-per-share-valued-at-over-600m/" 31 | end 32 | 33 | it "parses the author" do 34 | expect(@entry.author).to eq "Leena Rao" 35 | end 36 | 37 | it "parses the content" do 38 | expect(@entry.content).to eq sample_rss_feed_burner_entry_content 39 | end 40 | 41 | it "provides a summary" do 42 | expect(@entry.summary).to eq sample_rss_feed_burner_entry_description 43 | end 44 | 45 | it "parses the published date" do 46 | published = Feedjira::Util::ParseTime.call "Wed Nov 02 17:25:27 UTC 2011" 47 | expect(@entry.published).to eq published 48 | end 49 | 50 | it "parses the categories" do 51 | expect(@entry.categories).to eq ["TC", "angie\\'s list"] 52 | end 53 | 54 | it "parses the guid as id" do 55 | expect(@entry.id).to eq "http://techcrunch.com/?p=446154" 56 | end 57 | 58 | it "supports each" do 59 | expect(@entry).to respond_to :each 60 | end 61 | 62 | it "is able to list out all fields with each" do 63 | all_fields = [] 64 | title_value = "" 65 | 66 | @entry.each do |field, value| 67 | all_fields << field 68 | title_value = value if field == "title" 69 | end 70 | 71 | title = "Angie’s List Sets Price Range IPO At $11 To $13 Per Share; Valued At Over $600M" 72 | expect(title_value).to eq title 73 | 74 | expected_fields = %w[ 75 | author 76 | categories 77 | comment_rss 78 | comments 79 | content 80 | entry_id 81 | image 82 | published 83 | summary 84 | title 85 | url 86 | ] 87 | expect(all_fields.sort).to eq expected_fields 88 | end 89 | 90 | it "supports checking if a field exists in the entry" do 91 | expect(@entry).to include "author" 92 | expect(@entry).to include "title" 93 | end 94 | 95 | it "allows access to fields with hash syntax" do 96 | expect(@entry["author"]).to eq "Leena Rao" 97 | title = "Angie’s List Sets Price Range IPO At $11 To $13 Per Share; Valued At Over $600M" 98 | expect(@entry["title"]).to eq title 99 | end 100 | 101 | it "allows setting field values with hash syntax" do 102 | @entry["title"] = "Foobar" 103 | expect(@entry.title).to eq "Foobar" 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /spec/sample_feeds/ITunesWithSingleQuotedAttributes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | All About Everything 7 | http://www.example.com/podcasts/everything/index.html 8 | en-us 9 | ℗ & © 2005 John Doe & Family 10 | A show about everything 11 | http://example.com/new.xml 12 | John Doe 13 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 14 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 15 | 16 | John Doe 17 | john.doe@example.com 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | Shake Shake Shake Your Spices 26 | John Doe 27 | A short primer on table spices 28 | This week we talk about salt and pepper shakers, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party! 29 | 30 | http://example.com/podcasts/archive/aae20050615.m4a 31 | Wed, 15 Jun 2005 19:00:00 GMT 32 | 7:04 33 | salt, pepper, shaker, exciting 34 | 35 | 12 36 | yes 37 | <p><strong>TOPIC</strong>: Gooseneck Options</p> 38 | 39 | 40 | 41 | Socket Wrench Shootout 42 | Jane Doe 43 | Comparing socket wrenches is fun! 44 | This week we talk about metric vs. old english socket wrenches. Which one is better? Do you really need both? Get all of your answers here. 45 | 46 | http://example.com/podcasts/archive/aae20050608.mp3 47 | Wed, 8 Jun 2005 19:00:00 GMT 48 | 4:34 49 | metric, socket, wrenches, tool 50 | 51 | 52 | 53 | 54 | Red, Whine, & Blue 55 | Various 56 | Red + Blue != Purple 57 | This week we talk about surviving in a Red state if you are a Blue person. Or vice versa. 58 | 59 | http://example.com/podcasts/archive/aae20050601.mp3 60 | Wed, 1 Jun 2005 19:00:00 GMT 61 | 3:59 62 | politics, red, blue, state 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /spec/feedjira/parser/itunes_rss_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe "#will_parse?" do 8 | it "returns true for an itunes RSS feed" do 9 | expect(ITunesRSS).to be_able_to_parse(sample_itunes_feed) 10 | end 11 | 12 | it "returns true for an itunes RSS feed with spaces between attribute names, equals sign, and values" do 13 | expect(ITunesRSS).to be_able_to_parse(sample_itunes_feed_with_spaces) 14 | end 15 | 16 | it "returns true for an itunes RSS feed with single-quoted attributes" do 17 | expect(ITunesRSS).to be_able_to_parse(sample_itunes_feed_with_single_quotes) 18 | end 19 | 20 | it "returns fase for an atom feed" do 21 | expect(ITunesRSS).not_to be_able_to_parse(sample_atom_feed) 22 | end 23 | 24 | it "returns false for an rss feedburner feed" do 25 | expect(ITunesRSS).not_to be_able_to_parse(sample_rss_feed_burner_feed) 26 | end 27 | end 28 | 29 | describe "parsing" do 30 | before do 31 | @feed = ITunesRSS.parse(sample_itunes_feed) 32 | end 33 | 34 | it "parses the ttl" do 35 | expect(@feed.ttl).to eq "60" 36 | end 37 | 38 | it "parses the last build date" do 39 | expect(@feed.last_built).to eq "Sat, 07 Sep 2002 09:42:31 GMT" 40 | end 41 | 42 | it "parses the subtitle" do 43 | expect(@feed.itunes_subtitle).to eq "A show about everything" 44 | end 45 | 46 | it "parses the author" do 47 | expect(@feed.itunes_author).to eq "John Doe" 48 | end 49 | 50 | it "parses an owner" do 51 | expect(@feed.itunes_owners.size).to eq 1 52 | end 53 | 54 | it "parses an image" do 55 | expect(@feed.itunes_image).to eq "http://example.com/podcasts/everything/AllAboutEverything.jpg" 56 | end 57 | 58 | it "parses the image url" do 59 | expect(@feed.image.url).to eq "http://example.com/podcasts/everything/AllAboutEverything.jpg" 60 | end 61 | 62 | it "parses the image title" do 63 | expect(@feed.image.title).to eq "All About Everything" 64 | end 65 | 66 | it "parses the image link" do 67 | expect(@feed.image.link).to eq "http://www.example.com/podcasts/everything/index.html" 68 | end 69 | 70 | it "parses the image width" do 71 | expect(@feed.image.width).to eq "88" 72 | end 73 | 74 | it "parses the image height" do 75 | expect(@feed.image.height).to eq "31" 76 | end 77 | 78 | it "parses the image description" do 79 | description = "All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store" 80 | expect(@feed.image.description).to eq description 81 | end 82 | 83 | it "parses categories" do 84 | expect(@feed.itunes_categories).to eq [ 85 | "Technology", 86 | "Gadgets", 87 | "TV & Film", 88 | "Arts", 89 | "Design", 90 | "Food" 91 | ] 92 | 93 | expect(@feed.itunes_category_paths).to eq [ 94 | %w[Technology Gadgets], 95 | ["TV & Film"], 96 | %w[Arts Design], 97 | %w[Arts Food] 98 | ] 99 | end 100 | 101 | it "parses the itunes type" do 102 | expect(@feed.itunes_type).to eq "episodic" 103 | end 104 | 105 | it "parses the summary" do 106 | summary = "All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store" 107 | expect(@feed.itunes_summary).to eq summary 108 | end 109 | 110 | it "parses the complete tag" do 111 | expect(@feed.itunes_complete).to eq "yes" 112 | end 113 | 114 | it "parses entries" do 115 | expect(@feed.entries.size).to eq 3 116 | end 117 | 118 | it "parses the new-feed-url" do 119 | expect(@feed.itunes_new_feed_url).to eq "http://example.com/new.xml" 120 | end 121 | end 122 | end 123 | end 124 | -------------------------------------------------------------------------------- /spec/feedjira/parser/atom_feed_burner_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module Feedjira 6 | module Parser 7 | describe "#will_parse?" do 8 | it "returns true for a feedburner atom feed" do 9 | expect(AtomFeedBurner).to be_able_to_parse(sample_feedburner_atom_feed) 10 | end 11 | 12 | it "returns false for an rdf feed" do 13 | expect(AtomFeedBurner).not_to be_able_to_parse(sample_rdf_feed) 14 | end 15 | 16 | it "returns false for a regular atom feed" do 17 | expect(AtomFeedBurner).not_to be_able_to_parse(sample_atom_feed) 18 | end 19 | 20 | it "returns false for an rss feedburner feed" do 21 | expect(AtomFeedBurner).not_to be_able_to_parse sample_rss_feed_burner_feed 22 | end 23 | end 24 | 25 | describe "parsing old style feeds" do 26 | before do 27 | @feed = AtomFeedBurner.parse(sample_feedburner_atom_feed) 28 | end 29 | 30 | it "parses the title" do 31 | expect(@feed.title).to eq "Paul Dix Explains Nothing" 32 | end 33 | 34 | it "parses the description" do 35 | description = "Entrepreneurship, programming, software development, politics, NYC, and random thoughts." 36 | expect(@feed.description).to eq description 37 | end 38 | 39 | it "parses the url" do 40 | expect(@feed.url).to eq "http://www.pauldix.net/" 41 | end 42 | 43 | it "parses the feed_url" do 44 | expect(@feed.feed_url).to eq "http://feeds.feedburner.com/PaulDixExplainsNothing" 45 | end 46 | 47 | it "parses no hub urls" do 48 | expect(@feed.hubs.count).to eq 0 49 | end 50 | 51 | it "parses hub urls" do 52 | AtomFeedBurner.preprocess_xml = false 53 | feed_with_hub = AtomFeedBurner.parse(load_sample("TypePadNews.xml")) 54 | expect(feed_with_hub.hubs.count).to eq 1 55 | end 56 | 57 | it "parses entries" do 58 | expect(@feed.entries.size).to eq 5 59 | end 60 | 61 | it "changes url" do 62 | new_url = "http://some.url.com" 63 | expect { @feed.url = new_url }.not_to raise_error 64 | expect(@feed.url).to eq new_url 65 | end 66 | 67 | it "changes feed_url" do 68 | new_url = "http://some.url.com" 69 | expect { @feed.feed_url = new_url }.not_to raise_error 70 | expect(@feed.feed_url).to eq new_url 71 | end 72 | end 73 | 74 | describe "parsing alternate style feeds" do 75 | before do 76 | @feed = AtomFeedBurner.parse(sample_feedburner_atom_feed_alternate) 77 | end 78 | 79 | it "parses the title" do 80 | expect(@feed.title).to eq "Giant Robots Smashing Into Other Giant Robots" 81 | end 82 | 83 | it "parses the description" do 84 | description = "Written by thoughtbot" 85 | expect(@feed.description).to eq description 86 | end 87 | 88 | it "parses the url" do 89 | expect(@feed.url).to eq "https://robots.thoughtbot.com" 90 | end 91 | 92 | it "parses the feed_url" do 93 | expect(@feed.feed_url).to eq "http://feeds.feedburner.com/GiantRobotsSmashingIntoOtherGiantRobots" 94 | end 95 | 96 | it "parses hub urls" do 97 | expect(@feed.hubs.count).to eq 1 98 | end 99 | 100 | it "parses entries" do 101 | expect(@feed.entries.size).to eq 3 102 | end 103 | 104 | it "changes url" do 105 | new_url = "http://some.url.com" 106 | expect { @feed.url = new_url }.not_to raise_error 107 | expect(@feed.url).to eq new_url 108 | end 109 | 110 | it "changes feed_url" do 111 | new_url = "http://some.url.com" 112 | expect { @feed.feed_url = new_url }.not_to raise_error 113 | expect(@feed.feed_url).to eq new_url 114 | end 115 | end 116 | 117 | describe "preprocessing" do 118 | it "retains markup in xhtml content" do 119 | AtomFeedBurner.preprocess_xml = true 120 | 121 | feed = AtomFeedBurner.parse sample_feed_burner_atom_xhtml_feed 122 | entry = feed.entries.first 123 | 124 | expect(entry.content).to match(/\A

"my_id", "url" => "my_url", **overrides } 8 | end 9 | 10 | before do 11 | # I don't really like doing it this way because these unit test should only 12 | # rely on JSONFeed, but this is actually how it should work. You would 13 | # never just pass entry json straight to the JSONFeedItem 14 | @entry = Feedjira::Parser::JSONFeed.parse(sample_json_feed).entries.first 15 | end 16 | 17 | it "parses the id" do 18 | expect(@entry.id).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi" 19 | end 20 | 21 | it "parses the url" do 22 | expect(@entry.url).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi" 23 | end 24 | 25 | it "parses the title" do 26 | expect(@entry.title).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls" 27 | end 28 | 29 | it "parses the content" do 30 | content = "

On Wednesday night I know where I’ll be — playing keyboard for a few songs at the James Dempsey and the Breakpoints concert benefitting App Camp for Girls.

\n\n

You should get tickets. It’s a fun time for a great cause.

\n\n

Bonus: James writes about how this concert is full circle for him. It’s a special night.

" 31 | expect(@entry.content).to eq content 32 | end 33 | 34 | it "parses the published date" do 35 | published = Feedjira::Util::ParseTime.call "2017-06-02T22:05:47-07:00" 36 | expect(@entry.published).to eq published 37 | end 38 | 39 | it "sets the published date to nil when not present" do 40 | entry = described_class.new(params) 41 | 42 | expect(entry.published).to be_nil 43 | end 44 | 45 | it "sets updated to date_modified when present" do 46 | updated = "2017-06-02T22:05:47-07:00" 47 | entry = described_class.new(params("date_modified" => updated)) 48 | 49 | updated = Feedjira::Util::ParseTime.call "2017-06-02T22:05:47-07:00" 50 | expect(entry.updated).to eq updated 51 | end 52 | 53 | it "sets updated to nil when date_modified is not present" do 54 | entry = described_class.new(params) 55 | 56 | expect(entry.updated).to be_nil 57 | end 58 | 59 | it "sets the author when nested author object is present" do 60 | entry = described_class.new(params("author" => { "name" => "John Doe" })) 61 | 62 | expect(entry.author).to eq "John Doe" 63 | end 64 | 65 | it "sets the author to nil when nested author object is not present" do 66 | entry = described_class.new(params) 67 | 68 | expect(entry.author).to be_nil 69 | end 70 | 71 | it "supports each" do 72 | expect(@entry).to respond_to :each 73 | end 74 | 75 | it "is able to list out all the fields with each" do 76 | all_fields = [] 77 | title_value = "" 78 | @entry.each do |field, value| 79 | all_fields << field 80 | title_value = value if field == "title" 81 | end 82 | 83 | expect(title_value).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls" 84 | 85 | expected_fields = %w[ 86 | author 87 | banner_image 88 | categories 89 | content 90 | entry_id 91 | external_url 92 | image 93 | json 94 | published 95 | summary 96 | title 97 | updated 98 | url 99 | ] 100 | expect(all_fields).to match_array expected_fields 101 | end 102 | 103 | it "supports checking if a field exists in the entry" do 104 | expect(@entry).to include "title" 105 | expect(@entry).to include "url" 106 | end 107 | 108 | it "allows access to fields with hash syntax" do 109 | expect(@entry["title"]).to eq "James Dempsey and the Breakpoints Benefit App Camp for Girls" 110 | expect(@entry["url"]).to eq "http://inessential.com/2017/06/02/james_dempsey_and_the_breakpoints_benefi" 111 | end 112 | 113 | it "allows setting field values with hash syntax" do 114 | @entry["title"] = "Foobar" 115 | expect(@entry.title).to eq "Foobar" 116 | end 117 | end 118 | -------------------------------------------------------------------------------- /spec/sample_feeds/TechCrunchFirstEntry.xml: -------------------------------------------------------------------------------- 1 | angies-list

Angie’s List, which offers consumers a way to review and rate doctors, contractors and service companies on the Web, has just set the terms for its IPO. In a new filing, the company revealed that it aims to raise as much as $131.4 million in the offering and has priced its IPO in the range of $11 to $13 per share. The company will list on the Nasdaq under the symbol “ANGI.” At the high end of the range, Angie’s List would be valued at nearly $700 million.

2 |

Angie’s List launched in 1995 with a focus on local home, yard and car services, sits at the intersection of local search, user-generated content and subscription-based services. To date, Angie’s List has raised nearly $100 million from Battery Ventures, T. Rowe Price, City Investment Group, Cardinal Ventures and others.

3 |

As of September 30, 2011, the company offered its service to paying members in 175 local markets in the United States (compared to 170 as of August). Angie’s List now has more than 1 million (up from 820,000) paid memberships.

4 |

Angie’s List incurred marketing expenses of $30.2 million and $48 million in 2010 and the nine months ended September 30, 2011, respectively. In 2010 and the nine months ended September 30, 2011, the company’s revenue was $59.0 million and $62.6 million, respectively. In the same periods, Angie’s net loss was $27.2 million and $43.2 million. Angie’s List has incurred net losses its start and had an accumulated deficit of $160.6 million as of September 30, 2011.

5 |
6 |


7 |

8 | 9 |
-------------------------------------------------------------------------------- /spec/sample_feeds/FeedjiraBlog.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Feedjira Blog 4 | A Blog for Feedjira 5 | http://feedjira.com/blog 6 | 7 | 8 | 2014-03-17T00:00:00Z 9 | 10 | Jon Allured 11 | 12 | 13 | Feedjira Goes One-Point-Oh 14 | 15 | http://feedjira.com/blog/2014/03/17/feedjira-goes-one-point-oh.html 16 | 2014-03-17T00:00:00Z 17 | 2014-03-17T08:02:45-05:00 18 | 19 | Jon Allured 20 | 21 | <p>Last fall, I asked <a href="http://www.pauldix.net">Paul Dix</a> if I could take over maintenance of his gem 22 | Feedzirra. My request was totally out of the blue, so I was pretty pumped when 23 | he got right back to me and said yes. He said that he didn&rsquo;t have time to work 24 | on it anymore and so I should feel free to do whatever I thought was best.</p> 25 | 26 | <p>Score!</p> 27 | 28 | <p>My first order of business was to go through the many open issues and pull 29 | requests on GitHub. When I started there were over 60, a number that I&rsquo;ve gotten 30 | down to just a few. I thought it was important to ensure that users saw me treat 31 | their issue as important and even if it was very old (which many were), I asked 32 | if there was anything I could do to help.</p> 33 | 34 | <p>I was pleasantly surprised by the nice way many people responded and we got to 35 | work addressing their questions and issues.</p> 36 | 37 | <p>As I was working through issues and pull requests, I kept <a href="http://semver.org">SemVer</a> in mind - 38 | bug fixes in patch releases and backward-compatible changes in minor releases. 39 | But I also realized that it was past time for this project to be at version 1.0. 40 | In the SemVer FAQ, they talk about when to release version 1.0 and Feedzirra fit 41 | the bill: it was being used in production, there was a stable API and I was 42 | taking backwards compatibilty seriously.</p> 43 | 44 | <p>So I treated it as a project at 1.0 and I did my best to release versions that 45 | were backward compatible and added deprecations for what I wanted to do in 1.0. 46 | I saw things that I wanted to completely rewrite, but I resisted the urge to 47 | burn it all down and start again.</p> 48 | 49 | <p>When I was close to being caught up on the backlog of issues and pull requests, 50 | I started thinking about releasing version 1.0, and I knew I wanted to create a 51 | website for the project. I worked with <a href="http://danielariza.com">Daniel Ariza</a> to make it happen. I 52 | ripped apart the README and rewrote just about all the sections.</p> 53 | 54 | <p>There was an open issue on the project about renaming the Gem and I knew that 55 | launching the website and releasing 1.0 would be the perfect opportunity, so I 56 | went for it. There was a suggestion to change the name to Feedzilla, but since 57 | that is already a thing, I went with Feedjira. I bought the domain and setup an 58 | organization by that name on GitHub.</p> 59 | 60 | <p>With those things in place, I needed to actually update the code for these 61 | changes. I wanted to make this transition as easy as possible and devised a 62 | simple way to use <a href="/versions.html">three versions</a> to make the jump to 1.0.</p> 63 | 64 | <p>For most users, upgrading to 1.0 should be a breeze, but I have an <a href="/upgrading.html">upgrade 65 | page</a> to help with a couple details. If you have any trouble upgrading, 66 | please let me know by <a href="https://github.com/feedjira/feedjira/issues">opening an issue</a>.</p> 67 | 68 | <p>There are still lots of things I&rsquo;d like to do with this Gem. I mentioned seeing 69 | things that I wanted to completely rewrite, so that&rsquo;ll be something that I work 70 | on for a 2.0 release, but that&rsquo;s a ways off. I&rsquo;d like to officially support 71 | JRuby. Many people use Feedjira with Rails, so a separate project that helps 72 | those users get up and running quickly seems to have value.</p> 73 | 74 | <p>The list goes on.</p> 75 | 76 | <p>I do have a request before I finish this thing: I&rsquo;d like to hear from users that 77 | have apps in production using Feedjira. If you&rsquo;re using Feedjira for a 78 | commercial app, please <a href="feedjira@gmail.com">email me</a>!</p> 79 | 80 | <p>Thanks to everyone who has helped me accomplish this, but especially <a href="http://www.pauldix.net">Paul 81 | Dix</a> for creating such a fun project to work on, <a href="http://danielariza.com">Daniel Ariza</a> for a 82 | badass website design and the many people who opened issues or sent pull 83 | requests. Open source is fun to work on because of people like you!! &lt;3 &lt;3 &lt;3</p> 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /spec/sample_feeds/itunes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | All About Everything 7 | http://www.example.com/podcasts/everything/index.html 8 | en-us 9 | ℗ & © 2005 John Doe & Family 10 | Sat, 07 Sep 2002 09:42:31 GMT 11 | 60 12 | episodic 13 | A show about everything 14 | http://example.com/new.xml 15 | John Doe 16 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 17 | All About Everything is a show about everything. 18 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 19 | yes 20 | 21 | John Doe 22 | john.doe@example.com 23 | 24 | 25 | 26 | http://example.com/podcasts/everything/AllAboutEverything.jpg 27 | All About Everything 28 | http://www.example.com/podcasts/everything/index.html 29 | 88 30 | 31 31 | All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our Podcast in the iTunes Music Store 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | Shake Shake Shake Your Spices 43 | Shake Shake Shake Your Spices 44 | full 45 | 1 46 | 3 47 | John Doe 48 | A short primer on table spices 49 | This week we talk about salt and pepper shakers, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party! 50 | 51 | http://example.com/podcasts/archive/aae20050615.m4a 52 | Wed, 15 Jun 2005 19:00:00 GMT 53 | 7:04 54 | salt, pepper, shaker, exciting 55 | 56 | 12 57 | yes 58 | <p><strong>TOPIC</strong>: Gooseneck Options</p> 59 | 60 | 61 | 62 | Socket Wrench Shootout 63 | Socket Wrench Shootout 64 | full 65 | 1 66 | 2 67 | Jane Doe 68 | Comparing socket wrenches is fun! 69 | This week we talk about metric vs. old english socket wrenches. Which one is better? Do you really need both? Get all of your answers here. 70 | 71 | http://example.com/podcasts/archive/aae20050608.mp3 72 | Wed, 8 Jun 2005 19:00:00 GMT 73 | 4:34 74 | metric, socket, wrenches, tool 75 | 76 | 77 | 78 | 79 | Red, Whine, & Blue 80 | Red, Whine, & Blue 81 | full 82 | 1 83 | 1 84 | Various 85 | Red + Blue != Purple 86 | This week we talk about surviving in a Red state if you are a Blue person. Or vice versa. 87 | 88 | http://example.com/podcasts/archive/aae20050601.mp3 89 | Wed, 1 Jun 2005 19:00:00 GMT 90 | 3:59 91 | politics, red, blue, state 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml: -------------------------------------------------------------------------------- 1 |

Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how to make a ruby library that is already using c underneath perform better. Since I've never written a Ruby C extension and it's been a few years since I've touched C, I decided it would be a good educational experience to give it a try.

2 | 3 |

First, let's look into how Nokogiri and SAX-Machine perform a parse. The syntax for SAX-Machine builds up a set of class variables (actually, instance variables on a class object) that describe what you're interested in parsing. So when you see something like this: 4 |

5 | It calls the 'element' and 'elements' methods inserted by the SAXMachine module that build up ruby objects that describe what XML tags we're interested in for the Entry class. That's all pretty straight forward and not really the source of any slowdown in the parsing process. These calls only happen once, when you first load the class. 6 | 7 |

Things get interesting when you run a parse. So you run Entry.parse(some_xml). That makes the call to Nokogiri, which in turn makes a call to libxml. Libxml then parses over the stream (or string) and makes calls to C methods (in Nokogiri) on certain events. For our purposes, the most interesting are start_element, end_element, and characters_func. The C code in Nokogiri for these is basic. It simply converts those C variables into Ruby ones and then makes calls to whatever instance of Nokogiri::XML:SAX::Document (a Ruby object) is associated with this parse. This is where SAXMachine comes back in. It has handlers for these events that match up the tags with the previously defined SAXMachine objects attached to the Entry class. It ignores the events that don't match a tag (however, it still needs to determine if the tag should be ignored).

8 | 9 |

The only possible place I saw to speed things up was to push more of SAX event handling down into the C code. Unfortunately, the only way to do this was to abandon Nokogiri and write my own code to interface with libxml. I used the xml_sax_parser.c from Nokogiri as a base and added to it. I changed it so the SAXMachine definitions of what was interesting would be stored in C. I then changed the SAX handling code to capture the events in C and determine if a tag was of interest there before sending it off to the Ruby objects. The end result is that calls are only made to Ruby when there is an actual event of interest. Thus, I avoid doing any comparisons in Ruby and those classes are simply wrappers that call out to the correct value setters.

10 | 11 |

Here are the results of a quick speed comparison against the Nokogiri SAXMachine, parsing my atom feed using code from my last post.

12 |
        user   system  total   real
sax c 0.060000 0.000000 0.060000 ( 0.069990)
sax nokogiri 0.500000 0.010000 0.510000 ( 0.520278)

13 | The SAX C is 7.4 times faster than SAX Nokogiri. Now, that doesn't seem like a whole lot, but I think it's quite good considering it was against a library that was already half in C. It's even more punctuated when you look at the comparison of these two against rfeedparser. 14 |

        user   system  total    real
sax c 0.060000 0.000000 0.060000 ( 0.069990)
sax nokogiri 0.500000 0.010000 0.510000 ( 0.520278)
rfeedparser 13.770000 1.730000 15.500000 ( 15.690309)
15 |

The SAX C version is 224 times faster than rfeedparser! The 7 times multiple from the Nokogiri version of SAXMachine really makes a difference. Unfortunately, I really only wrote this code as a test. It's not even close to something I would use for real. It has memory leaks, isn't thread safe, is completely unreadable, and has hidden bugs that I know about. You can take a look at it in all its misery on the c-rafactor branch of SAXMachine on github. Even though the code is awful, I think it's interesting that there can be this much variability in performance on Ruby libraries that are using C.

16 | 17 |

I could actually turn this into a legitimate working version, but it would take more work than I think it's worth at this point. Also, I'm not excited about the idea of dealing with C issues in SAXMachine. I would be more excited for it if I could get this type of SAX parsing thing into Nokogiri (in addition to the one that is there now). For now, I'll move on to using the Nokogiri version of SAXMachine to create a feed parsing library.

18 | 19 |
-------------------------------------------------------------------------------- /spec/feedjira/feed_utilities_entry_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe Feedjira::FeedUtilities do 6 | before do 7 | @klass = Class.new do 8 | include Feedjira::FeedEntryUtilities 9 | end 10 | end 11 | 12 | describe "handling dates" do 13 | it "parses an ISO 8601 formatted datetime into Time" do 14 | time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00") 15 | expect(time.class).to eq Time 16 | expect(time).to eq Feedjira::Util::ParseTime.call("Wed Feb 20 18:05:00 UTC 2008") 17 | end 18 | 19 | it "parses a ISO 8601 with milliseconds into Time" do 20 | time = @klass.new.parse_datetime("2013-09-17T08:20:13.931-04:00") 21 | expect(time.class).to eq Time 22 | expect(time).to eq Time.strptime("Tue Sep 17 12:20:13.931 UTC 2013", "%a %b %d %H:%M:%S.%N %Z %Y") 23 | end 24 | end 25 | 26 | describe "updated= method" do 27 | it "sets updated when no existing updated value and parsed date is valid" do 28 | instance = @klass.new 29 | instance.updated = "2023-01-01T10:00:00Z" 30 | expect(instance["updated"]).to eq Time.parse("2023-01-01T10:00:00Z").utc 31 | end 32 | 33 | it "updates to newer date when existing updated value is older" do 34 | instance = @klass.new 35 | instance.updated = "2023-01-01T10:00:00Z" 36 | instance.updated = "2023-01-02T10:00:00Z" 37 | expect(instance["updated"]).to eq Time.parse("2023-01-02T10:00:00Z").utc 38 | end 39 | 40 | it "keeps existing updated value when new date is older" do 41 | instance = @klass.new 42 | instance.updated = "2023-01-02T10:00:00Z" 43 | instance.updated = "2023-01-01T10:00:00Z" 44 | expect(instance["updated"]).to eq Time.parse("2023-01-02T10:00:00Z").utc 45 | end 46 | 47 | it "does not set updated when date parsing fails" do 48 | instance = @klass.new 49 | instance.updated = "invalid-date" 50 | expect(instance["updated"]).to be_nil 51 | end 52 | 53 | it "does not change existing updated when new date is invalid" do 54 | instance = @klass.new 55 | instance.updated = "2023-01-01T10:00:00Z" 56 | original_updated = instance["updated"] 57 | instance.updated = "invalid-date" 58 | expect(instance["updated"]).to eq original_updated 59 | end 60 | end 61 | 62 | describe "published= method" do 63 | it "sets published when no existing published value and parsed date is valid" do 64 | instance = @klass.new 65 | instance.published = "2023-01-01T10:00:00Z" 66 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc 67 | end 68 | 69 | it "updates to older date when existing published value is newer" do 70 | instance = @klass.new 71 | instance.published = "2023-01-02T10:00:00Z" 72 | instance.published = "2023-01-01T10:00:00Z" 73 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc 74 | end 75 | 76 | it "keeps existing published value when new date is newer" do 77 | instance = @klass.new 78 | instance.published = "2023-01-01T10:00:00Z" 79 | instance.published = "2023-01-02T10:00:00Z" 80 | expect(instance["published"]).to eq Time.parse("2023-01-01T10:00:00Z").utc 81 | end 82 | 83 | it "does not set published when date parsing fails" do 84 | instance = @klass.new 85 | instance.published = "invalid-date" 86 | expect(instance["published"]).to be_nil 87 | end 88 | 89 | it "does not change existing published when new date is invalid" do 90 | instance = @klass.new 91 | instance.published = "2023-01-01T10:00:00Z" 92 | original_published = instance["published"] 93 | instance.published = "invalid-date" 94 | expect(instance["published"]).to eq original_published 95 | end 96 | end 97 | 98 | describe "sanitizing" do 99 | before do 100 | @feed = Feedjira.parse(sample_atom_feed) 101 | @entry = @feed.entries.first 102 | end 103 | 104 | it "doesn't fail when no elements are defined on includer" do 105 | expect { @klass.new.sanitize! }.not_to raise_error 106 | end 107 | 108 | it "provides a sanitized title" do 109 | new_title = "#{@entry.title}" 110 | @entry.title = new_title 111 | scrubbed_title = Loofah.scrub_fragment(new_title, :prune).to_s 112 | expect(Loofah.scrub_fragment(@entry.title, :prune).to_s).to eq scrubbed_title 113 | end 114 | 115 | it "sanitizes content in place" do 116 | new_content = "