├── amazon-review.gemspec ├── README.md └── lib ├── amazon-review.rb └── amazon-review └── review.rb /amazon-review.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = 'amazon-review' 3 | s.version = '0.0.1' 4 | s.date = '2014-09-09' 5 | s.summary = "A simple gem to parse Amazon product reviews" 6 | s.description = "A simple gem to parse Amazon product reviews" 7 | s.authors = ["Jeff Mekler"] 8 | s.email = 'contact@jeffmekler.com' 9 | s.files = ["lib/amazon-review.rb", "lib/amazon-review/review.rb"] 10 | s.homepage = 11 | 'http://rubygems.org/gems/amazon-review' 12 | s.license = 'MIT' 13 | 14 | s.add_runtime_dependency "nokogiri", ["> 1.5.6"] 15 | end -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | A simple gem to parse Amazon product reviews. 3 | 4 | ## Installation 5 | ```ruby 6 | gem install amazon-review 7 | ``` 8 | 9 | Or if you're using Bundler, add this line to your Gemfile: 10 | ```ruby 11 | gem 'amazon-review' 12 | ``` 13 | 14 | ## Usage 15 | ```ruby 16 | require 'amazon-review' 17 | 18 | # parse reviews for "The Ruby Programming Language" book by O'Reilly 19 | # http://www.amazon.com/Ruby-Programming-Language-David-Flanagan/dp/0596516177 20 | reviews = AmazonReview.find_reviews('0596516177') 21 | 22 | # interact with data 23 | r = reviews.first 24 | r.url #=> "http://www.amazon.com/review/RLJ78T9DWIPBD" 25 | r.title #=> "This is the new authoritative Ruby book and the one to buy." 26 | r.text #=> "Originally planned as a second edition to Ruby classic, Rub..." 27 | r.rating #=> 5.0 28 | r.helpful_count #=> 128.0 29 | r.helpful_ratio #=> 0.9624060150375939 30 | r.date #=> 31 | r.user_id #=> "A2P9K69PISA1IO" 32 | ``` 33 | -------------------------------------------------------------------------------- /lib/amazon-review.rb: -------------------------------------------------------------------------------- 1 | require 'nokogiri' 2 | require 'open-uri' 3 | 4 | module AmazonReview 5 | 6 | def self.find_reviews(asin) 7 | reviews = [] 8 | delay = 0.5 9 | page = 1 10 | 11 | # iterate through the pages of reviews 12 | 13 | begin 14 | url = "http://www.amazon.com/product-reviews/#{asin}/?ie=UTF8&showViewpoints=0&pageNumber=#{page}&sortBy=bySubmissionDateAscending" 15 | doc = Nokogiri::HTML(open(url)) 16 | 17 | # parse each review 18 | new_reviews = 0 19 | doc.css("#productReviews td > a[name]").each do |review_html| 20 | reviews << Review.new(review_html) 21 | new_reviews += 1 22 | end 23 | # go to next page 24 | page += 1 25 | 26 | # delay to prevent 503 errors 27 | delay = [0, delay - 0.1].max # decrease delay 28 | sleep delay 29 | 30 | rescue Exception => e # error while parsing (likely a 503) 31 | delay += 0.5 # increase delay 32 | 33 | end until new_reviews == 0 34 | 35 | reviews 36 | end 37 | 38 | end 39 | 40 | require_relative "amazon-review/review" -------------------------------------------------------------------------------- /lib/amazon-review/review.rb: -------------------------------------------------------------------------------- 1 | module AmazonReview 2 | class Review 3 | 4 | def initialize(html) 5 | @html = html 6 | @div = html.next_element.next_element 7 | end 8 | 9 | def inspect 10 | "" 11 | end 12 | 13 | def id 14 | @id ||= @html['name'] 15 | end 16 | 17 | def url 18 | @url ||= "http://www.amazon.com/review/#{id}" 19 | end 20 | 21 | def user_id 22 | regex = /[A-Z0-9]+/ 23 | @user_id ||= @div.css('a[href^="/gp/pdp/profile"]').first["href"][regex] 24 | end 25 | 26 | def title 27 | @title ||= @div.css("b").first.text.strip 28 | end 29 | 30 | def date 31 | @date ||= Date.parse(@div.css("nobr").first.text) 32 | end 33 | 34 | def text 35 | # remove leading and trailing line returns, tabs, and spaces 36 | @text ||= @div.css(".reviewText").first.content.strip #sub(/\A[\n\t\s]+/,"").sub(/[\n\t\s]+\Z/,"") 37 | end 38 | 39 | def rating 40 | regex = /[0-9\.]+/ 41 | @rating ||= Float( @div.css("span.swSprite").first['title'][regex] ) 42 | end 43 | 44 | def helpful_count 45 | if helpful_match 46 | @helpful_count ||= Float(helpful_match.captures[0]) 47 | else 48 | @helpful_count = nil 49 | end 50 | 51 | @helpful_count 52 | end 53 | 54 | def helpful_ratio 55 | if helpful_match 56 | @helpful_ratio ||= Float(helpful_match.captures[0]) / Float(helpful_match.captures[1]) 57 | else 58 | @helpful_ratio = nil 59 | end 60 | 61 | @helpful_ratio 62 | end 63 | 64 | def to_hash 65 | attrs = [:id, :url, :user_id, :title, :date, :text, :rating, :helpful_count, :helpful_ratio] 66 | attrs.inject({}) do |r,attr| 67 | r[attr] = self.send(attr) 68 | r 69 | end 70 | end 71 | 72 | private 73 | 74 | def helpful_match 75 | @helpful_match ||= @div.text.match(/(\d+) of (\d+) people/) 76 | end 77 | end 78 | 79 | end --------------------------------------------------------------------------------