├── lib ├── index.coffee ├── page │ ├── ReviewsIframePage.coffee │ ├── ReviewDesktopPage.coffee │ ├── ReviewMobilePage.coffee │ ├── Page.coffee │ └── AllCustomerReviewsPage.coffee ├── README.md ├── getReview.coffee └── getReviewIds.coffee ├── index.js ├── .gitignore ├── test ├── getReviewIds.test.coffee ├── page │ ├── Page.test.coffee │ ├── ReviewDesktopPage.test.coffee │ ├── ReviewMobilePage.test.coffee │ ├── AllCustomerReviewsPage.test.coffee │ └── ReviewsIframePage.test.coffee └── getReview.test.coffee ├── package.json └── README.md /lib/index.coffee: -------------------------------------------------------------------------------- 1 | 2 | module.exports = 3 | getReview: require './getReview' 4 | getReviewIds: require './getReviewIds' 5 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var CoffeeScript = require('coffee-script'); 2 | 3 | if (CoffeeScript.register) { 4 | CoffeeScript.register(); 5 | } 6 | 7 | module.exports = require('./lib/index'); 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | lib-cov 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | 10 | pids 11 | logs 12 | results 13 | 14 | node_modules 15 | npm-debug.log 16 | 17 | sftp-config.json 18 | 19 | *.un~ 20 | .DS_Store 21 | -------------------------------------------------------------------------------- /test/getReviewIds.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | getReviewIds = require '../lib/getReviewIds' 4 | 5 | 6 | describe 'getReviewIds(...)', -> 7 | it 'should be done', (done) -> 8 | getReviewIds 9 | productId: 'B00DFFT76U' 10 | , 11 | (err, reviewIds) -> 12 | should.not.exist err 13 | should.exist reviewIds 14 | 15 | done() 16 | -------------------------------------------------------------------------------- /lib/page/ReviewsIframePage.coffee: -------------------------------------------------------------------------------- 1 | Page = require './Page' 2 | 3 | 4 | class ReviewsIframePage extends Page 5 | #### get all customer reviews page url from the reviews iframe url. 6 | getAllCustomerReviewsPageUrl: -> 7 | linkTags = @$('.small a') 8 | return null if linkTags.length is 0 9 | 10 | @$('.small a').eq(0).attr 'href' 11 | 12 | 13 | 14 | module.exports = ReviewsIframePage 15 | -------------------------------------------------------------------------------- /test/page/Page.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | Page = require '../../lib/page/Page' 4 | 5 | 6 | describe 'Page', -> 7 | describe 'constructor(...)', -> 8 | it 'should be done', (done) -> 9 | page = new Page 10 | url: 'http://www.google.com' 11 | , 12 | (err, $) -> 13 | should.not.exist err 14 | should.exist $ 15 | done() 16 | 17 | it 'should be `no url.` error', (done) -> 18 | page = new Page {}, (err, $) -> 19 | should.exist err 20 | done() 21 | -------------------------------------------------------------------------------- /lib/page/ReviewDesktopPage.coffee: -------------------------------------------------------------------------------- 1 | Page = require './Page' 2 | 3 | 4 | class ReviewDesktopPage extends Page 5 | parse: -> 6 | return new Error 'the page is not loaded.' if not @$? 7 | 8 | result = {} 9 | 10 | authorInfoTag = @$('.crAuthorInfo') 11 | profileTag = authorInfoTag.find('a').eq(0) 12 | if profileTag? 13 | result.profile = 14 | name: profileTag?.text() 15 | url: profileTag?.attr 'href' 16 | id: profileTag?.attr('href')?.split('/')?[4] 17 | 18 | result 19 | 20 | 21 | 22 | module.exports = ReviewDesktopPage 23 | -------------------------------------------------------------------------------- /test/page/ReviewDesktopPage.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | ReviewDesktopPage = require '../../lib/page/ReviewDesktopPage' 4 | 5 | 6 | describe 'ReviewDesktopPage', -> 7 | describe 'parse()', -> 8 | it 'should be done', (done) -> 9 | reviewDesktopPage = new ReviewDesktopPage 10 | url: 'http://www.amazon.com/review/RDQO5C2XEPVPC' 11 | , 12 | (err, $) -> 13 | should.not.exist err 14 | should.exist $ 15 | 16 | result = reviewDesktopPage.parse() 17 | should.exist result 18 | 19 | done() 20 | -------------------------------------------------------------------------------- /test/page/ReviewMobilePage.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | ReviewMobilePage = require '../../lib/page/ReviewMobilePage' 4 | 5 | 6 | describe 'ReviewMobilePage', -> 7 | describe 'parse()', -> 8 | it 'should be done', (done) -> 9 | reviewMobilePage = new ReviewMobilePage 10 | url: 'http://www.amazon.com/gp/aw/review/B00DFFT76U/RDQO5C2XEPVPC' 11 | , 12 | (err, $) -> 13 | should.not.exist err 14 | should.exist $ 15 | 16 | result = reviewMobilePage.parse() 17 | should.exist result 18 | 19 | done() 20 | -------------------------------------------------------------------------------- /test/getReview.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | getReview = require '../lib/getReview' 4 | 5 | 6 | describe 'getReview()', -> 7 | it 'should be done', (done) -> 8 | getReview 9 | productId: 'B00DFFT76U' 10 | reviewId: 'RDQO5C2XEPVPC' 11 | , 12 | (err, review) -> 13 | should.not.exist err 14 | should.exist review 15 | 16 | review.title.should.equal 'Swaddlers vs Cruisers Size 4' 17 | review.starCount.should.equal 5 18 | should.exist review.descText 19 | should.exist review.profile 20 | review.profile.name.should.equal 'Rebecca N' 21 | review.profile.id.should.equal 'A276OI0NHBYORX' 22 | 23 | done() 24 | -------------------------------------------------------------------------------- /lib/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Class Hierarchy 3 | * Page 4 | * ReviewsIframePage 5 | * AllCustomerReviewsPage 6 | * ReviewMobilePage 7 | * ReviewDesktopPage 8 | 9 | Refer to [Amazon Product Advertising API](https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html), customer reviews are not proveded directly. Instead of that, the API provides an url to redirect IFrame page which has customer reviews about an item. 10 | 11 | ## Basic flow 12 | 1. Get an `IFrame page url` from Amazon Product Advertising API 13 | 1. Find `All Customer Reviews Page url` and load it 14 | 1. Find ID array of the reviews on the page 15 | 1. Load the `Review page` and crawl it 16 | 1. Find `Next page url` 17 | 1. Repeat crawling until the end page 18 | -------------------------------------------------------------------------------- /test/page/AllCustomerReviewsPage.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | AllCustomerReviewsPage = require '../../lib/page/AllCustomerReviewsPage' 4 | 5 | 6 | describe 'AllCustomerReviewsPage', -> 7 | it 'should be done', (done) -> 8 | allCustomerReviewsPage = new AllCustomerReviewsPage 9 | url: 'http://www.amazon.com/Pampers-Swaddlers-Diapers-Economy-Count/product-reviews/B00DFFT76U' 10 | , 11 | (err, $) -> 12 | should.not.exist err 13 | should.exist $ 14 | 15 | productId = allCustomerReviewsPage.getProductId() 16 | should.exist productId 17 | productId.should.equal 'B00DFFT76U' 18 | 19 | nextPageUrl = allCustomerReviewsPage.getNextPageUrl() 20 | should.exist nextPageUrl 21 | 22 | reviewIds = allCustomerReviewsPage.getReviewIds() 23 | reviewIds.should.length 10 24 | 25 | done() 26 | -------------------------------------------------------------------------------- /test/page/ReviewsIframePage.test.coffee: -------------------------------------------------------------------------------- 1 | should = require 'should' 2 | 3 | ReviewsIframePage = require '../../lib/page/ReviewsIframePage' 4 | 5 | 6 | # describe 'ReviewsIframePage', -> 7 | # describe 'getAllCustomerReviewsPageUrl()', -> 8 | # it 'should be done', (done) -> 9 | # reviewsIframePage = new ReviewsIframePage 10 | # # TODO: change this url. 11 | # url: 'http://www.amazon.com/reviews/iframe?akid=AKIAIPG2BJRKIIQIKQ5Q&alinkCode=xm2&asin=B00DFFT76U&atag=PutYourAssociateTagHere&exp=2013-10-08T22%3A55%3A26Z&v=2&sig=3SSmSMbVRMlIgfRQjH5cfNnw1Pb%2Bp%2F%2Fr%2F8f7iEhsQZQ%3D' 12 | # , 13 | # (err, $) -> 14 | # should.not.exist err 15 | # should.exist $ 16 | 17 | # allCustomerReviewsPageUrl = reviewsIframePage.getAllCustomerReviewsPageUrl() 18 | # should.exist allCustomerReviewsPageUrl 19 | 20 | # done() 21 | -------------------------------------------------------------------------------- /lib/page/ReviewMobilePage.coffee: -------------------------------------------------------------------------------- 1 | Page = require './Page' 2 | 3 | 4 | class ReviewMobilePage extends Page 5 | parse: -> 6 | return new Error 'the page is not loaded.' if not @$? 7 | 8 | result = {} 9 | result.title = @$('#reviews-list h4')?.text()?.replace /\n/g, '' 10 | result.starCount = @$('.a-icon-star-full').length 11 | 12 | dateText = @$('span.a-color-secondary')?.eq(0)?.text()?.split('-')?[1]?.replace /\n/g, '' 13 | result.createdAt = new Date dateText 14 | 15 | # replace all br tags to new lines. 16 | descTag = @$('.a-spacing-micro') 17 | descTag?.find('br')?.replaceWith '\n' 18 | result.descText = @$('.a-spacing-micro').text() 19 | 20 | result.helpfulCount = Number @$('.votes-helpful').text() 21 | result.voteCount = Number @$('.votes-total').text() 22 | 23 | result 24 | 25 | 26 | 27 | module.exports = ReviewMobilePage 28 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "amazon-reviews", 3 | "version": "0.0.4", 4 | "description": "A node.js module to crawl product reviews from Amazon.", 5 | "main": "index.js", 6 | "dependencies": { 7 | "coffee-script": "~1.7.1", 8 | "async": "~0.2.10", 9 | "request": "~2.34.0", 10 | "cheerio": "~0.14.0" 11 | }, 12 | "devDependencies": { 13 | "should": "~1.0.0", 14 | "mocha": "~1.3.0" 15 | }, 16 | "directories": { 17 | "test": "test" 18 | }, 19 | "scripts": { 20 | "test": "mocha --compilers coffee:coffee-script --require coffee-script/register --globals lw --recursive ./test -t 50000" 21 | }, 22 | "repository": { 23 | "type": "git", 24 | "url": "https://github.com/xissy/node-amazon-reviews.git" 25 | }, 26 | "keywords": [ 27 | "amazon", 28 | "product", 29 | "review", 30 | "crawl" 31 | ], 32 | "author": "Taeho Kim ", 33 | "license": "MIT", 34 | "bugs": { 35 | "url": "https://github.com/xissy/node-amazon-reviews/issues" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /lib/getReview.coffee: -------------------------------------------------------------------------------- 1 | ReviewMobilePage = require './page/ReviewMobilePage' 2 | ReviewDesktopPage = require './page/ReviewDesktopPage' 3 | 4 | 5 | #### retrieve a review by productId and reviewId. 6 | module.exports = (options, callback) -> 7 | return callback new Error 'no options.productId' if not options?.productId? 8 | return callback new Error 'no options.reviewId' if not options?.reviewId? 9 | 10 | options.url = "http://www.amazon.com/gp/aw/review/#{options.productId}/#{options.reviewId}" 11 | 12 | reviewMobilePage = new ReviewMobilePage options, (err, $) -> 13 | return callback err if err? 14 | 15 | reviewMobilePage.$ = $ 16 | 17 | review = reviewMobilePage.parse() 18 | 19 | options.url = "http://www.amazon.com/review/#{options.reviewId}" 20 | 21 | reviewDesktopPage = new ReviewDesktopPage options, (err, $) -> 22 | return callback err if err? 23 | 24 | reviewDesktopPage.$ = $ 25 | 26 | for k, v of reviewDesktopPage.parse() 27 | review[k] = v 28 | 29 | callback null, review 30 | -------------------------------------------------------------------------------- /lib/page/Page.coffee: -------------------------------------------------------------------------------- 1 | request = require 'request' 2 | cheerio = require 'cheerio' 3 | 4 | 5 | class Page 6 | #### default options for load a web-page. 7 | defaultOptions: 8 | headers: 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 10 | 'Accept-Language': 'en-US,en;q=0.8' 11 | 'Cache-Control': 'no-cache' 12 | 'Connection': 'keep-alive' 13 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3' 14 | 15 | 16 | #### load and parse the page. 17 | # `options` must have `url`. 18 | constructor: (@options, callback) -> 19 | return callback new Error 'no url.' if not options.url? 20 | 21 | for k, v of @defaultOptions 22 | options[k] = v if not options[k]? 23 | 24 | request options, (err, response, body) => 25 | return callback err if err? 26 | 27 | if response?.statusCode isnt 200 28 | return callback new Error "#{response.statusCode} - #{body}" 29 | 30 | @$ = cheerio.load body 31 | callback null, @$ 32 | 33 | 34 | 35 | module.exports = Page 36 | -------------------------------------------------------------------------------- /lib/page/AllCustomerReviewsPage.coffee: -------------------------------------------------------------------------------- 1 | Page = require './Page' 2 | urlModule = require 'url' 3 | 4 | 5 | class AllCustomerReviewsPage extends Page 6 | 7 | #### get the product id. 8 | getProductId: -> 9 | parsedUrl = urlModule.parse @options.url 10 | path = parsedUrl.pathname 11 | path.split('/')[3] 12 | 13 | 14 | #### get next page url from the all customer reviews page document. 15 | getNextPageUrl: -> 16 | pageLinkTags = @$('.paging a') 17 | nextTag = pageLinkTags.eq(pageLinkTags.length - 1) 18 | nextText = pageLinkTags.eq(pageLinkTags.length - 1).text() 19 | 20 | if nextText is 'Next ›' 21 | nextUrl = nextTag.attr 'href' 22 | else 23 | nextUrl = null 24 | 25 | nextUrl 26 | 27 | 28 | #### get last page number from the all customer reviews page document. 29 | getLastPageNo: -> 30 | pageLinkTags = @$('.paging a') 31 | lastTag = pageLinkTags.eq(pageLinkTags.length - 2) 32 | lastText = pageLinkTags.eq(pageLinkTags.length - 2).text() 33 | 34 | if lastText isnt '' 35 | lastPageNo = parseInt lastText 36 | else 37 | lastPageNo = 1 38 | 39 | lastPageNo 40 | 41 | 42 | #### get review id array. 43 | getReviewIds: -> 44 | reviewLinkTags = @$('#productReviews a') 45 | 46 | reviewIds = [] 47 | reviewLinkTags.each (index, element) -> 48 | # there should be `name` attribute but no `style` attribute. 49 | name = @?.attr('name') 50 | style = @?.attr('style') 51 | 52 | if name? and name[...3] isnt 'oc-' and not style? 53 | reviewIds.push @?.attr('name') 54 | 55 | reviewIds 56 | 57 | 58 | 59 | module.exports = AllCustomerReviewsPage 60 | -------------------------------------------------------------------------------- /lib/getReviewIds.coffee: -------------------------------------------------------------------------------- 1 | async = require 'async' 2 | 3 | AllCustomerReviewsPage = require './page/AllCustomerReviewsPage' 4 | 5 | 6 | #### retrieve review IDs by a productId. 7 | module.exports = (options, callback) -> 8 | return callback new Error 'no options.productId' if not options.productId? 9 | 10 | reviewIds = [] 11 | 12 | isLastPage = false 13 | currentUrl = "http://www.amazon.com/product-reviews/#{options.productId}/ref=cm_cr_pr_top_recent?ie=UTF8&showViewpoints=0&sortBy=bySubmissionDateDescending" 14 | 15 | allCustomerReviewsPage = new AllCustomerReviewsPage 16 | url: currentUrl 17 | , 18 | (err, $) -> 19 | return callback err if err? 20 | 21 | allCustomerReviewsPage.$ = $ 22 | reviewIds = reviewIds.concat allCustomerReviewsPage.getReviewIds() 23 | nextPageUrl = allCustomerReviewsPage.getNextPageUrl() 24 | lastPageNo = allCustomerReviewsPage.getLastPageNo() 25 | 26 | return callback null, reviewIds if lastPageNo <= 1 27 | 28 | reviewPageNos = [ 2..lastPageNo ] 29 | 30 | async.forEachLimit reviewPageNos, 100 31 | , 32 | (reviewPageNo, callback) -> 33 | allCustomerReviewsPage = new AllCustomerReviewsPage 34 | url: nextPageUrl.replace /pageNumber=[\d]*/g, "pageNumber=#{reviewPageNo}" 35 | , 36 | (err, $) -> 37 | return callback err if err? 38 | 39 | allCustomerReviewsPage.$ = $ 40 | reviewIds = reviewIds.concat allCustomerReviewsPage.getReviewIds() 41 | 42 | callback null 43 | , 44 | (err) -> 45 | return callback err if err? 46 | 47 | callback null, reviewIds 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # node-amazon-reviews 2 | > A node.js module to crawl product reviews from Amazon. 3 | 4 | Amazon Product Advertising API provides almost attributes about a product. But review data cannot be gathered by API. Use this module if you want to get product reviews. 5 | 6 | 7 | ## Installation 8 | Via [npm](https://npmjs.org): 9 | 10 | $ npm install amazon-reviews 11 | 12 | 13 | ## Usage 14 | 15 | ### Load in the module 16 | ```javascript 17 | var AmazonReviews = require('amazon-reviews'); 18 | ``` 19 | 20 | ### Get review IDs by a product ID 21 | ```javascript 22 | AmazonReviews.getReviewIds({ 23 | productId: 'PRODUCT_ID_HERE' 24 | }, function(err, reviewIds) { 25 | ... 26 | }); 27 | ``` 28 | 29 | ### Get a review by product ID and review ID 30 | ```javascript 31 | AmazonReviews.getReview({ 32 | productId: 'PRODUCT_ID_HERE', 33 | reviewId: 'REVIEW_ID_HERE' 34 | }, function(err, review) { 35 | ... 36 | }); 37 | ``` 38 | 39 | 40 | ## License 41 | 42 | Released under the MIT License 43 | 44 | Copyright (c) 2013 Taeho Kim 45 | 46 | Permission is hereby granted, free of charge, to any person obtaining a copy 47 | of this software and associated documentation files (the "Software"), to deal 48 | in the Software without restriction, including without limitation the rights 49 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 50 | copies of the Software, and to permit persons to whom the Software is 51 | furnished to do so, subject to the following conditions: 52 | 53 | The above copyright notice and this permission notice shall be included in 54 | all copies or substantial portions of the Software. 55 | 56 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 57 | --------------------------------------------------------------------------------