├── .gitignore
├── README.md
├── extract_ngram.py
├── find_amazon_id.py
├── generate_ngrams.py
├── jstor.EXAMPLE.cnf
├── process_reviews.py
├── read_reviews.py
└── scrape.py


/.gitignore:
--------------------------------------------------------------------------------
1 | jstor.cnf
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | amazon-review-scraper
 2 | =====================
 3 | 
 4 | A simple scraper written in python, using BeautifulSoup to take the title and fetch all of the reviews that have been written about that book. 
 5 | 
 6 | Attempts have been made to rely on as little structure as possible, but all scrapers are inevitably tied to the markup. Scraper works as of last commit date, but it may not work anymore. 
 7 | 
 8 | License 
 9 | =======
10 | Copyright Juan Pablo Alperin
11 | 
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 | 
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 | GNU General Public License for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
24 | 


--------------------------------------------------------------------------------
/extract_ngram.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import shelve
 3 | from collections import defaultdict
 4 | 
 5 | import MySQLdb as mdb
 6 | 
 7 | import ConfigParser
 8 | Config = ConfigParser.ConfigParser()
 9 | Config.read('jstor.cnf')
10 | 
11 | con = None
12 | try:
13 |         con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'));
14 | 
15 |         cur = con.cursor()
16 |         cur.execute("SELECT VERSION()")
17 | 
18 |         data = cur.fetchone()
19 |         print "Database version : %s " % data
20 | 
21 | except mdb.Error, e:
22 |         print "Error %d: %s" % (e.args[0],e.args[1])
23 |         sys.exit(1)
24 | 
25 | cur = con.cursor()
26 | 
27 | try:
28 |     discipline = sys.argv[1] + '-discipline'
29 |     year = sys.argv[2]
30 |     # cur.execute("SELECT DISTINCT r.doi FROM j_reviews r JOIN j_disciplines d ON (r.j_review_id = d.j_review_id) WHERE r.language = 'eng' AND r.num_reviewed_works = 1 AND r.reviewed_works != '' AND r.reviewed_works IS NOT NULL AND d.discipline = %s AND r.year = %s;", (discipline, int(year)))
31 | 
32 |     # joins with amazon data (only things in both)
33 |     cur.execute("SELECT DISTINCT r.doi, tim.amazon_id FROM j_reviews r JOIN j_disciplines d ON (r.j_review_id = d.j_review_id) JOIN title_id_map tim ON (r.reviewed_works_hash = tim.title_hash) JOIN a_reviews ar ON (tim.amazon_id = ar.amazon_id) WHERE r.language = 'eng' AND r.num_reviewed_works = 1 AND r.reviewed_works != '' AND r.reviewed_works IS NOT NULL AND d.discipline = %s AND r.year = %s;", (discipline, int(year)))
34 | 
35 |     rowcount = cur.rowcount
36 |     if rowcount == 0:
37 |         print "No records found for %s (%s)" % (discipline, year)
38 | except mdb.Error, e:
39 |         print "Error %d: %s" % (e.args[0],e.args[1])
40 |         sys.exit(1)
41 | 
42 | which_ngram = sys.argv[3]
43 | 
44 | print "Going after %s of %s (%s)" % (which_ngram, discipline, year)
45 | 
46 | dataDir = Config.get('files', 'datadir')
47 | if dataDir[-1] != '/': dataDir = dataDir + '/' # ensure trailing slash
48 | 
49 | ngramDir = dataDir + which_ngram + '/'
50 | outfile = "%sextracts/%s-%s-%s.txt" % (dataDir, discipline, year, which_ngram)
51 | 
52 | dois = set([])
53 | doi_amazon_id_map = {}
54 | for i in range(rowcount):
55 |     row = cur.fetchone()
56 |     dois.add(row[0])
57 |     if len(row) > 1: # allow for not joining with the amazon id
58 |         doi_amazon_id_map[row[0]] = row[1]
59 | 
60 | print "Looking for %s DOIs" % len(dois)
61 | 
62 | o = open(outfile, 'wb')
63 | i=0
64 | dois_found=set([])
65 | for part in os.listdir(ngramDir):
66 |     f = open(ngramDir + part, 'rb')
67 |     for line in f.readlines():
68 |         i+=1
69 |         doi = line[1:].split()[0]
70 |         if doi in dois:
71 |             dois_found.add(doi)
72 |             if len(doi_amazon_id_map): # allow for not joining with the amazon id
73 |                 o.write(doi_amazon_id_map[doi] + "\t" + line.replace('"', ''))
74 |             else:
75 |                 o.write(line.replace('"', ''))
76 |     f.close()
77 | o.close()
78 | print 'Found %s DOIs' % len(dois_found)
79 | print 'Checked %s lines' % i
80 | 
81 | 


--------------------------------------------------------------------------------
/find_amazon_id.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import csv
 3 | import re
 4 | import shelve
 5 | from collections import defaultdict
 6 | 
 7 | import MySQLdb as mdb, sqlite3
 8 | import ConfigParser
 9 | Config = ConfigParser.ConfigParser()
10 | Config.read('jstor.cnf')
11 | 
12 | import scrape
13 | 
14 | con = None
15 | try:
16 |     con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'), use_unicode=True)
17 | 
18 |     cur = con.cursor()
19 |     cur.execute("SELECT VERSION()")
20 | 
21 |     data = cur.fetchone()
22 |     print "Database version : %s " % data
23 | 
24 | except mdb.Error, e:
25 |     print "Error %d: %s" % (e.args[0],e.args[1])
26 |     sys.exit(1)
27 | 
28 | dataDir = Config.get('files', 'datadir')
29 | title_id_map_db = dataDir + 'title_id_map.db'
30 | 
31 | new_file = os.path.isfile(title_id_map_db)
32 | 
33 | scon = sqlite3.connect(title_id_map_db)
34 | scon.isolation_level = None
35 | scur = scon.cursor()
36 | 
37 | if not new_file:
38 |     scur.execute("""CREATE TABLE title_id_map (
39 |       title varchar(2048),
40 |       amazon_id varchar(20),
41 |       amazon_title varchar(2048)
42 |         );""")
43 |     scur.execute("""CREATE INDEX titles ON title_id_map (title);""")
44 |     scur.execute("""CREATE INDEX amazon_ids ON title_id_map (amazon_id);""")
45 | 
46 | cur.execute("""SELECT DISTINCT reviewed_works
47 |                     FROM j_reviews
48 |                     WHERE language = 'eng'
49 |                     AND num_reviewed_works = 1
50 |                     AND reviewed_works != ''
51 |                     AND reviewed_works IS NOT NULL
52 |                     AND year >= 2005
53 | 		ORDER BY year DESC""")
54 | 
55 | for i in range(cur.rowcount):
56 |     row = cur.fetchone()
57 |     if i % 5000 == 0:
58 |         print '================================================================='
59 |         print ' '.join(map(str, [i] + list(row)))
60 |         print '================================================================='
61 | 
62 | 
63 |     title = row[0]
64 |     # scur.execute("SELECT * FROM title_id_map WHERE title = ?", (title,))
65 |     # data=scur.fetchone()
66 |     # if data is not None:
67 |     #     # we have fetched this title before
68 |     #     # print "Had this title already: %s" % title
69 |     #     continue
70 | 
71 |     try:
72 |         (url, amazon_title) = scrape.find_book_url(title)
73 |     except KeyboardInterrupt:
74 |         raise KeyboardInterrupt
75 |     except:
76 |         scur.execute("INSERT INTO title_id_map (title, amazon_id, amazon_title) VALUES (?, ?, ?)", (title, None, None))
77 |         continue
78 | 
79 |     id_from_url_regex = re.compile('^.*/dp/([^/]+).*')
80 |     m=id_from_url_regex.match(url)
81 |     if m:
82 |         amazon_id = m.group(1)
83 |     else:
84 |         amazon_id = None
85 |         print "Amazon ID not found in %s:" % url
86 | 
87 |     try:
88 |         scur.execute("INSERT INTO title_id_map (title, amazon_id, amazon_title) VALUES (?, ?, ?)", (title, amazon_id, amazon_title))
89 |     except KeyboardInterrupt:
90 |         raise KeyboardInterrupt
91 |     except:
92 |         print "problem with DB (%s, %s)" % (title, amazon_id)
93 | 
94 | con.close()
95 | scon.close()
96 | 


--------------------------------------------------------------------------------
/generate_ngrams.py:
--------------------------------------------------------------------------------
 1 | import re, nltk
 2 | 
 3 | #
 4 | # Use the same tokenization as JSTOR
 5 | # see https://github.com/ITHAKA-AT/ejc-mapreduce/blob/master/ngrams.py
 6 | #
 7 | 
 8 | #Helper function to generate ngrams from raw text using NLTK ngram generator
 9 | def generate_ngrams(raw_text, n=1):
10 |     tokenized_sentences = get_tokenized_sentences(raw_text)
11 |     grams = {}
12 |     for tokens in tokenized_sentences:
13 |         if n == 1:
14 |             for gram in tokens:
15 |                 if gram.isalpha():
16 |                     grams[(gram,)] = grams.get((gram,),0) + 1
17 |         else:
18 |             for gram in nltk.ngrams(tokens, n):
19 |                 grams[gram] = grams.get(gram,0) + 1
20 |     sorted_grams = []
21 |     for gram, count in grams.items():
22 |         sorted_grams.append([' '.join(gram), count])
23 |     sorted_grams.sort(lambda y, x: cmp(x[1],y[1]))
24 |     return sorted_grams
25 | 
26 | 
27 | #Tokenize our text input using NLTK tokenizer.
28 | #We're not calling this directly, but rather letting generate_ngrams() call it for us.
29 | #note that some special tokens are inserted for sentence start/end, numbers are converted to a single token, punctuation is reduced to fewer tokens
30 | #if you wanted to apply stemming, spell checking, etc, this is probably where you'd do it.  NLTK provides a lot of this type of functionality.
31 | def get_tokenized_sentences(raw_text):
32 |     tokenized_sentences = []
33 |     if raw_text:
34 |         # normalize whitespace
35 |         raw_text = re.sub('\s+', ' ', raw_text)
36 |         raw_text = re.sub('-\s+', '', raw_text)
37 |     for sentence in nltk.tokenize.sent_tokenize(raw_text):
38 |         tokens = ['#SENTENCE_START#']
39 |         for token in sentence.lower().replace('.','').split(' '):
40 |             if token:
41 |                 if (token.isalpha()):
42 |                     tokens.append(token)
43 |                 elif token.isdigit():
44 |                     tokens.append('#NUMBER#')
45 |                 else:
46 |                     tokens.append('#NON_ALPHANUM#')
47 |         tokens.append('#SENTENCE_END#')
48 |         tokenized_sentences.append(tokens)
49 |     return tokenized_sentences
50 | 


--------------------------------------------------------------------------------
/jstor.EXAMPLE.cnf:
--------------------------------------------------------------------------------
1 | [database]
2 | username: xxx 
3 | password: xxx 
4 | database: jstor 
5 | 
6 | [files]
7 | datadir: /some/data/dir/jstor/
8 | 


--------------------------------------------------------------------------------
/process_reviews.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import re
 3 | from datetime import datetime
 4 | import generate_ngrams
 5 | 
 6 | import MySQLdb as mdb
 7 | 
 8 | import ConfigParser
 9 | Config = ConfigParser.ConfigParser()
10 | Config.read('jstor.cnf')
11 | 
12 | import pdb
13 | 
14 | con = None
15 | try:
16 |         con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'));
17 | 
18 |         cur = con.cursor()
19 |         cur.execute("SELECT VERSION()")
20 | 
21 |         data = cur.fetchone()
22 |         print "Database version : %s " % data
23 | 
24 | except mdb.Error, e:
25 |         print "Error %d: %s" % (e.args[0],e.args[1])
26 |         sys.exit(1)
27 | 
28 | cur = con.cursor()
29 | 
30 | dataDir = Config.get('files', 'datadir')
31 | jstorDir = dataDir + 'amazon/'
32 | 
33 | filename = sys.argv[1]
34 | 
35 | def get_data(line):
36 |     return line[line.find(": ") + 2:].strip().encode('utf8')
37 | 
38 | f = open(jstorDir + filename)
39 | while True:
40 |     line = f.readline()
41 |     if not line: break
42 | 
43 |     if len(line.strip()) == 0:
44 |         # print (amazon_id, reviewer_id, reviewer_name, helpfulness.split("/")[0], helpfulness.split("/")[1], score, datetime.fromtimestamp(float(review_date)).strftime("%Y-%m-%d %H:%M:%S"), review_title, review_text, len(re.split('\s*', review_text)))
45 |         cur.execute("INSERT INTO a_reviews (amazon_id, reviewer_id, reviewer_name, helpfulness, helpfulness_out_of, score, review_date, review_title, review_text, review_word_count) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (amazon_id, reviewer_id, reviewer_name, helpfulness.split("/")[0], helpfulness.split("/")[1], score, datetime.fromtimestamp(float(review_date)).strftime("%Y-%m-%d %H:%M:%S"), review_title, review_text, len(re.split('\s*', review_text))))
46 |         print cur._last_executed
47 |     else:
48 |         amazon_id = get_data(line)
49 |         reviewer_id = get_data(f.readline())
50 |         reviewer_name = get_data(f.readline())
51 |         helpfulness = get_data(f.readline())
52 |         score = get_data(f.readline())
53 |         review_date = get_data(f.readline())
54 |         review_title = get_data(f.readline())
55 |         review_text = get_data(f.readline())
56 | 
57 | f.close()
58 | cur.close()


--------------------------------------------------------------------------------
/read_reviews.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # <nbformat>3.0</nbformat>
  3 | 
  4 | # <codecell>
  5 | 
  6 | import MySQLdb as mdb
  7 | import csv
  8 | import re
  9 | import chardet, django.utils.encoding
 10 | from collections import defaultdict
 11 | 
 12 | import ConfigParser
 13 | Config = ConfigParser.ConfigParser()
 14 | Config.read('jstor.cnf')
 15 | 
 16 | con = None
 17 | try:
 18 |     con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'), charset='utf8',  use_unicode=True)
 19 | 
 20 |     cur = con.cursor()
 21 |     cur.execute("SELECT VERSION()")
 22 | 
 23 |     data = cur.fetchone()
 24 |     print "Database version : %s " % data
 25 | 
 26 | except mdb.Error, e:
 27 |     print "Error %d: %s" % (e.args[0],e.args[1])
 28 |     sys.exit(1)
 29 | 
 30 | cur = con.cursor()
 31 | 
 32 | # <codecell>
 33 | 
 34 | def clean_string(s):
 35 |     if len(s) == 0:
 36 |         return s
 37 |     s = s.decode('string-escape')
 38 |     return django.utils.encoding.smart_text(s)
 39 |     try:
 40 |         dec = s.decode('utf8')
 41 |     except:
 42 |         dec = s.decode(chardet.detect(s)['encoding'])
 43 |     return dec.encode('utf8')
 44 | 
 45 | # <codecell>
 46 | 
 47 | reviewFile = Config.get('files', 'datadir') + 'reviews.csv'
 48 | # reviewFile = Config.get('files', 'datadir') + 'chartest.csv'
 49 | f=open(reviewFile, 'rb')
 50 | csvReader = csv.reader(f)
 51 | csvReader.next()
 52 | 
 53 | i=0
 54 | 
 55 | split_regex = re.compile(r'[^\\]\|')
 56 | escaped_split_regex = re.compile(r'\\\|')
 57 | 
 58 | all_disciplines = set()
 59 | all_subjects = set()
 60 | 
 61 | year_counts = {}
 62 | 
 63 | for row in csvReader:
 64 |     i+=1
 65 | 
 66 |     doi = row[0]
 67 |     sn = row[1] # don't know what this is, but its always blank
 68 |     journal = row[2]
 69 |     vol = row[3]
 70 |     num = row[4]
 71 |     year = int(row[5][0:4])
 72 |     pubdate =  str(year) + '-' + row[5][4:6] + '-' + row[5][6:8]
 73 |     title = clean_string(row[6])
 74 |     author = clean_string(row[7])
 75 | 
 76 |     # TODO: check if I am not screwing up the encoding here
 77 |     rwi = clean_string(row[8])
 78 | 
 79 |     rwi = split_regex.split(rwi)
 80 |     num_reviewed_works = len(rwi)
 81 |     rwi_tuples = [tuple(escaped_split_regex.split(r)) for r in rwi]
 82 |     rwi_titles = []
 83 |     rwi_authors = []
 84 |     for r in rwi_tuples:
 85 |         rwi_titles.append(r[0])
 86 |         if len(r) < 2:
 87 |             rwi_authors.append('')
 88 |         else:
 89 |             rwi_authors.append(r[1])
 90 | 
 91 |     # note: out of order in original file
 92 |     disciplines = row[10]
 93 |     subjects = row[12]
 94 |     keywords = row[9]
 95 | 
 96 |     language = row[11]
 97 |     page_count = row[13]
 98 |     publisher = row[14]
 99 | 
100 |     review_id = 0
101 |     try:
102 |         cur.execute("INSERT INTO j_reviews (doi, journal, volume, number, year, publication_date, title, author, num_reviewed_works, reviewed_works, reviewed_authors, language, disciplines, subjects, keywords, page_count, publisher) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", (doi, journal, vol, num, year, pubdate, title, author, num_reviewed_works, '|'.join(rwi_titles), '|'.join(rwi_authors), language, disciplines, subjects, keywords, page_count, publisher))
103 | 
104 |         review_id = cur.lastrowid
105 | 
106 |         cur.executemany("INSERT INTO j_keywords (j_review_id, keyword) VALUES (%s, %s)", [(review_id, v) for v in keywords.split('|')])
107 |         cur.executemany("INSERT INTO j_disciplines (j_review_id, discipline) VALUES (%s, %s)", [(review_id, v) for v in disciplines.split('|')])
108 |         cur.executemany("INSERT INTO j_subjects (j_review_id, subject) VALUES (%s, %s)", [(review_id, v) for v in subjects.split('|')])
109 | 
110 |     except mdb.Error, e:
111 |         print "Error on review %d: %s" % (e.args[0],e.args[1])
112 |         print "id %s" % i
113 | 
114 | 
115 | # close the DB conncetion
116 | con.close()
117 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup, Comment
  2 | import re, itertools, random
  3 | import urllib, urllib2
  4 | from datetime import datetime
  5 | 
  6 | import generate_ngrams
  7 | import difflib
  8 | 
  9 | from time import sleep
 10 | 
 11 | AMAZON_ADV_SEARCH_BASE_URL = 'http://www.amazon.com/gp/search/ref=sr_adv_b/'
 12 | 
 13 | class Book:
 14 |     ''' simple class to hold together properties'''
 15 |     pass
 16 | 
 17 | class Review:
 18 |     ''' simple class to hold together properties'''
 19 |     pass
 20 | 
 21 | def get_soup(url):
 22 |     '''
 23 |     Open the URL and make a Soup with the content
 24 |     '''
 25 |     # sleep(random.randrange(1,3)) # prevent too many requests at the same time
 26 |     try:
 27 |         content = urllib2.urlopen(url).read()
 28 |     except:
 29 |         raise Exception('UrlOpenFail', url)
 30 | 
 31 |     soup = BeautifulSoup(content, "html.parser")
 32 |     return soup
 33 | 
 34 | 
 35 | def find_book_url(title, checkTitle = False):
 36 |     '''
 37 |     Search for the title using the keywords field (more tolerant than title field)
 38 |     and return the URL of the first result
 39 |     '''
 40 |     # try:
 41 |     data = urllib.urlencode({'search-alias': 'stripbooks', 'field-keywords': title})
 42 |     search_page = get_soup(AMAZON_ADV_SEARCH_BASE_URL + '?' + data)
 43 |     result0=search_page.find(id='result_0')
 44 |     try:
 45 |         a = result0.find('div', {'class': 'productTitle'}).find('a')
 46 |         aTitle = a.text.strip()
 47 |         url = a.attrs['href']
 48 |     except KeyboardInterrupt:
 49 |         raise KeyboardInterrupt
 50 |     except:
 51 |         raise Exception('NoResultsFound', title)
 52 | 
 53 |     if not checkTitle:
 54 |         return (url, aTitle)
 55 | 
 56 |     titleParts = re.split("[\:\(\!\,]+", title)
 57 |     aTitleParts = re.split("[\:\(\!\,]+", aTitle)
 58 | 
 59 |     if difflib.SequenceMatcher(None, title, aTitle).ratio() > .85:
 60 |         return (url, aTitle)
 61 |     elif len(titleParts) > 1 and len(aTitleParts) > 1 and difflib.SequenceMatcher(None, titleParts[0].strip(), aTitleParts[0].strip()).ratio() > .85:
 62 |         return (url, aTitle)
 63 |     elif len(titleParts) > 1 and difflib.SequenceMatcher(None, titleParts[0].strip(), aTitle).ratio() > .85:
 64 |         return (url, aTitle)
 65 |     elif len(aTitleParts) > 1 and difflib.SequenceMatcher(None, titleParts, aTitleParts[0].strip()).ratio() > .85:
 66 |         return (url, aTitle)
 67 |     else:
 68 |         raise Exception('TitleNotFound', title)
 69 | 
 70 | def get_review_url(main_page):
 71 |     '''
 72 |     Get the URL that has the reviews off the main product page
 73 |     Tries by the item id, falls back on a structure approach
 74 |     '''
 75 |     # try by id (not always present)
 76 |     a=main_page.find(id="revSAR")    # returns an "a" tag
 77 |     if a:
 78 |         review_url = a.attrs['href'] # pull out the href
 79 |     else:
 80 |         # back-up to by structure
 81 |         reviews_summary = main_page.find(id="revSum")
 82 |         all_a = reviews_summary.find_all(href=re.compile('product-reviews'))
 83 |         if len(all_a):
 84 |             review_url = all_a[-1].attrs['href']
 85 |         else:
 86 |             print 'No reviews found'
 87 |             return False
 88 |     return review_url
 89 | 
 90 | def get_num_each_rating(review_page):
 91 |     '''
 92 |     how many reviews of each rating?
 93 |     '''
 94 |     try:
 95 |         product_summary_div = review_page.find(id="productSummary")
 96 |         s = product_summary_div.find('b').string
 97 |         num_reviews = int(s.split(' ')[0].replace(',',''))
 98 |         num_reviews_by_star = []
 99 | 
100 |         star_table = product_summary_div.find('table')
101 |         for tr in star_table('tr'):
102 |             s = tr('td')[-1].string.strip() # last td, take out white space
103 |             if (len(s) > 2 and s[1:-1].isdigit()):
104 |                 n = s[1:-1].replace(',','') # take out ( ), strip comma
105 |                 num_reviews_by_star.append(int(n))
106 | 
107 |         return num_reviews_by_star
108 |     except:
109 |         raise Exception('NoRatingCountsFound')
110 | 
111 | def pull_out_reviews(review_page):
112 |     '''
113 |     This method is likely to break over time as it relies on very
114 |     specific structure for the review
115 |     Particularly, it depends on the review being embedded between
116 |     "This review is from .." and "Help other customers .. "
117 |     '''
118 |     try:
119 |         helpfulness_regex = re.compile(r'^\s*(\d+)\s+of\s+(\d+) people found the following review helpful\s*$')
120 |         reviewer_href_regex = re.compile(r'/gp/pdp/profile/([^/])+')
121 | 
122 |         reviews = []
123 | 
124 |         # get the part of the page wth the reviews
125 |         product_reviews_section = review_page.find(id="productReviews").find('td')
126 | 
127 |         boundaries = product_reviews_section.find_all(text=lambda text:isinstance(text, Comment))
128 |         # dates = product_reviews_section.find_all('nobr')
129 | 
130 |         if (boundaries):
131 |             for boundary in boundaries:
132 |                 review = Review()
133 |                 # get metadata
134 | 
135 |                 date = boundary.find_next('nobr')
136 |                 try:
137 |                     # parse the date string
138 |                     review.date = datetime.strptime(date.text, '%B %d, %Y').date()
139 |                 except:
140 |                     raise Exception('CouldNotParseDate')
141 | 
142 |                 reviewer = boundary.find_next('a', href=reviewer_href_regex)
143 |                 reviewer_href = reviewer.attrs['href']
144 |                 # reviewer = (reviewer_id, reviewer_name, reviewer_url)
145 |                 review.reviewer_id = reviewer_href.split('/')[-1]
146 |                 review.reviewer_username = reviewer.text.strip('"')
147 |                 review.reviewer_url = reviewer_href
148 | 
149 |                 texts = boundary.find_all_next(text=True)
150 |                 start = False
151 |                 skip = False
152 |                 review_text = ''
153 |                 for t in texts:
154 |                     t = t.strip()
155 |                     if start and t.startswith('Help other customers'):
156 |                         break
157 | 
158 |                     helpfulness_match = helpfulness_regex.match(t)
159 |                     if helpfulness_match:
160 |                         helpfulness = (int(helpfulness_match.group(1)), int(helpfulness_match.group(2)))
161 | 
162 |                     if t.startswith('This review is from'):
163 |                         start = True
164 |                         # advance one more (the title)
165 |                         skip = True
166 |                         continue
167 |                     if not start or skip:
168 |                         skip = False
169 |                         continue
170 | 
171 |                     if len(t):
172 |                         review_text += t
173 | 
174 |                 review.text = review_text.strip()
175 |                 review.word_count = sum([len(s) for s in generate_ngrams.get_tokenized_sentences(review.text)])
176 | 
177 |             # TODO: save token length
178 |             reviews.append(review)
179 |             helpfulness = False
180 | 
181 |         return reviews
182 |     except:
183 |         raise Exception('ReviewsNotFound')
184 | 
185 | def process_book(url):
186 |     '''
187 |     Pull it all together,
188 |     1) get the soup for the main product page
189 |     2) pull out some info about the book
190 |     3) get the URL page with the reviews
191 |     4) get the first page of the reviews
192 |     5) pull out all the reviews on that page
193 |     6) find the next link
194 |     7) go to the next review page
195 |     8) if more pages, go to 6)
196 |     '''
197 |     book = Book()
198 | 
199 |     try:
200 |         # 1)
201 |         book.url = url
202 |         main_page = get_soup(url)
203 | 
204 |         # 2)
205 |         # save the Amazon Book ID
206 |         m = re.match('.*/dp/(\d+).*', url)
207 |         book.amazon_id = m.group(1)
208 | 
209 |         # get the description
210 |         desc_div = main_page.find(id='postBodyPS')
211 |         if desc_div:
212 |             book.book_description = ' '.join(desc_div.find_all(text=True)).strip()
213 |         else:
214 |             book.book_description = ''
215 | 
216 |         # get the published date
217 |         details = [d.text for d in main_page.find('h2', text=re.compile('Product Details')).find_all_next('li')]
218 |         for d in details:
219 |             # print d
220 |             m=re.match(r'.*Publisher:\s*[^\(]+\(([^\)]+)\).*', d)
221 |             if m:
222 |                 book.published_date = datetime.strptime(m.group(1), '%B %d, %Y').date()
223 | 
224 |             m = re.match(r'\s*Amazon Best Sellers Rank:\s+#([\d\,]+) in Books.*', d, re.MULTILINE)
225 |             if m:
226 |                 book.rank = int(m.group(1).replace(',',''))
227 | 
228 |         # get the authors
229 |         book.authors = [a.text for a in main_page.find_all(href=re.compile(r'.*field-author.*'))]
230 | 
231 |         # TODO: get subject categorization (where is this?)
232 | 
233 |         # 3)
234 |         review_url = get_review_url(main_page)
235 |         if not review_url:
236 |             print 'Review URL not found: ' + url
237 |             return False
238 | 
239 |         print review_url
240 | 
241 |         # 4)
242 |         review_page = get_soup(review_url)
243 | 
244 |         book.num_each_rating = get_num_each_rating(review_page)
245 |         print '%s reviews' % sum(book.num_each_rating)
246 | 
247 |         if not review_page:
248 |             book.reviews = None
249 |             print 'Review Page not found: ' + review_url
250 |             return
251 | 
252 |         # 5)
253 |         reviews = pull_out_reviews(review_page)
254 | 
255 |         while True:
256 |             # 6)
257 |             page_links = review_page.find('span', {'class': 'paging'})
258 |             if page_links and page_links.find_all('a')[-1].text.startswith('Next'):
259 |                 review_url = page_links.find_all('a')[-1].attrs['href']
260 |                 if not review_url:
261 |                     print 'Review URL not found'
262 |                     return
263 | 
264 |                 review_page = get_soup(review_url)
265 |                 if not review_page:
266 |                     print 'Review Page not found: ' + review_url
267 | 
268 |                 # 7)
269 |                 reviews += pull_out_reviews(review_page)
270 |                 print len(reviews)
271 |             else:
272 |                 break
273 | 
274 |         return book, reviews
275 |     except:
276 |         import traceback
277 |         print traceback.format_exc()
278 |         raise Exception('SomeOtherProblemFound')
279 | 
280 | 


--------------------------------------------------------------------------------