├── .gitignore ├── README.md ├── extract_ngram.py ├── find_amazon_id.py ├── generate_ngrams.py ├── jstor.EXAMPLE.cnf ├── process_reviews.py ├── read_reviews.py └── scrape.py /.gitignore: -------------------------------------------------------------------------------- 1 | jstor.cnf 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | amazon-review-scraper 2 | ===================== 3 | 4 | A simple scraper written in python, using BeautifulSoup to take the title and fetch all of the reviews that have been written about that book. 5 | 6 | Attempts have been made to rely on as little structure as possible, but all scrapers are inevitably tied to the markup. Scraper works as of last commit date, but it may not work anymore. 7 | 8 | License 9 | ======= 10 | Copyright Juan Pablo Alperin 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | -------------------------------------------------------------------------------- /extract_ngram.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import shelve 3 | from collections import defaultdict 4 | 5 | import MySQLdb as mdb 6 | 7 | import ConfigParser 8 | Config = ConfigParser.ConfigParser() 9 | Config.read('jstor.cnf') 10 | 11 | con = None 12 | try: 13 | con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database')); 14 | 15 | cur = con.cursor() 16 | cur.execute("SELECT VERSION()") 17 | 18 | data = cur.fetchone() 19 | print "Database version : %s " % data 20 | 21 | except mdb.Error, e: 22 | print "Error %d: %s" % (e.args[0],e.args[1]) 23 | sys.exit(1) 24 | 25 | cur = con.cursor() 26 | 27 | try: 28 | discipline = sys.argv[1] + '-discipline' 29 | year = sys.argv[2] 30 | # cur.execute("SELECT DISTINCT r.doi FROM j_reviews r JOIN j_disciplines d ON (r.j_review_id = d.j_review_id) WHERE r.language = 'eng' AND r.num_reviewed_works = 1 AND r.reviewed_works != '' AND r.reviewed_works IS NOT NULL AND d.discipline = %s AND r.year = %s;", (discipline, int(year))) 31 | 32 | # joins with amazon data (only things in both) 33 | cur.execute("SELECT DISTINCT r.doi, tim.amazon_id FROM j_reviews r JOIN j_disciplines d ON (r.j_review_id = d.j_review_id) JOIN title_id_map tim ON (r.reviewed_works_hash = tim.title_hash) JOIN a_reviews ar ON (tim.amazon_id = ar.amazon_id) WHERE r.language = 'eng' AND r.num_reviewed_works = 1 AND r.reviewed_works != '' AND r.reviewed_works IS NOT NULL AND d.discipline = %s AND r.year = %s;", (discipline, int(year))) 34 | 35 | rowcount = cur.rowcount 36 | if rowcount == 0: 37 | print "No records found for %s (%s)" % (discipline, year) 38 | except mdb.Error, e: 39 | print "Error %d: %s" % (e.args[0],e.args[1]) 40 | sys.exit(1) 41 | 42 | which_ngram = sys.argv[3] 43 | 44 | print "Going after %s of %s (%s)" % (which_ngram, discipline, year) 45 | 46 | dataDir = Config.get('files', 'datadir') 47 | if dataDir[-1] != '/': dataDir = dataDir + '/' # ensure trailing slash 48 | 49 | ngramDir = dataDir + which_ngram + '/' 50 | outfile = "%sextracts/%s-%s-%s.txt" % (dataDir, discipline, year, which_ngram) 51 | 52 | dois = set([]) 53 | doi_amazon_id_map = {} 54 | for i in range(rowcount): 55 | row = cur.fetchone() 56 | dois.add(row[0]) 57 | if len(row) > 1: # allow for not joining with the amazon id 58 | doi_amazon_id_map[row[0]] = row[1] 59 | 60 | print "Looking for %s DOIs" % len(dois) 61 | 62 | o = open(outfile, 'wb') 63 | i=0 64 | dois_found=set([]) 65 | for part in os.listdir(ngramDir): 66 | f = open(ngramDir + part, 'rb') 67 | for line in f.readlines(): 68 | i+=1 69 | doi = line[1:].split()[0] 70 | if doi in dois: 71 | dois_found.add(doi) 72 | if len(doi_amazon_id_map): # allow for not joining with the amazon id 73 | o.write(doi_amazon_id_map[doi] + "\t" + line.replace('"', '')) 74 | else: 75 | o.write(line.replace('"', '')) 76 | f.close() 77 | o.close() 78 | print 'Found %s DOIs' % len(dois_found) 79 | print 'Checked %s lines' % i 80 | 81 | -------------------------------------------------------------------------------- /find_amazon_id.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import csv 3 | import re 4 | import shelve 5 | from collections import defaultdict 6 | 7 | import MySQLdb as mdb, sqlite3 8 | import ConfigParser 9 | Config = ConfigParser.ConfigParser() 10 | Config.read('jstor.cnf') 11 | 12 | import scrape 13 | 14 | con = None 15 | try: 16 | con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'), use_unicode=True) 17 | 18 | cur = con.cursor() 19 | cur.execute("SELECT VERSION()") 20 | 21 | data = cur.fetchone() 22 | print "Database version : %s " % data 23 | 24 | except mdb.Error, e: 25 | print "Error %d: %s" % (e.args[0],e.args[1]) 26 | sys.exit(1) 27 | 28 | dataDir = Config.get('files', 'datadir') 29 | title_id_map_db = dataDir + 'title_id_map.db' 30 | 31 | new_file = os.path.isfile(title_id_map_db) 32 | 33 | scon = sqlite3.connect(title_id_map_db) 34 | scon.isolation_level = None 35 | scur = scon.cursor() 36 | 37 | if not new_file: 38 | scur.execute("""CREATE TABLE title_id_map ( 39 | title varchar(2048), 40 | amazon_id varchar(20), 41 | amazon_title varchar(2048) 42 | );""") 43 | scur.execute("""CREATE INDEX titles ON title_id_map (title);""") 44 | scur.execute("""CREATE INDEX amazon_ids ON title_id_map (amazon_id);""") 45 | 46 | cur.execute("""SELECT DISTINCT reviewed_works 47 | FROM j_reviews 48 | WHERE language = 'eng' 49 | AND num_reviewed_works = 1 50 | AND reviewed_works != '' 51 | AND reviewed_works IS NOT NULL 52 | AND year >= 2005 53 | ORDER BY year DESC""") 54 | 55 | for i in range(cur.rowcount): 56 | row = cur.fetchone() 57 | if i % 5000 == 0: 58 | print '=================================================================' 59 | print ' '.join(map(str, [i] + list(row))) 60 | print '=================================================================' 61 | 62 | 63 | title = row[0] 64 | # scur.execute("SELECT * FROM title_id_map WHERE title = ?", (title,)) 65 | # data=scur.fetchone() 66 | # if data is not None: 67 | # # we have fetched this title before 68 | # # print "Had this title already: %s" % title 69 | # continue 70 | 71 | try: 72 | (url, amazon_title) = scrape.find_book_url(title) 73 | except KeyboardInterrupt: 74 | raise KeyboardInterrupt 75 | except: 76 | scur.execute("INSERT INTO title_id_map (title, amazon_id, amazon_title) VALUES (?, ?, ?)", (title, None, None)) 77 | continue 78 | 79 | id_from_url_regex = re.compile('^.*/dp/([^/]+).*') 80 | m=id_from_url_regex.match(url) 81 | if m: 82 | amazon_id = m.group(1) 83 | else: 84 | amazon_id = None 85 | print "Amazon ID not found in %s:" % url 86 | 87 | try: 88 | scur.execute("INSERT INTO title_id_map (title, amazon_id, amazon_title) VALUES (?, ?, ?)", (title, amazon_id, amazon_title)) 89 | except KeyboardInterrupt: 90 | raise KeyboardInterrupt 91 | except: 92 | print "problem with DB (%s, %s)" % (title, amazon_id) 93 | 94 | con.close() 95 | scon.close() 96 | -------------------------------------------------------------------------------- /generate_ngrams.py: -------------------------------------------------------------------------------- 1 | import re, nltk 2 | 3 | # 4 | # Use the same tokenization as JSTOR 5 | # see https://github.com/ITHAKA-AT/ejc-mapreduce/blob/master/ngrams.py 6 | # 7 | 8 | #Helper function to generate ngrams from raw text using NLTK ngram generator 9 | def generate_ngrams(raw_text, n=1): 10 | tokenized_sentences = get_tokenized_sentences(raw_text) 11 | grams = {} 12 | for tokens in tokenized_sentences: 13 | if n == 1: 14 | for gram in tokens: 15 | if gram.isalpha(): 16 | grams[(gram,)] = grams.get((gram,),0) + 1 17 | else: 18 | for gram in nltk.ngrams(tokens, n): 19 | grams[gram] = grams.get(gram,0) + 1 20 | sorted_grams = [] 21 | for gram, count in grams.items(): 22 | sorted_grams.append([' '.join(gram), count]) 23 | sorted_grams.sort(lambda y, x: cmp(x[1],y[1])) 24 | return sorted_grams 25 | 26 | 27 | #Tokenize our text input using NLTK tokenizer. 28 | #We're not calling this directly, but rather letting generate_ngrams() call it for us. 29 | #note that some special tokens are inserted for sentence start/end, numbers are converted to a single token, punctuation is reduced to fewer tokens 30 | #if you wanted to apply stemming, spell checking, etc, this is probably where you'd do it. NLTK provides a lot of this type of functionality. 31 | def get_tokenized_sentences(raw_text): 32 | tokenized_sentences = [] 33 | if raw_text: 34 | # normalize whitespace 35 | raw_text = re.sub('\s+', ' ', raw_text) 36 | raw_text = re.sub('-\s+', '', raw_text) 37 | for sentence in nltk.tokenize.sent_tokenize(raw_text): 38 | tokens = ['#SENTENCE_START#'] 39 | for token in sentence.lower().replace('.','').split(' '): 40 | if token: 41 | if (token.isalpha()): 42 | tokens.append(token) 43 | elif token.isdigit(): 44 | tokens.append('#NUMBER#') 45 | else: 46 | tokens.append('#NON_ALPHANUM#') 47 | tokens.append('#SENTENCE_END#') 48 | tokenized_sentences.append(tokens) 49 | return tokenized_sentences 50 | -------------------------------------------------------------------------------- /jstor.EXAMPLE.cnf: -------------------------------------------------------------------------------- 1 | [database] 2 | username: xxx 3 | password: xxx 4 | database: jstor 5 | 6 | [files] 7 | datadir: /some/data/dir/jstor/ 8 | -------------------------------------------------------------------------------- /process_reviews.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import re 3 | from datetime import datetime 4 | import generate_ngrams 5 | 6 | import MySQLdb as mdb 7 | 8 | import ConfigParser 9 | Config = ConfigParser.ConfigParser() 10 | Config.read('jstor.cnf') 11 | 12 | import pdb 13 | 14 | con = None 15 | try: 16 | con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database')); 17 | 18 | cur = con.cursor() 19 | cur.execute("SELECT VERSION()") 20 | 21 | data = cur.fetchone() 22 | print "Database version : %s " % data 23 | 24 | except mdb.Error, e: 25 | print "Error %d: %s" % (e.args[0],e.args[1]) 26 | sys.exit(1) 27 | 28 | cur = con.cursor() 29 | 30 | dataDir = Config.get('files', 'datadir') 31 | jstorDir = dataDir + 'amazon/' 32 | 33 | filename = sys.argv[1] 34 | 35 | def get_data(line): 36 | return line[line.find(": ") + 2:].strip().encode('utf8') 37 | 38 | f = open(jstorDir + filename) 39 | while True: 40 | line = f.readline() 41 | if not line: break 42 | 43 | if len(line.strip()) == 0: 44 | # print (amazon_id, reviewer_id, reviewer_name, helpfulness.split("/")[0], helpfulness.split("/")[1], score, datetime.fromtimestamp(float(review_date)).strftime("%Y-%m-%d %H:%M:%S"), review_title, review_text, len(re.split('\s*', review_text))) 45 | cur.execute("INSERT INTO a_reviews (amazon_id, reviewer_id, reviewer_name, helpfulness, helpfulness_out_of, score, review_date, review_title, review_text, review_word_count) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (amazon_id, reviewer_id, reviewer_name, helpfulness.split("/")[0], helpfulness.split("/")[1], score, datetime.fromtimestamp(float(review_date)).strftime("%Y-%m-%d %H:%M:%S"), review_title, review_text, len(re.split('\s*', review_text)))) 46 | print cur._last_executed 47 | else: 48 | amazon_id = get_data(line) 49 | reviewer_id = get_data(f.readline()) 50 | reviewer_name = get_data(f.readline()) 51 | helpfulness = get_data(f.readline()) 52 | score = get_data(f.readline()) 53 | review_date = get_data(f.readline()) 54 | review_title = get_data(f.readline()) 55 | review_text = get_data(f.readline()) 56 | 57 | f.close() 58 | cur.close() -------------------------------------------------------------------------------- /read_reviews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3.0 3 | 4 | # 5 | 6 | import MySQLdb as mdb 7 | import csv 8 | import re 9 | import chardet, django.utils.encoding 10 | from collections import defaultdict 11 | 12 | import ConfigParser 13 | Config = ConfigParser.ConfigParser() 14 | Config.read('jstor.cnf') 15 | 16 | con = None 17 | try: 18 | con = mdb.connect('localhost', Config.get('database', 'username'), Config.get('database', 'password'), Config.get('database', 'database'), charset='utf8', use_unicode=True) 19 | 20 | cur = con.cursor() 21 | cur.execute("SELECT VERSION()") 22 | 23 | data = cur.fetchone() 24 | print "Database version : %s " % data 25 | 26 | except mdb.Error, e: 27 | print "Error %d: %s" % (e.args[0],e.args[1]) 28 | sys.exit(1) 29 | 30 | cur = con.cursor() 31 | 32 | # 33 | 34 | def clean_string(s): 35 | if len(s) == 0: 36 | return s 37 | s = s.decode('string-escape') 38 | return django.utils.encoding.smart_text(s) 39 | try: 40 | dec = s.decode('utf8') 41 | except: 42 | dec = s.decode(chardet.detect(s)['encoding']) 43 | return dec.encode('utf8') 44 | 45 | # 46 | 47 | reviewFile = Config.get('files', 'datadir') + 'reviews.csv' 48 | # reviewFile = Config.get('files', 'datadir') + 'chartest.csv' 49 | f=open(reviewFile, 'rb') 50 | csvReader = csv.reader(f) 51 | csvReader.next() 52 | 53 | i=0 54 | 55 | split_regex = re.compile(r'[^\\]\|') 56 | escaped_split_regex = re.compile(r'\\\|') 57 | 58 | all_disciplines = set() 59 | all_subjects = set() 60 | 61 | year_counts = {} 62 | 63 | for row in csvReader: 64 | i+=1 65 | 66 | doi = row[0] 67 | sn = row[1] # don't know what this is, but its always blank 68 | journal = row[2] 69 | vol = row[3] 70 | num = row[4] 71 | year = int(row[5][0:4]) 72 | pubdate = str(year) + '-' + row[5][4:6] + '-' + row[5][6:8] 73 | title = clean_string(row[6]) 74 | author = clean_string(row[7]) 75 | 76 | # TODO: check if I am not screwing up the encoding here 77 | rwi = clean_string(row[8]) 78 | 79 | rwi = split_regex.split(rwi) 80 | num_reviewed_works = len(rwi) 81 | rwi_tuples = [tuple(escaped_split_regex.split(r)) for r in rwi] 82 | rwi_titles = [] 83 | rwi_authors = [] 84 | for r in rwi_tuples: 85 | rwi_titles.append(r[0]) 86 | if len(r) < 2: 87 | rwi_authors.append('') 88 | else: 89 | rwi_authors.append(r[1]) 90 | 91 | # note: out of order in original file 92 | disciplines = row[10] 93 | subjects = row[12] 94 | keywords = row[9] 95 | 96 | language = row[11] 97 | page_count = row[13] 98 | publisher = row[14] 99 | 100 | review_id = 0 101 | try: 102 | cur.execute("INSERT INTO j_reviews (doi, journal, volume, number, year, publication_date, title, author, num_reviewed_works, reviewed_works, reviewed_authors, language, disciplines, subjects, keywords, page_count, publisher) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", (doi, journal, vol, num, year, pubdate, title, author, num_reviewed_works, '|'.join(rwi_titles), '|'.join(rwi_authors), language, disciplines, subjects, keywords, page_count, publisher)) 103 | 104 | review_id = cur.lastrowid 105 | 106 | cur.executemany("INSERT INTO j_keywords (j_review_id, keyword) VALUES (%s, %s)", [(review_id, v) for v in keywords.split('|')]) 107 | cur.executemany("INSERT INTO j_disciplines (j_review_id, discipline) VALUES (%s, %s)", [(review_id, v) for v in disciplines.split('|')]) 108 | cur.executemany("INSERT INTO j_subjects (j_review_id, subject) VALUES (%s, %s)", [(review_id, v) for v in subjects.split('|')]) 109 | 110 | except mdb.Error, e: 111 | print "Error on review %d: %s" % (e.args[0],e.args[1]) 112 | print "id %s" % i 113 | 114 | 115 | # close the DB conncetion 116 | con.close() 117 | -------------------------------------------------------------------------------- /scrape.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, Comment 2 | import re, itertools, random 3 | import urllib, urllib2 4 | from datetime import datetime 5 | 6 | import generate_ngrams 7 | import difflib 8 | 9 | from time import sleep 10 | 11 | AMAZON_ADV_SEARCH_BASE_URL = 'http://www.amazon.com/gp/search/ref=sr_adv_b/' 12 | 13 | class Book: 14 | ''' simple class to hold together properties''' 15 | pass 16 | 17 | class Review: 18 | ''' simple class to hold together properties''' 19 | pass 20 | 21 | def get_soup(url): 22 | ''' 23 | Open the URL and make a Soup with the content 24 | ''' 25 | # sleep(random.randrange(1,3)) # prevent too many requests at the same time 26 | try: 27 | content = urllib2.urlopen(url).read() 28 | except: 29 | raise Exception('UrlOpenFail', url) 30 | 31 | soup = BeautifulSoup(content, "html.parser") 32 | return soup 33 | 34 | 35 | def find_book_url(title, checkTitle = False): 36 | ''' 37 | Search for the title using the keywords field (more tolerant than title field) 38 | and return the URL of the first result 39 | ''' 40 | # try: 41 | data = urllib.urlencode({'search-alias': 'stripbooks', 'field-keywords': title}) 42 | search_page = get_soup(AMAZON_ADV_SEARCH_BASE_URL + '?' + data) 43 | result0=search_page.find(id='result_0') 44 | try: 45 | a = result0.find('div', {'class': 'productTitle'}).find('a') 46 | aTitle = a.text.strip() 47 | url = a.attrs['href'] 48 | except KeyboardInterrupt: 49 | raise KeyboardInterrupt 50 | except: 51 | raise Exception('NoResultsFound', title) 52 | 53 | if not checkTitle: 54 | return (url, aTitle) 55 | 56 | titleParts = re.split("[\:\(\!\,]+", title) 57 | aTitleParts = re.split("[\:\(\!\,]+", aTitle) 58 | 59 | if difflib.SequenceMatcher(None, title, aTitle).ratio() > .85: 60 | return (url, aTitle) 61 | elif len(titleParts) > 1 and len(aTitleParts) > 1 and difflib.SequenceMatcher(None, titleParts[0].strip(), aTitleParts[0].strip()).ratio() > .85: 62 | return (url, aTitle) 63 | elif len(titleParts) > 1 and difflib.SequenceMatcher(None, titleParts[0].strip(), aTitle).ratio() > .85: 64 | return (url, aTitle) 65 | elif len(aTitleParts) > 1 and difflib.SequenceMatcher(None, titleParts, aTitleParts[0].strip()).ratio() > .85: 66 | return (url, aTitle) 67 | else: 68 | raise Exception('TitleNotFound', title) 69 | 70 | def get_review_url(main_page): 71 | ''' 72 | Get the URL that has the reviews off the main product page 73 | Tries by the item id, falls back on a structure approach 74 | ''' 75 | # try by id (not always present) 76 | a=main_page.find(id="revSAR") # returns an "a" tag 77 | if a: 78 | review_url = a.attrs['href'] # pull out the href 79 | else: 80 | # back-up to by structure 81 | reviews_summary = main_page.find(id="revSum") 82 | all_a = reviews_summary.find_all(href=re.compile('product-reviews')) 83 | if len(all_a): 84 | review_url = all_a[-1].attrs['href'] 85 | else: 86 | print 'No reviews found' 87 | return False 88 | return review_url 89 | 90 | def get_num_each_rating(review_page): 91 | ''' 92 | how many reviews of each rating? 93 | ''' 94 | try: 95 | product_summary_div = review_page.find(id="productSummary") 96 | s = product_summary_div.find('b').string 97 | num_reviews = int(s.split(' ')[0].replace(',','')) 98 | num_reviews_by_star = [] 99 | 100 | star_table = product_summary_div.find('table') 101 | for tr in star_table('tr'): 102 | s = tr('td')[-1].string.strip() # last td, take out white space 103 | if (len(s) > 2 and s[1:-1].isdigit()): 104 | n = s[1:-1].replace(',','') # take out ( ), strip comma 105 | num_reviews_by_star.append(int(n)) 106 | 107 | return num_reviews_by_star 108 | except: 109 | raise Exception('NoRatingCountsFound') 110 | 111 | def pull_out_reviews(review_page): 112 | ''' 113 | This method is likely to break over time as it relies on very 114 | specific structure for the review 115 | Particularly, it depends on the review being embedded between 116 | "This review is from .." and "Help other customers .. " 117 | ''' 118 | try: 119 | helpfulness_regex = re.compile(r'^\s*(\d+)\s+of\s+(\d+) people found the following review helpful\s*$') 120 | reviewer_href_regex = re.compile(r'/gp/pdp/profile/([^/])+') 121 | 122 | reviews = [] 123 | 124 | # get the part of the page wth the reviews 125 | product_reviews_section = review_page.find(id="productReviews").find('td') 126 | 127 | boundaries = product_reviews_section.find_all(text=lambda text:isinstance(text, Comment)) 128 | # dates = product_reviews_section.find_all('nobr') 129 | 130 | if (boundaries): 131 | for boundary in boundaries: 132 | review = Review() 133 | # get metadata 134 | 135 | date = boundary.find_next('nobr') 136 | try: 137 | # parse the date string 138 | review.date = datetime.strptime(date.text, '%B %d, %Y').date() 139 | except: 140 | raise Exception('CouldNotParseDate') 141 | 142 | reviewer = boundary.find_next('a', href=reviewer_href_regex) 143 | reviewer_href = reviewer.attrs['href'] 144 | # reviewer = (reviewer_id, reviewer_name, reviewer_url) 145 | review.reviewer_id = reviewer_href.split('/')[-1] 146 | review.reviewer_username = reviewer.text.strip('"') 147 | review.reviewer_url = reviewer_href 148 | 149 | texts = boundary.find_all_next(text=True) 150 | start = False 151 | skip = False 152 | review_text = '' 153 | for t in texts: 154 | t = t.strip() 155 | if start and t.startswith('Help other customers'): 156 | break 157 | 158 | helpfulness_match = helpfulness_regex.match(t) 159 | if helpfulness_match: 160 | helpfulness = (int(helpfulness_match.group(1)), int(helpfulness_match.group(2))) 161 | 162 | if t.startswith('This review is from'): 163 | start = True 164 | # advance one more (the title) 165 | skip = True 166 | continue 167 | if not start or skip: 168 | skip = False 169 | continue 170 | 171 | if len(t): 172 | review_text += t 173 | 174 | review.text = review_text.strip() 175 | review.word_count = sum([len(s) for s in generate_ngrams.get_tokenized_sentences(review.text)]) 176 | 177 | # TODO: save token length 178 | reviews.append(review) 179 | helpfulness = False 180 | 181 | return reviews 182 | except: 183 | raise Exception('ReviewsNotFound') 184 | 185 | def process_book(url): 186 | ''' 187 | Pull it all together, 188 | 1) get the soup for the main product page 189 | 2) pull out some info about the book 190 | 3) get the URL page with the reviews 191 | 4) get the first page of the reviews 192 | 5) pull out all the reviews on that page 193 | 6) find the next link 194 | 7) go to the next review page 195 | 8) if more pages, go to 6) 196 | ''' 197 | book = Book() 198 | 199 | try: 200 | # 1) 201 | book.url = url 202 | main_page = get_soup(url) 203 | 204 | # 2) 205 | # save the Amazon Book ID 206 | m = re.match('.*/dp/(\d+).*', url) 207 | book.amazon_id = m.group(1) 208 | 209 | # get the description 210 | desc_div = main_page.find(id='postBodyPS') 211 | if desc_div: 212 | book.book_description = ' '.join(desc_div.find_all(text=True)).strip() 213 | else: 214 | book.book_description = '' 215 | 216 | # get the published date 217 | details = [d.text for d in main_page.find('h2', text=re.compile('Product Details')).find_all_next('li')] 218 | for d in details: 219 | # print d 220 | m=re.match(r'.*Publisher:\s*[^\(]+\(([^\)]+)\).*', d) 221 | if m: 222 | book.published_date = datetime.strptime(m.group(1), '%B %d, %Y').date() 223 | 224 | m = re.match(r'\s*Amazon Best Sellers Rank:\s+#([\d\,]+) in Books.*', d, re.MULTILINE) 225 | if m: 226 | book.rank = int(m.group(1).replace(',','')) 227 | 228 | # get the authors 229 | book.authors = [a.text for a in main_page.find_all(href=re.compile(r'.*field-author.*'))] 230 | 231 | # TODO: get subject categorization (where is this?) 232 | 233 | # 3) 234 | review_url = get_review_url(main_page) 235 | if not review_url: 236 | print 'Review URL not found: ' + url 237 | return False 238 | 239 | print review_url 240 | 241 | # 4) 242 | review_page = get_soup(review_url) 243 | 244 | book.num_each_rating = get_num_each_rating(review_page) 245 | print '%s reviews' % sum(book.num_each_rating) 246 | 247 | if not review_page: 248 | book.reviews = None 249 | print 'Review Page not found: ' + review_url 250 | return 251 | 252 | # 5) 253 | reviews = pull_out_reviews(review_page) 254 | 255 | while True: 256 | # 6) 257 | page_links = review_page.find('span', {'class': 'paging'}) 258 | if page_links and page_links.find_all('a')[-1].text.startswith('Next'): 259 | review_url = page_links.find_all('a')[-1].attrs['href'] 260 | if not review_url: 261 | print 'Review URL not found' 262 | return 263 | 264 | review_page = get_soup(review_url) 265 | if not review_page: 266 | print 'Review Page not found: ' + review_url 267 | 268 | # 7) 269 | reviews += pull_out_reviews(review_page) 270 | print len(reviews) 271 | else: 272 | break 273 | 274 | return book, reviews 275 | except: 276 | import traceback 277 | print traceback.format_exc() 278 | raise Exception('SomeOtherProblemFound') 279 | 280 | --------------------------------------------------------------------------------