├── README.md ├── app.py └── hostel_review.py /README.md: -------------------------------------------------------------------------------- 1 | # Hostel Reviews with NLP 2 | Hostel Reviews analyzed with some intelligence at @HotelReviewsBot 3 | 4 | Full blog post describing the process is here 5 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 7 14:08:15 2015 4 | 5 | @author: jay 6 | 7 | TWITTER API CLIENT 8 | Summary: Reads in twitter timeline and responds to tweets 9 | that are about hostels and a keyword. 10 | """ 11 | import os 12 | #os.chdir("path_to_HostelReview") 13 | from hostel_review import HostelReview 14 | import logging 15 | import tweepy 16 | import requests 17 | import numpy as np 18 | import pickle 19 | 20 | def login(): 21 | """Login with api keys""" 22 | keys = pickle.load(open("twitter_oauth.p", "rb")) 23 | CONSUMER_KEY, CONSUMER_SECRET, oauth_token, oauth_token_secret = [key for key in keys] #read in tokens 24 | auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) 25 | auth.set_access_token(oauth_token, oauth_token_secret) 26 | return auth 27 | 28 | def hostel_main(url, key): 29 | """ Run the main analysis program of the hostel 30 | Scrapes the url and returns a dictionary of analysis of the hostel 31 | Explanation in blog 32 | """ 33 | ht = HostelReview(url) #initiate hostel review class 34 | logger.info('Hostel instantiated') 35 | first_url = ht.url + "1?period=all" 36 | xml = ht.request_xml(first_url) #get xml of first page 37 | pages = ht.find_end(xml) #extract the number of pages to scrape 38 | df = ht.scrape_to_df(ht.url, pages) #scape pages and transpose to dataframe 39 | df = ht.count_amenities(df, key) #create a new column for the keyword features 40 | return ht.sentiment_analysis(key, df) #return dictionary of analysis 41 | 42 | def get_word(txt): 43 | """ Find keyword within the tweet, else return None """ 44 | for word in txt: 45 | for amen in ['wifi', 'breakfast', 'bathroom', 'shower', 'noise']: 46 | if amen in word.lower(): 47 | return amen 48 | 49 | def get_tweet_url(txt): 50 | """ Find url in tweet, else return None """ 51 | for word in txt: 52 | if 'https://t.co' in word: 53 | return check_url_format(requests.get(word).url) 54 | 55 | def check_url_format(url): 56 | """ Check that the url is correct else raise error flag """ 57 | if "hostelworld.com" in url: 58 | if "www" not in url: 59 | url = url.replace("t.hostelworld.com", "www.hostelworld.com") 60 | return url.split('?')[0] + '/reviews/' 61 | else: 62 | return "no" 63 | 64 | def get_params(tweet): 65 | """ Takes in a tweet object and returns screen name, id, url, and keyword """ 66 | txt = tweet.text.split() 67 | return '@' + tweet.user.screen_name, tweet.id, get_tweet_url(txt), get_word(txt) 68 | 69 | def compute_status(analysis, screen_name, key): 70 | """ Takes results json analysis and creates a tweet that aggregates the information 71 | without taking over 140 characters 72 | """ 73 | hostel_key_rating = str(np.round(analysis['key_avg'],0)) 74 | tweet_status = screen_name + " " + key + " rating: " + hostel_key_rating + "/100. Positive: " + \ 75 | str(analysis['positive']) + " Negative: " + str(analysis['negative']) + ' "' + analysis['common_phrase']['phrase'] + '"' 76 | if len(tweet_status) > 140: 77 | tweet_status = tweet_status[0:139] 78 | return tweet_status 79 | 80 | def get_unread_statuses(all_tweets): 81 | """ Passes in the tweepy api and reads from the timeline to return 82 | which tweets have been answered before 83 | """ 84 | seen_ids = pickle.load(open("seen_twitter.p", "rb")) 85 | return [x for x in all_tweets if x.id not in seen_ids] 86 | 87 | def store_tweet_ids(all_tweets): 88 | """ Take all tweet ids on timeline and store them in a pickle file """ 89 | tweet_ids = [x.id for x in all_tweets] 90 | pickle.dump(tweet_ids, open("seen_twitter.p", "wb")) 91 | 92 | def catch_errors(url, key, screen_name): 93 | """ Throw error messages for wrong inputs into tweets """ 94 | if url is None: 95 | return screen_name + " Can't find a url buddy" 96 | elif url == 'no': 97 | return screen_name + " Not the right url buddy" 98 | elif key is None: 99 | return screen_name + " Can't find an amenity to search for buddy, or you are a bad speller" 100 | 101 | 102 | def update_hostel_status(api, tweet): 103 | """ update status for each tweet """ 104 | screen_name, reply_id, url, key = get_params(tweet) 105 | status_errors = catch_errors(url, key, screen_name) 106 | if status_errors is not None: #check for errors in tweet mention 107 | tweet_status = status_errors 108 | else: 109 | try: 110 | analysis = hostel_main(url, key) #call hostel main function 111 | tweet_status = compute_status(analysis, screen_name, key) 112 | except: 113 | tweet_status = screen_name + ' YOU BROKE SOMETHING' #adhoc message for breaking the program 114 | api.update_status(status = tweet_status, in_reply_to_status_id = reply_id) 115 | 116 | 117 | if __name__ == '__main__': 118 | logging.basicConfig(level=logging.INFO) #initiate logging 119 | logger = logging.getLogger() 120 | 121 | auth = login() #login 122 | api = tweepy.API(auth) 123 | all_tweets = api.mentions_timeline() #grab all tweets from timeline 124 | tweets = get_unread_statuses(all_tweets) #save the unread mentions 125 | for tweet in tweets: 126 | update_hostel_status(api, tweet) #reply to each mention 127 | store_tweet_ids(all_tweets) #store tweets so it doesn't repeat next time 128 | -------------------------------------------------------------------------------- /hostel_review.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 4 00:28:16 2015 4 | 5 | @author: jay 6 | 7 | HOSTELWORLD Real reviews 8 | 9 | 1. Average the numerical ratings where they mentioned the ameneties 10 | 2. Get sentiment analysis for the specific keyword. 11 | - Split it into sentences and count negative words and positive ones where 12 | the keyword is said 13 | 3. Make it a twitter bot slash API 14 | 4. Let them ask the twitter bot "Hey hostelbot, how is the wifi and shower" 15 | - parse the question and find wifi and shower 16 | 5. Return positive reviews with most common words 17 | """ 18 | 19 | import requests 20 | import pandas as pd 21 | from lxml import html 22 | import numpy as np 23 | import re 24 | from textblob import TextBlob 25 | import nltk 26 | from nltk.util import ngrams 27 | from nltk.collocations import * 28 | from collections import Counter 29 | 30 | class HostelReview(): 31 | amenities = { 32 | 'wifi': ['wifi','internet','wi-fi', 'wi fi', 'wireless'], 33 | 'breakfast': ['breakfast', 'breakfest', 'break fast', 'brunch'], 34 | 'bathroom': ['bathroom', 'bath room', 'bath', 'restroom', 'toilet', 'urinal', 'lavatory', 'washroom', 'bathrooms'], 35 | 'shower': ['shower', 'bathe', 'showers'], 36 | 'noise': ['noise', 'noisy', 'quiet', 'loud', 'silent'] 37 | } 38 | def __init__(self, url): 39 | """ Initiates the class with a url for the hostel """ 40 | self.url = url 41 | 42 | def request_xml(self, url): 43 | """ Passes in a url and returns the xml of the page """ 44 | response = requests.get(url) 45 | xml = html.fromstring(response.text) 46 | return xml 47 | 48 | def find_end(self, xml): 49 | """ Passes in the xml of a page and returns the number of pages to scrape for reviews """ 50 | end = int(xml.xpath("//div/div[@class='results']/text()")[0].split(' ')[0].split('(')[1]) 51 | if end / 20 > 25: #max out at 25 pages of reviews 52 | pages = 26 53 | else: 54 | pages = end / 20 55 | return pages 56 | 57 | def scrape_to_df(self, base_url, pages): 58 | """ 59 | Takes in the hostel url and number pages of reviews and returns a dataframe 60 | with each row representing a review. Columns are: ratings, review, and page number 61 | """ 62 | df = [] 63 | for i in xrange(1, pages): 64 | url = base_url + str(i) + "?period=all" #append string to get all reviews 65 | xml = self.request_xml(url) 66 | reviews = xml.xpath('//div[@class="microreviews rounded"]') # list of reviews 67 | for review in reviews: 68 | df.append({ 69 | 'rating':(int(review.xpath('.//div/text()')[1].replace('%', ''))), #numerical rating on review 70 | 'review': ''.join(review.xpath('.//div/p/text()')).strip(), #text of review 71 | 'page': i #page number 72 | }) 73 | return pd.DataFrame(df) 74 | 75 | def count_amenities(self, hostel, key): 76 | """ If key/amenity found in review, apply phrase in key column """ 77 | hostel[key] = hostel.apply(lambda x: self.get_key_sentence(x['review'], self.amenities[key]), axis=1) 78 | return hostel 79 | 80 | def get_key_sentence(self, x, key_list): 81 | """Passes in a review and a bag of words associated with the key 82 | Returns a sentence in the review containing one or more of the bag of words 83 | """ 84 | delimiters = ',', '.', ';', '!', '?' 85 | sentences = self.split(delimiters, x, maxsplit=0) 86 | for sent in sentences: #loop through phrases in paragraph 87 | for word in sent.split(): #loop through words in phrase 88 | if word.lower() in key_list: #check for keyword matches 89 | return sent.lower().strip() 90 | 91 | def split(self, delimiters, string, maxsplit=0): 92 | """ Takes in comma separated delimiters and splits paragraph string 93 | into a list of phrases """ 94 | regexPattern = '|'.join(map(re.escape, delimiters)) 95 | return re.split(regexPattern, string, maxsplit) 96 | 97 | def count_words(self, word_freq, sent, stopwords, list_key): 98 | """Takes in a dictionary, sentence or phrase, stopwords, and bag of words 99 | and appends counts for word frequencies not in stopwords to find 100 | the most common words in the reviews 101 | """ 102 | for word in sent.split(): 103 | if word not in stopwords and word not in list_key: 104 | if word not in word_freq: 105 | word_freq[word] = 1 106 | else: 107 | word_freq[word] += 1 108 | 109 | def parse_reviews(self, subset, key): 110 | """ Takes in a dataframe and key 111 | Returns a dictionary with the highest frequency words and their counts 112 | where the key was found in the reviews 113 | """ 114 | word_freq = {} 115 | stopwords = nltk.corpus.stopwords.words('english') 116 | for i in xrange(0, len(subset)): #loop through each review 117 | self.count_words(word_freq, subset[key][i], stopwords, self.amenities[key]) 118 | return word_freq 119 | 120 | def sentiment_analysis(self, key, hostel): 121 | """ Returns a dictionary of summary sentiment analysis values 122 | """ 123 | subset = hostel.dropna() #drop reviews not mentioning key 124 | subset.reset_index(inplace=True) 125 | 126 | subset['sentiment'] = subset[key].apply(lambda x: TextBlob(x).sentiment.polarity) 127 | 128 | word_freq = self.parse_reviews(subset, key) 129 | d = Counter(word_freq) 130 | phrase_words = [x[0] for x in d.most_common(3)] #find top 3 keywords describing each review 131 | summary = { 132 | 'phrase_words': d.most_common(3), #dictionary of top 3 common keywords and their counts 133 | 'hotel_avg': np.mean(hostel['rating']), #average rating of the hostel 134 | 'key_avg': np.mean(subset['rating']), #average rating of reviews specific to key 135 | 'num': len(hostel), #number of reviews at hostel 136 | 'mean': np.mean(subset['sentiment']), #average sentiment of review related to key 137 | 'positive': len(subset[subset['sentiment'] > 0]), #number of positive reviews 138 | 'negative': len(subset[subset['sentiment'] < 0]), #number of negative reviews 139 | 'zero': len(subset[subset['sentiment'] == 0]), #number of zero sentiment reviews 140 | 'max_val': { 141 | 'num': subset.loc[subset['sentiment'].idxmax()]['sentiment'], #sentiment rating for best review 142 | 'phrase': subset.loc[subset['sentiment'].idxmax()][key] #text for best review 143 | }, 144 | 'min_val': { 145 | 'num': subset.loc[subset['sentiment'].idxmin()]['sentiment'], #sentiment rating for worst reviews 146 | 'phrase': subset.loc[subset['sentiment'].idxmin()][key] #text for worst review 147 | }, 148 | 'common_phrase': { 149 | 'phrase': '', 150 | 'num': -1 151 | } 152 | } 153 | # Find the review with the most number of common words aggregated from all key reviews 154 | for phrase in subset[key]: 155 | num_words = len([x for word in phrase.split() for x in phrase_words if x in word]) 156 | if num_words > summary['common_phrase']['num']: 157 | summary['common_phrase'] = { 158 | 'phrase': phrase, 159 | 'num': num_words 160 | } 161 | 162 | return summary 163 | 164 | 165 | --------------------------------------------------------------------------------