├── README.md
├── app.py
└── hostel_review.py


/README.md:
--------------------------------------------------------------------------------
1 | # Hostel Reviews with NLP
2 | Hostel Reviews analyzed with some intelligence at @HotelReviewsBot
3 | 
4 | Full blog post describing the process is <a href="http://www.racketracer.com/2015/11/18/practical-natural-language-processing-for-determing-wifi-quality-in-hostels/">here</a>
5 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Nov  7 14:08:15 2015
  4 | 
  5 | @author: jay
  6 | 
  7 | TWITTER API CLIENT
  8 | Summary: Reads in twitter timeline and responds to tweets 
  9 | that are about hostels and a keyword. 
 10 | """
 11 | import os
 12 | #os.chdir("path_to_HostelReview")
 13 | from hostel_review import HostelReview
 14 | import logging
 15 | import tweepy
 16 | import requests
 17 | import numpy as np
 18 | import pickle
 19 |     
 20 | def login():
 21 |     """Login with api keys"""
 22 |     keys = pickle.load(open("twitter_oauth.p", "rb"))
 23 |     CONSUMER_KEY, CONSUMER_SECRET, oauth_token, oauth_token_secret = [key for key in keys] #read in tokens
 24 |     auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
 25 |     auth.set_access_token(oauth_token, oauth_token_secret)    
 26 |     return auth
 27 | 
 28 | def hostel_main(url, key):
 29 |     """ Run the main analysis program of the hostel
 30 |         Scrapes the url and returns a dictionary of analysis of the hostel
 31 |         Explanation in blog
 32 |     """
 33 |     ht =        HostelReview(url) #initiate hostel review class
 34 |     logger.info('Hostel instantiated')
 35 |     first_url = ht.url + "1?period=all" 
 36 |     xml =       ht.request_xml(first_url) #get xml of first page
 37 |     pages =     ht.find_end(xml) #extract the number of pages to scrape
 38 |     df =        ht.scrape_to_df(ht.url, pages) #scape pages and transpose to dataframe
 39 |     df =        ht.count_amenities(df, key) #create a new column for the keyword features
 40 |     return      ht.sentiment_analysis(key, df) #return dictionary of analysis
 41 |     
 42 | def get_word(txt):
 43 |     """ Find keyword within the tweet, else return None """
 44 |     for word in txt:
 45 |         for amen in ['wifi', 'breakfast', 'bathroom', 'shower', 'noise']:
 46 |             if amen in word.lower():
 47 |                 return amen
 48 | 
 49 | def get_tweet_url(txt):
 50 |     """ Find url in tweet, else return None """
 51 |     for word in txt:
 52 |         if 'https://t.co' in word:
 53 |             return check_url_format(requests.get(word).url)
 54 | 
 55 | def check_url_format(url):
 56 |     """ Check that the url is correct else raise error flag """
 57 |     if "hostelworld.com" in url:
 58 |         if "www" not in url:
 59 |             url = url.replace("t.hostelworld.com", "www.hostelworld.com")
 60 |         return url.split('?')[0] + '/reviews/'
 61 |     else:
 62 |         return "no"
 63 |     
 64 | def get_params(tweet):
 65 |     """ Takes in a tweet object and returns screen name, id, url, and keyword """
 66 |     txt = tweet.text.split()
 67 |     return '@' + tweet.user.screen_name, tweet.id, get_tweet_url(txt), get_word(txt)
 68 | 
 69 | def compute_status(analysis, screen_name, key):
 70 |     """ Takes results json analysis and creates a tweet that aggregates the information
 71 |         without taking over 140 characters
 72 |     """
 73 |     hostel_key_rating = str(np.round(analysis['key_avg'],0))
 74 |     tweet_status = screen_name + " " + key + " rating: " + hostel_key_rating + "/100. Positive: " + \
 75 |         str(analysis['positive']) + " Negative: " + str(analysis['negative']) + ' "' + analysis['common_phrase']['phrase'] + '"'
 76 |     if len(tweet_status) > 140:
 77 |         tweet_status = tweet_status[0:139]
 78 |     return tweet_status
 79 |     
 80 | def get_unread_statuses(all_tweets):
 81 |     """ Passes in the tweepy api and reads from the timeline to return
 82 |         which tweets have been answered before
 83 |     """
 84 |     seen_ids = pickle.load(open("seen_twitter.p", "rb"))
 85 |     return [x for x in all_tweets if x.id not in seen_ids]
 86 | 
 87 | def store_tweet_ids(all_tweets):
 88 |     """ Take all tweet ids on timeline and store them in a pickle file """
 89 |     tweet_ids = [x.id for x in all_tweets]
 90 |     pickle.dump(tweet_ids, open("seen_twitter.p", "wb"))
 91 |     
 92 | def catch_errors(url, key, screen_name):
 93 |     """ Throw error messages for wrong inputs into tweets """
 94 |     if url is None:
 95 |         return screen_name + " Can't find a url buddy"
 96 |     elif url == 'no':
 97 |         return screen_name + " Not the right url buddy"
 98 |     elif key is None:
 99 |         return screen_name + " Can't find an amenity to search for buddy, or you are a bad speller"
100 |     
101 | 
102 | def update_hostel_status(api, tweet):
103 |     """ update status for each tweet """
104 |     screen_name, reply_id, url, key = get_params(tweet)
105 |     status_errors = catch_errors(url, key, screen_name)
106 |     if status_errors is not None: #check for errors in tweet mention
107 |         tweet_status = status_errors
108 |     else: 
109 |         try:
110 |             analysis = hostel_main(url, key) #call hostel main function
111 |             tweet_status = compute_status(analysis, screen_name, key) 
112 |         except:
113 |             tweet_status = screen_name + ' YOU BROKE SOMETHING' #adhoc message for breaking the program
114 |     api.update_status(status = tweet_status, in_reply_to_status_id = reply_id)
115 | 
116 |             
117 | if __name__ == '__main__':
118 |     logging.basicConfig(level=logging.INFO) #initiate logging
119 |     logger = logging.getLogger()
120 |     
121 |     auth = login() #login 
122 |     api = tweepy.API(auth)
123 |     all_tweets = api.mentions_timeline() #grab all tweets from timeline
124 |     tweets = get_unread_statuses(all_tweets) #save the unread mentions
125 |     for tweet in tweets: 
126 |         update_hostel_status(api, tweet) #reply to each mention
127 |     store_tweet_ids(all_tweets) #store tweets so it doesn't repeat next time
128 | 


--------------------------------------------------------------------------------
/hostel_review.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Nov  4 00:28:16 2015
  4 | 
  5 | @author: jay
  6 | 
  7 | HOSTELWORLD Real reviews
  8 | 
  9 | 1. Average the numerical ratings where they mentioned the ameneties 
 10 | 2. Get sentiment analysis for the specific keyword. 
 11 |     - Split it into sentences and count negative words and positive ones where
 12 |     the keyword is said
 13 | 3. Make it a twitter bot slash API
 14 | 4. Let them ask the twitter bot "Hey hostelbot, how is the wifi and shower"
 15 |     - parse the question and find wifi and shower
 16 | 5. Return positive reviews with most common words
 17 | """
 18 | 
 19 | import requests
 20 | import pandas as pd
 21 | from lxml import html
 22 | import numpy as np
 23 | import re
 24 | from textblob import TextBlob
 25 | import nltk
 26 | from nltk.util import ngrams
 27 | from nltk.collocations import *
 28 | from collections import Counter
 29 | 
 30 | class HostelReview():
 31 |     amenities = {
 32 |                     'wifi':        ['wifi','internet','wi-fi', 'wi fi', 'wireless'],
 33 |                     'breakfast':   ['breakfast', 'breakfest', 'break fast', 'brunch'],
 34 |                     'bathroom':    ['bathroom', 'bath room', 'bath', 'restroom', 'toilet', 'urinal', 'lavatory', 'washroom', 'bathrooms'],
 35 |                     'shower':      ['shower', 'bathe', 'showers'],
 36 |                     'noise':       ['noise', 'noisy', 'quiet', 'loud', 'silent']
 37 |                 }
 38 |     def __init__(self, url):
 39 |         """ Initiates the class with a url for the hostel """
 40 |         self.url = url
 41 | 
 42 |     def request_xml(self, url):
 43 |         """ Passes in a url and returns the xml of the page """
 44 |         response = requests.get(url)
 45 |         xml = html.fromstring(response.text)
 46 |         return xml
 47 |     
 48 |     def find_end(self, xml):
 49 |         """ Passes in the xml of a page and returns the number of pages to scrape for reviews """
 50 |         end = int(xml.xpath("//div/div[@class='results']/text()")[0].split(' ')[0].split('(')[1])
 51 |         if end / 20 > 25: #max out at 25 pages of reviews
 52 |             pages = 26
 53 |         else:
 54 |             pages = end / 20
 55 |         return pages
 56 |         
 57 |     def scrape_to_df(self, base_url, pages):
 58 |         """
 59 |         Takes in the hostel url and number pages of reviews and returns a dataframe
 60 |         with each row representing a review. Columns are: ratings, review, and page number
 61 |         """
 62 |         df = []
 63 |         for i in xrange(1, pages):
 64 |             url = base_url + str(i) + "?period=all" #append string to get all reviews
 65 |             xml = self.request_xml(url) 
 66 |             reviews = xml.xpath('//div[@class="microreviews rounded"]') # list of reviews
 67 |             for review in reviews:
 68 |                 df.append({
 69 |                         'rating':(int(review.xpath('.//div/text()')[1].replace('%', ''))), #numerical rating on review
 70 |                         'review': ''.join(review.xpath('.//div/p/text()')).strip(), #text of review
 71 |                         'page': i #page number
 72 |                         })
 73 |         return pd.DataFrame(df) 
 74 |     
 75 |     def count_amenities(self, hostel, key):
 76 |         """ If key/amenity found in review, apply phrase in key column """
 77 |         hostel[key] = hostel.apply(lambda x: self.get_key_sentence(x['review'], self.amenities[key]), axis=1)
 78 |         return hostel
 79 |         
 80 |     def get_key_sentence(self, x, key_list):
 81 |         """Passes in a review and a bag of words associated with the key
 82 |            Returns a sentence in the review containing one or more of the bag of words
 83 |         """
 84 |         delimiters = ',', '.', ';', '!', '?'
 85 |         sentences = self.split(delimiters, x, maxsplit=0)
 86 |         for sent in sentences: #loop through phrases in paragraph
 87 |             for word in sent.split(): #loop through words in phrase
 88 |                 if word.lower() in key_list: #check for keyword matches
 89 |                     return sent.lower().strip()
 90 |     
 91 |     def split(self, delimiters, string, maxsplit=0):
 92 |         """ Takes in comma separated delimiters and splits paragraph string
 93 |             into a list of phrases """
 94 |         regexPattern = '|'.join(map(re.escape, delimiters))
 95 |         return re.split(regexPattern, string, maxsplit)
 96 |     
 97 |     def count_words(self, word_freq, sent, stopwords, list_key):
 98 |         """Takes in a dictionary, sentence or phrase, stopwords, and bag of words
 99 |            and appends counts for word frequencies not in stopwords to find
100 |            the most common words in the reviews
101 |         """
102 |         for word in sent.split():
103 |             if word not in stopwords and word not in list_key:
104 |                 if word not in word_freq:
105 |                     word_freq[word] = 1
106 |                 else:
107 |                     word_freq[word] += 1
108 |         
109 |     def parse_reviews(self, subset, key):
110 |         """ Takes in a dataframe and key
111 |             Returns a dictionary with the highest frequency words and their counts
112 |             where the key was found in the reviews
113 |         """
114 |         word_freq = {}
115 |         stopwords = nltk.corpus.stopwords.words('english')
116 |         for i in xrange(0, len(subset)): #loop through each review
117 |             self.count_words(word_freq, subset[key][i], stopwords, self.amenities[key])    
118 |         return word_freq
119 |         
120 |     def sentiment_analysis(self, key, hostel):
121 |         """ Returns a dictionary of summary sentiment analysis values
122 |         """
123 |         subset = hostel.dropna() #drop reviews not mentioning key
124 |         subset.reset_index(inplace=True) 
125 |         
126 |         subset['sentiment'] = subset[key].apply(lambda x: TextBlob(x).sentiment.polarity)
127 |         
128 |         word_freq = self.parse_reviews(subset, key)
129 |         d = Counter(word_freq)
130 |         phrase_words = [x[0] for x in d.most_common(3)] #find top 3 keywords describing each review
131 |         summary = {
132 |            'phrase_words': d.most_common(3), #dictionary of top 3 common keywords and their counts
133 |            'hotel_avg': np.mean(hostel['rating']), #average rating of the hostel
134 |            'key_avg':   np.mean(subset['rating']), #average rating of reviews specific to key
135 |            'num':       len(hostel), #number of reviews at hostel
136 |            'mean':      np.mean(subset['sentiment']), #average sentiment of review related to key
137 |            'positive':  len(subset[subset['sentiment'] > 0]), #number of positive reviews
138 |            'negative':  len(subset[subset['sentiment'] < 0]), #number of negative reviews
139 |            'zero':      len(subset[subset['sentiment'] == 0]), #number of zero sentiment reviews
140 |            'max_val':   {
141 |                'num': subset.loc[subset['sentiment'].idxmax()]['sentiment'], #sentiment rating for best review
142 |                'phrase': subset.loc[subset['sentiment'].idxmax()][key] #text for best review
143 |             },
144 |            'min_val':   {
145 |                'num': subset.loc[subset['sentiment'].idxmin()]['sentiment'], #sentiment rating for worst reviews
146 |                'phrase': subset.loc[subset['sentiment'].idxmin()][key] #text for worst review
147 |             },
148 |            'common_phrase': {
149 |                'phrase': '', 
150 |                'num': -1
151 |             }
152 |         }
153 |         # Find the review with the most number of common words aggregated from all key reviews
154 |         for phrase in subset[key]:
155 |             num_words = len([x for word in phrase.split() for x in phrase_words if x in word])
156 |             if  num_words > summary['common_phrase']['num']:
157 |                 summary['common_phrase'] = {
158 |                     'phrase': phrase, 
159 |                     'num': num_words
160 |                 }
161 | 
162 |         return summary
163 | 
164 |                  
165 | 


--------------------------------------------------------------------------------