├── README.md ├── real-time-twitter-trend-discovery.py ├── requirements.txt ├── sample_credentials.json ├── scrape_tweets.py ├── topic_20news.py ├── topic_tweets.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Twitter analysis 2 | 3 | 4 | ## vritualenv 5 | 6 | First make sure `pip` and `virtualenv` are installed. Then create a virtual 7 | environment in the root dir by running: 8 | 9 | `virtualenv env` 10 | 11 | then activate the virtual env with 12 | 13 | `source env/bin/activate` 14 | 15 | (to get out of the virtualenv, run `deactivate`) 16 | 17 | 18 | ## Dependencies 19 | 20 | install all the dependencies with 21 | 22 | `pip install -r requirements.txt` 23 | 24 | also make sure to download nltk's corpus by running those line in python 25 | interpreter: 26 | 27 | ```python 28 | import nltk 29 | nltk.download() 30 | ``` 31 | 32 | ## Credentials 33 | 34 | Rename `sample_credentials.json` to `credentials.json`, and fill in the four 35 | credentials from your twitter app. 36 | 37 | 38 | ## Real-time twitter trend discovery 39 | 40 | Run `bokeh serve --show real-time-twitter-trend-discovery.py --args 41 | <*save_history>`, where `` and `` are arguments 42 | representing within what time window we treat tweets as a batch, and how many 43 | words with highest idf scores to show, while `<*save_history>`` is an optional 44 | boolean value indicating whether we want to dump the history. Make sure API 45 | credentials are properly stored in the credentials.json file. 46 | 47 | 48 | ## Topic modeling and t-SNE visualization: 20 Newsgroups 49 | 50 | To train a topic model and visualize the news in 2-D space, run 51 | `python topic_20news.py --n_topics --n_iter 52 | --top_n --threshold `, where `` being the number 53 | of topics we select (default 20), `` being the number of iterations 54 | for training an LDA model (default 500), `` being the number of top 55 | keywords we display (default 5), and `` being the threshold 56 | probability for topic assignment (default 0.0). 57 | 58 | 59 | ## Scrape tweets and save them to disk 60 | 61 | To scrape tweets and save them to disk for later use, run 62 | `python scrape_tweets.py`. If the script is interrupted, just re-run the same 63 | command so new tweets collected. The script gets ~1,000 English tweets per min, 64 | or 1.5 million/day. 65 | 66 | Make sure API credentials are properly stored in the credentials.json file. 67 | 68 | 69 | ## Topic modeling and t-SNE visualization: tweets 70 | 71 | First make sure you accumulated some tweets, then run `python topic_tweets.py 72 | --raw_tweet_dir --num_train_tweet 73 | --n_topics --n_iter --top_n --threshold 74 | --num_example `, where `` being a folder containing 75 | raw tweet files, `` being the number of tweets we use for 76 | training an LDA model, `` being the number of topics we select 77 | (default 20), `` being the number of iterations for training an LDA 78 | model (default 500), `` being the number of top keywords we display 79 | (default 5), `` being the threshold probability for topic assignment 80 | (default 0.0), and `` being number of tweets to show on the plot 81 | (default 5000) -------------------------------------------------------------------------------- /real-time-twitter-trend-discovery.py: -------------------------------------------------------------------------------- 1 | """ 2 | Discover real-time twitter trend. 3 | """ 4 | 5 | import sys 6 | import json 7 | import time 8 | import datetime 9 | from twitter import Api 10 | from functools import partial 11 | from threading import Thread 12 | from bokeh.models import ColumnDataSource 13 | from bokeh.plotting import curdoc, figure 14 | from tornado import gen 15 | 16 | from utils import preprocess, get_tfidf 17 | 18 | 19 | # cli param 20 | tw = float(sys.argv[1]) # time window in min 21 | top_n = int(sys.argv[2]) 22 | try: 23 | store_history = sys.argv[3] 24 | except IndexError: 25 | store_history = False # default: don't store history 26 | 27 | 28 | # twitter dev api 29 | with open('credentials.json') as j: 30 | cred = json.load(j) 31 | 32 | api = Api(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'], 33 | cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET']) 34 | 35 | # start time for the 1st batch 36 | batch_start_time = time.time() 37 | 38 | # bokeh setup 39 | source = ColumnDataSource(data=dict(x=[], y=[], text=[])) 40 | doc = curdoc() 41 | 42 | # bokeh update 43 | @gen.coroutine 44 | def update(x, y, text): 45 | source.stream(dict(x=[x], y=[y], text=[text]), 10) # last param controls right shift 46 | 47 | # get live tweets 48 | def get_tweets(): 49 | global batch_start_time 50 | processed_tweet = [] 51 | try: 52 | for line in api.GetStreamSample(): 53 | if 'text' in line and line['lang'] == u'en': 54 | text = line['text'].encode('utf-8').replace('\n', ' ') 55 | p_t = preprocess(text) # process tweets 56 | if p_t: 57 | processed_tweet += p_t, 58 | if time.time() - batch_start_time >= tw * 60: # time is over for this batch 59 | return processed_tweet 60 | return processed_tweet # server-side interruption 61 | except: 62 | pass 63 | 64 | # main logic for batch update 65 | def blocking_task(): 66 | global batch_start_time 67 | temp_batch_tweet = [] 68 | history = {} 69 | start_t = None 70 | 71 | while True: 72 | try: 73 | tweets = get_tweets() 74 | if temp_batch_tweet: # some leftover due to interruption 75 | temp_batch_tweet.extend(tweets) 76 | else: # no interruption in this batch 77 | temp_batch_tweet = tweets 78 | 79 | utc_t = datetime.datetime.utcfromtimestamp(batch_start_time) 80 | time_x = int('{}{}{}{}{}'.format( 81 | utc_t.year, utc_t.month, utc_t.day, utc_t.hour, utc_t.minute)) 82 | 83 | if start_t is None: # history file start time 84 | start_t = time_x 85 | 86 | if temp_batch_tweet: 87 | # get top features and idf scores 88 | top_feature_name, top_feature_idf = get_tfidf( 89 | temp_batch_tweet, top_n=top_n, max_features=int(5000./60*tw)) 90 | 91 | # maybe store history 92 | if store_history: 93 | history[time_x] = list(top_feature_name), list(top_feature_idf) 94 | 95 | # reset start time and contain to hold next batch 96 | batch_start_time = time.time() 97 | temp_batch_tweet = [] 98 | 99 | # feature name ans scores (words with tie score on same line) 100 | batch_dict = {} 101 | for feat, score in zip(top_feature_name, top_feature_idf): 102 | if score not in batch_dict: 103 | batch_dict[score] = feat 104 | else: 105 | batch_dict[score] += ', {}'.format(feat) 106 | for score, feat in batch_dict.iteritems(): # update 107 | doc.add_next_tick_callback( 108 | partial(update, x=time_x, y=score, text=feat)) 109 | 110 | except KeyboardInterrupt: # manual stop 111 | print 'KeyboardInterrupt; aborted' 112 | sys.exit(1) 113 | 114 | # maybe dump history 115 | if store_history and history: 116 | with open('{}_{}_{}.json'.format(start_t, len(history), tw), 'w') as f: 117 | json.dump(history, f) 118 | 119 | 120 | # bokeh figure 121 | p = figure(plot_height=650, plot_width=1300, title='Twitter Trending Words', 122 | x_axis_label='UTC Time', y_axis_label='IDF Score') 123 | 124 | # title font size 125 | p.title.text_font_size='20pt' 126 | 127 | # no grids 128 | p.xgrid.grid_line_color = None 129 | p.ygrid.grid_line_color = None 130 | 131 | # no scientific representation 132 | p.xaxis[0].formatter.use_scientific = False 133 | 134 | # set x-tick min 1 135 | p.xaxis[0].ticker.desired_num_ticks=1 136 | 137 | l = p.text(x='x', y='y', text='text', text_font_size="10pt", 138 | text_baseline="middle", text_align='center', source=source) 139 | 140 | doc.add_root(p) 141 | thread = Thread(target=blocking_task) 142 | thread.start() 143 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-twitter 2 | scipy 3 | numpy 4 | sklearn 5 | nltk 6 | bokeh 7 | lda -------------------------------------------------------------------------------- /sample_credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | "CONSUMER_KEY": "", 3 | "CONSUMER_SECRET": "", 4 | "ACCESS_TOKEN": "", 5 | "ACCESS_TOKEN_SECRET": "" 6 | } -------------------------------------------------------------------------------- /scrape_tweets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get live tweets and save to disk 3 | """ 4 | 5 | import os 6 | import json 7 | import datetime 8 | from twitter import Api 9 | 10 | 11 | RAW_TWEET_DIR = 'raw_tweet' 12 | 13 | # maybe create raw_tweet dir 14 | if not os.path.exists(RAW_TWEET_DIR): 15 | os.makedirs(RAW_TWEET_DIR) 16 | 17 | # retrieve credentials 18 | with open('credentials.json') as j: 19 | cred = json.load(j) 20 | 21 | api = Api(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'], 22 | cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET']) 23 | 24 | 25 | def datetime_filename(prefix='output_'): 26 | """ 27 | creates filename with current datetime string suffix 28 | """ 29 | outputname = prefix + '{:%Y%m%d%H%M%S}utc.txt'.format( 30 | datetime.datetime.utcnow()) 31 | return outputname 32 | 33 | 34 | def scrape(tweets_per_file=100000): 35 | """ 36 | scrape live tweets. GetStreamSample() gets ~1,000 English 37 | tweets per min, or 1.5 million/day 38 | 39 | for easier reference, we save 100k tweets per file 40 | """ 41 | f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w') 42 | tweet_count = 0 43 | try: 44 | for line in api.GetStreamSample(): 45 | if 'text' in line and line['lang'] == u'en': 46 | text = line['text'].encode('utf-8').replace('\n', ' ') 47 | f.write('{}\n'.format(text)) 48 | tweet_count += 1 49 | if tweet_count % tweets_per_file == 0: # start new batch 50 | f.close() 51 | f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w') 52 | continue 53 | except KeyboardInterrupt: 54 | print 'Twitter stream collection aborted' 55 | finally: 56 | f.close() 57 | return tweet_count 58 | 59 | 60 | if __name__ == '__main__': 61 | tweet_count = scrape() 62 | print 'A total of {} tweets collected'.format(tweet_count) 63 | 64 | -------------------------------------------------------------------------------- /topic_20news.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train an lDA model on 20 newsgroups (training + test sets) 3 | """ 4 | 5 | import os 6 | import argparse 7 | import time 8 | import lda 9 | import numpy as np 10 | from sklearn.feature_extraction.text import CountVectorizer 11 | from sklearn.datasets import fetch_20newsgroups 12 | from sklearn.manifold import TSNE 13 | import bokeh.plotting as bp 14 | from bokeh.plotting import save 15 | from bokeh.models import HoverTool 16 | 17 | 18 | if __name__ == '__main__': 19 | 20 | ############################################################################## 21 | # setup 22 | 23 | news_base_dir = '20newsgroups' 24 | if not os.path.exists(news_base_dir): 25 | os.makedirs(news_base_dir) 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--n_topics', required=True, type=int, default=20, 29 | help='number of topics') 30 | parser.add_argument('--n_iter', required=True, type=int, default=500, 31 | help='number of iteration for LDA model training') 32 | parser.add_argument('--top_n', required=True, type=int, default=5, 33 | help='number of keywords to show for each topic') 34 | parser.add_argument('--threshold', required=True, type=float, default=0.0, 35 | help='threshold probability for topic assignment') 36 | args = parser.parse_args() 37 | 38 | # unpack 39 | n_topics = args.n_topics 40 | n_iter = args.n_iter 41 | n_top_words = args.top_n 42 | threshold = args.threshold 43 | 44 | t0 = time.time() 45 | 46 | ############################################################################## 47 | # train an LDA model 48 | 49 | remove = ('headers', 'footers', 'quotes') 50 | newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) 51 | newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) 52 | news = [' '.join(filter(unicode.isalpha, raw.lower().split())) for raw in 53 | newsgroups_train.data + newsgroups_test.data] 54 | 55 | cvectorizer = CountVectorizer(min_df=5, stop_words='english') 56 | cvz = cvectorizer.fit_transform(news) 57 | 58 | lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter) 59 | X_topics = lda_model.fit_transform(cvz) 60 | 61 | t1 = time.time() 62 | 63 | print '\n>>> LDA training done; took {} mins\n'.format((t1-t0)/60.) 64 | 65 | np.save('20newsgroups/lda_doc_topic_{}news_{}topics.npy'.format( 66 | X_topics.shape[0], X_topics.shape[1]), X_topics) 67 | 68 | np.save('20newsgroups/lda_topic_word_{}news_{}topics.npy'.format( 69 | X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_) 70 | 71 | print '\n>>> doc_topic & topic word written to disk\n' 72 | 73 | ############################################################################## 74 | # threshold and plot 75 | 76 | _idx = np.amax(X_topics, axis=1) > threshold # idx of news that > threshold 77 | _topics = X_topics[_idx] 78 | 79 | num_example = len(_topics) 80 | 81 | # t-SNE: 50 -> 2D 82 | tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, 83 | init='pca') 84 | tsne_lda = tsne_model.fit_transform(_topics[:num_example]) 85 | 86 | # find the most probable topic for each news 87 | _lda_keys = [] 88 | for i in xrange(_topics.shape[0]): 89 | _lda_keys += _topics[i].argmax(), 90 | 91 | # show topics and their top words 92 | topic_summaries = [] 93 | topic_word = lda_model.topic_word_ # get the topic words 94 | vocab = cvectorizer.get_feature_names() 95 | for i, topic_dist in enumerate(topic_word): 96 | topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] 97 | topic_summaries.append(' '.join(topic_words)) 98 | 99 | # 20 colors 100 | colormap = np.array([ 101 | "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 102 | "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 103 | "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 104 | "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" 105 | ]) 106 | 107 | # plot 108 | title = "[20 newsgroups] t-SNE visualization of LDA model trained on {} news, " \ 109 | "{} topics, thresholding at {} topic probability, {} iter ({} data " \ 110 | "points and top {} words)".format( 111 | X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words) 112 | 113 | plot_lda = bp.figure(plot_width=1400, plot_height=1100, 114 | title=title, 115 | tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", 116 | x_axis_type=None, y_axis_type=None, min_border=1) 117 | 118 | plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], 119 | color=colormap[_lda_keys][:num_example], 120 | source=bp.ColumnDataSource({ 121 | "content": news[:num_example], 122 | "topic_key": _lda_keys[:num_example] 123 | })) 124 | 125 | # randomly choose a news (in a topic) coordinate as the crucial words coordinate 126 | topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan 127 | for topic_num in _lda_keys: 128 | if not np.isnan(topic_coord).any(): 129 | break 130 | topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)] 131 | 132 | # plot crucial words 133 | for i in xrange(X_topics.shape[1]): 134 | plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]]) 135 | 136 | # hover tools 137 | hover = plot_lda.select(dict(type=HoverTool)) 138 | hover.tooltips = {"content": "@content - topic: @topic_key"} 139 | 140 | save(plot_lda, '20_news_tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format( 141 | X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)) 142 | 143 | t2 = time.time() 144 | print '\n>>> whole process done; took {} mins\n'.format((t2 - t0) / 60.) -------------------------------------------------------------------------------- /topic_tweets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train LDA model using https://pypi.python.org/pypi/lda, 3 | and visualize in 2-D space with t-SNE. 4 | 5 | """ 6 | 7 | import os 8 | import time 9 | import lda 10 | import random 11 | import argparse 12 | import numpy as np 13 | from sklearn.feature_extraction.text import CountVectorizer 14 | from sklearn.manifold import TSNE 15 | import bokeh.plotting as bp 16 | from bokeh.plotting import save 17 | from bokeh.models import HoverTool 18 | from utils import preprocess 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | lda_base = 'lda_simple' 24 | if not os.path.exists(lda_base): 25 | os.makedirs(lda_base) 26 | 27 | ############################################################################## 28 | # cli inputs 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--raw_tweet_dir', required=True, type=str, 32 | help='a directory of raw tweet files') 33 | parser.add_argument('--num_train_tweet', required=True, type=int, 34 | help='number of tweets used for training a LDA model') 35 | parser.add_argument('--n_topics', required=True, type=int, default=20, 36 | help='number of topics') 37 | parser.add_argument('--n_iter', required=True, type=int, default=500, 38 | help='number of iteration for LDA model training') 39 | parser.add_argument('--top_n', required=True, type=int, default=5, 40 | help='number of keywords to show for each topic') 41 | parser.add_argument('--threshold', required=True, type=float, default=0.0, 42 | help='threshold probability for topic assignment') 43 | parser.add_argument('--num_example', required=True, type=int, default=5000, 44 | help='number of tweets to show on the plot') 45 | args = parser.parse_args() 46 | 47 | # unpack 48 | raw_tweet_dir = args.raw_tweet_dir 49 | num_train_tweet = args.num_train_tweet 50 | n_topics = args.n_topics 51 | n_iter = args.n_iter 52 | n_top_words = args.top_n 53 | threshold = args.threshold 54 | num_example = args.num_example 55 | 56 | 57 | ############################################################################## 58 | # get training tweets 59 | 60 | num_scanned_tweet = 0 61 | num_qualified_tweet = 0 62 | 63 | raw_tweet_files = os.listdir(raw_tweet_dir) 64 | 65 | raw_tweet = [] 66 | processed_tweet = [] 67 | processed_tweet_set = set() # for quicker'item in?' check 68 | 69 | t0 = time.time() 70 | 71 | for f in raw_tweet_files: 72 | in_file = os.path.join(raw_tweet_dir, f) 73 | if not in_file.endswith('.txt'): # ignore non .txt file 74 | continue 75 | for t in open(in_file): 76 | num_scanned_tweet += 1 77 | p_t = preprocess(t) 78 | if p_t and p_t not in processed_tweet_set: # ignore duplicate tweets 79 | raw_tweet += t, 80 | processed_tweet += p_t, 81 | processed_tweet_set.add(p_t) 82 | num_qualified_tweet += 1 83 | 84 | if num_scanned_tweet % 1000000 == 0: # progress update 85 | print 'scanned {} tweets'.format(num_scanned_tweet) 86 | 87 | if num_qualified_tweet == num_train_tweet: # enough data for training 88 | break 89 | 90 | if num_qualified_tweet == num_train_tweet: # break outer loop 91 | break 92 | 93 | del processed_tweet_set # free memory 94 | 95 | t1 = time.time() 96 | print '\n>>> scanned {} tweets to find {} trainable; took {} mins\n'.format( 97 | num_scanned_tweet, num_train_tweet, (t1-t0)/60.) 98 | 99 | ############################################################################## 100 | # train LDA 101 | 102 | # ignore terms that have a document frequency strictly lower than 5 103 | cvectorizer = CountVectorizer(min_df=5) 104 | cvz = cvectorizer.fit_transform(processed_tweet) 105 | 106 | lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter) 107 | X_topics = lda_model.fit_transform(cvz) 108 | 109 | t2 = time.time() 110 | print '\n>>> LDA training done; took {} mins\n'.format((t2-t1)/60.) 111 | 112 | np.save('lda_simple/lda_doc_topic_{}tweets_{}topics.npy'.format( 113 | X_topics.shape[0], X_topics.shape[1]), X_topics) 114 | np.save('lda_simple/lda_topic_word_{}tweets_{}topics.npy'.format( 115 | X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_) 116 | print '\n>>> doc_topic & topic word written to disk\n' 117 | 118 | ############################################################################## 119 | # threshold and plot 120 | 121 | _idx = np.amax(X_topics, axis=1) > threshold # idx of tweets that > threshold 122 | _topics = X_topics[_idx] 123 | _raw_tweet = np.array(raw_tweet)[_idx] 124 | _processed_tweet = np.array(processed_tweet)[_idx] 125 | 126 | # t-SNE: 50 -> 2D 127 | tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, 128 | init='pca') 129 | tsne_lda = tsne_model.fit_transform(_topics[:num_example]) 130 | 131 | t3 = time.time() 132 | print '\n>>> t-SNE transformation done; took {} mins\n'.format((t3-t2)/60.) 133 | 134 | # find the most probable topic for each tweet 135 | _lda_keys = [] 136 | for i, tweet in enumerate(_raw_tweet): 137 | _lda_keys += _topics[i].argmax(), 138 | 139 | # generate random hex color 140 | colormap = [] 141 | for i in xrange(X_topics.shape[1]): 142 | r = lambda: random.randint(0, 255) 143 | colormap += ('#%02X%02X%02X' % (r(), r(), r())), 144 | colormap = np.array(colormap) 145 | 146 | # show topics and their top words 147 | topic_summaries = [] 148 | topic_word = lda_model.topic_word_ # get the topic words 149 | vocab = cvectorizer.get_feature_names() 150 | for i, topic_dist in enumerate(topic_word): 151 | topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] 152 | topic_summaries.append(' '.join(topic_words)) 153 | 154 | # use the coordinate of a random tweet as string topic string coordinate 155 | topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan 156 | for topic_num in _lda_keys: 157 | if not np.isnan(topic_coord).any(): 158 | break 159 | topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)] 160 | 161 | # plot 162 | 163 | title = "t-SNE visualization of LDA model trained on {} tweets, {} topics, " \ 164 | "thresholding at {} topic probability, {} iter ({} data points and " \ 165 | "top {} words)".format(num_qualified_tweet, n_topics, threshold, 166 | n_iter, num_example, n_top_words) 167 | 168 | plot_lda = bp.figure(plot_width=900, plot_height=700, 169 | title=title, 170 | tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", 171 | x_axis_type=None, y_axis_type=None, min_border=1) 172 | 173 | plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], 174 | color=colormap[_lda_keys][:num_example], 175 | source=bp.ColumnDataSource({ 176 | "tweet": _raw_tweet[:num_example], 177 | "topic_key": _lda_keys[:num_example] 178 | })) 179 | 180 | # plot crucial words 181 | for i in xrange(X_topics.shape[1]): 182 | plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]]) 183 | hover = plot_lda.select(dict(type=HoverTool)) 184 | hover.tooltips = {"tweet": "@tweet - topic: @topic_key"} 185 | 186 | save(plot_lda, 'tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format( 187 | num_qualified_tweet, n_topics, threshold, n_iter, num_example, n_top_words)) 188 | 189 | 190 | t4 = time.time() 191 | print '\n>>> whole process done; took {} mins\n'.format((t4-t0)/60.) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some utilities. 3 | """ 4 | 5 | from nltk.corpus import stopwords 6 | import numpy as np 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | 9 | 10 | def preprocess(tweet, ascii=True, ignore_rt_char=True, ignore_url=True, 11 | ignore_mention=True, ignore_hashtag=True, 12 | letter_only=True, remove_stopwords=True, min_tweet_len=3): 13 | 14 | sword = stopwords.words('english') 15 | 16 | if ascii: # maybe remove lines with ANY non-ascii character 17 | for c in tweet: 18 | if not (0 < ord(c) < 127): 19 | return '' 20 | 21 | tokens = tweet.lower().split() # to lower, split 22 | res = [] 23 | 24 | for token in tokens: 25 | if remove_stopwords and token in sword: 26 | continue 27 | if ignore_rt_char and token == 'rt': 28 | continue 29 | if ignore_url and token.startswith('https:'): 30 | continue 31 | if ignore_mention and token.startswith('@'): 32 | continue 33 | if ignore_hashtag and token.startswith('#'): 34 | continue 35 | if letter_only: 36 | if not token.isalpha(): 37 | continue 38 | elif token.isdigit(): 39 | token = '' 40 | 41 | res += token, 42 | 43 | if min_tweet_len and len(res) < min_tweet_len: 44 | return '' 45 | else: 46 | return ' '.join(res) 47 | 48 | 49 | def get_tfidf(tweet_list, top_n, max_features=5000): 50 | """ return the top n feature names and idf scores of a tweets list """ 51 | tfidf_vectorizer = TfidfVectorizer(max_features=max_features) 52 | tfidf_vectorizer.fit_transform(tweet_list) 53 | indices = np.argsort(tfidf_vectorizer.idf_)[::-1] 54 | features = tfidf_vectorizer.get_feature_names() 55 | top_feature_name = [features[i] for i in indices[:top_n]] 56 | top_feautre_idf = tfidf_vectorizer.idf_[indices][:top_n] 57 | 58 | return top_feature_name, top_feautre_idf 59 | 60 | 61 | 62 | --------------------------------------------------------------------------------