├── README.md
├── real-time-twitter-trend-discovery.py
├── requirements.txt
├── sample_credentials.json
├── scrape_tweets.py
├── topic_20news.py
├── topic_tweets.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Twitter analysis
 2 | 
 3 | 
 4 | ## vritualenv
 5 | 
 6 | First make sure `pip` and `virtualenv` are installed. Then create a virtual
 7 | environment in the root dir by running:
 8 | 
 9 | `virtualenv env`
10 | 
11 | then activate the virtual env with
12 | 
13 | `source env/bin/activate`
14 | 
15 | (to get out of the virtualenv, run `deactivate`)
16 | 
17 | 
18 | ## Dependencies
19 | 
20 | install all the dependencies with
21 | 
22 | `pip install -r requirements.txt`
23 | 
24 | also make sure to download nltk's corpus by running those line in python
25 | interpreter:
26 | 
27 | ```python
28 | import nltk
29 | nltk.download()
30 | ```
31 | 
32 | ## Credentials
33 | 
34 | Rename `sample_credentials.json` to `credentials.json`, and fill in the four
35 | credentials from your twitter app.
36 | 
37 | 
38 | ## Real-time twitter trend discovery
39 | 
40 | Run `bokeh serve --show real-time-twitter-trend-discovery.py --args <tw>
41 | <top_n_words> <*save_history>`, where `<tw>` and `<top_n_words>` are arguments
42 | representing within what time window we treat tweets as a batch, and how many
43 | words with highest idf scores to show, while `<*save_history>`` is an optional
44 | boolean value indicating whether we want to dump the history. Make sure API
45 | credentials are properly stored in the credentials.json file.
46 | 
47 | 
48 | ## Topic modeling and t-SNE visualization: 20 Newsgroups
49 | 
50 | To train a topic model and visualize the news in 2-D space, run
51 | `python topic_20news.py --n_topics <n_topics> --n_iter <n_iter>
52 | --top_n <top_n> --threshold <threshold>`, where `<n_topics>` being the number
53 | of topics we select (default 20), `<n_iter>` being the number of iterations
54 | for training an LDA model (default 500), `<top_n>` being the number of top
55 | keywords we display (default 5), and `<threshold>` being the threshold
56 | probability for topic assignment (default 0.0).
57 | 
58 | 
59 | ## Scrape tweets and save them to disk
60 | 
61 | To scrape tweets and save them to disk for later use, run
62 | `python scrape_tweets.py`. If the script is interrupted, just re-run the same
63 | command so new tweets collected. The script gets ~1,000 English tweets per min,
64 | or 1.5 million/day.
65 | 
66 | Make sure API credentials are properly stored in the credentials.json file.
67 | 
68 | 
69 | ## Topic modeling and t-SNE visualization: tweets
70 | 
71 | First make sure you accumulated some tweets, then run `python topic_tweets.py
72 | --raw_tweet_dir <raw_tweet_dir> --num_train_tweet <num_train_tweet>
73 | --n_topics <n_topics> --n_iter <n_iter> --top_n <top_n> --threshold <threshold>
74 | --num_example <num_example>`, where `<raw_tweet_dir>` being a folder containing
75 | raw tweet files, `<num_train_tweet>` being the number of tweets we use for
76 | training an LDA model, `<n_topics>` being the number of topics we select
77 | (default 20), `<n_iter>` being the number of iterations for training an LDA
78 | model (default 500), `<top_n>` being the number of top keywords we display
79 | (default 5), `<threshold>` being the threshold probability for topic assignment
80 | (default 0.0), and `<num_example>` being number of tweets to show on the plot
81 | (default 5000)


--------------------------------------------------------------------------------
/real-time-twitter-trend-discovery.py:
--------------------------------------------------------------------------------
  1 | """
  2 |   Discover real-time twitter trend.
  3 | """
  4 | 
  5 | import sys
  6 | import json
  7 | import time
  8 | import datetime
  9 | from twitter import Api
 10 | from functools import partial
 11 | from threading import Thread
 12 | from bokeh.models import ColumnDataSource
 13 | from bokeh.plotting import curdoc, figure
 14 | from tornado import gen
 15 | 
 16 | from utils import preprocess, get_tfidf
 17 | 
 18 | 
 19 | # cli param
 20 | tw = float(sys.argv[1]) # time window in min
 21 | top_n = int(sys.argv[2])
 22 | try:
 23 |   store_history = sys.argv[3]
 24 | except IndexError:
 25 |   store_history = False # default: don't store history
 26 | 
 27 | 
 28 | # twitter dev api
 29 | with open('credentials.json') as j:
 30 |   cred = json.load(j)
 31 | 
 32 | api = Api(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'],
 33 |           cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET'])
 34 | 
 35 | # start time for the 1st batch
 36 | batch_start_time = time.time()
 37 | 
 38 | # bokeh setup
 39 | source = ColumnDataSource(data=dict(x=[], y=[], text=[]))
 40 | doc = curdoc()
 41 | 
 42 | # bokeh update
 43 | @gen.coroutine
 44 | def update(x, y, text):
 45 |   source.stream(dict(x=[x], y=[y], text=[text]), 10)  # last param controls right shift
 46 | 
 47 | # get live tweets
 48 | def get_tweets():
 49 |   global batch_start_time
 50 |   processed_tweet = []
 51 |   try:
 52 |     for line in api.GetStreamSample():
 53 |       if 'text' in line and line['lang'] == u'en':
 54 |         text = line['text'].encode('utf-8').replace('\n', ' ')
 55 |         p_t = preprocess(text) # process tweets
 56 |         if p_t:
 57 |           processed_tweet += p_t,
 58 |       if time.time() - batch_start_time >= tw * 60: # time is over for this batch
 59 |         return processed_tweet
 60 |     return processed_tweet # server-side interruption
 61 |   except:
 62 |     pass
 63 | 
 64 | # main logic for batch update
 65 | def blocking_task():
 66 |   global batch_start_time
 67 |   temp_batch_tweet = []
 68 |   history = {}
 69 |   start_t = None
 70 | 
 71 |   while True:
 72 |     try:
 73 |       tweets = get_tweets()
 74 |       if temp_batch_tweet: # some leftover due to interruption
 75 |         temp_batch_tweet.extend(tweets)
 76 |       else: # no interruption in this batch
 77 |         temp_batch_tweet = tweets
 78 | 
 79 |       utc_t = datetime.datetime.utcfromtimestamp(batch_start_time)
 80 |       time_x = int('{}{}{}{}{}'.format(
 81 |         utc_t.year, utc_t.month, utc_t.day, utc_t.hour, utc_t.minute))
 82 | 
 83 |       if start_t is None: # history file start time
 84 |         start_t = time_x
 85 | 
 86 |       if temp_batch_tweet:
 87 |         # get top features and idf scores
 88 |         top_feature_name, top_feature_idf = get_tfidf(
 89 |           temp_batch_tweet, top_n=top_n, max_features=int(5000./60*tw))
 90 | 
 91 |         # maybe store history
 92 |         if store_history:
 93 |           history[time_x] = list(top_feature_name), list(top_feature_idf)
 94 | 
 95 |         # reset start time and contain to hold next batch
 96 |         batch_start_time = time.time()
 97 |         temp_batch_tweet = []
 98 | 
 99 |         # feature name ans scores (words with tie score on same line)
100 |         batch_dict = {}
101 |         for feat, score in zip(top_feature_name, top_feature_idf):
102 |           if score not in batch_dict:
103 |             batch_dict[score] = feat
104 |           else:
105 |             batch_dict[score] += ', {}'.format(feat)
106 |         for score, feat in batch_dict.iteritems(): # update
107 |           doc.add_next_tick_callback(
108 |             partial(update, x=time_x, y=score, text=feat))
109 | 
110 |     except KeyboardInterrupt: # manual stop
111 |       print 'KeyboardInterrupt; aborted'
112 |       sys.exit(1)
113 | 
114 |     # maybe dump history
115 |     if store_history and history:
116 |       with open('{}_{}_{}.json'.format(start_t, len(history), tw), 'w') as f:
117 |         json.dump(history, f)
118 | 
119 | 
120 | # bokeh figure
121 | p = figure(plot_height=650, plot_width=1300, title='Twitter Trending Words',
122 |            x_axis_label='UTC Time', y_axis_label='IDF Score')
123 | 
124 | # title font size
125 | p.title.text_font_size='20pt'
126 | 
127 | # no grids
128 | p.xgrid.grid_line_color = None
129 | p.ygrid.grid_line_color = None
130 | 
131 | # no scientific representation
132 | p.xaxis[0].formatter.use_scientific = False
133 | 
134 | # set x-tick min 1
135 | p.xaxis[0].ticker.desired_num_ticks=1
136 | 
137 | l = p.text(x='x', y='y', text='text', text_font_size="10pt",
138 |            text_baseline="middle", text_align='center', source=source)
139 | 
140 | doc.add_root(p)
141 | thread = Thread(target=blocking_task)
142 | thread.start()
143 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-twitter
2 | scipy
3 | numpy
4 | sklearn
5 | nltk
6 | bokeh
7 | lda


--------------------------------------------------------------------------------
/sample_credentials.json:
--------------------------------------------------------------------------------
1 | {
2 |   "CONSUMER_KEY": "",
3 |   "CONSUMER_SECRET": "",
4 |   "ACCESS_TOKEN": "",
5 |   "ACCESS_TOKEN_SECRET": ""
6 | }


--------------------------------------------------------------------------------
/scrape_tweets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |   Get live tweets and save to disk
 3 | """
 4 | 
 5 | import os
 6 | import json
 7 | import datetime
 8 | from twitter import Api
 9 | 
10 | 
11 | RAW_TWEET_DIR = 'raw_tweet'
12 | 
13 | # maybe create raw_tweet dir
14 | if not os.path.exists(RAW_TWEET_DIR):
15 |   os.makedirs(RAW_TWEET_DIR)
16 | 
17 | # retrieve credentials
18 | with open('credentials.json') as j:
19 |   cred = json.load(j)
20 | 
21 | api = Api(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'],
22 |           cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET'])
23 | 
24 | 
25 | def datetime_filename(prefix='output_'):
26 |   """
27 |   creates filename with current datetime string suffix
28 |   """
29 |   outputname = prefix + '{:%Y%m%d%H%M%S}utc.txt'.format(
30 |     datetime.datetime.utcnow())
31 |   return outputname
32 | 
33 | 
34 | def scrape(tweets_per_file=100000):
35 |   """
36 |   scrape live tweets. GetStreamSample() gets ~1,000 English
37 |   tweets per min, or 1.5 million/day
38 | 
39 |   for easier reference, we save 100k tweets per file
40 |   """
41 |   f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w')
42 |   tweet_count = 0
43 |   try:
44 |     for line in api.GetStreamSample():
45 |       if 'text' in line and line['lang'] == u'en':
46 |         text = line['text'].encode('utf-8').replace('\n', ' ')
47 |         f.write('{}\n'.format(text))
48 |         tweet_count += 1
49 |         if tweet_count % tweets_per_file == 0: # start new batch
50 |           f.close()
51 |           f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w')
52 |           continue
53 |   except KeyboardInterrupt:
54 |     print 'Twitter stream collection aborted'
55 |   finally:
56 |     f.close()
57 |     return tweet_count
58 | 
59 | 
60 | if __name__ == '__main__':
61 |   tweet_count = scrape()
62 |   print 'A total of {} tweets collected'.format(tweet_count)
63 | 
64 | 


--------------------------------------------------------------------------------
/topic_20news.py:
--------------------------------------------------------------------------------
  1 | """
  2 |   Train an lDA model on 20 newsgroups (training + test sets)
  3 | """
  4 | 
  5 | import os
  6 | import argparse
  7 | import time
  8 | import lda
  9 | import numpy as np
 10 | from sklearn.feature_extraction.text import CountVectorizer
 11 | from sklearn.datasets import fetch_20newsgroups
 12 | from sklearn.manifold import TSNE
 13 | import bokeh.plotting as bp
 14 | from bokeh.plotting import save
 15 | from bokeh.models import HoverTool
 16 | 
 17 | 
 18 | if __name__ == '__main__':
 19 | 
 20 |   ##############################################################################
 21 |   # setup
 22 | 
 23 |   news_base_dir = '20newsgroups'
 24 |   if not os.path.exists(news_base_dir):
 25 |     os.makedirs(news_base_dir)
 26 | 
 27 |   parser = argparse.ArgumentParser()
 28 |   parser.add_argument('--n_topics', required=True, type=int, default=20,
 29 |                       help='number of topics')
 30 |   parser.add_argument('--n_iter', required=True, type=int, default=500,
 31 |                       help='number of iteration for LDA model training')
 32 |   parser.add_argument('--top_n', required=True, type=int, default=5,
 33 |                       help='number of keywords to show for each topic')
 34 |   parser.add_argument('--threshold', required=True, type=float, default=0.0,
 35 |                       help='threshold probability for topic assignment')
 36 |   args = parser.parse_args()
 37 | 
 38 |   # unpack
 39 |   n_topics = args.n_topics
 40 |   n_iter = args.n_iter
 41 |   n_top_words = args.top_n
 42 |   threshold = args.threshold
 43 | 
 44 |   t0 = time.time()
 45 | 
 46 |   ##############################################################################
 47 |   # train an LDA model
 48 | 
 49 |   remove = ('headers', 'footers', 'quotes')
 50 |   newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
 51 |   newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
 52 |   news = [' '.join(filter(unicode.isalpha, raw.lower().split())) for raw in
 53 |           newsgroups_train.data + newsgroups_test.data]
 54 | 
 55 |   cvectorizer = CountVectorizer(min_df=5, stop_words='english')
 56 |   cvz = cvectorizer.fit_transform(news)
 57 | 
 58 |   lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
 59 |   X_topics = lda_model.fit_transform(cvz)
 60 | 
 61 |   t1 = time.time()
 62 | 
 63 |   print '\n>>> LDA training done; took {} mins\n'.format((t1-t0)/60.)
 64 | 
 65 |   np.save('20newsgroups/lda_doc_topic_{}news_{}topics.npy'.format(
 66 |     X_topics.shape[0], X_topics.shape[1]), X_topics)
 67 | 
 68 |   np.save('20newsgroups/lda_topic_word_{}news_{}topics.npy'.format(
 69 |     X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_)
 70 | 
 71 |   print '\n>>> doc_topic & topic word written to disk\n'
 72 | 
 73 |   ##############################################################################
 74 |   # threshold and plot
 75 | 
 76 |   _idx = np.amax(X_topics, axis=1) > threshold  # idx of news that > threshold
 77 |   _topics = X_topics[_idx]
 78 | 
 79 |   num_example = len(_topics)
 80 | 
 81 |   # t-SNE: 50 -> 2D
 82 |   tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99,
 83 |                     init='pca')
 84 |   tsne_lda = tsne_model.fit_transform(_topics[:num_example])
 85 | 
 86 |   # find the most probable topic for each news
 87 |   _lda_keys = []
 88 |   for i in xrange(_topics.shape[0]):
 89 |     _lda_keys += _topics[i].argmax(),
 90 | 
 91 |   # show topics and their top words
 92 |   topic_summaries = []
 93 |   topic_word = lda_model.topic_word_  # get the topic words
 94 |   vocab = cvectorizer.get_feature_names()
 95 |   for i, topic_dist in enumerate(topic_word):
 96 |     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
 97 |     topic_summaries.append(' '.join(topic_words))
 98 | 
 99 |   # 20 colors
100 |   colormap = np.array([
101 |     "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
102 |     "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
103 |     "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
104 |     "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
105 |   ])
106 | 
107 |   # plot
108 |   title = "[20 newsgroups] t-SNE visualization of LDA model trained on {} news, " \
109 |           "{} topics, thresholding at {} topic probability, {} iter ({} data " \
110 |           "points and top {} words)".format(
111 |     X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)
112 | 
113 |   plot_lda = bp.figure(plot_width=1400, plot_height=1100,
114 |                        title=title,
115 |                        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
116 |                        x_axis_type=None, y_axis_type=None, min_border=1)
117 | 
118 |   plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
119 |                    color=colormap[_lda_keys][:num_example],
120 |                    source=bp.ColumnDataSource({
121 |                      "content": news[:num_example],
122 |                      "topic_key": _lda_keys[:num_example]
123 |                      }))
124 | 
125 |   # randomly choose a news (in a topic) coordinate as the crucial words coordinate
126 |   topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
127 |   for topic_num in _lda_keys:
128 |     if not np.isnan(topic_coord).any():
129 |       break
130 |     topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]
131 | 
132 |   # plot crucial words
133 |   for i in xrange(X_topics.shape[1]):
134 |     plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])
135 | 
136 |   # hover tools
137 |   hover = plot_lda.select(dict(type=HoverTool))
138 |   hover.tooltips = {"content": "@content - topic: @topic_key"}
139 | 
140 |   save(plot_lda, '20_news_tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
141 |     X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words))
142 | 
143 |   t2 = time.time()
144 |   print '\n>>> whole process done; took {} mins\n'.format((t2 - t0) / 60.)


--------------------------------------------------------------------------------
/topic_tweets.py:
--------------------------------------------------------------------------------
  1 | """
  2 |   Train LDA model using https://pypi.python.org/pypi/lda,
  3 |   and visualize in 2-D space with t-SNE.
  4 | 
  5 | """
  6 | 
  7 | import os
  8 | import time
  9 | import lda
 10 | import random
 11 | import argparse
 12 | import numpy as np
 13 | from sklearn.feature_extraction.text import CountVectorizer
 14 | from sklearn.manifold import TSNE
 15 | import bokeh.plotting as bp
 16 | from bokeh.plotting import save
 17 | from bokeh.models import HoverTool
 18 | from utils import preprocess
 19 | 
 20 | 
 21 | if __name__ == '__main__':
 22 | 
 23 |   lda_base = 'lda_simple'
 24 |   if not os.path.exists(lda_base):
 25 |     os.makedirs(lda_base)
 26 | 
 27 |   ##############################################################################
 28 |   # cli inputs
 29 | 
 30 |   parser = argparse.ArgumentParser()
 31 |   parser.add_argument('--raw_tweet_dir', required=True, type=str,
 32 |                       help='a directory of raw tweet files')
 33 |   parser.add_argument('--num_train_tweet', required=True, type=int,
 34 |                       help='number of tweets used for training a LDA model')
 35 |   parser.add_argument('--n_topics', required=True, type=int, default=20,
 36 |                       help='number of topics')
 37 |   parser.add_argument('--n_iter', required=True, type=int, default=500,
 38 |                       help='number of iteration for LDA model training')
 39 |   parser.add_argument('--top_n', required=True, type=int, default=5,
 40 |                       help='number of keywords to show for each topic')
 41 |   parser.add_argument('--threshold', required=True, type=float, default=0.0,
 42 |                       help='threshold probability for topic assignment')
 43 |   parser.add_argument('--num_example', required=True, type=int, default=5000,
 44 |                       help='number of tweets to show on the plot')
 45 |   args = parser.parse_args()
 46 | 
 47 |   # unpack
 48 |   raw_tweet_dir = args.raw_tweet_dir
 49 |   num_train_tweet = args.num_train_tweet
 50 |   n_topics = args.n_topics
 51 |   n_iter = args.n_iter
 52 |   n_top_words = args.top_n
 53 |   threshold = args.threshold
 54 |   num_example = args.num_example
 55 | 
 56 | 
 57 |   ##############################################################################
 58 |   # get training tweets
 59 | 
 60 |   num_scanned_tweet = 0
 61 |   num_qualified_tweet = 0
 62 | 
 63 |   raw_tweet_files = os.listdir(raw_tweet_dir)
 64 | 
 65 |   raw_tweet = []
 66 |   processed_tweet = []
 67 |   processed_tweet_set = set()  # for quicker'item in?' check
 68 | 
 69 |   t0 = time.time()
 70 | 
 71 |   for f in raw_tweet_files:
 72 |     in_file = os.path.join(raw_tweet_dir, f)
 73 |     if not in_file.endswith('.txt'):  # ignore non .txt file
 74 |       continue
 75 |     for t in open(in_file):
 76 |       num_scanned_tweet += 1
 77 |       p_t = preprocess(t)
 78 |       if p_t and p_t not in processed_tweet_set: # ignore duplicate tweets
 79 |         raw_tweet += t,
 80 |         processed_tweet += p_t,
 81 |         processed_tweet_set.add(p_t)
 82 |         num_qualified_tweet += 1
 83 | 
 84 |       if num_scanned_tweet % 1000000 == 0:  # progress update
 85 |         print 'scanned {} tweets'.format(num_scanned_tweet)
 86 | 
 87 |       if num_qualified_tweet == num_train_tweet:  # enough data for training
 88 |         break
 89 | 
 90 |     if num_qualified_tweet == num_train_tweet:  # break outer loop
 91 |       break
 92 | 
 93 |   del processed_tweet_set  # free memory
 94 | 
 95 |   t1 = time.time()
 96 |   print '\n>>> scanned {} tweets to find {} trainable; took {} mins\n'.format(
 97 |     num_scanned_tweet, num_train_tweet, (t1-t0)/60.)
 98 | 
 99 |   ##############################################################################
100 |   # train LDA
101 | 
102 |   # ignore terms that have a document frequency strictly lower than 5
103 |   cvectorizer = CountVectorizer(min_df=5)
104 |   cvz = cvectorizer.fit_transform(processed_tweet)
105 | 
106 |   lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
107 |   X_topics = lda_model.fit_transform(cvz)
108 | 
109 |   t2 = time.time()
110 |   print '\n>>> LDA training done; took {} mins\n'.format((t2-t1)/60.)
111 | 
112 |   np.save('lda_simple/lda_doc_topic_{}tweets_{}topics.npy'.format(
113 |     X_topics.shape[0], X_topics.shape[1]), X_topics)
114 |   np.save('lda_simple/lda_topic_word_{}tweets_{}topics.npy'.format(
115 |     X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_)
116 |   print '\n>>> doc_topic & topic word written to disk\n'
117 | 
118 |   ##############################################################################
119 |   # threshold and plot
120 | 
121 |   _idx = np.amax(X_topics, axis=1) > threshold  # idx of tweets that > threshold
122 |   _topics = X_topics[_idx]
123 |   _raw_tweet = np.array(raw_tweet)[_idx]
124 |   _processed_tweet = np.array(processed_tweet)[_idx]
125 | 
126 |   # t-SNE: 50 -> 2D
127 |   tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99,
128 |                     init='pca')
129 |   tsne_lda = tsne_model.fit_transform(_topics[:num_example])
130 | 
131 |   t3 = time.time()
132 |   print '\n>>> t-SNE transformation done; took {} mins\n'.format((t3-t2)/60.)
133 | 
134 |   # find the most probable topic for each tweet
135 |   _lda_keys = []
136 |   for i, tweet in enumerate(_raw_tweet):
137 |     _lda_keys += _topics[i].argmax(),
138 | 
139 |   # generate random hex color
140 |   colormap = []
141 |   for i in xrange(X_topics.shape[1]):
142 |     r = lambda: random.randint(0, 255)
143 |     colormap += ('#%02X%02X%02X' % (r(), r(), r())),
144 |   colormap = np.array(colormap)
145 | 
146 |   # show topics and their top words
147 |   topic_summaries = []
148 |   topic_word = lda_model.topic_word_  # get the topic words
149 |   vocab = cvectorizer.get_feature_names()
150 |   for i, topic_dist in enumerate(topic_word):
151 |     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
152 |     topic_summaries.append(' '.join(topic_words))
153 | 
154 |   # use the coordinate of a random tweet as string topic string coordinate
155 |   topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
156 |   for topic_num in _lda_keys:
157 |     if not np.isnan(topic_coord).any():
158 |       break
159 |     topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]
160 | 
161 |   # plot
162 | 
163 |   title = "t-SNE visualization of LDA model trained on {} tweets, {} topics, " \
164 |           "thresholding at {} topic probability, {} iter ({} data points and " \
165 |           "top {} words)".format(num_qualified_tweet, n_topics, threshold,
166 |                                  n_iter, num_example, n_top_words)
167 | 
168 |   plot_lda = bp.figure(plot_width=900, plot_height=700,
169 |                        title=title,
170 |                        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
171 |                        x_axis_type=None, y_axis_type=None, min_border=1)
172 | 
173 |   plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
174 |                    color=colormap[_lda_keys][:num_example],
175 |                    source=bp.ColumnDataSource({
176 |                      "tweet": _raw_tweet[:num_example],
177 |                      "topic_key": _lda_keys[:num_example]
178 |                    }))
179 | 
180 |   # plot crucial words
181 |   for i in xrange(X_topics.shape[1]):
182 |     plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])
183 |   hover = plot_lda.select(dict(type=HoverTool))
184 |   hover.tooltips = {"tweet": "@tweet - topic: @topic_key"}
185 | 
186 |   save(plot_lda, 'tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
187 |     num_qualified_tweet, n_topics, threshold, n_iter, num_example, n_top_words))
188 | 
189 | 
190 |   t4 = time.time()
191 |   print '\n>>> whole process done; took {} mins\n'.format((t4-t0)/60.)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 |   Some utilities.
 3 | """
 4 | 
 5 | from nltk.corpus import stopwords
 6 | import numpy as np
 7 | from sklearn.feature_extraction.text import TfidfVectorizer
 8 | 
 9 | 
10 | def preprocess(tweet, ascii=True, ignore_rt_char=True, ignore_url=True,
11 |                ignore_mention=True, ignore_hashtag=True,
12 |                letter_only=True, remove_stopwords=True, min_tweet_len=3):
13 | 
14 |   sword = stopwords.words('english')
15 | 
16 |   if ascii:  # maybe remove lines with ANY non-ascii character
17 |     for c in tweet:
18 |       if not (0 < ord(c) < 127):
19 |         return ''
20 | 
21 |   tokens = tweet.lower().split()  # to lower, split
22 |   res = []
23 | 
24 |   for token in tokens:
25 |     if remove_stopwords and token in sword:
26 |       continue
27 |     if ignore_rt_char and token == 'rt':
28 |       continue
29 |     if ignore_url and token.startswith('https:'):
30 |       continue
31 |     if ignore_mention and token.startswith('@'):
32 |       continue
33 |     if ignore_hashtag and token.startswith('#'):
34 |       continue
35 |     if letter_only:
36 |       if not token.isalpha():
37 |         continue
38 |     elif token.isdigit():
39 |       token = '<num>'
40 | 
41 |     res += token,
42 | 
43 |   if min_tweet_len and len(res) < min_tweet_len:
44 |     return ''
45 |   else:
46 |     return ' '.join(res)
47 | 
48 | 
49 | def get_tfidf(tweet_list, top_n, max_features=5000):
50 |   """ return the top n feature names and idf scores of a tweets list """
51 |   tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
52 |   tfidf_vectorizer.fit_transform(tweet_list)
53 |   indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
54 |   features = tfidf_vectorizer.get_feature_names()
55 |   top_feature_name = [features[i] for i in indices[:top_n]]
56 |   top_feautre_idf = tfidf_vectorizer.idf_[indices][:top_n]
57 | 
58 |   return top_feature_name, top_feautre_idf
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------