├── .gitignore ├── images └── dashboard.png ├── requirements.txt ├── config.py ├── LICENSE ├── README.md ├── get_stockprice.py ├── parsing.py └── get_tweet_sentiment.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ 3 | *.json 4 | -------------------------------------------------------------------------------- /images/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunxiaoshi/stock-insight-engine/HEAD/images/dashboard.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | elasticsearch 3 | nltk 4 | requests 5 | textblob 6 | tweepy 7 | vaderSentiment 8 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """ 2 | file - config.py 3 | Place to store needed configurations 4 | """ 5 | 6 | # nltk tokens reqs 7 | nltk_tokens_ignored = ("win", 'giveaway') 8 | nltk_tokens_required = ("jeff", "bezos", "jeff bezos", "#amazon", "@amazon", "amazon", "amzn", "#amzn", "alexa", "blue origin", "space") 9 | nltk_min_tokens = 1 10 | 11 | # elasticsearch 12 | elasticsearch_host = "localhost" 13 | elasticsearch_port = 9200 14 | 15 | # put your twitter access credentials here 16 | consumer_key = "" 17 | consumer_key_secret = "" 18 | access_token = "" 19 | access_token_secret = "" 20 | 21 | # sentiment url 22 | sentiment_url = 'http://text-processing.com/api/sentiment/' 23 | # yahoo stock url 24 | yahoo_stock_url = "https://query1.finance.yahoo.com/v8/finance/chart/SYMBOL?region=US&lang=en-US&includePrePost=false&interval=2m&range=5d&corsDomain=finance.yahoo.com&.tsrc=finance" 25 | # yahoo news url 26 | yahoo_news_url = 'https://finance.yahoo.com/quote/SYMBOL/?p=SYMBOL' 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) Copyright (c) 2020 Yunxiao Shi 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | © 2020 GitHub, Inc. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Stock Insight Engine 2 | 3 | [![Python 3.6+](https://img.shields.io/badge/Python-3.6%2B-blue)](https://www.python.org/) 4 | [![MIT License](https://img.shields.io/badge/License-MIT-brightgreen)](./LICENSE) 5 | [![Elasticsearch 7.9.0](https://img.shields.io/badge/Elasticsearch-7.9.0-yellow)](https://www.elastic.co/elasticsearch/) 6 | 7 | Stock insight engine is a stock market analyzer that mines user tweets on Twitter and performs sentiment analysis on them to show how much emotions on Twitter affect the stock prices. It is built upon [tweepy](https://www.tweepy.org/), [Elasticsearch](https://www.elastic.co/elasticsearch/) and [NLTK](https://www.nltk.org/). 8 | 9 | ## Installation 10 | 11 | To be able to use stock insight engine to mine tweets from Twitter, there are a few steps need to be donei first. Namely, 12 | 13 | 1. Apply for a Twitter developer account [here](https://developer.twitter.com/en/apply-for-access), then [create a new Twitter application](https://developer.twitter.com/en/docs/basics/developer-portal/guides/apps.html) and generate your [access credentials](https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens.html) 14 | 15 | 2. Install Elasticsearch on your local machine. The following has been tested on Ubuntu 20.04 16 | 17 | ``` 18 | curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add - 19 | echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-7.x.list 20 | sudo apt update 21 | sudo apt install elasticsearch 22 | ``` 23 | 24 | Upon successful installation, start the Elasticsearch service 25 | 26 | ``` 27 | sudo systemctl start elasticsearch 28 | ``` 29 | 30 | To test the installation is successful, do 31 | ``` 32 | curl -X GET 'http://localhost:9200' 33 | ``` 34 | which should give you some snapshot information about the installed Elasticsearch. 35 | 36 | 3. Install Kibana. If your Elasticsearch installation is successful, then you just need to do 37 | 38 | ``` 39 | sudo apt install kibana 40 | ``` 41 | 42 | and rememeber 43 | ``` 44 | sudo systemctl start kibana 45 | ``` 46 | Once the Kibana service is started, you should be able to access the dashboard via ```localhost:5601```. 47 | 48 | 4. Install Python dependancies 49 | 50 | It is recommended to use [conda](https://docs.conda.io/en/latest/) to manage your env. For example do 51 | 52 | ``` 53 | conda create -n stock python=3.8 54 | conda activate stock 55 | pip install -r requirements.txt 56 | ``` 57 | 58 | ## Usage 59 | 60 | 1. Put **your** Twitter consumer key and access token in ```config.py```. 61 | 62 | 2. Edit the ```nltk_tokens_required``` and ```nltk_tokens_ignored``` in ```config.py``` to the Twitter feeds you want to mine. 63 | 64 | + ```nltk_tokens_required``` specifies the must-have tokens in a tweet, of which the tweet must contain at least one before being added to Elasticsearch otherwise skipped, and 65 | + ```nltk_tokens_ignored``` specifies the ignored tokens, of which if a tweet contains any then it will be skipped, not adding to elasticsearch. 66 | + ```nltk_min_required``` sets the minimum number of required tokens. 67 | 68 | 3. To mine tweets talking about ```Amazon``` and ```Jeff Bezos```, do 69 | 70 | ```python get_tweet_sentiment.py -s AMZN -k 'Jeff Bezos',Bezos,Amazon,Alexa,'Blue Origin' --quiet``` 71 | 72 | and to further follow URL links in tweets to perform sentiment analysis on that page 73 | 74 | ```python get_tweet_sentiment.py -s AMZN -k 'Jeff Bezos',Bezos,Amazon,Alexa,'Blue Origin' -l --quiet``` 75 | 76 | 4. To get Amazon stock price from [yahoo finance](https://finance.yahoo.com/quote/AMZN/?p=AMZN), do 77 | 78 | ```python get_stockprice.py -s AMZN --quiet``` 79 | 80 | ## Visualization 81 | 82 | Follow [Kibana Visualization Tutorial](https://www.elastic.co/guide/en/kibana/current/tutorial-visualizing.html) to customize your data visualizations. Here I showcase mine as an illustration. 83 |

84 | 85 |

86 | 87 | ## License 88 | 89 | MIT 90 | -------------------------------------------------------------------------------- /get_stockprice.py: -------------------------------------------------------------------------------- 1 | """ 2 | file - stockprice.py 3 | Get stock price from yahoo finance and add to Elasticsearch 4 | """ 5 | 6 | import argparse 7 | import json 8 | import logging 9 | import random 10 | import re 11 | import requests 12 | import sys 13 | import time 14 | 15 | from elasticsearch import Elasticsearch 16 | 17 | from config import elasticsearch_host, elasticsearch_port, yahoo_stock_url 18 | 19 | # create es instance 20 | es = Elasticsearch(hosts=[{'host': elasticsearch_host, 'port': elasticsearch_port}]) 21 | 22 | class Stock: 23 | 24 | def __init__(self): 25 | pass 26 | 27 | def get_stock_price(self, url, symbol): 28 | 29 | import re 30 | 31 | while True: 32 | logger.info('grabbing stock data for symbol %s...' % symbol) 33 | 34 | try: 35 | url = re.sub('SYMBOL', symbol, url) 36 | # get json stock data from url 37 | try: 38 | r = requests.get(url) 39 | data = r.json() 40 | except (requests.HTTPError, requests.ConnectionError, requests.ConnectTimeout) as re: 41 | logger.error('exception occurred when getting stock data from url caused by %s' % re) 42 | raise 43 | logger.debug(data) 44 | try: 45 | dict = {} 46 | dict['symbol'] = symbol 47 | dict['last'] = data['chart']['result'][0]['indicators']['quote'][0]['close'][-1] 48 | if dict['last'] is None: 49 | dict['last'] = data['chart']['result'][0]['indicators']['quote'][0]['close'][-2] 50 | dict['date'] = time.strftime('%Y-%m-%dT%H:%M:%S', time.gmtime()) 51 | try: 52 | dict['change'] = (data['chart']['result'][0]['indicators']['quote'][0]['close'][-1] - 53 | data['chart']['result'][0]['indicators']['quote'][0]['close'][-2]) / \ 54 | data['chart']['result'][0]['indicators']['quote'][0]['close'][-2] * 100 55 | except TypeError: 56 | dict['change'] = (data['chart']['result'][0]['indicators']['quote'][0]['close'][-2] - 57 | data['chart']['result'][0]['indicators']['quote'][0]['close'][-3]) / \ 58 | data['chart']['result'][0]['indicators']['quote'][0]['close'][-3] * 100 59 | pass 60 | dict['high'] = data['chart']['result'][0]['indicators']['quote'][0]['high'][-1] 61 | if dict['high'] is None: 62 | dict['high'] = data['chart']['result'][0]['indicators']['quote'][0]['high'][-2] 63 | 64 | dict['low'] = data['chart']['result'][0]['indicators']['quote'][0]['low'][-1] 65 | if dict['low'] is None: 66 | dict['low'] = data['chart']['result'][0]['indicators']['quote'][0]['low'][-2] 67 | 68 | dict['vol'] = data['chart']['result'][0]['indicators']['quote'][0]['volume'][-1] 69 | if dict['vol'] is None: 70 | dict['vol'] = data['chart']['result'][0]['indicators']['quote'][0]['volume'][-2] 71 | 72 | logger.debug(dict) 73 | except KeyError as e: 74 | logger.error('exception occurred when getting stock data caused by %s' % e) 75 | raise 76 | 77 | # sanity before sending to es 78 | if dict['last'] is not None and dict['high'] is not None and dict['low'] is not None: 79 | logger.info('adding stock data to Elasticsearch') 80 | es.index(index=args.index, doc_type='stock', 81 | body={ 82 | 'symbol': dict['symbol'], 83 | 'price_last': dict['last'], 84 | 'date': dict['date'], 85 | 'change': dict['change'], 86 | 'price_high': dict['high'], 87 | 'price_low': dict['low'], 88 | 'vol': dict['vol'] 89 | }) 90 | else: 91 | logger.warning('some stock data had null values, skipping') 92 | 93 | except Exception as e: 94 | logger.error('exception can\'t get stock data caused by %s, trying again later' % e) 95 | pass 96 | 97 | if __name__ == '__main__': 98 | 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument('-i', '--index', default='stock-price', 101 | help='Index name for es') 102 | parser.add_argument('-s', '--symbol', type=str, help='Stock symbol, e.g. TSLA') 103 | parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity') 104 | parser.add_argument('--debug', action='store_true', help='Debug message output') 105 | parser.add_argument('-q', '--quiet', action='store_true', help='Run quiet with no msg output') 106 | 107 | args = parser.parse_args() 108 | 109 | # set up logging 110 | logger = logging.getLogger('stock-price') 111 | logger.setLevel(logging.INFO) 112 | 113 | logging.addLevelName( 114 | logging.INFO, "\033[1;32m%s\033[1;0m" 115 | % logging.getLevelName(logging.INFO)) 116 | logging.addLevelName( 117 | logging.WARNING, "\033[1;31m%s\033[1;0m" 118 | % logging.getLevelName(logging.WARNING)) 119 | logging.addLevelName( 120 | logging.ERROR, "\033[1;41m%s\033[1;0m" 121 | % logging.getLevelName(logging.ERROR)) 122 | logging.addLevelName( 123 | logging.DEBUG, "\033[1;33m%s\033[1;0m" 124 | % logging.getLevelName(logging.DEBUG)) 125 | log_format = '%(asctime)s [%(levelname)s][%(name)s] %(message)s' 126 | log_level = logging.INFO 127 | logging.basicConfig(format=log_format, level=log_level) 128 | 129 | if args.verbose: 130 | logger.setLevel(logging.INFO) 131 | if args.debug: 132 | logger.setLevel(logging.DEBUG) 133 | if args.quiet: 134 | logger.disabled = True 135 | 136 | if args.symbol is None: 137 | print('No stock symbol, see --help for help') 138 | sys.exit(1) 139 | 140 | # create instance of Stock 141 | stockprice = Stock() 142 | 143 | try: 144 | stockprice.get_stock_price(symbol=args.symbol, url=yahoo_stock_url) 145 | except Exception as e: 146 | logger.warning('Exception occurred when getting stock data caused by %s' % e) 147 | except KeyboardInterrupt: 148 | print('Ctrl-c keyboard interrupt, exiting...') 149 | sys.exit(0) 150 | -------------------------------------------------------------------------------- /parsing.py: -------------------------------------------------------------------------------- 1 | """ 2 | file - parsing.py 3 | Implements a utility class that cleans up text and performs sentiment analysis 4 | """ 5 | 6 | import re 7 | import requests 8 | import string 9 | import urllib.parse as urlparse 10 | 11 | from bs4 import BeautifulSoup 12 | import nltk 13 | from newspaper import Article, ArticleException 14 | from textblob import TextBlob 15 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 16 | 17 | from config import nltk_min_tokens, nltk_tokens_required, nltk_tokens_ignored 18 | 19 | class ParsingUtils: 20 | """ 21 | A utility class that computes sentiment for text 22 | """ 23 | def __init__(self, sentiment_url, logger, web_sentiment=False, 24 | verbose=False): 25 | """ 26 | sentiment_url: 'http://text-processing.com/api/sentiment/' for online sentiment parsing 27 | """ 28 | self.sentiment_url = sentiment_url 29 | self.logger = logger 30 | self.web_sentiment = web_sentiment 31 | self.verbose = verbose 32 | 33 | def clean_text(self, text): 34 | # clean up text 35 | text = text.replace('\n', ' ') 36 | text = re.sub(r'https?\S+', '', text) 37 | text = re.sub(r'&.*?;', '', text) 38 | text = re.sub(r'<.*?>', '', text) 39 | text = text.replace('RT', '') 40 | text = text.replace(u'...', '') 41 | text = text.strip() 42 | return text 43 | 44 | def clean_text_sentiment(self, text): 45 | # clean up text for sentiment analysis 46 | text = re.sub(r'[#|@]\S+', '', text) 47 | text = text.strip() 48 | return text 49 | 50 | def create_tokens_from_text(self, text): 51 | text_tokens = re.sub(r"[\%|\$|\.|\,|\!|\:|\@|\(|\)|\#|\+|(``)|('')|\?|\-]", "", text) 52 | tokens = nltk.word_tokenize(text_tokens) 53 | tokens = [w.lower() for w in tokens] 54 | table = str.maketrans('', '', string.punctuation) 55 | stripped = [w.translate(table) for w in tokens] 56 | tokens = [w for w in stripped if w.isalpha()] 57 | stop_words = set(nltk.corpus.stopwords.words('english')) 58 | tokens = [w for w in tokens if not w in stop_words] 59 | # remove words less than 3 characters 60 | tokens = [w for w in tokens if not len(w) < 3] 61 | return tokens 62 | 63 | def get_sentiment_from_url(self, text): 64 | # get sentiment from text processing website 65 | payload = {'text': text} 66 | 67 | try: 68 | self.logger.debug(text) 69 | post = requests.post(self.sentiment_url, data=payload) 70 | self.logger.debug(post.status_code) 71 | self.logger.debug(post.text) 72 | 73 | except requests.exceptions.RequestException as re: 74 | self.logger.error('Exception occurred when getting sentiment from %s caused by %s' % (self.sentiment_url, re)) 75 | raise 76 | 77 | # return None if getting throttled or other connection problem 78 | if post.status_code != 200: 79 | self.logger.warning('Can\'t get sentiment from %s caused by %s %s' % (self.sentiment_url, post.status_code, post.text)) 80 | return None 81 | 82 | response = post.json() 83 | 84 | neg = response['probability']['neg'] 85 | pos = response['probability']['pos'] 86 | neu = response['probability']['neutral'] 87 | label = response['label'] 88 | 89 | # determine if sentiment is positive, negative or neutral 90 | if label == 'neg': 91 | sentiment = 'negative' 92 | elif label == 'neutral': 93 | sentiment = 'neutral' 94 | else: 95 | sentiment = 'positive' 96 | 97 | return sentiment, neg, pos, neu 98 | 99 | def sentiment_analysis(self, text): 100 | """ 101 | utility leveraging TextBlob, VADERSentiment and sentiment from text-processing.com 102 | """ 103 | # pass text into sentiment url 104 | if self.web_sentiment: 105 | ret = self.get_sentiment_from_url(text) 106 | if not ret: 107 | sentiment_web = None 108 | else: 109 | sentiment_web, _, _, _ = ret 110 | else: 111 | sentiment_web = None 112 | 113 | # pass text into TextBlob 114 | if not isinstance(text, str): 115 | text = str(text) 116 | text_tb = TextBlob(text) 117 | 118 | # pass text into VADER sentiment 119 | analyzer = SentimentIntensityAnalyzer() 120 | text_vs = analyzer.polarity_scores(text) 121 | 122 | # determine sentiment 123 | if not sentiment_web: 124 | if text_tb.sentiment.polarity < 0 and text_vs['compound'] <= -0.05: 125 | sentiment = 'negative' 126 | elif text_tb.sentiment.polarity > 0 and text_vs['compound'] >= 0.05: 127 | sentiment = 'positive' 128 | else: 129 | sentiment = 'neutral' 130 | else: 131 | if text_tb.sentiment.polarity < 0 and text_vs['compound'] <= -0.05 and sentiment_web == 'negative': 132 | sentiment = 'negative' 133 | elif text_tb.sentiment.polarity > 0 and text_vs['compound'] >=0.05 and sentiment_web == 'positive': 134 | sentiment = 'positive' 135 | else: 136 | sentiment = 'neutral' 137 | 138 | # calculate average polarity from TextBlob and VADER 139 | polarity = (text_tb.sentiment.polarity + text_vs['compound']) / 2 140 | 141 | return polarity, text_tb.sentiment.subjectivity, sentiment 142 | 143 | def tweet_link_sentiment_analysis(self, url): 144 | # run sentiment analysis on tweet link text summary page 145 | try: 146 | self.logger.info('Following tweet link %s to get sentiment...' % url) 147 | article = Article(url) 148 | article.download() 149 | article.parse() 150 | if 'Tweet with a location' in article.text: 151 | self.logger.info('Link to a twitter web page, skipping') 152 | return None 153 | article.nlp() 154 | tokens = article.keywords 155 | 156 | if len(tokens) < 1: 157 | self.logger.info('Text does not have min number of tokens, skipping') 158 | return None 159 | # check ignored tokens from config 160 | for t in nltk_tokens_ignored: 161 | if t in tokens: 162 | self.logger.info('Text contains token from ignored list, skipping') 163 | return None 164 | # check required tokens from config 165 | tokens_passed = False 166 | tokens_found = 0 167 | for t in nltk_tokens_required: 168 | if t in tokens: 169 | tokens_found += 1 170 | if tokens_found == nltk_min_tokens: 171 | tokens_passed = True 172 | break 173 | if not tokens_passed: 174 | self.logger.info('Text does not contain any required token, skipping') 175 | return None 176 | 177 | summary = article.summary 178 | if not summary: 179 | self.logger.info('No text found in tweet link url page') 180 | return None 181 | 182 | summary_cleaned = self.clean_text(summary) 183 | summary_cleaned = self.clean_text_sentiment(summary_cleaned) 184 | polarity, subjectivity, sentiment = self.sentiment_analysis(summary_cleaned) 185 | 186 | return polarity, subjectivity, sentiment 187 | 188 | except ArticleException as e: 189 | self.logger.warning('Exception: error getting text on twitter link caused by %s' % e) 190 | return None 191 | 192 | def get_twitter_users_from_url(self, url): 193 | twitter_users = [] 194 | self.logger.info('grabbing twitter users from url %s' % url) 195 | try: 196 | twitter_urls = ('http://twitter.com/', 'http://www.twitter.com/', 197 | 'https://www.twitter.com', 'https://www.twitter.com') 198 | req = requests.get(url) 199 | html = req.text 200 | soup = BeautifulSoup(html, 'html.parser') 201 | html_links = [] 202 | for link in soup.findAll('a'): 203 | html_links.append(link.get('href')) 204 | if html_links: 205 | for link in html_links: 206 | # check if there is twitter url in link 207 | parsed_uri = urlparse.urljoin(link, '/') 208 | # get twitter user-name from link and add to list 209 | if parsed_uri in twitter_urls and '=' not in link and '?' not in link: 210 | user = link.split('/')[3] 211 | twitter_users.append(u'@' + user) 212 | self.logger.debug(twitter_users) 213 | except requests.exceptions.RequestException as re: 214 | self.logger.warning('Can\'t crawl web site caused by %s' % re) 215 | pass 216 | 217 | return twitter_users 218 | -------------------------------------------------------------------------------- /get_tweet_sentiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | file - sentiment.py 3 | Analyze tweets with sentiment analysis and add to Elasticsearch 4 | """ 5 | 6 | import argparse 7 | import datetime 8 | import json 9 | import logging 10 | import nltk 11 | import re 12 | import requests 13 | import random 14 | import string 15 | import time 16 | import sys 17 | import urllib.parse as urlparse 18 | 19 | from bs4 import BeautifulSoup 20 | from elasticsearch import Elasticsearch 21 | from newspaper import Article, ArticleException 22 | from tweepy import API, Stream, OAuthHandler, TweepError 23 | from tweepy.streaming import StreamListener 24 | from textblob import TextBlob 25 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 26 | 27 | from parsing import ParsingUtils 28 | 29 | from config import nltk_min_tokens, nltk_tokens_required, nltk_tokens_ignored 30 | from config import consumer_key, consumer_key_secret, access_token, access_token_secret 31 | from config import elasticsearch_host, elasticsearch_port 32 | from config import sentiment_url, yahoo_news_url 33 | 34 | class TweetStreamListener(StreamListener): 35 | 36 | def __init__(self, parsing_utils, verbose=False): 37 | self.count = 0 38 | self.filtered_count = 0 39 | self.filtered_ratio = 0. 40 | self.tweet_ids = [] 41 | self.parsing_utils = parsing_utils 42 | self.verbose = verbose 43 | 44 | # on success 45 | def on_data(self, data): 46 | try: 47 | self.count += 1 48 | # decode json 49 | dict_data = json.loads(data) 50 | 51 | if self.verbose: 52 | print('################ tweets: %d | filtered: %d | filtered-ratio: %.2f' % ( 53 | self.count, self.filtered_count, self.filtered_count / self.count)) 54 | logger.debug('tweet data: %s' % str(dict_data)) 55 | 56 | text = dict_data['text'] 57 | if not text: 58 | logger.info('Tweet has no text, skipping') 59 | self.filtered_count += 1 60 | return True 61 | 62 | # extract html links from tweet 63 | tweet_urls = [] 64 | if args.link_sentiment: 65 | tweet_urls = re.findall(r'https?://[^\s]+', text) 66 | 67 | # clean up tweet text 68 | text_cleaned = self.parsing_utils.clean_text(text) 69 | 70 | if not text_cleaned: 71 | logger.info('Tweet does not contain any valid text, skipping') 72 | self.filtered_count += 1 73 | return True 74 | 75 | # get date when tweet was created 76 | created_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.strptime(dict_data['created_at'], 77 | '%a %b %d %H:%M:%S +0000 %Y')) 78 | 79 | # unpack dict_data into separate vars 80 | screen_name = str(dict_data.get('user', {}).get('screen_name')) 81 | location = str(dict_data.get('user', {}).get('location')) 82 | language = str(dict_data.get('user', {}).get('lang')) 83 | friends = int(dict_data.get('user', {}).get('friends_count')) 84 | followers = int(dict_data.get('user', {}).get('followers_count')) 85 | statuses = int(dict_data.get('user', {}).get('statuses_count')) 86 | hashtags = str(dict_data.get('entities', {})['hashtags'][0]['text'].title() 87 | ) if len(dict_data.get('entities', {})['hashtags']) > 0 else "" 88 | filtered_text = str(text_cleaned) 89 | tweet_id = int(dict_data.get('id')) 90 | 91 | tokens = self.parsing_utils.create_tokens_from_text(filtered_text) 92 | 93 | # check for min token length 94 | if not tokens: 95 | logger.info('Empty tokens from tweet, skipping') 96 | self.filtered_count += 1 97 | return True 98 | # check ignored tokens from config 99 | for t in nltk_tokens_ignored: 100 | if t in tokens: 101 | logger.info('Tweet contains tokens from ignored list, skipping') 102 | self.filtered_count += 1 103 | return True 104 | # check required tokens from config 105 | tokens_passed = False 106 | tokens_found = 0 107 | for t in nltk_tokens_required: 108 | if t in tokens: 109 | tokens_found += 1 110 | if tokens_found == nltk_min_tokens: 111 | tokens_passed = True 112 | break 113 | if not tokens_passed: 114 | logger.info('Tweet does not contain tokens from required tokens list or min tokens required, skipping') 115 | self.filtered_count += 1 116 | return True 117 | 118 | # clean up text for sentiment analysis 119 | text_cleaned_for_sentiment = self.parsing_utils.clean_text_sentiment(filtered_text) 120 | if not text_cleaned_for_sentiment: 121 | logger.info('Tweet does not contain any valid text after cleaning, skipping') 122 | self.filtered_count += 1 123 | 124 | if self.verbose: 125 | print('Tweet cleaned for sentiment analysis: %s' % text_cleaned_for_sentiment) 126 | 127 | # get sentiment values 128 | polarity, subjectivity, sentiment = self.parsing_utils.sentiment_analysis(text_cleaned_for_sentiment) 129 | 130 | # add tweet_id to tweet_ids 131 | self.tweet_ids.append(dict_data['id']) 132 | 133 | # get sentiment for tweet 134 | if tweet_urls: 135 | tweet_urls_polarity = 0 136 | tweet_urls_subjectivity = 0 137 | for url in tweet_urls: 138 | res = self.parsing_utils.tweet_link_sentiment_analysis(url) 139 | if not res: 140 | continue 141 | pol, sub, sen = res 142 | tweet_urls_polarity = (tweet_urls_polarity + pol) / 2 143 | tweet_urls_subjectivity = (tweet_urls_subjectivity + sub) / 2 144 | if sentiment == 'positive' or sen == 'positive': 145 | sentiment == 'postive' 146 | elif sentiment == 'negative' or sen == 'negative': 147 | sentiment == 'negative' 148 | else: 149 | sentiment == 'neutral' 150 | # calculate average polarity and subjectivity from tweet and tweet links 151 | if tweet_urls_polarity > 0: 152 | polarity = (polarity + tweet_urls_polarity) / 2 153 | if tweet_urls_subjectivity > 0: 154 | subjectivity = (subjectivity + tweet_urls_subjectivity) / 2 155 | 156 | logger.info('Adding tweet to elasticsearch') 157 | # add twitter data and sentiment info into elasticsearch 158 | es.index(index=args.index, 159 | doc_type='tweet', 160 | body={ 161 | 'author': screen_name, 162 | 'location': location, 163 | 'language': language, 164 | 'friends': friends, 165 | 'followers': followers, 166 | 'statuses': statuses, 167 | 'date': created_date, 168 | 'message': filtered_text, 169 | 'tweet_id': tweet_id, 170 | 'polarity': polarity, 171 | 'subjectivity': subjectivity, 172 | 'sentiment': sentiment, 173 | 'hashtags': hashtags 174 | }) 175 | return True 176 | 177 | except Exception as e: 178 | logger.warning('Exception: exception caused by: %s' % e) 179 | raise 180 | 181 | # on failure 182 | def on_error(self, status_code): 183 | logger.error('Got an error with status code: %s (will try again later)' % status_code) 184 | 185 | # on timeout 186 | def on_timeout(self): 187 | logger.warning('timeout... (will try again later)') 188 | 189 | # on exception 190 | def on_exception(self, exception): 191 | print(exception) 192 | return 193 | 194 | if __name__ == '__main__': 195 | 196 | parser = argparse.ArgumentParser() 197 | 198 | parser.add_argument('-i', '--index', default='stock-tweet', help='index name for elasticsearch') 199 | parser.add_argument('-s', '--symbol', required=True, help='Stock symbol to search for, e.g. TSLA') 200 | parser.add_argument('-k', '--keywords', required=True, 201 | help='Use keywords to search in tweets instead of feeds. ' 202 | 'Separated by commas, case senstitive, space are ANDs and commas are ORs. ' 203 | 'Example: TSLA,\'Elon Musk\',Musk,Tesla,SpaceX') 204 | parser.add_argument('-a', '--add_tokens', action='store_true', 205 | help='Add nltk tokens required from config to keywords') 206 | parser.add_argument('-u', '--url', help='Use twitter users from any links in web page at url') 207 | parser.add_argument('-l', '--link_sentiment', action='store_true', 208 | help='Follow any link url in tweets and analyze sentiments on web page') 209 | parser.add_argument('-w', '--web_sentiment', action='store_true', 210 | help='Get sentiment results from text processing website') 211 | parser.add_argument('--override_tokens_required', nargs='+', 212 | help='Override nltk required tokens from config, separate with space') 213 | parser.add_argument('--override_tokens_ignored', nargs='+', 214 | help='Override nltk ignored token from config, separate with space') 215 | parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity') 216 | parser.add_argument('-q', '--quiet', action='store_true', help='Run quiet without message output') 217 | parser.add_argument('--debug', action='store_true', help='debug message output') 218 | 219 | args = parser.parse_args() 220 | 221 | # set up logging 222 | logger = logging.getLogger('stock-tweets') 223 | logger.setLevel(logging.INFO) 224 | 225 | logging.addLevelName(logging.INFO, '\033[1;32m%s\033[1;0m' 226 | % logging.getLevelName(logging.INFO)) 227 | logging.addLevelName(logging.WARNING, '\033[1;31m%s\033[1;0m' 228 | % logging.getLevelName(logging.WARNING)) 229 | logging.addLevelName(logging.ERROR, '\033[1;41m%s\033[1;0m' 230 | % logging.getLevelName(logging.WARNING)) 231 | logging.addLevelName(logging.DEBUG, '\033[1;33m%s\033[1;0m' 232 | % logging.getLevelName(logging.DEBUG)) 233 | 234 | log_format = '%(asctime)s [%(levelname)s][%(name)s] %(message)s' 235 | log_level = logging.INFO 236 | logging.basicConfig(format=log_format, level=log_level) 237 | 238 | if args.verbose: 239 | logger.setLevel(logging.INFO) 240 | if args.debug: 241 | logger.setLevel(logging.DEBUG) 242 | if args.quiet: 243 | logger.disabled = True 244 | 245 | parsing_utils = ParsingUtils(sentiment_url=sentiment_url, logger=logger, 246 | web_sentiment=args.web_sentiment, verbose=args.verbose) 247 | 248 | # create instance of elasticsearch 249 | es = Elasticsearch(hosts=[{'host': elasticsearch_host, 'port': elasticsearch_port}]) 250 | 251 | # check if need to override any tokens 252 | if args.override_tokens_required: 253 | nltk_tokens_required = tuple(args.override_tokens_required) 254 | if args.override_tokens_ignored: 255 | nltk_tokens_ignored = tuple(args.override_tokens_ignored) 256 | 257 | # create instance of tweet listener 258 | tweet_listener = TweetStreamListener(parsing_utils=parsing_utils, verbose=args.verbose) 259 | 260 | # set twitter access keys/tokens 261 | auth = OAuthHandler(consumer_key, consumer_key_secret) 262 | auth.set_access_token(access_token, access_token_secret) 263 | api = API(auth) 264 | 265 | # create instance of the tweepy stream 266 | stream = Stream(auth, tweet_listener) 267 | 268 | # grab twitter users from links at url 269 | if args.url: 270 | twitter_users = parsing_utils.get_twitter_users_from_url(args.url) 271 | if len(twitter_users) > 0: 272 | twitter_feeds = twitter_users 273 | else: 274 | logger.info('No twitter user found in links at %s, exiting' % args.url) 275 | sys.exit(1) 276 | 277 | try: 278 | # search twitter for keywords 279 | logger.info('Stock symbol: %s' % args.symbol) 280 | logger.info('NLTK tokens required : %s' % str(nltk_tokens_required)) 281 | logger.info('NLTK tokens ignored: %s' % str(nltk_tokens_ignored)) 282 | logger.info('Listening for tweets (ctrl-c to exit)') 283 | keywords = args.keywords.split(',') 284 | if args.add_tokens: 285 | for f in nltk_tokens_required: 286 | keywords.append(f) 287 | logger.info('Searching twitter for keywords...') 288 | logger.info('Twitter keywords: %s' % keywords) 289 | stream.filter(track=keywords, languages=['en']) 290 | except TweepError as te: 291 | logger.debug('Tweepy exception: failed to get tweets caused by: %s' % te) 292 | except KeyboardInterrupt: 293 | print('ctrl-c keyboard interrupt, exiting...') 294 | stream.disconnect() 295 | sys.exit(0) 296 | --------------------------------------------------------------------------------