├── assets ├── banner.png ├── screenshot-0.png ├── screenshot-1.png ├── screenshot-2.png └── screenshot-3.png ├── .gitignore ├── requirements.txt ├── test.py ├── server.py ├── news_utils.py ├── google_language.py ├── process_tweet.py ├── twitter_utils.py ├── README.md └── determine_relevance.py /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/banner.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ENV/ 2 | __pycache__/ 3 | 4 | credentials.py 5 | service-account-file.json 6 | settings.cfg 7 | -------------------------------------------------------------------------------- /assets/screenshot-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-0.png -------------------------------------------------------------------------------- /assets/screenshot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-1.png -------------------------------------------------------------------------------- /assets/screenshot-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-2.png -------------------------------------------------------------------------------- /assets/screenshot-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-3.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aniso8601==2.0.0 2 | cachetools==2.0.1 3 | certifi==2018.1.18 4 | chardet==3.0.4 5 | click==6.7 6 | Flask==0.12.2 7 | Flask-RESTful==0.3.6 8 | google-api-core==0.1.4 9 | google-auth==1.3.0 10 | google-cloud-language==1.0.0 11 | googleapis-common-protos==1.5.3 12 | grpcio==1.8.4 13 | idna==2.6 14 | itsdangerous==0.24 15 | Jinja2==2.10 16 | MarkupSafe==1.0 17 | newsapi-python==0.0.2 18 | protobuf==3.5.1 19 | pyasn1==0.4.2 20 | pyasn1-modules==0.2.1 21 | pytz==2017.3 22 | requests==2.18.4 23 | rsa==3.4.2 24 | six==1.11.0 25 | urllib3==1.22 26 | Werkzeug==0.14.1 27 | wincertstore==0.2 28 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from twitter_utils import TweetProcessor 2 | from news_utils import NewsRetriever, pretty_print_news 3 | from determine_relevance import get_relevant_news 4 | 5 | tweet_processor = TweetProcessor() 6 | news_retriever = NewsRetriever() 7 | 8 | tweet = tweet_processor.get_tweet(957220637705109505) 9 | tweet_entities = tweet_processor.extract_entities(tweet) 10 | 11 | if tweet["user"]["verified"]: 12 | user_name = tweet["user"]["name"] 13 | else: 14 | user_name = None 15 | 16 | if tweet["place"] is not None: 17 | country = tweet["place"]["country"] 18 | else: 19 | country = None 20 | 21 | news_articles = news_retriever.get_articles(tweet_entities, country, user_name) 22 | 23 | relevant_articles = get_relevant_news(tweet, tweet_entities, news_articles, 0) 24 | 25 | pretty_print_news(relevant_articles) 26 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import credentials 3 | 4 | # needs to be before `import process_tweet` 5 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials.PATH_TO_GOOGLE_JSON 6 | 7 | from flask import Flask, jsonify, redirect, request 8 | from flask_restful import Api, Resource 9 | 10 | from process_tweet import process_tweet 11 | 12 | 13 | app = Flask(__name__) 14 | APP_URL = "http://127.0.0.1:5000" 15 | 16 | 17 | class Tweets(Resource): 18 | 19 | def post(self): 20 | data = request.get_json() 21 | if not data: 22 | data = {"response": "ERROR"} 23 | return jsonify(data) 24 | else: 25 | tweetID = data.get("id") 26 | data = process_tweet(tweetID) 27 | return jsonify(data) 28 | 29 | api = Api(app) 30 | api.add_resource(Tweets, "/tweet", endpoint="tweet") 31 | 32 | 33 | if __name__ == "__main__": 34 | app.run(debug=True) 35 | -------------------------------------------------------------------------------- /news_utils.py: -------------------------------------------------------------------------------- 1 | from newsapi import NewsApiClient 2 | from google_language import IMP_ENTITY_IDX 3 | 4 | import credentials 5 | 6 | 7 | class NewsRetriever: 8 | 9 | def __init__(self): 10 | self.newsapiClient = NewsApiClient(api_key=credentials.NEWS_API_KEY) 11 | self.list_of_sources = "buzzfeed,bbc-news,fox-news,cnn,the-new-york-times" 12 | 13 | def get_articles(self, entities, country=None, user_name=None): 14 | phrases = [] 15 | for entity in entities: 16 | if entity.type in IMP_ENTITY_IDX: 17 | phrases.append(entity.name) 18 | 19 | if country is not None: 20 | phrases.append(country) 21 | 22 | if user_name is not None: 23 | phrases.append(user_name) 24 | 25 | response = self.newsapiClient.get_everything(q=["+" + phrase for phrase in phrases], 26 | sources=self.list_of_sources, 27 | language="en", 28 | sort_by="relevancy", 29 | page_size=10) 30 | status = response["status"] 31 | if status != "ok": 32 | print("Retrieved!") 33 | 34 | return response["articles"] 35 | 36 | 37 | def pretty_print_news(articles): 38 | for i in range(len(articles)): 39 | item = articles[i] 40 | print("\n---\n") 41 | print(str(i) + ". " + str(item["relevance_score"]) + " - " + str(item["sentiment_score"]) + " - " + 42 | str(item["source"]["name"]) + " - " + item["title"] + " - " + item["description"]) 43 | print("\n---\n") 44 | -------------------------------------------------------------------------------- /google_language.py: -------------------------------------------------------------------------------- 1 | from google.cloud import language 2 | from google.cloud.language import enums 3 | from google.cloud.language import types 4 | 5 | 6 | ENTITY_TYPES = ["UNKNOWN", "PERSON", "LOCATION", "ORGANIZATION", "EVENT", "WORK_OF_ART", "CONSUMER_GOOD", 7 | "OTHER"] 8 | IMP_ENTITY_IDX = [1, 2, 3, 4, 5, 6] 9 | REALLY_IMP_ENTITY_IDX = [1, 2, 3, 4] 10 | 11 | 12 | class GoogleLanguage(object): 13 | 14 | def __init__(self): 15 | self.client = language.LanguageServiceClient() 16 | 17 | def get_entities(self, text): 18 | document = types.Document(content=text, 19 | type=enums.Document.Type.PLAIN_TEXT) 20 | response = self.client.analyze_entities(document=document, 21 | encoding_type=enums.EncodingType.UTF32) 22 | 23 | for entity in response.entities: 24 | if entity.mentions[0].type == enums.EntityMention.Type.COMMON: 25 | entity.salience = entity.salience * 0.5 26 | 27 | return response.entities 28 | 29 | def get_entities_sentiment(self, text): 30 | document = types.Document(content=text, 31 | type=enums.Document.Type.PLAIN_TEXT) 32 | response = self.client.analyze_entity_sentiment(document=document, 33 | encoding_type=enums.EncodingType.UTF32) 34 | return response.entities 35 | 36 | def get_document_sentiment(self, text): 37 | document = types.Document(content=text, 38 | type=enums.Document.Type.PLAIN_TEXT) 39 | sentiment = self.client.analyze_sentiment(document=document, 40 | encoding_type=enums.EncodingType.UTF32).document_sentiment 41 | return sentiment 42 | -------------------------------------------------------------------------------- /process_tweet.py: -------------------------------------------------------------------------------- 1 | from twitter_utils import TweetProcessor 2 | from news_utils import NewsRetriever, pretty_print_news 3 | from determine_relevance import get_relevant_news 4 | from google_language import GoogleLanguage 5 | 6 | 7 | def process_tweet(tweetID): 8 | tweet_processor = TweetProcessor() 9 | news_retriever = NewsRetriever() 10 | 11 | tweet = tweet_processor.get_tweet(tweetID) 12 | tweet_entities = tweet_processor.extract_entities(tweet) 13 | tweet_sentiment_score = get_tweet_sentiment(tweet["full_text"]) 14 | 15 | if len(tweet_entities) == 0: 16 | return {"relevant_articles": [], "tweet_sentiment_score": tweet_sentiment_score, "wiki_urls": []} 17 | 18 | if tweet["user"]["verified"]: 19 | user_name = tweet["user"]["name"] 20 | else: 21 | user_name = None 22 | 23 | if tweet["place"] is not None: 24 | country = tweet["place"]["country"] 25 | else: 26 | country = None 27 | 28 | news_articles = news_retriever.get_articles(tweet_entities, country, user_name) 29 | relevant_articles = get_relevant_news(tweet, tweet_entities, news_articles, 0) 30 | 31 | wiki_urls = get_wiki_links(tweet_entities) 32 | 33 | response = {"relevant_articles": relevant_articles, 34 | "tweet_sentiment_score": tweet_sentiment_score, 35 | "wiki_urls": wiki_urls} 36 | 37 | return response 38 | 39 | 40 | def get_tweet_sentiment(tweet): 41 | google_lang = GoogleLanguage() 42 | tweet_sentiment_score = google_lang.get_document_sentiment(tweet).score 43 | return tweet_sentiment_score 44 | 45 | 46 | def get_wiki_links(tweet_entities): 47 | wikipedia_urls = [] 48 | for entity in tweet_entities: 49 | if entity.salience > 0.5 and "wikipedia_url" in entity.metadata.keys(): 50 | wikipedia_urls.append({"entity_name": entity.name, "wiki_url": entity.metadata["wikipedia_url"]}) 51 | break 52 | 53 | return wikipedia_urls 54 | -------------------------------------------------------------------------------- /twitter_utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import base64 3 | import html 4 | import re 5 | 6 | import credentials 7 | from google_language import GoogleLanguage 8 | from google_language import ENTITY_TYPES 9 | 10 | 11 | class TweetProcessor(object): 12 | 13 | def __init__(self): 14 | self.base_url = "https://api.twitter.com/" 15 | 16 | access_token = self._authorize_twitter() 17 | self.query_headers = { 18 | "Authorization": "Bearer {}".format(access_token) 19 | } 20 | 21 | self.google_lang = GoogleLanguage() 22 | 23 | def _authorize_twitter(self): 24 | key_secret = "{}:{}".format(credentials.TWITTER_API_KEY, 25 | credentials.TWITTER_API_SECRET).encode("ascii") 26 | b64_encoded_key = base64.b64encode(key_secret).decode("ascii") 27 | 28 | auth_url = "{}oauth2/token".format(self.base_url) 29 | auth_headers = { 30 | "Authorization": "Basic {}".format(b64_encoded_key), 31 | "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8" 32 | } 33 | auth_data = { 34 | "grant_type": "client_credentials" 35 | } 36 | auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data) 37 | assert auth_resp.status_code == 200 38 | return auth_resp.json()["access_token"] 39 | 40 | def extract_entities(self, tweet): 41 | text = tweet["full_text"] 42 | # unescape html text 43 | text = html.unescape(text) 44 | # remove links 45 | text = re.sub(r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", "", text) 46 | # remove hashtags 47 | text = re.sub(r"#[A-Za-z]+", "", text) 48 | # remove irrelevant characters 49 | text = re.sub(r"[^a-zA-Z0-9.,?!/$&\"': -_\n\s]", "", text) 50 | # remove repeated whitespaces 51 | text = re.sub(r"\s{2,}", " ", text) 52 | 53 | print("Text: {}".format(text)) 54 | 55 | if False: 56 | sentiment = self.google_lang.get_sentiment(text) 57 | print("Sentiment: {}, {}".format(sentiment.score, 58 | sentiment.magnitude)) 59 | 60 | if False: 61 | entities = self.google_lang.get_entities_sentiment(text) 62 | for entity in entities: 63 | print("Entity: {}".format(entity.name)) 64 | print("Sentiment: {}".format(entity.sentiment.score, entity.sentiment.magnitude)) 65 | 66 | entities = self.google_lang.get_entities(text) 67 | for entity in entities: 68 | print("Entity: {}".format(entity.name)) 69 | print("Type: {}".format(ENTITY_TYPES[entity.type])) 70 | print("Salience: {}".format(entity.salience)) 71 | 72 | return entities 73 | 74 | def get_tweet(self, tweet_id): 75 | query_params = { 76 | "id": tweet_id, 77 | "tweet_mode": "extended" 78 | } 79 | search_url = "{}1.1/statuses/show.json".format(self.base_url) 80 | search_resp = requests.get(search_url, headers=self.query_headers, params=query_params) 81 | tweet_data = search_resp.json() 82 | return tweet_data 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Context News Bot 2 | 3 | > Social media algorithms are built around engagement & likes, not perspective, and we aim to fix that with **Context News Bot**. 4 | 5 | ![](assets/banner.png) 6 | 7 | **tl;dr** Context News Bot is a Chrome extension that allows you to step out of your filter bubble by providing a diverse set of news articles and objective Wikipedia entries for tweets on your Twitter timeline. This repository contains the server-side code and works with the client-side [Chrome extension](https://github.com/SuyashLakhotia/ContextNewsBot-Client). Installation instructions can be found [here](#installation). 8 | 9 | 10 | ### Problems 11 | 12 | - The inability of people to create an internal filter against their own bias has caused the proliferation of false (or somewhat false) information to plague modern society and is slowly leading us to a world with **only polarizing opinions**. 13 | 14 | - Today's modern discovery algorithms trap us in personal **'filter bubbles'** by giving us content that agrees with our inherent biases regardless of whether these biases are grounded in reality. 15 | 16 | ### Solution 17 | 18 | We built a machine learning based solution that allows users to fact-check content on Twitter and gain a broader perspective on news items and events. The service helps get rid of biases by understanding the semantics of the tweet's content and suggesting diverse news articles and Wikipedia entries that aim to provide a more rounded and objective perspective. 19 | 20 | - Our natural language processing pipeline performs **entity and topic extraction** on the tweets to search for related news articles from reliable sources and discover relevant Wikipedia articles. 21 | 22 | - We combine this information with **sentiment analysis** signals extracted from the tweet and the retrieved news articles to select articles that agree with the tweet as well as those that have a different perspective. 23 | 24 | - The selected news articles are then **integrated into Twitter's user interface**, where the user can choose to read further about the issue at hand. 25 | 26 | ### Installation 27 | 28 | After obtaining (and setting up) the required credentials for the [Twitter API](https://developer.twitter.com/), [News API](https://newsapi.org) & [Google Cloud Natural Language API](https://cloud.google.com/natural-language/), start the server by running: 29 | 30 | ``` 31 | $ pip install -r requirements.txt 32 | $ python server.py 33 | ``` 34 | 35 | Next, clone and install the companion [Chrome extension](https://github.com/SuyashLakhotia/ContextNewsBot-Client) on your computer's Chrome browser. You should now see the Context News Bot button below the tweets on your Twitter timeline! 36 | 37 | ### Screenshots 38 | 39 | #### Context News Button 40 | 41 | 42 | 43 | #### Context Panel 44 | 45 | 46 | 47 | 48 | 49 | ### Team 50 | 51 | 1. [Suyash Lakhotia](https://github.com/SuyashLakhotia) 52 | 2. [Chaitanya Joshi](https://github.com/chaitjo) 53 | 3. [Nikhil Venkatesh](https://github.com/nikv96) 54 | 4. [Bobby Ranjan](https://github.com/bbbranjan) 55 | 56 | --- 57 | 58 | > **NOTE:** Initially built for and during NUS Hack&Roll 2018. Won a Top 8 prize and Most Socially Useful Hack! 59 | -------------------------------------------------------------------------------- /determine_relevance.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from google_language import GoogleLanguage 4 | from google_language import REALLY_IMP_ENTITY_IDX 5 | 6 | from news_utils import pretty_print_news 7 | 8 | 9 | google_lang = GoogleLanguage() 10 | 11 | 12 | def get_relevant_news(tweet, tweet_entities, news_articles, threshold): 13 | relevant_news_articles = [] 14 | 15 | for item in news_articles: 16 | relevance_score = relevance_score_google(tweet, tweet_entities, 17 | item["title"] + ". " + item["description"]) 18 | item["relevance_score"] = relevance_score 19 | if relevance_score >= threshold: 20 | relevant_news_articles.append(item) 21 | 22 | relevant_news_articles.sort(key=lambda x: x["relevance_score"], reverse=True) 23 | 24 | final_articles = [] 25 | sources_covered = [] 26 | for item in relevant_news_articles: 27 | if item["source"]["id"] not in sources_covered: 28 | final_articles.append(item) 29 | sources_covered.append(item["source"]["id"]) 30 | 31 | for item in final_articles[:3]: 32 | news_item = item["title"] + ". " + item["description"] 33 | sentiment = google_lang.get_document_sentiment(news_item) 34 | item["sentiment_score"] = sentiment.score 35 | 36 | pretty_print_news(final_articles[:3]) 37 | 38 | return final_articles[:3] 39 | 40 | 41 | def relevance_score_google(tweet, tweet_entities, news_item): 42 | news_entities_names = [] 43 | 44 | entities = google_lang.get_entities(news_item) 45 | for entity in entities: 46 | news_entities_names.append(entity.name) 47 | 48 | total_score = 0 49 | for i in range(len(tweet_entities)): 50 | if tweet_entities[i].name in news_entities_names: 51 | idx = news_entities_names.index(tweet_entities[i].name) 52 | 53 | if entities[idx].type in REALLY_IMP_ENTITY_IDX: 54 | total_score += (entities[idx].salience * 1.5) * min(3, len(entities[idx].mentions)) 55 | else: 56 | total_score += entities[idx].salience * min(3, len(entities[idx].mentions)) 57 | 58 | return total_score 59 | 60 | 61 | def get_relevant_news_tfidf(tweet, news_articles, threshold=0.5): 62 | import gensim 63 | from nltk.tokenize import word_tokenize 64 | 65 | news_articles_text = [item["title"] + ". " + item["description"] for item in news_articles] 66 | news_articles_tokenized = [[w.lower() for w in word_tokenize(item)] 67 | for item in news_articles_text] 68 | 69 | dictionary = gensim.corpora.Dictionary(news_articles_tokenized) 70 | corpus = [dictionary.doc2bow(item_tokenized) for item_tokenized in news_articles_tokenized] 71 | tf_idf = gensim.models.TfidfModel(corpus) 72 | sims = gensim.similarities.Similarity("", tf_idf[corpus], 73 | num_features=len(dictionary)) 74 | 75 | tweet_tokenized = [w.lower() for w in word_tokenize(tweet)] 76 | tweet_tokenized_bow = dictionary.doc2bow(tweet_tokenized) 77 | tweet_tokenized_tf_idf = tf_idf[tweet_tokenized_bow] 78 | 79 | relevant_news_articles = [] 80 | for idx, similarity_score in enumerate(sims[tweet_tokenized_tf_idf]): 81 | if similarity_score >= threshold: 82 | news_articles[idx]["relevance_score"] = similarity_score 83 | relevant_news_articles.append(news_articles[idx]) 84 | 85 | return relevant_news_articles 86 | 87 | 88 | def get_relevant_news_cosine(tweet, news_articles, threshold=0.5): 89 | import spacy 90 | 91 | nlp = spacy.load("en_core_web_sm") # need to download: python -m spacy download en_core_web_sm/_md/_lg 92 | news_articles_vectors = [nlp(item["title"] + ". " + item["description"]) for item in news_articles] 93 | tweet_vector = nlp(tweet) 94 | 95 | relevant_news_articles = [] 96 | for idx, item in enumerate(news_articles_vectors): 97 | similarity_score = tweet_vector.similarity(item) 98 | if similarity_score >= threshold: 99 | news_articles[idx]["relevance_score"] = similarity_score 100 | relevant_news_articles.append(news_articles[idx]) 101 | 102 | return relevant_news_articles 103 | --------------------------------------------------------------------------------