├── assets
├── banner.png
├── screenshot-0.png
├── screenshot-1.png
├── screenshot-2.png
└── screenshot-3.png
├── .gitignore
├── requirements.txt
├── test.py
├── server.py
├── news_utils.py
├── google_language.py
├── process_tweet.py
├── twitter_utils.py
├── README.md
└── determine_relevance.py
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/banner.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ENV/
2 | __pycache__/
3 |
4 | credentials.py
5 | service-account-file.json
6 | settings.cfg
7 |
--------------------------------------------------------------------------------
/assets/screenshot-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-0.png
--------------------------------------------------------------------------------
/assets/screenshot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-1.png
--------------------------------------------------------------------------------
/assets/screenshot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-2.png
--------------------------------------------------------------------------------
/assets/screenshot-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuyashLakhotia/ContextNewsBot/HEAD/assets/screenshot-3.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aniso8601==2.0.0
2 | cachetools==2.0.1
3 | certifi==2018.1.18
4 | chardet==3.0.4
5 | click==6.7
6 | Flask==0.12.2
7 | Flask-RESTful==0.3.6
8 | google-api-core==0.1.4
9 | google-auth==1.3.0
10 | google-cloud-language==1.0.0
11 | googleapis-common-protos==1.5.3
12 | grpcio==1.8.4
13 | idna==2.6
14 | itsdangerous==0.24
15 | Jinja2==2.10
16 | MarkupSafe==1.0
17 | newsapi-python==0.0.2
18 | protobuf==3.5.1
19 | pyasn1==0.4.2
20 | pyasn1-modules==0.2.1
21 | pytz==2017.3
22 | requests==2.18.4
23 | rsa==3.4.2
24 | six==1.11.0
25 | urllib3==1.22
26 | Werkzeug==0.14.1
27 | wincertstore==0.2
28 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from twitter_utils import TweetProcessor
2 | from news_utils import NewsRetriever, pretty_print_news
3 | from determine_relevance import get_relevant_news
4 |
5 | tweet_processor = TweetProcessor()
6 | news_retriever = NewsRetriever()
7 |
8 | tweet = tweet_processor.get_tweet(957220637705109505)
9 | tweet_entities = tweet_processor.extract_entities(tweet)
10 |
11 | if tweet["user"]["verified"]:
12 | user_name = tweet["user"]["name"]
13 | else:
14 | user_name = None
15 |
16 | if tweet["place"] is not None:
17 | country = tweet["place"]["country"]
18 | else:
19 | country = None
20 |
21 | news_articles = news_retriever.get_articles(tweet_entities, country, user_name)
22 |
23 | relevant_articles = get_relevant_news(tweet, tweet_entities, news_articles, 0)
24 |
25 | pretty_print_news(relevant_articles)
26 |
--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
1 | import os
2 | import credentials
3 |
4 | # needs to be before `import process_tweet`
5 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials.PATH_TO_GOOGLE_JSON
6 |
7 | from flask import Flask, jsonify, redirect, request
8 | from flask_restful import Api, Resource
9 |
10 | from process_tweet import process_tweet
11 |
12 |
13 | app = Flask(__name__)
14 | APP_URL = "http://127.0.0.1:5000"
15 |
16 |
17 | class Tweets(Resource):
18 |
19 | def post(self):
20 | data = request.get_json()
21 | if not data:
22 | data = {"response": "ERROR"}
23 | return jsonify(data)
24 | else:
25 | tweetID = data.get("id")
26 | data = process_tweet(tweetID)
27 | return jsonify(data)
28 |
29 | api = Api(app)
30 | api.add_resource(Tweets, "/tweet", endpoint="tweet")
31 |
32 |
33 | if __name__ == "__main__":
34 | app.run(debug=True)
35 |
--------------------------------------------------------------------------------
/news_utils.py:
--------------------------------------------------------------------------------
1 | from newsapi import NewsApiClient
2 | from google_language import IMP_ENTITY_IDX
3 |
4 | import credentials
5 |
6 |
7 | class NewsRetriever:
8 |
9 | def __init__(self):
10 | self.newsapiClient = NewsApiClient(api_key=credentials.NEWS_API_KEY)
11 | self.list_of_sources = "buzzfeed,bbc-news,fox-news,cnn,the-new-york-times"
12 |
13 | def get_articles(self, entities, country=None, user_name=None):
14 | phrases = []
15 | for entity in entities:
16 | if entity.type in IMP_ENTITY_IDX:
17 | phrases.append(entity.name)
18 |
19 | if country is not None:
20 | phrases.append(country)
21 |
22 | if user_name is not None:
23 | phrases.append(user_name)
24 |
25 | response = self.newsapiClient.get_everything(q=["+" + phrase for phrase in phrases],
26 | sources=self.list_of_sources,
27 | language="en",
28 | sort_by="relevancy",
29 | page_size=10)
30 | status = response["status"]
31 | if status != "ok":
32 | print("Retrieved!")
33 |
34 | return response["articles"]
35 |
36 |
37 | def pretty_print_news(articles):
38 | for i in range(len(articles)):
39 | item = articles[i]
40 | print("\n---\n")
41 | print(str(i) + ". " + str(item["relevance_score"]) + " - " + str(item["sentiment_score"]) + " - " +
42 | str(item["source"]["name"]) + " - " + item["title"] + " - " + item["description"])
43 | print("\n---\n")
44 |
--------------------------------------------------------------------------------
/google_language.py:
--------------------------------------------------------------------------------
1 | from google.cloud import language
2 | from google.cloud.language import enums
3 | from google.cloud.language import types
4 |
5 |
6 | ENTITY_TYPES = ["UNKNOWN", "PERSON", "LOCATION", "ORGANIZATION", "EVENT", "WORK_OF_ART", "CONSUMER_GOOD",
7 | "OTHER"]
8 | IMP_ENTITY_IDX = [1, 2, 3, 4, 5, 6]
9 | REALLY_IMP_ENTITY_IDX = [1, 2, 3, 4]
10 |
11 |
12 | class GoogleLanguage(object):
13 |
14 | def __init__(self):
15 | self.client = language.LanguageServiceClient()
16 |
17 | def get_entities(self, text):
18 | document = types.Document(content=text,
19 | type=enums.Document.Type.PLAIN_TEXT)
20 | response = self.client.analyze_entities(document=document,
21 | encoding_type=enums.EncodingType.UTF32)
22 |
23 | for entity in response.entities:
24 | if entity.mentions[0].type == enums.EntityMention.Type.COMMON:
25 | entity.salience = entity.salience * 0.5
26 |
27 | return response.entities
28 |
29 | def get_entities_sentiment(self, text):
30 | document = types.Document(content=text,
31 | type=enums.Document.Type.PLAIN_TEXT)
32 | response = self.client.analyze_entity_sentiment(document=document,
33 | encoding_type=enums.EncodingType.UTF32)
34 | return response.entities
35 |
36 | def get_document_sentiment(self, text):
37 | document = types.Document(content=text,
38 | type=enums.Document.Type.PLAIN_TEXT)
39 | sentiment = self.client.analyze_sentiment(document=document,
40 | encoding_type=enums.EncodingType.UTF32).document_sentiment
41 | return sentiment
42 |
--------------------------------------------------------------------------------
/process_tweet.py:
--------------------------------------------------------------------------------
1 | from twitter_utils import TweetProcessor
2 | from news_utils import NewsRetriever, pretty_print_news
3 | from determine_relevance import get_relevant_news
4 | from google_language import GoogleLanguage
5 |
6 |
7 | def process_tweet(tweetID):
8 | tweet_processor = TweetProcessor()
9 | news_retriever = NewsRetriever()
10 |
11 | tweet = tweet_processor.get_tweet(tweetID)
12 | tweet_entities = tweet_processor.extract_entities(tweet)
13 | tweet_sentiment_score = get_tweet_sentiment(tweet["full_text"])
14 |
15 | if len(tweet_entities) == 0:
16 | return {"relevant_articles": [], "tweet_sentiment_score": tweet_sentiment_score, "wiki_urls": []}
17 |
18 | if tweet["user"]["verified"]:
19 | user_name = tweet["user"]["name"]
20 | else:
21 | user_name = None
22 |
23 | if tweet["place"] is not None:
24 | country = tweet["place"]["country"]
25 | else:
26 | country = None
27 |
28 | news_articles = news_retriever.get_articles(tweet_entities, country, user_name)
29 | relevant_articles = get_relevant_news(tweet, tweet_entities, news_articles, 0)
30 |
31 | wiki_urls = get_wiki_links(tweet_entities)
32 |
33 | response = {"relevant_articles": relevant_articles,
34 | "tweet_sentiment_score": tweet_sentiment_score,
35 | "wiki_urls": wiki_urls}
36 |
37 | return response
38 |
39 |
40 | def get_tweet_sentiment(tweet):
41 | google_lang = GoogleLanguage()
42 | tweet_sentiment_score = google_lang.get_document_sentiment(tweet).score
43 | return tweet_sentiment_score
44 |
45 |
46 | def get_wiki_links(tweet_entities):
47 | wikipedia_urls = []
48 | for entity in tweet_entities:
49 | if entity.salience > 0.5 and "wikipedia_url" in entity.metadata.keys():
50 | wikipedia_urls.append({"entity_name": entity.name, "wiki_url": entity.metadata["wikipedia_url"]})
51 | break
52 |
53 | return wikipedia_urls
54 |
--------------------------------------------------------------------------------
/twitter_utils.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import base64
3 | import html
4 | import re
5 |
6 | import credentials
7 | from google_language import GoogleLanguage
8 | from google_language import ENTITY_TYPES
9 |
10 |
11 | class TweetProcessor(object):
12 |
13 | def __init__(self):
14 | self.base_url = "https://api.twitter.com/"
15 |
16 | access_token = self._authorize_twitter()
17 | self.query_headers = {
18 | "Authorization": "Bearer {}".format(access_token)
19 | }
20 |
21 | self.google_lang = GoogleLanguage()
22 |
23 | def _authorize_twitter(self):
24 | key_secret = "{}:{}".format(credentials.TWITTER_API_KEY,
25 | credentials.TWITTER_API_SECRET).encode("ascii")
26 | b64_encoded_key = base64.b64encode(key_secret).decode("ascii")
27 |
28 | auth_url = "{}oauth2/token".format(self.base_url)
29 | auth_headers = {
30 | "Authorization": "Basic {}".format(b64_encoded_key),
31 | "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"
32 | }
33 | auth_data = {
34 | "grant_type": "client_credentials"
35 | }
36 | auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)
37 | assert auth_resp.status_code == 200
38 | return auth_resp.json()["access_token"]
39 |
40 | def extract_entities(self, tweet):
41 | text = tweet["full_text"]
42 | # unescape html text
43 | text = html.unescape(text)
44 | # remove links
45 | text = re.sub(r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", "", text)
46 | # remove hashtags
47 | text = re.sub(r"#[A-Za-z]+", "", text)
48 | # remove irrelevant characters
49 | text = re.sub(r"[^a-zA-Z0-9.,?!/$&\"': -_\n\s]", "", text)
50 | # remove repeated whitespaces
51 | text = re.sub(r"\s{2,}", " ", text)
52 |
53 | print("Text: {}".format(text))
54 |
55 | if False:
56 | sentiment = self.google_lang.get_sentiment(text)
57 | print("Sentiment: {}, {}".format(sentiment.score,
58 | sentiment.magnitude))
59 |
60 | if False:
61 | entities = self.google_lang.get_entities_sentiment(text)
62 | for entity in entities:
63 | print("Entity: {}".format(entity.name))
64 | print("Sentiment: {}".format(entity.sentiment.score, entity.sentiment.magnitude))
65 |
66 | entities = self.google_lang.get_entities(text)
67 | for entity in entities:
68 | print("Entity: {}".format(entity.name))
69 | print("Type: {}".format(ENTITY_TYPES[entity.type]))
70 | print("Salience: {}".format(entity.salience))
71 |
72 | return entities
73 |
74 | def get_tweet(self, tweet_id):
75 | query_params = {
76 | "id": tweet_id,
77 | "tweet_mode": "extended"
78 | }
79 | search_url = "{}1.1/statuses/show.json".format(self.base_url)
80 | search_resp = requests.get(search_url, headers=self.query_headers, params=query_params)
81 | tweet_data = search_resp.json()
82 | return tweet_data
83 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Context News Bot
2 |
3 | > Social media algorithms are built around engagement & likes, not perspective, and we aim to fix that with **Context News Bot**.
4 |
5 | 
6 |
7 | **tl;dr** Context News Bot is a Chrome extension that allows you to step out of your filter bubble by providing a diverse set of news articles and objective Wikipedia entries for tweets on your Twitter timeline. This repository contains the server-side code and works with the client-side [Chrome extension](https://github.com/SuyashLakhotia/ContextNewsBot-Client). Installation instructions can be found [here](#installation).
8 |
9 |
10 | ### Problems
11 |
12 | - The inability of people to create an internal filter against their own bias has caused the proliferation of false (or somewhat false) information to plague modern society and is slowly leading us to a world with **only polarizing opinions**.
13 |
14 | - Today's modern discovery algorithms trap us in personal **'filter bubbles'** by giving us content that agrees with our inherent biases regardless of whether these biases are grounded in reality.
15 |
16 | ### Solution
17 |
18 | We built a machine learning based solution that allows users to fact-check content on Twitter and gain a broader perspective on news items and events. The service helps get rid of biases by understanding the semantics of the tweet's content and suggesting diverse news articles and Wikipedia entries that aim to provide a more rounded and objective perspective.
19 |
20 | - Our natural language processing pipeline performs **entity and topic extraction** on the tweets to search for related news articles from reliable sources and discover relevant Wikipedia articles.
21 |
22 | - We combine this information with **sentiment analysis** signals extracted from the tweet and the retrieved news articles to select articles that agree with the tweet as well as those that have a different perspective.
23 |
24 | - The selected news articles are then **integrated into Twitter's user interface**, where the user can choose to read further about the issue at hand.
25 |
26 | ### Installation
27 |
28 | After obtaining (and setting up) the required credentials for the [Twitter API](https://developer.twitter.com/), [News API](https://newsapi.org) & [Google Cloud Natural Language API](https://cloud.google.com/natural-language/), start the server by running:
29 |
30 | ```
31 | $ pip install -r requirements.txt
32 | $ python server.py
33 | ```
34 |
35 | Next, clone and install the companion [Chrome extension](https://github.com/SuyashLakhotia/ContextNewsBot-Client) on your computer's Chrome browser. You should now see the Context News Bot button below the tweets on your Twitter timeline!
36 |
37 | ### Screenshots
38 |
39 | #### Context News Button
40 |
41 |
42 |
43 | #### Context Panel
44 |
45 |
46 |
47 |
48 |
49 | ### Team
50 |
51 | 1. [Suyash Lakhotia](https://github.com/SuyashLakhotia)
52 | 2. [Chaitanya Joshi](https://github.com/chaitjo)
53 | 3. [Nikhil Venkatesh](https://github.com/nikv96)
54 | 4. [Bobby Ranjan](https://github.com/bbbranjan)
55 |
56 | ---
57 |
58 | > **NOTE:** Initially built for and during NUS Hack&Roll 2018. Won a Top 8 prize and Most Socially Useful Hack!
59 |
--------------------------------------------------------------------------------
/determine_relevance.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | from google_language import GoogleLanguage
4 | from google_language import REALLY_IMP_ENTITY_IDX
5 |
6 | from news_utils import pretty_print_news
7 |
8 |
9 | google_lang = GoogleLanguage()
10 |
11 |
12 | def get_relevant_news(tweet, tweet_entities, news_articles, threshold):
13 | relevant_news_articles = []
14 |
15 | for item in news_articles:
16 | relevance_score = relevance_score_google(tweet, tweet_entities,
17 | item["title"] + ". " + item["description"])
18 | item["relevance_score"] = relevance_score
19 | if relevance_score >= threshold:
20 | relevant_news_articles.append(item)
21 |
22 | relevant_news_articles.sort(key=lambda x: x["relevance_score"], reverse=True)
23 |
24 | final_articles = []
25 | sources_covered = []
26 | for item in relevant_news_articles:
27 | if item["source"]["id"] not in sources_covered:
28 | final_articles.append(item)
29 | sources_covered.append(item["source"]["id"])
30 |
31 | for item in final_articles[:3]:
32 | news_item = item["title"] + ". " + item["description"]
33 | sentiment = google_lang.get_document_sentiment(news_item)
34 | item["sentiment_score"] = sentiment.score
35 |
36 | pretty_print_news(final_articles[:3])
37 |
38 | return final_articles[:3]
39 |
40 |
41 | def relevance_score_google(tweet, tweet_entities, news_item):
42 | news_entities_names = []
43 |
44 | entities = google_lang.get_entities(news_item)
45 | for entity in entities:
46 | news_entities_names.append(entity.name)
47 |
48 | total_score = 0
49 | for i in range(len(tweet_entities)):
50 | if tweet_entities[i].name in news_entities_names:
51 | idx = news_entities_names.index(tweet_entities[i].name)
52 |
53 | if entities[idx].type in REALLY_IMP_ENTITY_IDX:
54 | total_score += (entities[idx].salience * 1.5) * min(3, len(entities[idx].mentions))
55 | else:
56 | total_score += entities[idx].salience * min(3, len(entities[idx].mentions))
57 |
58 | return total_score
59 |
60 |
61 | def get_relevant_news_tfidf(tweet, news_articles, threshold=0.5):
62 | import gensim
63 | from nltk.tokenize import word_tokenize
64 |
65 | news_articles_text = [item["title"] + ". " + item["description"] for item in news_articles]
66 | news_articles_tokenized = [[w.lower() for w in word_tokenize(item)]
67 | for item in news_articles_text]
68 |
69 | dictionary = gensim.corpora.Dictionary(news_articles_tokenized)
70 | corpus = [dictionary.doc2bow(item_tokenized) for item_tokenized in news_articles_tokenized]
71 | tf_idf = gensim.models.TfidfModel(corpus)
72 | sims = gensim.similarities.Similarity("", tf_idf[corpus],
73 | num_features=len(dictionary))
74 |
75 | tweet_tokenized = [w.lower() for w in word_tokenize(tweet)]
76 | tweet_tokenized_bow = dictionary.doc2bow(tweet_tokenized)
77 | tweet_tokenized_tf_idf = tf_idf[tweet_tokenized_bow]
78 |
79 | relevant_news_articles = []
80 | for idx, similarity_score in enumerate(sims[tweet_tokenized_tf_idf]):
81 | if similarity_score >= threshold:
82 | news_articles[idx]["relevance_score"] = similarity_score
83 | relevant_news_articles.append(news_articles[idx])
84 |
85 | return relevant_news_articles
86 |
87 |
88 | def get_relevant_news_cosine(tweet, news_articles, threshold=0.5):
89 | import spacy
90 |
91 | nlp = spacy.load("en_core_web_sm") # need to download: python -m spacy download en_core_web_sm/_md/_lg
92 | news_articles_vectors = [nlp(item["title"] + ". " + item["description"]) for item in news_articles]
93 | tweet_vector = nlp(tweet)
94 |
95 | relevant_news_articles = []
96 | for idx, item in enumerate(news_articles_vectors):
97 | similarity_score = tweet_vector.similarity(item)
98 | if similarity_score >= threshold:
99 | news_articles[idx]["relevance_score"] = similarity_score
100 | relevant_news_articles.append(news_articles[idx])
101 |
102 | return relevant_news_articles
103 |
--------------------------------------------------------------------------------