├── .gitignore ├── LICENSE ├── README.md ├── link_counter.py ├── mufos.py ├── related_hashtags.py ├── test.py └── twint_utils └── tweets └── media_downloader.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 TWINT Project 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # twint-utils 2 | These are utilizations of Twint. Please check out our collaboration guidelines to contribute to these. 3 | 4 | ## Mutuals Detector 5 | Mufos.py uses Twint to detect a given seed accounts mutual followers. It can be incorporated into other more elaborate pipelines for social networks or millieu detection. It is too slow to be useful right now for accounts with massive follower/Following. Here is a [hosted version](https://colab.research.google.com/drive/1AOXQxkOWbq7KEHWVBRiOrYhTOSg3QTqq) that you can open in playground mode to play with. 6 | 7 | ## Media Downloader 8 | accepts twint.output.tweets_list as an argument. 9 | ```python 10 | from twint_utils.tweets import media_downloader 11 | 12 | tweets = twint.output.tweets_list 13 | location = "./" 14 | media_downloader.download_photos(get_tweets(tweets), location) 15 | media_downloader.download_videos(get_tweets(tweets), location) 16 | ``` 17 | 18 | ## Link Counter (link_counter.py) 19 | This code takes a list of twitter usernames, iterates over them to find tweets where they shared links, and then sums up the base URLs of everyones links combined and turns it into a matplotlib bar graph. Please check code documentation for usage guidance. The code does take a bit to run depending on your tweet limit and how many accounts you pull. Non-coder friendly hosted version [here](https://colab.research.google.com/drive/1AGgt2Qm2LThNAKeBsnbKRXgWgPc9kFN9). 20 | 21 | ## Related Hashtags Detector (related_hashtags.py) 22 | This notebook finds other hashtags that are most commonly found with a given hashtag and creates a bar graph of them. This can be used to track how disinformation campaigns or stories are happening. Non-coder friendly hosted version [here](https://colab.research.google.com/drive/1dNSxohTBgNox0IiaGwqv66eFyxKBABHx). 23 | -------------------------------------------------------------------------------- /link_counter.py: -------------------------------------------------------------------------------- 1 | #This code takes a list of twitter usernames, iterates over them to find tweets where they shared links, 2 | #and then sums up the base URLs of everyones links combined and turns it into a matplotlib graph. 3 | #I put a bunch of code documentation in and it really will help you use this. 4 | #the code does take a bit to run depending on your tweet limit and how many accounts you pull 5 | 6 | import pandas as pd 7 | import re 8 | 9 | from urllib.parse import urlparse 10 | from urllib.request import urlopen 11 | 12 | import csv 13 | 14 | import twint #you may need to install this first if you haven't! 15 | 16 | import matplotlib.pyplot as plt; plt.rcdefaults() 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | 20 | import csv 21 | import os 22 | 23 | #this prevents async problems/ runtime errors 24 | #https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/ 25 | import nest_asyncio 26 | nest_asyncio.apply() 27 | 28 | #put accounts in between the brackets, comma seperated, without the @sign. ie ["jack", "realDonaldtrump", "Blacksocialists"] 29 | sourceAccounts= ["PUT YOUR ACCOUNTS HERE" , "DIRECTIONS ABOVE"] 30 | 31 | 32 | 33 | if not os.path.isfile('all_urls.csv'): 34 | with open('all_urls.csv', 'wb') as f: 35 | pass 36 | 37 | for username in sourceAccounts: 38 | c = twint.Config() 39 | print("pulling tweets for " + str(username) + "...") 40 | c.Username = username 41 | c.Hide_output = True #makes the command line less noisy 42 | c.Limit = 500 #maximum number of tweets to pull per account 43 | c.Store_object = True 44 | #only selects tweets that have links 45 | c.Links = "include" 46 | 47 | 48 | baseURLs = [] 49 | twint.run.Search(c) 50 | tweets = twint.output.tweets_list 51 | for tweet in tweets: 52 | #urls is a class in the twint tweet objects to see all classes: dir(tweet) 53 | for URL in tweet.urls: 54 | 55 | parsed_uri = urlparse(URL) 56 | baseURL = str('{uri.netloc}'.format(uri=parsed_uri)) #gets the base URL 57 | if baseURL[:7] == 'twitter': #ignores RTs as links 58 | pass 59 | elif baseURL[:4] == "www.": #strips www for a e s t h e t i c 60 | baseURLs.append([username, baseURL[4:]]) 61 | else: 62 | baseURLs.append([username, baseURL]) 63 | 64 | 65 | # I added this in case it gets slow in pulling the list so you can stop at any point and then just 66 | #edit your sourceAccounts list to get rid of the one's you've already done. 67 | with open('all_urls.csv','a', newline='') as f: 68 | for baseURL in baseURLs: 69 | writer = csv.writer(f) 70 | writer.writerow(baseURL) 71 | 72 | 73 | 74 | all_urls = pd.read_csv('all_urls.csv', names = ['username','URL']) 75 | 76 | print("total tweets pulled: " + str(len(all_urls))) 77 | 78 | 79 | labels = ['Base URL', 'Frequency'] 80 | countedURLs = all_urls['URL'].value_counts() 81 | countedURLs.to_csv('countedURLs.csv') 82 | 83 | top_urls = countedURLs.iloc[:10] 84 | top_urls = top_urls[::-1] #makes it descending 85 | 86 | y_pos = np.arange(len(top_urls)) 87 | performance = top_urls 88 | print(performance) 89 | baseURLs = top_urls.index 90 | print(baseURLs) 91 | plt.barh(y_pos, performance, align='center', alpha=0.5) 92 | plt.yticks(y_pos, baseURLs) 93 | plt.xlabel('Frequency of Links') 94 | plt.title('Most Frequent External Links of all Handles Tested') 95 | 96 | plt.show() 97 | -------------------------------------------------------------------------------- /mufos.py: -------------------------------------------------------------------------------- 1 | #this function detects mutual followers for a seed twitter account and 2 | #can be incporated into other more elaborate pipelines for social networks or millieu detection. 3 | #this is too slow to be useful right now for accounts with massive follower/Following 4 | #here is a hosted version that you can open in playground mode to play with 5 | #https://colab.research.google.com/drive/1AOXQxkOWbq7KEHWVBRiOrYhTOSg3QTqq 6 | 7 | #install twint: pip3 install twint 8 | #write seed username below 9 | import twint 10 | 11 | username = "CHOOSE TARGET USERNAME HERE NO @ SIGN" 12 | 13 | def mutuals(username): 14 | c = twint.Config() 15 | c.Hide_output = True 16 | c.Username = username 17 | c.Pandas = True 18 | print("finding followers...(ignore errors)") 19 | 20 | twint.run.Followers(c) 21 | 22 | Followers_df = twint.storage.panda.Follow_df 23 | list_of_followers = Followers_df['followers'][username] 24 | print("finding following...(ignore errors)") 25 | 26 | c.Pandas = True 27 | twint.run.Following(c) 28 | 29 | Following_df = twint.storage.panda.Follow_df 30 | list_of_following = Following_df['following'][username] 31 | print("finding mutuals...") 32 | def intersection(lst1, lst2): 33 | return list(set(lst1) & set(lst2)) 34 | 35 | mufos = intersection(list_of_followers, list_of_following) 36 | return mufos 37 | 38 | mufos = mutuals(username) 39 | 40 | print(str(username)+ " account has " + str(len(mufos)) + " mutual followers. Here they are:") 41 | print(mufos) 42 | -------------------------------------------------------------------------------- /related_hashtags.py: -------------------------------------------------------------------------------- 1 | # Related Hashtags Detector 2 | 3 | #This notebook finds other hashtags that are most commonly found with a given hashtag 4 | #and creates a bar graph of them. This can be used to track how disinformation campaigns 5 | #or stories are happening. Be patient. It takes a bit to pull the tweets especially 6 | #if you have a high limit. 7 | 8 | seed_hashtag = "#Elections2019" #change this to whatever seed hashtag you want. 9 | limit = 500 #This changes the number of tweets to pull 10 | 11 | import twint #may need to install first 12 | import heapq 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | print("pulling tweets.... please wait...espera por favor...") 17 | c = twint.Config() 18 | c.Hide_output = True #makes the command line less noisy 19 | c.Limit = limit #maximum number of tweets to pull per account 20 | c.Store_object = True 21 | c.Search = seed_hashtag 22 | twint.run.Search(c) 23 | tweets = twint.output.tweets_list 24 | 25 | #counts occurrence of hashtags 26 | hashtags_dict = {} 27 | for tweet in tweets: 28 | for hashtag in tweet.hashtags: 29 | if hashtag in hashtags_dict: 30 | hashtags_dict[hashtag] += 1 31 | else: 32 | hashtags_dict[hashtag] = 1 33 | 34 | del hashtags_dict[seed_hashtag] #gets rid of seed hashtag 35 | top_hashtags = heapq.nlargest(10, hashtags_dict, key=hashtags_dict.get) #gets highest hashtags 36 | 37 | #makes dictionary of just highest ones 38 | hashtags_ranked = {} 39 | for hashtag in top_hashtags: 40 | hashtags_ranked[hashtag] = hashtags_dict[hashtag] 41 | 42 | print("There will now be a pop-up with the bar chart.") 43 | plt.barh(range(len(hashtags_ranked)), list(hashtags_ranked.values()), align='center', color = 'maroon') 44 | plt.yticks(range(len(hashtags_ranked)), list(hashtags_ranked.keys())) 45 | plt.gca().invert_yaxis() #just to have the highest bar at the top 46 | plt.title("Most Related Hashtags to " + seed_hashtag) 47 | plt.show() 48 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import twint 2 | 3 | from twint_utils.tweets import media_downloader 4 | 5 | account = "twitter" 6 | 7 | 8 | def get_tweets(target): 9 | c = twint.Config() 10 | c.Username = target 11 | c.Store_object = True 12 | c.Hide_output = False 13 | c.Media = True 14 | twint.run.Search(c) 15 | return twint.output.tweets_list 16 | 17 | 18 | if __name__ == "__main__": 19 | # media_downloader.download_photos(get_tweets(account), ".") 20 | media_downloader.download_videos(get_tweets(account), ".") 21 | -------------------------------------------------------------------------------- /twint_utils/tweets/media_downloader.py: -------------------------------------------------------------------------------- 1 | # Author: Philip Woldhek (11philip22) 2 | 3 | from pathlib import Path 4 | from re import compile 5 | from time import sleep 6 | from urllib.parse import urlparse 7 | 8 | import youtube_dl 9 | from bs4 import BeautifulSoup 10 | from requests import get 11 | 12 | 13 | def get_soup(html): 14 | if html is not None: 15 | soup = BeautifulSoup(html, 'lxml') 16 | return soup 17 | else: 18 | return 19 | 20 | 21 | def photo_downloader(urls, download_location): 22 | headers = { 23 | 'User-Agent': 24 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \ 25 | Chrome/74.0.3729.169 Safari/537.36'} 26 | for tweet in urls: 27 | try: 28 | result = get(tweet, headers) 29 | except Exception as e: 30 | continue 31 | if result.status_code is 200: 32 | content = result.content 33 | soup = get_soup(content) 34 | for link in soup.findAll('img', attrs={'src': compile("^https://pbs.twimg.com/media")}): 35 | photo_url = link['src'] 36 | url_obj = urlparse(photo_url) 37 | file_name = url_obj.path.replace("/media/", "") 38 | path = str(Path(download_location, file_name)) 39 | if not Path(path).is_file(): 40 | with open(path, "wb") as file: 41 | file.write(get(photo_url).content) 42 | else: 43 | continue 44 | 45 | 46 | def video_downloader(urls, download_location): 47 | for tweet in urls: 48 | try: 49 | download_path = str(Path(download_location, "%(id)s.%(ext)s")) 50 | ydl_opts = { 51 | "outtmpl": download_path, 52 | "quiet": True, 53 | } 54 | with youtube_dl.YoutubeDL(ydl_opts) as ydl: 55 | ydl.download([tweet, ]) 56 | except Exception as e: 57 | continue 58 | if len(urls) > 200: 59 | sleep(2) 60 | 61 | 62 | def sorter(tweets_obj): 63 | photo_urls = [] 64 | video_urls = [] 65 | for item in tweets_obj: 66 | url = "https://twitter.com/statuses/{0}".format(item.id) 67 | if item.photos: 68 | photo_urls.append(url) 69 | if item.video: 70 | video_urls.append(url) 71 | return photo_urls, video_urls 72 | 73 | 74 | def get_photo_urls(tweets_obj): 75 | photo_urls = [] 76 | for item in tweets_obj: 77 | url = "https://twitter.com/statuses/{0}".format(item.id) 78 | if item.photos: 79 | photo_urls.append(url) 80 | return photo_urls 81 | 82 | 83 | def get_video_urls(tweets_obj): 84 | video_urls = [] 85 | for item in tweets_obj: 86 | url = "https://twitter.com/statuses/{0}".format(item.id) 87 | if item.video: 88 | video_urls.append(url) 89 | return video_urls 90 | 91 | 92 | def download_photos(tweets_obj, download_location): 93 | photo_urls = get_photo_urls(tweets_obj) 94 | photo_downloader(photo_urls, download_location) 95 | 96 | 97 | def download_videos(tweets_obj, download_location): 98 | video_urls = get_video_urls(tweets_obj) 99 | video_downloader(video_urls, download_location) 100 | --------------------------------------------------------------------------------