├── .gitignore
├── LICENSE
├── README.md
├── link_counter.py
├── mufos.py
├── related_hashtags.py
├── test.py
└── twint_utils
    └── tweets
        └── media_downloader.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 TWINT Project
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # twint-utils
 2 | These are utilizations of Twint. Please check out our collaboration guidelines to contribute to these.
 3 | 
 4 | ## Mutuals Detector
 5 | Mufos.py uses Twint to detect a given seed accounts mutual followers. It can be incorporated into other more elaborate pipelines for social networks or millieu detection. It is too slow to be useful right now for accounts with massive follower/Following. Here is a [hosted version](https://colab.research.google.com/drive/1AOXQxkOWbq7KEHWVBRiOrYhTOSg3QTqq) that you can open in playground mode to play with.
 6 | 
 7 | ## Media Downloader
 8 | accepts twint.output.tweets_list as an argument. 
 9 | ```python
10 | from twint_utils.tweets import media_downloader
11 | 
12 | tweets = twint.output.tweets_list                              
13 | location = "./"
14 | media_downloader.download_photos(get_tweets(tweets), location)
15 | media_downloader.download_videos(get_tweets(tweets), location)
16 | ```
17 | 
18 | ## Link Counter (link_counter.py)
19 | This code takes a list of twitter usernames, iterates over them to find tweets where they shared links, and then sums up the base URLs of everyones links combined and turns it into a matplotlib bar graph. Please check code documentation for usage guidance. The code does take a bit to run depending on your tweet limit and how many accounts you pull.  Non-coder friendly hosted version [here](https://colab.research.google.com/drive/1AGgt2Qm2LThNAKeBsnbKRXgWgPc9kFN9).
20 | 
21 | ## Related Hashtags Detector (related_hashtags.py)
22 | This notebook finds other hashtags that are most commonly found with a given hashtag and creates a bar graph of them. This can be used to track how disinformation campaigns or stories are happening. Non-coder friendly hosted version [here](https://colab.research.google.com/drive/1dNSxohTBgNox0IiaGwqv66eFyxKBABHx).
23 | 


--------------------------------------------------------------------------------
/link_counter.py:
--------------------------------------------------------------------------------
 1 | #This code takes a list of twitter usernames, iterates over them to find tweets where they shared links,
 2 | #and then sums up the base URLs of everyones links combined and turns it into a matplotlib graph.
 3 | #I put a bunch of code documentation in and it really will help you use this.
 4 | #the code does take a bit to run depending on your tweet limit and how many accounts you pull
 5 | 
 6 | import pandas as pd
 7 | import re
 8 | 
 9 | from urllib.parse import urlparse
10 | from urllib.request import urlopen
11 | 
12 | import csv
13 | 
14 | import twint #you may need to install this first if you haven't!
15 | 
16 | import matplotlib.pyplot as plt; plt.rcdefaults()
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | 
20 | import csv
21 | import os
22 | 
23 | #this prevents async problems/ runtime errors
24 | #https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/
25 | import nest_asyncio
26 | nest_asyncio.apply()
27 | 
28 | #put accounts in between the brackets, comma seperated, without the @sign. ie ["jack", "realDonaldtrump", "Blacksocialists"]
29 | sourceAccounts= ["PUT YOUR ACCOUNTS HERE" , "DIRECTIONS ABOVE"]
30 | 
31 | 
32 | 
33 | if not os.path.isfile('all_urls.csv'):
34 |     with open('all_urls.csv', 'wb') as f:
35 |         pass
36 | 
37 | for username in sourceAccounts:
38 |     c = twint.Config()
39 |     print("pulling tweets for " + str(username) + "...")
40 |     c.Username = username
41 |     c.Hide_output = True #makes the command line less noisy
42 |     c.Limit = 500 #maximum number of tweets to pull per account
43 |     c.Store_object = True
44 |     #only selects tweets that have links
45 |     c.Links = "include"
46 | 
47 | 
48 |     baseURLs = []
49 |     twint.run.Search(c)
50 |     tweets = twint.output.tweets_list
51 |     for tweet in tweets:
52 |         #urls is a class in the twint tweet objects to see all classes: dir(tweet)
53 |         for URL in tweet.urls:
54 | 
55 |             parsed_uri = urlparse(URL)
56 |             baseURL = str('{uri.netloc}'.format(uri=parsed_uri)) #gets the base URL
57 |             if baseURL[:7] == 'twitter': #ignores RTs as links
58 |                 pass
59 |             elif baseURL[:4] == "www.": #strips www for a e s t h e t i c
60 |                 baseURLs.append([username, baseURL[4:]])
61 |             else:
62 |                 baseURLs.append([username, baseURL])
63 | 
64 | 
65 |     # I added this in case it gets slow in pulling the list so you can stop at any point and then just
66 |     #edit your sourceAccounts list to get rid of the one's you've already done.
67 |     with open('all_urls.csv','a', newline='') as f:
68 |         for baseURL in baseURLs:
69 |             writer = csv.writer(f)
70 |             writer.writerow(baseURL)
71 | 
72 | 
73 | 
74 | all_urls = pd.read_csv('all_urls.csv', names = ['username','URL'])
75 | 
76 | print("total tweets pulled: " + str(len(all_urls)))
77 | 
78 | 
79 | labels = ['Base URL', 'Frequency']
80 | countedURLs = all_urls['URL'].value_counts()
81 | countedURLs.to_csv('countedURLs.csv')
82 | 
83 | top_urls = countedURLs.iloc[:10]
84 | top_urls = top_urls[::-1] #makes it descending
85 | 
86 | y_pos = np.arange(len(top_urls))
87 | performance = top_urls
88 | print(performance)
89 | baseURLs =  top_urls.index
90 | print(baseURLs)
91 | plt.barh(y_pos, performance, align='center', alpha=0.5)
92 | plt.yticks(y_pos, baseURLs)
93 | plt.xlabel('Frequency of Links')
94 | plt.title('Most Frequent External Links of all Handles Tested')
95 | 
96 | plt.show()
97 | 


--------------------------------------------------------------------------------
/mufos.py:
--------------------------------------------------------------------------------
 1 | #this function detects mutual followers for a seed twitter account and
 2 | #can be incporated into other more elaborate pipelines for social networks or millieu detection.
 3 | #this is too slow to be useful right now for accounts with massive follower/Following
 4 | #here is a hosted version that you can open in playground mode to play with
 5 | #https://colab.research.google.com/drive/1AOXQxkOWbq7KEHWVBRiOrYhTOSg3QTqq
 6 | 
 7 | #install twint: pip3 install twint
 8 | #write seed username below
 9 | import twint
10 | 
11 | username = "CHOOSE TARGET USERNAME HERE NO @ SIGN"
12 | 
13 | def mutuals(username):
14 |   c = twint.Config()
15 |   c.Hide_output = True
16 |   c.Username = username
17 |   c.Pandas = True
18 |   print("finding followers...(ignore errors)")
19 | 
20 |   twint.run.Followers(c)
21 | 
22 |   Followers_df = twint.storage.panda.Follow_df
23 |   list_of_followers = Followers_df['followers'][username]
24 |   print("finding following...(ignore errors)")
25 | 
26 |   c.Pandas = True
27 |   twint.run.Following(c)
28 | 
29 |   Following_df = twint.storage.panda.Follow_df
30 |   list_of_following = Following_df['following'][username]
31 |   print("finding mutuals...")
32 |   def intersection(lst1, lst2):
33 |     return list(set(lst1) & set(lst2))
34 | 
35 |   mufos = intersection(list_of_followers, list_of_following)
36 |   return mufos
37 | 
38 | mufos = mutuals(username)
39 | 
40 | print(str(username)+ " account has " + str(len(mufos)) + " mutual followers. Here they are:")
41 | print(mufos)
42 | 


--------------------------------------------------------------------------------
/related_hashtags.py:
--------------------------------------------------------------------------------
 1 | # Related Hashtags Detector
 2 | 
 3 | #This notebook finds other hashtags that are most commonly found with a given hashtag
 4 | #and creates a bar graph of them. This can be used to track how disinformation campaigns
 5 | #or stories are happening. Be patient. It takes a bit to pull the tweets especially
 6 | #if you have a high limit.
 7 | 
 8 | seed_hashtag = "#Elections2019"   #change this to whatever seed hashtag you want.
 9 | limit = 500   #This changes the number of tweets to pull
10 | 
11 | import twint #may need to install first
12 | import heapq
13 | import matplotlib.pyplot as plt
14 | 
15 | 
16 | print("pulling tweets.... please wait...espera por favor...")
17 | c = twint.Config()
18 | c.Hide_output = True #makes the command line less noisy
19 | c.Limit = limit #maximum number of tweets to pull per account
20 | c.Store_object = True
21 | c.Search = seed_hashtag
22 | twint.run.Search(c)
23 | tweets = twint.output.tweets_list
24 | 
25 | #counts occurrence of hashtags
26 | hashtags_dict = {}
27 | for tweet in tweets:
28 |   for hashtag in tweet.hashtags:
29 |     if hashtag in hashtags_dict:
30 |       hashtags_dict[hashtag] += 1
31 |     else:
32 |       hashtags_dict[hashtag] = 1
33 | 
34 | del hashtags_dict[seed_hashtag] #gets rid of seed hashtag
35 | top_hashtags = heapq.nlargest(10, hashtags_dict, key=hashtags_dict.get) #gets highest hashtags
36 | 
37 | #makes dictionary of just highest ones
38 | hashtags_ranked = {}
39 | for hashtag in top_hashtags:
40 |   hashtags_ranked[hashtag] = hashtags_dict[hashtag]
41 | 
42 | print("There will now be a pop-up with the bar chart.")
43 | plt.barh(range(len(hashtags_ranked)), list(hashtags_ranked.values()), align='center', color = 'maroon')
44 | plt.yticks(range(len(hashtags_ranked)), list(hashtags_ranked.keys()))
45 | plt.gca().invert_yaxis() #just to have the highest bar at the top
46 | plt.title("Most Related Hashtags to " + seed_hashtag)
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import twint
 2 | 
 3 | from twint_utils.tweets import media_downloader
 4 | 
 5 | account = "twitter"
 6 | 
 7 | 
 8 | def get_tweets(target):
 9 |     c = twint.Config()
10 |     c.Username = target
11 |     c.Store_object = True
12 |     c.Hide_output = False
13 |     c.Media = True
14 |     twint.run.Search(c)
15 |     return twint.output.tweets_list
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     # media_downloader.download_photos(get_tweets(account), ".")
20 |     media_downloader.download_videos(get_tweets(account), ".")
21 | 


--------------------------------------------------------------------------------
/twint_utils/tweets/media_downloader.py:
--------------------------------------------------------------------------------
  1 | # Author: Philip Woldhek (11philip22)
  2 | 
  3 | from pathlib import Path
  4 | from re import compile
  5 | from time import sleep
  6 | from urllib.parse import urlparse
  7 | 
  8 | import youtube_dl
  9 | from bs4 import BeautifulSoup
 10 | from requests import get
 11 | 
 12 | 
 13 | def get_soup(html):
 14 |     if html is not None:
 15 |         soup = BeautifulSoup(html, 'lxml')
 16 |         return soup
 17 |     else:
 18 |         return
 19 | 
 20 | 
 21 | def photo_downloader(urls, download_location):
 22 |     headers = {
 23 |         'User-Agent':
 24 |             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
 25 |             Chrome/74.0.3729.169 Safari/537.36'}
 26 |     for tweet in urls:
 27 |         try:
 28 |             result = get(tweet, headers)
 29 |         except Exception as e:
 30 |             continue
 31 |         if result.status_code is 200:
 32 |             content = result.content
 33 |             soup = get_soup(content)
 34 |             for link in soup.findAll('img', attrs={'src': compile("^https://pbs.twimg.com/media")}):
 35 |                 photo_url = link['src']
 36 |                 url_obj = urlparse(photo_url)
 37 |                 file_name = url_obj.path.replace("/media/", "")
 38 |                 path = str(Path(download_location, file_name))
 39 |                 if not Path(path).is_file():
 40 |                     with open(path, "wb") as file:
 41 |                         file.write(get(photo_url).content)
 42 |         else:
 43 |             continue
 44 | 
 45 | 
 46 | def video_downloader(urls, download_location):
 47 |     for tweet in urls:
 48 |         try:
 49 |             download_path = str(Path(download_location, "%(id)s.%(ext)s"))
 50 |             ydl_opts = {
 51 |                 "outtmpl": download_path,
 52 |                 "quiet": True,
 53 |             }
 54 |             with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 55 |                 ydl.download([tweet, ])
 56 |         except Exception as e:
 57 |             continue
 58 |         if len(urls) > 200:
 59 |             sleep(2)
 60 | 
 61 | 
 62 | def sorter(tweets_obj):
 63 |     photo_urls = []
 64 |     video_urls = []
 65 |     for item in tweets_obj:
 66 |         url = "https://twitter.com/statuses/{0}".format(item.id)
 67 |         if item.photos:
 68 |             photo_urls.append(url)
 69 |         if item.video:
 70 |             video_urls.append(url)
 71 |     return photo_urls, video_urls
 72 | 
 73 | 
 74 | def get_photo_urls(tweets_obj):
 75 |     photo_urls = []
 76 |     for item in tweets_obj:
 77 |         url = "https://twitter.com/statuses/{0}".format(item.id)
 78 |         if item.photos:
 79 |             photo_urls.append(url)
 80 |     return photo_urls
 81 | 
 82 | 
 83 | def get_video_urls(tweets_obj):
 84 |     video_urls = []
 85 |     for item in tweets_obj:
 86 |         url = "https://twitter.com/statuses/{0}".format(item.id)
 87 |         if item.video:
 88 |             video_urls.append(url)
 89 |     return video_urls
 90 | 
 91 | 
 92 | def download_photos(tweets_obj, download_location):
 93 |     photo_urls = get_photo_urls(tweets_obj)
 94 |     photo_downloader(photo_urls, download_location)
 95 | 
 96 | 
 97 | def download_videos(tweets_obj, download_location):
 98 |     video_urls = get_video_urls(tweets_obj)
 99 |     video_downloader(video_urls, download_location)
100 | 


--------------------------------------------------------------------------------