├── .gitignore ├── README.md ├── requirements.txt └── get_tweets.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-project 2 | *.sublime-workspace 3 | api_keys.py 4 | *.csv 5 | *.pyc 6 | usernames.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twitter Personas Using Python and Alchemy API 2 | 3 | Sign up for a Twitter API key: apps.twitter.com/ 4 | 5 | Sign up for an Alchemy API key: http://www.alchemyapi.com/api/register.html 6 | 7 | Add the Twitter handles you want to analyse to the file named usernames.txt 8 | 9 | Add the API keys you created to the file named api_keys.py 10 | 11 | Run the file named get_tweets.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | decorator==4.0.4 3 | gnureadline==6.3.3 4 | ipython==4.0.0 5 | ipython-genutils==0.1.0 6 | oauthlib==1.0.3 7 | path.py==8.1.2 8 | pexpect==4.0.1 9 | pickleshare==0.5 10 | ptyprocess==0.5 11 | requests==2.8.1 12 | requests-oauthlib==0.5.0 13 | simplegeneric==0.8.1 14 | six==1.4.1 15 | tld==0.7.4 16 | traitlets==4.0.0 17 | tweepy==3.4.0 18 | wheel==0.24.0 19 | -------------------------------------------------------------------------------- /get_tweets.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | import datetime 4 | import requests 5 | import tweepy 6 | from xml.etree import ElementTree 7 | from collections import Counter 8 | from tweepy import API 9 | from tweepy import OAuthHandler 10 | from tld import get_tld 11 | 12 | from api_keys import * 13 | 14 | 15 | # Twitter auth 16 | auth = tweepy.OAuthHandler(twitter_ckey, twitter_csecret) 17 | auth.set_access_token(twitter_atoken, twitter_asecret) 18 | api = tweepy.API(auth) 19 | 20 | # create some empty lists 21 | links = [] 22 | domains = [] 23 | firstpass = [] 24 | topics = [] 25 | tags = [] 26 | 27 | # make variables for the two Alchemy APIs I want to use 28 | ConceptsAPI = "http://gateway-a.watsonplatform.net/calls/url/URLGetRankedConcepts?apikey=" + watson_api_key + "&url=" 29 | 30 | # make a variable for the Twitter usernames file by reading the text 31 | # file usernames.txt 32 | usernames = open('usernames.txt', 'r+') 33 | # create a variable named usernames which is the usernames split by line. 34 | usernames = usernames.read().splitlines() 35 | 36 | 37 | def CountDomains(usernames): 38 | 39 | print "Starting tweet collection..." 40 | print str(len(usernames)) + " usernames.\n" 41 | 42 | # for all the usernames in the usernames file... 43 | for name in usernames: 44 | try: 45 | # create a variable named public_tweets 46 | public_tweets = api.user_timeline(name, count=20) 47 | except: 48 | print "private profile" 49 | continue 50 | # for every tweet in the public_tweets list... 51 | for tweet in public_tweets: 52 | 53 | # use regex to find the links in the tweet body 54 | urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.text) 55 | 56 | # for every url in the list of urls that the regex found 57 | for url in urls: 58 | try: 59 | # request each link using the requests library 60 | # follow redirects. 61 | link = requests.get(url, allow_redirects=True).url 62 | # put all of the urls into the list named links 63 | links.append(link) 64 | # use the get_tld function from the tld library 65 | domain = get_tld(link) 66 | # append the domain to the domains list 67 | domains.append(domain) 68 | except: 69 | pass 70 | 71 | print "\tFinished: " + name 72 | 73 | print "\nStarting Alchemy analysis of links..." 74 | print str(len(links)) + " links.\n" 75 | 76 | # for each of the urls in the urls list that was created above... 77 | for link in links: 78 | # create a new url by concatenating the concepts API base url 79 | # (defined at the start) to the link we want to get the concepts for 80 | apiurl = ConceptsAPI+link 81 | 82 | # create a variable named r which is the content from the API request 83 | r = requests.get(apiurl) 84 | 85 | # use the ElementTree function from the xml library to create 86 | # a new variable named doc from the xml returned in the API call 87 | doc = ElementTree.fromstring(r.text) 88 | 89 | # find all the tags in the xml and assign the name tag. 90 | # The tag is the concept text that we want to keep a list of. 91 | for tag in doc.findall('.//text'): 92 | tags.append(tag.text) 93 | print "\tFinished: " + link 94 | 95 | print "\nDone. :)" 96 | 97 | today = datetime.datetime.now() 98 | postfix = today.strftime('%Y-%m-%d-%H-%M') 99 | 100 | with open("domains_" + postfix + ".csv", "a") as personas: 101 | personaswriter = csv.writer(personas) 102 | for domain, count in dict(Counter(domains)).items(): 103 | personaswriter.writerow([domain, count]) 104 | personas.close() 105 | 106 | with open("concepts_" + postfix + ".csv", "a") as concepts: 107 | conceptswriter = csv.writer(concepts) 108 | for tag, count in dict(Counter(tags)).items(): 109 | conceptswriter.writerow([tag, count]) 110 | concepts.close() 111 | 112 | CountDomains(usernames) 113 | --------------------------------------------------------------------------------