├── .gitignore
├── README.md
├── requirements.txt
└── get_tweets.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.sublime-project
2 | *.sublime-workspace
3 | api_keys.py
4 | *.csv
5 | *.pyc
6 | usernames.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Twitter Personas Using Python and Alchemy API
 2 | 
 3 | Sign up for a Twitter API key: apps.twitter.com/
 4 | 
 5 | Sign up for an Alchemy API key: http://www.alchemyapi.com/api/register.html
 6 | 
 7 | Add the Twitter handles you want to analyse to the file named usernames.txt
 8 | 
 9 | Add the API keys you created to the file named api_keys.py
10 | 
11 | Run the file named get_tweets.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | decorator==4.0.4
 3 | gnureadline==6.3.3
 4 | ipython==4.0.0
 5 | ipython-genutils==0.1.0
 6 | oauthlib==1.0.3
 7 | path.py==8.1.2
 8 | pexpect==4.0.1
 9 | pickleshare==0.5
10 | ptyprocess==0.5
11 | requests==2.8.1
12 | requests-oauthlib==0.5.0
13 | simplegeneric==0.8.1
14 | six==1.4.1
15 | tld==0.7.4
16 | traitlets==4.0.0
17 | tweepy==3.4.0
18 | wheel==0.24.0
19 | 


--------------------------------------------------------------------------------
/get_tweets.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import csv
  3 | import datetime
  4 | import requests
  5 | import tweepy
  6 | from xml.etree import ElementTree
  7 | from collections import Counter
  8 | from tweepy import API
  9 | from tweepy import OAuthHandler
 10 | from tld import get_tld
 11 | 
 12 | from api_keys import *
 13 | 
 14 | 
 15 | # Twitter auth
 16 | auth = tweepy.OAuthHandler(twitter_ckey, twitter_csecret)
 17 | auth.set_access_token(twitter_atoken, twitter_asecret)
 18 | api = tweepy.API(auth)
 19 | 
 20 | # create some empty lists
 21 | links = []
 22 | domains = []
 23 | firstpass = []
 24 | topics = []
 25 | tags = []
 26 | 
 27 | # make variables for the two Alchemy APIs I want to use
 28 | ConceptsAPI = "http://gateway-a.watsonplatform.net/calls/url/URLGetRankedConcepts?apikey=" + watson_api_key + "&url="
 29 | 
 30 | # make a variable for the Twitter usernames file by reading the text
 31 | # file usernames.txt
 32 | usernames = open('usernames.txt', 'r+')
 33 | # create a variable named usernames which is the usernames split by line.
 34 | usernames = usernames.read().splitlines()
 35 | 
 36 | 
 37 | def CountDomains(usernames):
 38 | 
 39 |     print "Starting tweet collection..."
 40 |     print str(len(usernames)) + " usernames.\n"
 41 | 
 42 |     # for all the usernames in the usernames file...
 43 |     for name in usernames:
 44 |         try:
 45 |             # create a variable named public_tweets
 46 |             public_tweets = api.user_timeline(name, count=20)
 47 |         except:
 48 |             print "private profile"
 49 |             continue
 50 |         # for every tweet in the public_tweets list...
 51 |         for tweet in public_tweets:
 52 | 
 53 |             # use regex to find the links in the tweet body
 54 |             urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.text)
 55 | 
 56 |             # for every url in the list of urls that the regex found
 57 |             for url in urls:
 58 |                 try:
 59 |                     # request each link using the requests library
 60 |                     # follow redirects.
 61 |                     link = requests.get(url, allow_redirects=True).url
 62 |                     # put all of the urls into the list named links
 63 |                     links.append(link)
 64 |                     # use the get_tld function from the tld library
 65 |                     domain = get_tld(link)
 66 |                     # append the domain to the domains list
 67 |                     domains.append(domain)
 68 |                 except:
 69 |                     pass
 70 | 
 71 |         print "\tFinished: " + name
 72 | 
 73 |     print "\nStarting Alchemy analysis of links..."
 74 |     print str(len(links)) + " links.\n"
 75 | 
 76 |     # for each of the urls in the urls list that was created above...
 77 |     for link in links:
 78 |         # create a new url by concatenating the concepts API base url
 79 |         # (defined at the start) to the link we want to get the concepts for
 80 |         apiurl = ConceptsAPI+link
 81 | 
 82 |         # create a variable named r which is the content from the API request
 83 |         r = requests.get(apiurl)
 84 | 
 85 |         # use the ElementTree function from the xml library to create
 86 |         # a new variable named doc from the xml returned in the API call
 87 |         doc = ElementTree.fromstring(r.text)
 88 | 
 89 |         # find all the <text> tags in the xml and assign the name tag.
 90 |         # The <text> tag is the concept text that we want to keep a list of.
 91 |         for tag in doc.findall('.//text'):
 92 |             tags.append(tag.text)
 93 |         print "\tFinished: " + link
 94 | 
 95 |     print "\nDone. :)"
 96 | 
 97 |     today = datetime.datetime.now()
 98 |     postfix = today.strftime('%Y-%m-%d-%H-%M')
 99 | 
100 |     with open("domains_" + postfix + ".csv", "a") as personas:
101 |         personaswriter = csv.writer(personas)
102 |         for domain, count in dict(Counter(domains)).items():
103 |             personaswriter.writerow([domain, count])
104 |         personas.close()
105 | 
106 |     with open("concepts_" + postfix + ".csv", "a") as concepts:
107 |         conceptswriter = csv.writer(concepts)
108 |         for tag, count in dict(Counter(tags)).items():
109 |             conceptswriter.writerow([tag, count])
110 |         concepts.close()
111 | 
112 | CountDomains(usernames)
113 | 


--------------------------------------------------------------------------------