├── .gitignore
├── README.md
├── dump.py
├── requirements.txt
├── scraper.py
└── settings.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .DS_Store
3 | .pyc
4 | private.py
5 | __pycache__
6 | stream.db
7 | tweets.csv
8 | tweets.db


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Twitter Scrape
 2 | 
 3 | Scrape tweets from twitter into a DB.  Convert the DB to a CSV file.
 4 | 
 5 | ## Installation
 6 | 
 7 | * `pip install -r requirements.txt`
 8 | 
 9 | ## Setup
10 | 
11 | * Create a file called `private.py`.
12 | * Sign up for a Twitter [developer account](https://dev.twitter.com/).
13 | * Create an application [here](https://apps.twitter.com/).
14 | * Set the following keys in `private.py`.  You can get these values from the app you created:
15 |     * `TWITTER_KEY`
16 |     * `TWITTER_SECRET`
17 |     * `TWITTER_APP_KEY`
18 |     * `TWITTER_APP_SECRET`
19 | * Set the following key in `private.py`.
20 |     * `CONNECTION_STRING` -- use `sqlite:///tweets.db` as a default if you need to.  It's recommended to use postgresql, but not necessary.
21 | 
22 | ## Usage
23 | 
24 | * `python scrape.py` to scrape.  Use `Ctrl + C` to stop.
25 | * `python dump.py` to generate `tweets.csv`, which contains all the tweet data that was scraped.
26 | * If you want to edit behavior, change settings in `settings.py`.


--------------------------------------------------------------------------------
/dump.py:
--------------------------------------------------------------------------------
1 | import settings
2 | import tweepy
3 | import dataset
4 | from textblob import TextBlob
5 | 
6 | db = dataset.connect(settings.CONNECTION_STRING)
7 | 
8 | result = db[settings.TABLE_NAME].all()
9 | dataset.freeze(result, format='csv', filename=settings.CSV_NAME)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tweepy
2 | ipython
3 | matplotlib
4 | scipy
5 | numpy
6 | pandas
7 | dataset
8 | psycopg2
9 | textblob


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
 1 | import settings
 2 | import tweepy
 3 | import dataset
 4 | from textblob import TextBlob
 5 | from sqlalchemy.exc import ProgrammingError
 6 | import json
 7 | 
 8 | db = dataset.connect(settings.CONNECTION_STRING)
 9 | 
10 | class StreamListener(tweepy.StreamListener):
11 | 
12 |     def on_status(self, status):
13 |         if status.retweeted:
14 |             return
15 | 
16 |         description = status.user.description
17 |         loc = status.user.location
18 |         text = status.text
19 |         coords = status.coordinates
20 |         geo = status.geo
21 |         name = status.user.screen_name
22 |         user_created = status.user.created_at
23 |         followers = status.user.followers_count
24 |         id_str = status.id_str
25 |         created = status.created_at
26 |         retweets = status.retweet_count
27 |         bg_color = status.user.profile_background_color
28 |         blob = TextBlob(text)
29 |         sent = blob.sentiment
30 | 
31 |         if geo is not None:
32 |             geo = json.dumps(geo)
33 | 
34 |         if coords is not None:
35 |             coords = json.dumps(coords)
36 | 
37 |         table = db[settings.TABLE_NAME]
38 |         try:
39 |             table.insert(dict(
40 |                 user_description=description,
41 |                 user_location=loc,
42 |                 coordinates=coords,
43 |                 text=text,
44 |                 geo=geo,
45 |                 user_name=name,
46 |                 user_created=user_created,
47 |                 user_followers=followers,
48 |                 id_str=id_str,
49 |                 created=created,
50 |                 retweet_count=retweets,
51 |                 user_bg_color=bg_color,
52 |                 polarity=sent.polarity,
53 |                 subjectivity=sent.subjectivity,
54 |             ))
55 |         except ProgrammingError as err:
56 |             print(err)
57 | 
58 |     def on_error(self, status_code):
59 |         if status_code == 420:
60 |             #returning False in on_data disconnects the stream
61 |             return False
62 | 
63 | auth = tweepy.OAuthHandler(settings.TWITTER_APP_KEY, settings.TWITTER_APP_SECRET)
64 | auth.set_access_token(settings.TWITTER_KEY, settings.TWITTER_SECRET)
65 | api = tweepy.API(auth)
66 | 
67 | stream_listener = StreamListener()
68 | stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
69 | stream.filter(track=settings.TRACK_TERMS)


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
1 | TRACK_TERMS = ["trump", "clinton", "sanders", "hillary clinton", "bernie", "donald trump"]
2 | CONNECTION_STRING = ""
3 | CSV_NAME = "tweets.csv"
4 | TABLE_NAME = "election"
5 | 
6 | try:
7 |     from private import *
8 | except Exception:
9 |     pass


--------------------------------------------------------------------------------