├── .gitignore ├── README.md ├── dump.py ├── requirements.txt ├── scraper.py └── settings.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .DS_Store 3 | .pyc 4 | private.py 5 | __pycache__ 6 | stream.db 7 | tweets.csv 8 | tweets.db -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twitter Scrape 2 | 3 | Scrape tweets from twitter into a DB. Convert the DB to a CSV file. 4 | 5 | ## Installation 6 | 7 | * `pip install -r requirements.txt` 8 | 9 | ## Setup 10 | 11 | * Create a file called `private.py`. 12 | * Sign up for a Twitter [developer account](https://dev.twitter.com/). 13 | * Create an application [here](https://apps.twitter.com/). 14 | * Set the following keys in `private.py`. You can get these values from the app you created: 15 | * `TWITTER_KEY` 16 | * `TWITTER_SECRET` 17 | * `TWITTER_APP_KEY` 18 | * `TWITTER_APP_SECRET` 19 | * Set the following key in `private.py`. 20 | * `CONNECTION_STRING` -- use `sqlite:///tweets.db` as a default if you need to. It's recommended to use postgresql, but not necessary. 21 | 22 | ## Usage 23 | 24 | * `python scrape.py` to scrape. Use `Ctrl + C` to stop. 25 | * `python dump.py` to generate `tweets.csv`, which contains all the tweet data that was scraped. 26 | * If you want to edit behavior, change settings in `settings.py`. -------------------------------------------------------------------------------- /dump.py: -------------------------------------------------------------------------------- 1 | import settings 2 | import tweepy 3 | import dataset 4 | from textblob import TextBlob 5 | 6 | db = dataset.connect(settings.CONNECTION_STRING) 7 | 8 | result = db[settings.TABLE_NAME].all() 9 | dataset.freeze(result, format='csv', filename=settings.CSV_NAME) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tweepy 2 | ipython 3 | matplotlib 4 | scipy 5 | numpy 6 | pandas 7 | dataset 8 | psycopg2 9 | textblob -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import settings 2 | import tweepy 3 | import dataset 4 | from textblob import TextBlob 5 | from sqlalchemy.exc import ProgrammingError 6 | import json 7 | 8 | db = dataset.connect(settings.CONNECTION_STRING) 9 | 10 | class StreamListener(tweepy.StreamListener): 11 | 12 | def on_status(self, status): 13 | if status.retweeted: 14 | return 15 | 16 | description = status.user.description 17 | loc = status.user.location 18 | text = status.text 19 | coords = status.coordinates 20 | geo = status.geo 21 | name = status.user.screen_name 22 | user_created = status.user.created_at 23 | followers = status.user.followers_count 24 | id_str = status.id_str 25 | created = status.created_at 26 | retweets = status.retweet_count 27 | bg_color = status.user.profile_background_color 28 | blob = TextBlob(text) 29 | sent = blob.sentiment 30 | 31 | if geo is not None: 32 | geo = json.dumps(geo) 33 | 34 | if coords is not None: 35 | coords = json.dumps(coords) 36 | 37 | table = db[settings.TABLE_NAME] 38 | try: 39 | table.insert(dict( 40 | user_description=description, 41 | user_location=loc, 42 | coordinates=coords, 43 | text=text, 44 | geo=geo, 45 | user_name=name, 46 | user_created=user_created, 47 | user_followers=followers, 48 | id_str=id_str, 49 | created=created, 50 | retweet_count=retweets, 51 | user_bg_color=bg_color, 52 | polarity=sent.polarity, 53 | subjectivity=sent.subjectivity, 54 | )) 55 | except ProgrammingError as err: 56 | print(err) 57 | 58 | def on_error(self, status_code): 59 | if status_code == 420: 60 | #returning False in on_data disconnects the stream 61 | return False 62 | 63 | auth = tweepy.OAuthHandler(settings.TWITTER_APP_KEY, settings.TWITTER_APP_SECRET) 64 | auth.set_access_token(settings.TWITTER_KEY, settings.TWITTER_SECRET) 65 | api = tweepy.API(auth) 66 | 67 | stream_listener = StreamListener() 68 | stream = tweepy.Stream(auth=api.auth, listener=stream_listener) 69 | stream.filter(track=settings.TRACK_TERMS) -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | TRACK_TERMS = ["trump", "clinton", "sanders", "hillary clinton", "bernie", "donald trump"] 2 | CONNECTION_STRING = "" 3 | CSV_NAME = "tweets.csv" 4 | TABLE_NAME = "election" 5 | 6 | try: 7 | from private import * 8 | except Exception: 9 | pass --------------------------------------------------------------------------------