├── .gitignore ├── README.md ├── bs_scraper.py ├── csv_scraper.py ├── data ├── .~lock.crunchbase.xlsx# ├── crunchbase.xlsx ├── schedule.csv └── simple.html ├── family_tree.py ├── json_scraper.py ├── requirements.txt ├── scrape_netflix.py ├── scraper.py ├── start_selenium.py ├── xlsx2csv.py ├── xlsx_scraper.py └── xpath_intro.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | scrape_my_netflix.py 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyCon Introduction to Web and Data Scraping Tutorial 2 | =========================================== 3 | 4 | A tutorial-based introduction to web scraping with Python. 5 | 6 | Virtual Env 7 | ------------ 8 | 9 | If you'd like to use virtual environments, please follow the following instructions. It is not required for the tutorial but may be helpful. 10 | 11 | For more details on [virtual environments](http://www.doughellmann.com/projects/virtualenvwrapper/) 12 | 13 | If you don't have virtual env wrapper and/or pip: 14 | 15 | $ easy_install pip 16 | $ pip install virtualenvwrapper 17 | 18 | and read the additional instructions [here](http://virtualenvwrapper.readthedocs.org/en/latest/install.html) 19 | 20 | 21 | $ mkvirtualenv scraper_tutorial 22 | $ pip install -r requirements.txt 23 | 24 | 25 | LXML and Selenium 26 | ------------------------- 27 | You will need both [LXML](http://lxml.de/) and [Selenium](http://selenium-python.readthedocs.org/en/latest/index.html) to follow this tutorial in it's entirety. 28 | 29 | If you are using a Mac, I would highly recommend using [Homebrew](http://brew.sh/). It will help make pip install *very easy* for you to use. 30 | * [Homebrew and LXML Installation](http://geekforbrains.com/how-to-install-lxml-for-python-using-homebrew-and-pip-in-3-steps) 31 | * [More help on Installing LXML on Mac](http://lxml.de/installation.html#installation) 32 | * [And additional suggestions for LXML on Mac](http://stackoverflow.com/questions/1277124/how-do-you-install-lxml-on-os-x-leopard-without-using-macports-or-fink) 33 | 34 | If you are using Windows, it might be worth it to run this within a Linux Virtual Machine. If you are a Windows + Python guru, please follow these installation instructions. I can help as needed but I have not programmed on Windows in more than 5 years. 35 | * [Installing Selenium on Windows](http://selenium-python.readthedocs.org/en/latest/installation.html#detailed-instructions-for-windows-users) 36 | * [Installing LXML on Windows](http://lxml.de/installation.html#ms-windows) 37 | 38 | Please reach out to me if you have any questions on getting the initial requirements set up. Thanks! 39 | 40 | 41 | Firefox Web Browser 42 | --------------------- 43 | 44 | Firefox comes as the default web driver for Selenium. To use Selenium easily, please [download and install Firefox](http://www.mozilla.org/en-US/firefox/new/). 45 | 46 | Using PIP 47 | ------------ 48 | 49 | If you have never used PIP before you will need to sudo easy_install pip or brew install pip. PIP is a python package manager and it's really super so I highly advise using it! 50 | 51 | 52 | Questions? 53 | ---------- 54 | /msg kjam on freenode or @kjam on twitter 55 | 56 | -------------------------------------------------------------------------------- /bs_scraper.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from email.MIMEText import MIMEText 3 | import smtplib 4 | from bs4 import BeautifulSoup 5 | 6 | GMAIL_LOGIN = 'pyladiestest@gmail.com' 7 | GMAIL_PASSWORD = 'YOU NO CAN HAZ' 8 | 9 | 10 | def send_email(subject, message, from_addr=GMAIL_LOGIN, to_addr=GMAIL_LOGIN): 11 | msg = MIMEText(message) 12 | msg['Subject'] = subject 13 | msg['From'] = from_addr 14 | msg['To'] = to_addr 15 | msg['Reply-To'] = 'happyhours@noreply.com' 16 | 17 | server = smtplib.SMTP('smtp.gmail.com', 587) # port 465 or 587 18 | server.ehlo() 19 | server.starttls() 20 | server.ehlo() 21 | server.login(GMAIL_LOGIN, GMAIL_PASSWORD) 22 | server.sendmail(from_addr, to_addr, msg.as_string()) 23 | server.close() 24 | 25 | 26 | def get_site_html(url): 27 | source = urllib2.urlopen(url).read() 28 | return source 29 | 30 | 31 | def get_tree(url): 32 | source = get_site_html(url) 33 | tree = BeautifulSoup(source) 34 | return tree 35 | 36 | 37 | if __name__ == '__main__': 38 | 39 | stuff_i_like = ['burger', 'wine', 'sushi', 'sweet potato fries', 'BBQ'] 40 | found_happy_hours = [] 41 | my_happy_hours = [] 42 | 43 | # First, I'm going to identify the areas of the page I want to look at 44 | tables = get_tree( 45 | 'http://www.downtownla.com/3_10_happyHours.asp?action=ALL') 46 | 47 | # Then, I'm going to sort out the *exact* parts of the page 48 | # that match what I'm looking for... 49 | for t in tables.findAll('p', {'class': 'calendar_EventTitle'}): 50 | text = t.text 51 | for s in t.findNextSiblings(): 52 | text += '\n' + s.text 53 | found_happy_hours.append(text) 54 | 55 | print "The scraper found %d happy hours!" % len(found_happy_hours) 56 | 57 | # Now I'm going to loop through the food I like 58 | # and see if any of the happy hour descriptions match 59 | for food in stuff_i_like: 60 | for hh in found_happy_hours: 61 | # checking for text AND making sure I don't have duplicates 62 | if food in hh and hh not in my_happy_hours: 63 | print "YAY! I found some %s!" % food 64 | my_happy_hours.append(hh) 65 | 66 | print "I think you might like %d of them, yipeeeee!" % len(my_happy_hours) 67 | 68 | # Now, let's make a mail message we can read: 69 | message = 'Hey Katharine,\n\n\n' 70 | message += 'OMG, I found some stuff for you in Downtown, take a look.\n\n' 71 | message += '==============================\n'.join(my_happy_hours) 72 | message = message.encode('utf-8') 73 | # To read more about encoding: 74 | # http://diveintopython.org/xml_processing/unicode.html 75 | message = message.replace('\t', '').replace('\r', '') 76 | message += '\n\nXOXO,\n Your Py Script' 77 | 78 | # And email it to ourselves! 79 | email = 'katharine@pyladies.com' 80 | send_email('Happy Hour Update', message, from_addr=GMAIL_LOGIN, 81 | to_addr=email) 82 | -------------------------------------------------------------------------------- /csv_scraper.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | from datetime import datetime 3 | 4 | with open('data/schedule.csv') as document: 5 | reader = DictReader(document) 6 | for row in reader: 7 | day = datetime.strptime(row.get('START_DATE'), '%m/%d/%y') 8 | if 'PNC' in row.get('LOCATION') and day.weekday() > 4: 9 | print 'HOME WEEKEND GAME!! %s on %s' % ( 10 | row.get('SUBJECT'), row.get('START_DATE')) 11 | -------------------------------------------------------------------------------- /data/.~lock.crunchbase.xlsx#: -------------------------------------------------------------------------------- 1 | katharine ,katharine,kjamistan,03.04.2014 11:59,file:///home/katharine/.config/libreoffice/4; -------------------------------------------------------------------------------- /data/crunchbase.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/python-web-scraping-tutorial/65df43d244724bc49db64b12535ea2be46b3262b/data/crunchbase.xlsx -------------------------------------------------------------------------------- /data/schedule.csv: -------------------------------------------------------------------------------- 1 | START_DATE,START_TIME,START_TIME_ET,SUBJECT,LOCATION,DESCRIPTION,END_DATE,END_DATE_ET,END_TIME,END_TIME_ET,REMINDER_OFF,REMINDER_ON,REMINDER_DATE,REMINDER_TIME,REMINDER_TIME_ET,SHOWTIMEAS_FREE,SHOWTIMEAS_BUSY 2 | 02/26/14,01:05 PM,01:05 PM,Yankees at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",02/26/14,02/26/14,04:05 PM,04:05 PM,FALSE,TRUE,02/26/14,12:05 PM,12:05 PM,FREE,BUSY 3 | 02/27/14,01:05 PM,01:05 PM,Pirates at Yankees,George M. Steinbrenner Field,"Local TV: MLBN (delay) ----- Local Radio: MLB.com",02/27/14,02/27/14,04:05 PM,04:05 PM,FALSE,TRUE,02/27/14,12:05 PM,12:05 PM,FREE,BUSY 4 | 02/28/14,01:05 PM,01:05 PM,Pirates at Blue Jays,Florida Auto Exchange Stadium,"Local Radio: MLB.com",02/28/14,02/28/14,04:05 PM,04:05 PM,FALSE,TRUE,02/28/14,12:05 PM,12:05 PM,FREE,BUSY 5 | 03/01/14,01:05 PM,01:05 PM,Rays at Pirates,McKechnie Field,"Local Radio: MLB.com",03/01/14,03/01/14,04:05 PM,04:05 PM,FALSE,TRUE,03/01/14,12:05 PM,12:05 PM,FREE,BUSY 6 | 03/02/14,01:05 PM,01:05 PM,Pirates at Phillies,Bright House Field,"Local Radio: KDKA-FM 93.7",03/02/14,03/02/14,04:05 PM,04:05 PM,FALSE,TRUE,03/02/14,12:05 PM,12:05 PM,FREE,BUSY 7 | 03/03/14,01:05 PM,01:05 PM,Red Sox at Pirates,McKechnie Field,"Local Radio: MLB.com",03/03/14,03/03/14,04:05 PM,04:05 PM,FALSE,TRUE,03/03/14,12:05 PM,12:05 PM,FREE,BUSY 8 | 03/04/14,01:05 PM,01:05 PM,Pirates at Tigers,Joker Marchant Stadium,"Local Radio: KDKA-FM 93.7",03/04/14,03/04/14,04:05 PM,04:05 PM,FALSE,TRUE,03/04/14,12:05 PM,12:05 PM,FREE,BUSY 9 | 03/05/14,01:05 PM,01:05 PM,Pirates at Blue Jays,Florida Auto Exchange Stadium,"Local Radio: KDKA-FM 93.7",03/05/14,03/05/14,04:05 PM,04:05 PM,FALSE,TRUE,03/05/14,12:05 PM,12:05 PM,FREE,BUSY 10 | 03/06/14,01:05 PM,01:05 PM,Blue Jays at Pirates,McKechnie Field,"Local Radio: MLB.com",03/06/14,03/06/14,04:05 PM,04:05 PM,FALSE,TRUE,03/06/14,12:05 PM,12:05 PM,FREE,BUSY 11 | 03/07/14,01:05 PM,01:05 PM,Twins at Pirates,McKechnie Field,"Local Radio: MLB.com",03/07/14,03/07/14,04:05 PM,04:05 PM,FALSE,TRUE,03/07/14,12:05 PM,12:05 PM,FREE,BUSY 12 | 03/08/14,01:05 PM,01:05 PM,Pirates at Rays,Charlotte Sports Park,"Local Radio: MLB.com",03/08/14,03/08/14,04:05 PM,04:05 PM,FALSE,TRUE,03/08/14,12:05 PM,12:05 PM,FREE,BUSY 13 | 03/09/14,01:05 PM,01:05 PM,Pirates at Orioles,Ed Smith Stadium,"Local Radio: KDKA-FM 93.7",03/09/14,03/09/14,04:05 PM,04:05 PM,FALSE,TRUE,03/09/14,12:05 PM,12:05 PM,FREE,BUSY 14 | 03/09/14,01:05 PM,01:05 PM,Red Sox at Pirates,McKechnie Field,"Local TV: ROOT SPORTS",03/09/14,03/09/14,04:05 PM,04:05 PM,FALSE,TRUE,03/09/14,12:05 PM,12:05 PM,FREE,BUSY 15 | 03/10/14,01:05 PM,01:05 PM,Orioles at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",03/10/14,03/10/14,04:05 PM,04:05 PM,FALSE,TRUE,03/10/14,12:05 PM,12:05 PM,FREE,BUSY 16 | 03/12/14,01:05 PM,01:05 PM,Pirates at Twins,Hammond Stadium,"Local Radio: MLB.com",03/12/14,03/12/14,04:05 PM,04:05 PM,FALSE,TRUE,03/12/14,12:05 PM,12:05 PM,FREE,BUSY 17 | 03/13/14,01:05 PM,01:05 PM,Pirates at Rays,Charlotte Sports Park,"Local Radio: MLB.com",03/13/14,03/13/14,04:05 PM,04:05 PM,FALSE,TRUE,03/13/14,12:05 PM,12:05 PM,FREE,BUSY 18 | 03/14/14,01:05 PM,01:05 PM,Phillies at Pirates,McKechnie Field,"Local Radio: MLB.com",03/14/14,03/14/14,04:05 PM,04:05 PM,FALSE,TRUE,03/14/14,12:05 PM,12:05 PM,FREE,BUSY 19 | 03/15/14,01:05 PM,01:05 PM,Rays at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",03/15/14,03/15/14,04:05 PM,04:05 PM,FALSE,TRUE,03/15/14,12:05 PM,12:05 PM,FREE,BUSY 20 | 03/16/14,01:05 PM,01:05 PM,Pirates at Phillies,Bright House Field,"Local TV: MLBN ----- Local Radio: KDKA-FM 93.7",03/16/14,03/16/14,04:05 PM,04:05 PM,FALSE,TRUE,03/16/14,12:05 PM,12:05 PM,FREE,BUSY 21 | 03/17/14,01:05 PM,01:05 PM,Yankees at Pirates,McKechnie Field,"Local TV: MLBN (delay) -- ROOT SPORTS",03/17/14,03/17/14,04:05 PM,04:05 PM,FALSE,TRUE,03/17/14,12:05 PM,12:05 PM,FREE,BUSY 22 | 03/19/14,07:05 PM,07:05 PM,Pirates at Red Sox,JetBlue Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",03/19/14,03/19/14,10:05 PM,10:05 PM,FALSE,TRUE,03/19/14,06:05 PM,06:05 PM,FREE,BUSY 23 | 03/20/14,07:05 PM,07:05 PM,Orioles at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",03/20/14,03/20/14,10:05 PM,10:05 PM,FALSE,TRUE,03/20/14,06:05 PM,06:05 PM,FREE,BUSY 24 | 03/21/14,07:05 PM,07:05 PM,Pirates at Yankees,George M. Steinbrenner Field,"Local TV: MLBN -- ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",03/21/14,03/21/14,10:05 PM,10:05 PM,FALSE,TRUE,03/21/14,06:05 PM,06:05 PM,FREE,BUSY 25 | 03/22/14,01:05 PM,01:05 PM,Phillies at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",03/22/14,03/22/14,04:05 PM,04:05 PM,FALSE,TRUE,03/22/14,12:05 PM,12:05 PM,FREE,BUSY 26 | 03/23/14,01:05 PM,01:05 PM,Pirates at Orioles,Ed Smith Stadium,"Local Radio: KDKA-FM 93.7",03/23/14,03/23/14,04:05 PM,04:05 PM,FALSE,TRUE,03/23/14,12:05 PM,12:05 PM,FREE,BUSY 27 | 03/24/14,01:05 PM,01:05 PM,Tigers at Pirates,McKechnie Field,"Local TV: ESPN ----- Local Radio: MLB.com",03/24/14,03/24/14,04:05 PM,04:05 PM,FALSE,TRUE,03/24/14,12:05 PM,12:05 PM,FREE,BUSY 28 | 03/25/14,01:05 PM,01:05 PM,Blue Jays at Pirates,McKechnie Field,"Local Radio: KDKA-FM 93.7",03/25/14,03/25/14,04:05 PM,04:05 PM,FALSE,TRUE,03/25/14,12:05 PM,12:05 PM,FREE,BUSY 29 | 03/26/14,01:05 PM,01:05 PM,Pirates at Twins,Hammond Stadium,"Local Radio: MLB.com",03/26/14,03/26/14,04:05 PM,04:05 PM,FALSE,TRUE,03/26/14,12:05 PM,12:05 PM,FREE,BUSY 30 | 03/27/14,01:05 PM,01:05 PM,Yankees at Pirates,McKechnie Field,"Local TV: MLBN (delay) ----- Local Radio: MLB.com",03/27/14,03/27/14,04:05 PM,04:05 PM,FALSE,TRUE,03/27/14,12:05 PM,12:05 PM,FREE,BUSY 31 | 03/28/14,07:05 PM,07:05 PM,Pirates at Phillies,Citizens Bank Park,"Local Radio: KDKA-FM 93.7",03/28/14,03/28/14,10:05 PM,10:05 PM,FALSE,TRUE,03/28/14,06:05 PM,06:05 PM,FREE,BUSY 32 | 03/29/14,01:05 PM,01:05 PM,Pirates at Phillies,Citizens Bank Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",03/29/14,03/29/14,04:05 PM,04:05 PM,FALSE,TRUE,03/29/14,12:05 PM,12:05 PM,FREE,BUSY 33 | 03/31/14,01:05 PM,01:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS -- ESPN ----- Local Radio: KDKA-FM 93.7",03/31/14,03/31/14,04:05 PM,04:05 PM,FALSE,TRUE,03/31/14,12:05 PM,12:05 PM,FREE,BUSY 34 | 04/02/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/02/14,04/02/14,10:05 PM,10:05 PM,FALSE,TRUE,04/02/14,06:05 PM,06:05 PM,FREE,BUSY 35 | 04/03/14,12:35 PM,12:35 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/03/14,04/03/14,03:35 PM,03:35 PM,FALSE,TRUE,04/03/14,11:35 AM,11:35 AM,FREE,BUSY 36 | 04/04/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: MLBN -- ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/04/14,04/04/14,10:05 PM,10:05 PM,FALSE,TRUE,04/04/14,06:05 PM,06:05 PM,FREE,BUSY 37 | 04/05/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: MLBN ----- Local Radio: KDKA-FM 93.7",04/05/14,04/05/14,10:05 PM,10:05 PM,FALSE,TRUE,04/05/14,06:05 PM,06:05 PM,FREE,BUSY 38 | 04/06/14,01:35 PM,01:35 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/06/14,04/06/14,04:35 PM,04:35 PM,FALSE,TRUE,04/06/14,12:35 PM,12:35 PM,FREE,BUSY 39 | 04/08/14,08:05 PM,08:05 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/08/14,04/08/14,11:05 PM,11:05 PM,FALSE,TRUE,04/08/14,07:05 PM,07:05 PM,FREE,BUSY 40 | 04/09/14,08:05 PM,08:05 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/09/14,04/09/14,11:05 PM,11:05 PM,FALSE,TRUE,04/09/14,07:05 PM,07:05 PM,FREE,BUSY 41 | 04/10/14,02:20 PM,02:20 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/10/14,04/10/14,05:20 PM,05:20 PM,FALSE,TRUE,04/10/14,01:20 PM,01:20 PM,FREE,BUSY 42 | 04/11/14,08:10 PM,08:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/11/14,04/11/14,11:10 PM,11:10 PM,FALSE,TRUE,04/11/14,07:10 PM,07:10 PM,FREE,BUSY 43 | 04/12/14,07:10 PM,07:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/12/14,04/12/14,10:10 PM,10:10 PM,FALSE,TRUE,04/12/14,06:10 PM,06:10 PM,FREE,BUSY 44 | 04/13/14,02:10 PM,02:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/13/14,04/13/14,05:10 PM,05:10 PM,FALSE,TRUE,04/13/14,01:10 PM,01:10 PM,FREE,BUSY 45 | 04/14/14,07:10 PM,07:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/14/14,04/14/14,10:10 PM,10:10 PM,FALSE,TRUE,04/14/14,06:10 PM,06:10 PM,FREE,BUSY 46 | 04/15/14,07:10 PM,07:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/15/14,04/15/14,10:10 PM,10:10 PM,FALSE,TRUE,04/15/14,06:10 PM,06:10 PM,FREE,BUSY 47 | 04/16/14,12:35 PM,12:35 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/16/14,04/16/14,03:35 PM,03:35 PM,FALSE,TRUE,04/16/14,11:35 AM,11:35 AM,FREE,BUSY 48 | 04/17/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: MLBN -- ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/17/14,04/17/14,10:05 PM,10:05 PM,FALSE,TRUE,04/17/14,06:05 PM,06:05 PM,FREE,BUSY 49 | 04/18/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/18/14,04/18/14,10:05 PM,10:05 PM,FALSE,TRUE,04/18/14,06:05 PM,06:05 PM,FREE,BUSY 50 | 04/19/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/19/14,04/19/14,10:05 PM,10:05 PM,FALSE,TRUE,04/19/14,06:05 PM,06:05 PM,FREE,BUSY 51 | 04/20/14,01:35 PM,01:35 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/20/14,04/20/14,04:35 PM,04:35 PM,FALSE,TRUE,04/20/14,12:35 PM,12:35 PM,FREE,BUSY 52 | 04/21/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local Radio: KDKA-FM 93.7",04/21/14,04/21/14,10:05 PM,10:05 PM,FALSE,TRUE,04/21/14,06:05 PM,06:05 PM,FREE,BUSY 53 | 04/22/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/22/14,04/22/14,10:05 PM,10:05 PM,FALSE,TRUE,04/22/14,06:05 PM,06:05 PM,FREE,BUSY 54 | 04/23/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/23/14,04/23/14,10:05 PM,10:05 PM,FALSE,TRUE,04/23/14,06:05 PM,06:05 PM,FREE,BUSY 55 | 04/24/14,12:35 PM,12:35 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/24/14,04/24/14,03:35 PM,03:35 PM,FALSE,TRUE,04/24/14,11:35 AM,11:35 AM,FREE,BUSY 56 | 04/25/14,08:15 PM,08:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS -- MLBN ----- Local Radio: KDKA-FM 93.7",04/25/14,04/25/14,11:15 PM,11:15 PM,FALSE,TRUE,04/25/14,07:15 PM,07:15 PM,FREE,BUSY 57 | 04/26/14,04:05 PM,04:05 PM,Pirates at Cardinals,Busch Stadium,"Local TV: FS1 -- ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/26/14,04/26/14,07:05 PM,07:05 PM,FALSE,TRUE,04/26/14,03:05 PM,03:05 PM,FREE,BUSY 58 | 04/27/14,02:15 PM,02:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/27/14,04/27/14,05:15 PM,05:15 PM,FALSE,TRUE,04/27/14,01:15 PM,01:15 PM,FREE,BUSY 59 | 04/29/14,07:05 PM,07:05 PM,Pirates at Orioles,Oriole Park at Camden Yards,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/29/14,04/29/14,10:05 PM,10:05 PM,FALSE,TRUE,04/29/14,06:05 PM,06:05 PM,FREE,BUSY 60 | 04/30/14,07:05 PM,07:05 PM,Pirates at Orioles,Oriole Park at Camden Yards,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",04/30/14,04/30/14,10:05 PM,10:05 PM,FALSE,TRUE,04/30/14,06:05 PM,06:05 PM,FREE,BUSY 61 | 05/02/14,07:05 PM,07:05 PM,Blue Jays at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/02/14,05/02/14,10:05 PM,10:05 PM,FALSE,TRUE,05/02/14,06:05 PM,06:05 PM,FREE,BUSY 62 | 05/03/14,07:05 PM,07:05 PM,Blue Jays at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/03/14,05/03/14,10:05 PM,10:05 PM,FALSE,TRUE,05/03/14,06:05 PM,06:05 PM,FREE,BUSY 63 | 05/04/14,01:35 PM,01:35 PM,Blue Jays at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/04/14,05/04/14,04:35 PM,04:35 PM,FALSE,TRUE,05/04/14,12:35 PM,12:35 PM,FREE,BUSY 64 | 05/05/14,07:05 PM,07:05 PM,Giants at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/05/14,05/05/14,10:05 PM,10:05 PM,FALSE,TRUE,05/05/14,06:05 PM,06:05 PM,FREE,BUSY 65 | 05/06/14,07:05 PM,07:05 PM,Giants at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/06/14,05/06/14,10:05 PM,10:05 PM,FALSE,TRUE,05/06/14,06:05 PM,06:05 PM,FREE,BUSY 66 | 05/07/14,12:35 PM,12:35 PM,Giants at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/07/14,05/07/14,03:35 PM,03:35 PM,FALSE,TRUE,05/07/14,11:35 AM,11:35 AM,FREE,BUSY 67 | 05/09/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/09/14,05/09/14,10:05 PM,10:05 PM,FALSE,TRUE,05/09/14,06:05 PM,06:05 PM,FREE,BUSY 68 | 05/10/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/10/14,05/10/14,10:05 PM,10:05 PM,FALSE,TRUE,05/10/14,06:05 PM,06:05 PM,FREE,BUSY 69 | 05/11/14,08:05 PM,08:05 PM,Cardinals at Pirates,PNC Park,"Local TV: ESPN ----- Local Radio: KDKA-FM 93.7",05/11/14,05/11/14,11:05 PM,11:05 PM,FALSE,TRUE,05/11/14,07:05 PM,07:05 PM,FREE,BUSY 70 | 05/13/14,08:10 PM,08:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/13/14,05/13/14,11:10 PM,11:10 PM,FALSE,TRUE,05/13/14,07:10 PM,07:10 PM,FREE,BUSY 71 | 05/14/14,08:10 PM,08:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/14/14,05/14/14,11:10 PM,11:10 PM,FALSE,TRUE,05/14/14,07:10 PM,07:10 PM,FREE,BUSY 72 | 05/15/14,01:10 PM,01:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/15/14,05/15/14,04:10 PM,04:10 PM,FALSE,TRUE,05/15/14,12:10 PM,12:10 PM,FREE,BUSY 73 | 05/16/14,07:05 PM,07:05 PM,Pirates at Yankees,Yankee Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/16/14,05/16/14,10:05 PM,10:05 PM,FALSE,TRUE,05/16/14,06:05 PM,06:05 PM,FREE,BUSY 74 | 05/17/14,04:05 PM,04:05 PM,Pirates at Yankees,Yankee Stadium,"Local TV: FS1 -- ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/17/14,05/17/14,07:05 PM,07:05 PM,FALSE,TRUE,05/17/14,03:05 PM,03:05 PM,FREE,BUSY 75 | 05/18/14,01:05 PM,01:05 PM,Pirates at Yankees,Yankee Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/18/14,05/18/14,04:05 PM,04:05 PM,FALSE,TRUE,05/18/14,12:05 PM,12:05 PM,FREE,BUSY 76 | 05/20/14,07:05 PM,07:05 PM,Orioles at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/20/14,05/20/14,10:05 PM,10:05 PM,FALSE,TRUE,05/20/14,06:05 PM,06:05 PM,FREE,BUSY 77 | 05/21/14,07:05 PM,07:05 PM,Orioles at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/21/14,05/21/14,10:05 PM,10:05 PM,FALSE,TRUE,05/21/14,06:05 PM,06:05 PM,FREE,BUSY 78 | 05/22/14,07:05 PM,07:05 PM,Nationals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/22/14,05/22/14,10:05 PM,10:05 PM,FALSE,TRUE,05/22/14,06:05 PM,06:05 PM,FREE,BUSY 79 | 05/23/14,07:05 PM,07:05 PM,Nationals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/23/14,05/23/14,10:05 PM,10:05 PM,FALSE,TRUE,05/23/14,06:05 PM,06:05 PM,FREE,BUSY 80 | 05/24/14,07:15 PM,07:15 PM,Nationals at Pirates,PNC Park,"Local TV: FOX ----- Local Radio: KDKA-FM 93.7",05/24/14,05/24/14,10:15 PM,10:15 PM,FALSE,TRUE,05/24/14,06:15 PM,06:15 PM,FREE,BUSY 81 | 05/25/14,01:35 PM,01:35 PM,Nationals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/25/14,05/25/14,04:35 PM,04:35 PM,FALSE,TRUE,05/25/14,12:35 PM,12:35 PM,FREE,BUSY 82 | 05/26/14,01:10 PM,01:10 PM,Pirates at Mets,Citi Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/26/14,05/26/14,04:10 PM,04:10 PM,FALSE,TRUE,05/26/14,12:10 PM,12:10 PM,FREE,BUSY 83 | 05/27/14,07:10 PM,07:10 PM,Pirates at Mets,Citi Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/27/14,05/27/14,10:10 PM,10:10 PM,FALSE,TRUE,05/27/14,06:10 PM,06:10 PM,FREE,BUSY 84 | 05/28/14,01:10 PM,01:10 PM,Pirates at Mets,Citi Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/28/14,05/28/14,04:10 PM,04:10 PM,FALSE,TRUE,05/28/14,12:10 PM,12:10 PM,FREE,BUSY 85 | 05/29/14,10:10 PM,10:10 PM,Pirates at Dodgers,Dodger Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/30/14,05/30/14,01:10 AM,01:10 AM,FALSE,TRUE,05/29/14,09:10 PM,09:10 PM,FREE,BUSY 86 | 05/30/14,10:10 PM,10:10 PM,Pirates at Dodgers,Dodger Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",05/31/14,05/31/14,01:10 AM,01:10 AM,FALSE,TRUE,05/30/14,09:10 PM,09:10 PM,FREE,BUSY 87 | 05/31/14,07:15 PM,07:15 PM,Pirates at Dodgers,Dodger Stadium,"Local TV: FOX ----- Local Radio: KDKA-FM 93.7",05/31/14,05/31/14,10:15 PM,10:15 PM,FALSE,TRUE,05/31/14,06:15 PM,06:15 PM,FREE,BUSY 88 | 06/01/14,03:33 AM,03:33 AM,Pirates at Dodgers,Dodger Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/01/14,06/01/14,06:33 AM,06:33 AM,FALSE,TRUE,06/01/14,02:33 AM,02:33 AM,FREE,BUSY 89 | 06/02/14,10:10 PM,10:10 PM,Pirates at Padres,Petco Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/03/14,06/03/14,01:10 AM,01:10 AM,FALSE,TRUE,06/02/14,09:10 PM,09:10 PM,FREE,BUSY 90 | 06/03/14,10:10 PM,10:10 PM,Pirates at Padres,Petco Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/04/14,06/04/14,01:10 AM,01:10 AM,FALSE,TRUE,06/03/14,09:10 PM,09:10 PM,FREE,BUSY 91 | 06/04/14,06:40 PM,06:40 PM,Pirates at Padres,Petco Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/04/14,06/04/14,09:40 PM,09:40 PM,FALSE,TRUE,06/04/14,05:40 PM,05:40 PM,FREE,BUSY 92 | 06/06/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/06/14,06/06/14,10:05 PM,10:05 PM,FALSE,TRUE,06/06/14,06:05 PM,06:05 PM,FREE,BUSY 93 | 06/07/14,04:05 PM,04:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/07/14,06/07/14,07:05 PM,07:05 PM,FALSE,TRUE,06/07/14,03:05 PM,03:05 PM,FREE,BUSY 94 | 06/08/14,01:35 PM,01:35 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/08/14,06/08/14,04:35 PM,04:35 PM,FALSE,TRUE,06/08/14,12:35 PM,12:35 PM,FREE,BUSY 95 | 06/09/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/09/14,06/09/14,10:05 PM,10:05 PM,FALSE,TRUE,06/09/14,06:05 PM,06:05 PM,FREE,BUSY 96 | 06/10/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/10/14,06/10/14,10:05 PM,10:05 PM,FALSE,TRUE,06/10/14,06:05 PM,06:05 PM,FREE,BUSY 97 | 06/11/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/11/14,06/11/14,10:05 PM,10:05 PM,FALSE,TRUE,06/11/14,06:05 PM,06:05 PM,FREE,BUSY 98 | 06/12/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/12/14,06/12/14,10:05 PM,10:05 PM,FALSE,TRUE,06/12/14,06:05 PM,06:05 PM,FREE,BUSY 99 | 06/13/14,07:10 PM,07:10 PM,Pirates at Marlins,Marlins Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/13/14,06/13/14,10:10 PM,10:10 PM,FALSE,TRUE,06/13/14,06:10 PM,06:10 PM,FREE,BUSY 100 | 06/14/14,04:10 PM,04:10 PM,Pirates at Marlins,Marlins Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/14/14,06/14/14,07:10 PM,07:10 PM,FALSE,TRUE,06/14/14,03:10 PM,03:10 PM,FREE,BUSY 101 | 06/15/14,01:10 PM,01:10 PM,Pirates at Marlins,Marlins Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/15/14,06/15/14,04:10 PM,04:10 PM,FALSE,TRUE,06/15/14,12:10 PM,12:10 PM,FREE,BUSY 102 | 06/17/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/17/14,06/17/14,10:05 PM,10:05 PM,FALSE,TRUE,06/17/14,06:05 PM,06:05 PM,FREE,BUSY 103 | 06/18/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/18/14,06/18/14,10:05 PM,10:05 PM,FALSE,TRUE,06/18/14,06:05 PM,06:05 PM,FREE,BUSY 104 | 06/19/14,12:35 PM,12:35 PM,Reds at Pirates,PNC Park,"Local Radio: KDKA-FM 93.7",06/19/14,06/19/14,03:35 PM,03:35 PM,FALSE,TRUE,06/19/14,11:35 AM,11:35 AM,FREE,BUSY 105 | 06/20/14,04:05 PM,04:05 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/20/14,06/20/14,07:05 PM,07:05 PM,FALSE,TRUE,06/20/14,03:05 PM,03:05 PM,FREE,BUSY 106 | 06/21/14,07:15 PM,07:15 PM,Pirates at Cubs,Wrigley Field,"Local TV: FOX ----- Local Radio: KDKA-FM 93.7",06/21/14,06/21/14,10:15 PM,10:15 PM,FALSE,TRUE,06/21/14,06:15 PM,06:15 PM,FREE,BUSY 107 | 06/22/14,02:20 PM,02:20 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/22/14,06/22/14,05:20 PM,05:20 PM,FALSE,TRUE,06/22/14,01:20 PM,01:20 PM,FREE,BUSY 108 | 06/23/14,07:10 PM,07:10 PM,Pirates at Rays,Tropicana Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/23/14,06/23/14,10:10 PM,10:10 PM,FALSE,TRUE,06/23/14,06:10 PM,06:10 PM,FREE,BUSY 109 | 06/24/14,07:10 PM,07:10 PM,Pirates at Rays,Tropicana Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/24/14,06/24/14,10:10 PM,10:10 PM,FALSE,TRUE,06/24/14,06:10 PM,06:10 PM,FREE,BUSY 110 | 06/25/14,12:10 PM,12:10 PM,Pirates at Rays,Tropicana Field,"Local Radio: KDKA-FM 93.7",06/25/14,06/25/14,03:10 PM,03:10 PM,FALSE,TRUE,06/25/14,11:10 AM,11:10 AM,FREE,BUSY 111 | 06/26/14,07:05 PM,07:05 PM,Mets at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/26/14,06/26/14,10:05 PM,10:05 PM,FALSE,TRUE,06/26/14,06:05 PM,06:05 PM,FREE,BUSY 112 | 06/27/14,07:05 PM,07:05 PM,Mets at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/27/14,06/27/14,10:05 PM,10:05 PM,FALSE,TRUE,06/27/14,06:05 PM,06:05 PM,FREE,BUSY 113 | 06/28/14,04:05 PM,04:05 PM,Mets at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/28/14,06/28/14,07:05 PM,07:05 PM,FALSE,TRUE,06/28/14,03:05 PM,03:05 PM,FREE,BUSY 114 | 06/29/14,01:35 PM,01:35 PM,Mets at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",06/29/14,06/29/14,04:35 PM,04:35 PM,FALSE,TRUE,06/29/14,12:35 PM,12:35 PM,FREE,BUSY 115 | 07/01/14,07:05 PM,07:05 PM,D-backs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/01/14,07/01/14,10:05 PM,10:05 PM,FALSE,TRUE,07/01/14,06:05 PM,06:05 PM,FREE,BUSY 116 | 07/02/14,07:05 PM,07:05 PM,D-backs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/02/14,07/02/14,10:05 PM,10:05 PM,FALSE,TRUE,07/02/14,06:05 PM,06:05 PM,FREE,BUSY 117 | 07/03/14,07:05 PM,07:05 PM,D-backs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/03/14,07/03/14,10:05 PM,10:05 PM,FALSE,TRUE,07/03/14,06:05 PM,06:05 PM,FREE,BUSY 118 | 07/04/14,05:05 PM,05:05 PM,Phillies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/04/14,07/04/14,08:05 PM,08:05 PM,FALSE,TRUE,07/04/14,04:05 PM,04:05 PM,FREE,BUSY 119 | 07/05/14,04:05 PM,04:05 PM,Phillies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/05/14,07/05/14,07:05 PM,07:05 PM,FALSE,TRUE,07/05/14,03:05 PM,03:05 PM,FREE,BUSY 120 | 07/06/14,01:35 PM,01:35 PM,Phillies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/06/14,07/06/14,04:35 PM,04:35 PM,FALSE,TRUE,07/06/14,12:35 PM,12:35 PM,FREE,BUSY 121 | 07/07/14,08:15 PM,08:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/07/14,07/07/14,11:15 PM,11:15 PM,FALSE,TRUE,07/07/14,07:15 PM,07:15 PM,FREE,BUSY 122 | 07/08/14,08:15 PM,08:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/08/14,07/08/14,11:15 PM,11:15 PM,FALSE,TRUE,07/08/14,07:15 PM,07:15 PM,FREE,BUSY 123 | 07/09/14,08:15 PM,08:15 PM,Pirates at Cardinals,Busch Stadium,"Local Radio: KDKA-FM 93.7",07/09/14,07/09/14,11:15 PM,11:15 PM,FALSE,TRUE,07/09/14,07:15 PM,07:15 PM,FREE,BUSY 124 | 07/10/14,07:15 PM,07:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/10/14,07/10/14,10:15 PM,10:15 PM,FALSE,TRUE,07/10/14,06:15 PM,06:15 PM,FREE,BUSY 125 | 07/11/14,07:10 PM,07:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/11/14,07/11/14,10:10 PM,10:10 PM,FALSE,TRUE,07/11/14,06:10 PM,06:10 PM,FREE,BUSY 126 | 07/12/14,07:15 PM,07:15 PM,Pirates at Reds,Great American Ball Park,"Local TV: FOX ----- Local Radio: KDKA-FM 93.7",07/12/14,07/12/14,10:15 PM,10:15 PM,FALSE,TRUE,07/12/14,06:15 PM,06:15 PM,FREE,BUSY 127 | 07/13/14,01:10 PM,01:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/13/14,07/13/14,04:10 PM,04:10 PM,FALSE,TRUE,07/13/14,12:10 PM,12:10 PM,FREE,BUSY 128 | 07/15/14,03:33 AM,03:33 AM,NL All-Stars at AL All-Stars,Target Field,,07/15/14,07/15/14,06:33 AM,06:33 AM,FALSE,TRUE,07/15/14,02:33 AM,02:33 AM,FREE,BUSY 129 | 07/18/14,07:05 PM,07:05 PM,Rockies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/18/14,07/18/14,10:05 PM,10:05 PM,FALSE,TRUE,07/18/14,06:05 PM,06:05 PM,FREE,BUSY 130 | 07/19/14,07:05 PM,07:05 PM,Rockies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/19/14,07/19/14,10:05 PM,10:05 PM,FALSE,TRUE,07/19/14,06:05 PM,06:05 PM,FREE,BUSY 131 | 07/20/14,01:35 PM,01:35 PM,Rockies at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/20/14,07/20/14,04:35 PM,04:35 PM,FALSE,TRUE,07/20/14,12:35 PM,12:35 PM,FREE,BUSY 132 | 07/21/14,07:05 PM,07:05 PM,Dodgers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/21/14,07/21/14,10:05 PM,10:05 PM,FALSE,TRUE,07/21/14,06:05 PM,06:05 PM,FREE,BUSY 133 | 07/22/14,07:05 PM,07:05 PM,Dodgers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/22/14,07/22/14,10:05 PM,10:05 PM,FALSE,TRUE,07/22/14,06:05 PM,06:05 PM,FREE,BUSY 134 | 07/23/14,07:05 PM,07:05 PM,Dodgers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/23/14,07/23/14,10:05 PM,10:05 PM,FALSE,TRUE,07/23/14,06:05 PM,06:05 PM,FREE,BUSY 135 | 07/25/14,08:40 PM,08:40 PM,Pirates at Rockies,Coors Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/25/14,07/25/14,11:40 PM,11:40 PM,FALSE,TRUE,07/25/14,07:40 PM,07:40 PM,FREE,BUSY 136 | 07/26/14,08:10 PM,08:10 PM,Pirates at Rockies,Coors Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/26/14,07/26/14,11:10 PM,11:10 PM,FALSE,TRUE,07/26/14,07:10 PM,07:10 PM,FREE,BUSY 137 | 07/27/14,04:10 PM,04:10 PM,Pirates at Rockies,Coors Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/27/14,07/27/14,07:10 PM,07:10 PM,FALSE,TRUE,07/27/14,03:10 PM,03:10 PM,FREE,BUSY 138 | 07/28/14,10:15 PM,10:15 PM,Pirates at Giants,AT&T Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/29/14,07/29/14,01:15 AM,01:15 AM,FALSE,TRUE,07/28/14,09:15 PM,09:15 PM,FREE,BUSY 139 | 07/29/14,10:15 PM,10:15 PM,Pirates at Giants,AT&T Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",07/30/14,07/30/14,01:15 AM,01:15 AM,FALSE,TRUE,07/29/14,09:15 PM,09:15 PM,FREE,BUSY 140 | 07/30/14,03:45 PM,03:45 PM,Pirates at Giants,AT&T Park,"Local Radio: KDKA-FM 93.7",07/30/14,07/30/14,06:45 PM,06:45 PM,FALSE,TRUE,07/30/14,02:45 PM,02:45 PM,FREE,BUSY 141 | 07/31/14,09:40 PM,09:40 PM,Pirates at D-backs,Chase Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/01/14,08/01/14,12:40 AM,12:40 AM,FALSE,TRUE,07/31/14,08:40 PM,08:40 PM,FREE,BUSY 142 | 08/01/14,09:40 PM,09:40 PM,Pirates at D-backs,Chase Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/02/14,08/02/14,12:40 AM,12:40 AM,FALSE,TRUE,08/01/14,08:40 PM,08:40 PM,FREE,BUSY 143 | 08/02/14,08:10 PM,08:10 PM,Pirates at D-backs,Chase Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/02/14,08/02/14,11:10 PM,11:10 PM,FALSE,TRUE,08/02/14,07:10 PM,07:10 PM,FREE,BUSY 144 | 08/03/14,04:10 PM,04:10 PM,Pirates at D-backs,Chase Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/03/14,08/03/14,07:10 PM,07:10 PM,FALSE,TRUE,08/03/14,03:10 PM,03:10 PM,FREE,BUSY 145 | 08/05/14,07:05 PM,07:05 PM,Marlins at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/05/14,08/05/14,10:05 PM,10:05 PM,FALSE,TRUE,08/05/14,06:05 PM,06:05 PM,FREE,BUSY 146 | 08/06/14,07:05 PM,07:05 PM,Marlins at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/06/14,08/06/14,10:05 PM,10:05 PM,FALSE,TRUE,08/06/14,06:05 PM,06:05 PM,FREE,BUSY 147 | 08/07/14,07:05 PM,07:05 PM,Marlins at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/07/14,08/07/14,10:05 PM,10:05 PM,FALSE,TRUE,08/07/14,06:05 PM,06:05 PM,FREE,BUSY 148 | 08/08/14,07:05 PM,07:05 PM,Padres at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/08/14,08/08/14,10:05 PM,10:05 PM,FALSE,TRUE,08/08/14,06:05 PM,06:05 PM,FREE,BUSY 149 | 08/09/14,07:05 PM,07:05 PM,Padres at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/09/14,08/09/14,10:05 PM,10:05 PM,FALSE,TRUE,08/09/14,06:05 PM,06:05 PM,FREE,BUSY 150 | 08/10/14,01:35 PM,01:35 PM,Padres at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/10/14,08/10/14,04:35 PM,04:35 PM,FALSE,TRUE,08/10/14,12:35 PM,12:35 PM,FREE,BUSY 151 | 08/11/14,07:05 PM,07:05 PM,Tigers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/11/14,08/11/14,10:05 PM,10:05 PM,FALSE,TRUE,08/11/14,06:05 PM,06:05 PM,FREE,BUSY 152 | 08/12/14,07:05 PM,07:05 PM,Tigers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/12/14,08/12/14,10:05 PM,10:05 PM,FALSE,TRUE,08/12/14,06:05 PM,06:05 PM,FREE,BUSY 153 | 08/13/14,07:08 PM,07:08 PM,Pirates at Tigers,Comerica Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/13/14,08/13/14,10:08 PM,10:08 PM,FALSE,TRUE,08/13/14,06:08 PM,06:08 PM,FREE,BUSY 154 | 08/14/14,01:08 PM,01:08 PM,Pirates at Tigers,Comerica Park,"Local Radio: KDKA-FM 93.7",08/14/14,08/14/14,04:08 PM,04:08 PM,FALSE,TRUE,08/14/14,12:08 PM,12:08 PM,FREE,BUSY 155 | 08/15/14,07:05 PM,07:05 PM,Pirates at Nationals,Nationals Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/15/14,08/15/14,10:05 PM,10:05 PM,FALSE,TRUE,08/15/14,06:05 PM,06:05 PM,FREE,BUSY 156 | 08/16/14,07:05 PM,07:05 PM,Pirates at Nationals,Nationals Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/16/14,08/16/14,10:05 PM,10:05 PM,FALSE,TRUE,08/16/14,06:05 PM,06:05 PM,FREE,BUSY 157 | 08/17/14,05:05 PM,05:05 PM,Pirates at Nationals,Nationals Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/17/14,08/17/14,08:05 PM,08:05 PM,FALSE,TRUE,08/17/14,04:05 PM,04:05 PM,FREE,BUSY 158 | 08/18/14,07:05 PM,07:05 PM,Braves at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/18/14,08/18/14,10:05 PM,10:05 PM,FALSE,TRUE,08/18/14,06:05 PM,06:05 PM,FREE,BUSY 159 | 08/19/14,07:05 PM,07:05 PM,Braves at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/19/14,08/19/14,10:05 PM,10:05 PM,FALSE,TRUE,08/19/14,06:05 PM,06:05 PM,FREE,BUSY 160 | 08/20/14,07:05 PM,07:05 PM,Braves at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/20/14,08/20/14,10:05 PM,10:05 PM,FALSE,TRUE,08/20/14,06:05 PM,06:05 PM,FREE,BUSY 161 | 08/22/14,08:10 PM,08:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/22/14,08/22/14,11:10 PM,11:10 PM,FALSE,TRUE,08/22/14,07:10 PM,07:10 PM,FREE,BUSY 162 | 08/23/14,07:10 PM,07:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/23/14,08/23/14,10:10 PM,10:10 PM,FALSE,TRUE,08/23/14,06:10 PM,06:10 PM,FREE,BUSY 163 | 08/24/14,02:10 PM,02:10 PM,Pirates at Brewers,Miller Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/24/14,08/24/14,05:10 PM,05:10 PM,FALSE,TRUE,08/24/14,01:10 PM,01:10 PM,FREE,BUSY 164 | 08/25/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/25/14,08/25/14,10:05 PM,10:05 PM,FALSE,TRUE,08/25/14,06:05 PM,06:05 PM,FREE,BUSY 165 | 08/26/14,07:05 PM,07:05 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/26/14,08/26/14,10:05 PM,10:05 PM,FALSE,TRUE,08/26/14,06:05 PM,06:05 PM,FREE,BUSY 166 | 08/27/14,12:35 PM,12:35 PM,Cardinals at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/27/14,08/27/14,03:35 PM,03:35 PM,FALSE,TRUE,08/27/14,11:35 AM,11:35 AM,FREE,BUSY 167 | 08/29/14,07:05 PM,07:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/29/14,08/29/14,10:05 PM,10:05 PM,FALSE,TRUE,08/29/14,06:05 PM,06:05 PM,FREE,BUSY 168 | 08/30/14,04:05 PM,04:05 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS -- FS1 ----- Local Radio: KDKA-FM 93.7",08/30/14,08/30/14,07:05 PM,07:05 PM,FALSE,TRUE,08/30/14,03:05 PM,03:05 PM,FREE,BUSY 169 | 08/31/14,01:35 PM,01:35 PM,Reds at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",08/31/14,08/31/14,04:35 PM,04:35 PM,FALSE,TRUE,08/31/14,12:35 PM,12:35 PM,FREE,BUSY 170 | 09/01/14,02:15 PM,02:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/01/14,09/01/14,05:15 PM,05:15 PM,FALSE,TRUE,09/01/14,01:15 PM,01:15 PM,FREE,BUSY 171 | 09/02/14,08:15 PM,08:15 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/02/14,09/02/14,11:15 PM,11:15 PM,FALSE,TRUE,09/02/14,07:15 PM,07:15 PM,FREE,BUSY 172 | 09/03/14,01:45 PM,01:45 PM,Pirates at Cardinals,Busch Stadium,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/03/14,09/03/14,04:45 PM,04:45 PM,FALSE,TRUE,09/03/14,12:45 PM,12:45 PM,FREE,BUSY 173 | 09/05/14,02:20 PM,02:20 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/05/14,09/05/14,05:20 PM,05:20 PM,FALSE,TRUE,09/05/14,01:20 PM,01:20 PM,FREE,BUSY 174 | 09/06/14,04:05 PM,04:05 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/06/14,09/06/14,07:05 PM,07:05 PM,FALSE,TRUE,09/06/14,03:05 PM,03:05 PM,FREE,BUSY 175 | 09/07/14,02:20 PM,02:20 PM,Pirates at Cubs,Wrigley Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/07/14,09/07/14,05:20 PM,05:20 PM,FALSE,TRUE,09/07/14,01:20 PM,01:20 PM,FREE,BUSY 176 | 09/08/14,07:05 PM,07:05 PM,Pirates at Phillies,Citizens Bank Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/08/14,09/08/14,10:05 PM,10:05 PM,FALSE,TRUE,09/08/14,06:05 PM,06:05 PM,FREE,BUSY 177 | 09/09/14,07:05 PM,07:05 PM,Pirates at Phillies,Citizens Bank Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/09/14,09/09/14,10:05 PM,10:05 PM,FALSE,TRUE,09/09/14,06:05 PM,06:05 PM,FREE,BUSY 178 | 09/10/14,07:05 PM,07:05 PM,Pirates at Phillies,Citizens Bank Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/10/14,09/10/14,10:05 PM,10:05 PM,FALSE,TRUE,09/10/14,06:05 PM,06:05 PM,FREE,BUSY 179 | 09/11/14,07:05 PM,07:05 PM,Pirates at Phillies,Citizens Bank Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/11/14,09/11/14,10:05 PM,10:05 PM,FALSE,TRUE,09/11/14,06:05 PM,06:05 PM,FREE,BUSY 180 | 09/12/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/12/14,09/12/14,10:05 PM,10:05 PM,FALSE,TRUE,09/12/14,06:05 PM,06:05 PM,FREE,BUSY 181 | 09/13/14,07:05 PM,07:05 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/13/14,09/13/14,10:05 PM,10:05 PM,FALSE,TRUE,09/13/14,06:05 PM,06:05 PM,FREE,BUSY 182 | 09/14/14,01:35 PM,01:35 PM,Cubs at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/14/14,09/14/14,04:35 PM,04:35 PM,FALSE,TRUE,09/14/14,12:35 PM,12:35 PM,FREE,BUSY 183 | 09/16/14,07:05 PM,07:05 PM,Red Sox at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/16/14,09/16/14,10:05 PM,10:05 PM,FALSE,TRUE,09/16/14,06:05 PM,06:05 PM,FREE,BUSY 184 | 09/17/14,07:05 PM,07:05 PM,Red Sox at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/17/14,09/17/14,10:05 PM,10:05 PM,FALSE,TRUE,09/17/14,06:05 PM,06:05 PM,FREE,BUSY 185 | 09/18/14,07:05 PM,07:05 PM,Red Sox at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/18/14,09/18/14,10:05 PM,10:05 PM,FALSE,TRUE,09/18/14,06:05 PM,06:05 PM,FREE,BUSY 186 | 09/19/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/19/14,09/19/14,10:05 PM,10:05 PM,FALSE,TRUE,09/19/14,06:05 PM,06:05 PM,FREE,BUSY 187 | 09/20/14,07:05 PM,07:05 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/20/14,09/20/14,10:05 PM,10:05 PM,FALSE,TRUE,09/20/14,06:05 PM,06:05 PM,FREE,BUSY 188 | 09/21/14,01:35 PM,01:35 PM,Brewers at Pirates,PNC Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/21/14,09/21/14,04:35 PM,04:35 PM,FALSE,TRUE,09/21/14,12:35 PM,12:35 PM,FREE,BUSY 189 | 09/22/14,07:10 PM,07:10 PM,Pirates at Braves,Turner Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/22/14,09/22/14,10:10 PM,10:10 PM,FALSE,TRUE,09/22/14,06:10 PM,06:10 PM,FREE,BUSY 190 | 09/23/14,07:10 PM,07:10 PM,Pirates at Braves,Turner Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/23/14,09/23/14,10:10 PM,10:10 PM,FALSE,TRUE,09/23/14,06:10 PM,06:10 PM,FREE,BUSY 191 | 09/24/14,07:10 PM,07:10 PM,Pirates at Braves,Turner Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/24/14,09/24/14,10:10 PM,10:10 PM,FALSE,TRUE,09/24/14,06:10 PM,06:10 PM,FREE,BUSY 192 | 09/25/14,07:10 PM,07:10 PM,Pirates at Braves,Turner Field,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/25/14,09/25/14,10:10 PM,10:10 PM,FALSE,TRUE,09/25/14,06:10 PM,06:10 PM,FREE,BUSY 193 | 09/26/14,07:10 PM,07:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/26/14,09/26/14,10:10 PM,10:10 PM,FALSE,TRUE,09/26/14,06:10 PM,06:10 PM,FREE,BUSY 194 | 09/27/14,04:10 PM,04:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/27/14,09/27/14,07:10 PM,07:10 PM,FALSE,TRUE,09/27/14,03:10 PM,03:10 PM,FREE,BUSY 195 | 09/28/14,01:10 PM,01:10 PM,Pirates at Reds,Great American Ball Park,"Local TV: ROOT SPORTS ----- Local Radio: KDKA-FM 93.7",09/28/14,09/28/14,04:10 PM,04:10 PM,FALSE,TRUE,09/28/14,12:10 PM,12:10 PM,FREE,BUSY -------------------------------------------------------------------------------- /data/simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test 5 | 64 | 65 | 66 |
67 |
68 | 71 | 85 |
86 |
87 |

Lorem ipsum dolor sit amet, ...

88 |
89 |
90 |

Nunc cursus, justo eget elementum dictum, ...

91 |
92 |
93 |
94 | 95 |
96 | 97 | 98 | 99 | 100 | 159 | -------------------------------------------------------------------------------- /family_tree.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | # Let's grab the simple page source. 4 | simple_page = open('data/simple.html').read() 5 | 6 | # Let's open it with BS so we can iterate over the family tree. 7 | simple_soup = BeautifulSoup(simple_page) 8 | 9 | # Let's highlight our current element and return it so we can play around! 10 | current_elem = simple_soup.findAll('div', {'class': 'navblock'})[1] 11 | -------------------------------------------------------------------------------- /json_scraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib import urlopen 3 | 4 | ip_info = urlopen('http://freegeoip.net/json/').read() 5 | 6 | my_ip = json.loads(ip_info) 7 | 8 | print "I think you're at: %f lat, %f long and in %s" % ( 9 | my_ip.get('latitude'), my_ip.get('longitude'), my_ip.get('city')) 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.2.1 2 | beautifulsoup4==4.3.2 3 | cssselect==0.9.1 4 | lxml==3.3.3 5 | selenium==2.41.0 6 | wsgiref==0.1.2 7 | xlrd==0.9.2 8 | -------------------------------------------------------------------------------- /scrape_netflix.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.common.action_chains import ActionChains 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from time import sleep 8 | 9 | MY_EMAIL = '' 10 | MY_PASSWORD = '' 11 | MY_PROFILE_NAME = '' 12 | 13 | browser = webdriver.Firefox() 14 | browser.get('http://netflix.com') 15 | browser.find_element_by_link_text('Sign In').click() 16 | email = browser.find_element_by_css_selector('input#email') 17 | email.send_keys(MY_EMAIL) 18 | pw = browser.find_element_by_css_selector('input#password') 19 | pw.send_keys(MY_PASSWORD, Keys.RETURN) 20 | browser.implicitly_wait(10) # seconds 21 | browser.find_element_by_link_text(MY_PROFILE_NAME).click() 22 | browser.maximize_window() 23 | rows = browser.find_elements_by_css_selector('div.mrow') 24 | 25 | for r in rows: 26 | if 'Top Picks' in r.text: 27 | top_pix = r 28 | break 29 | 30 | movie_recs = top_pix.find_elements_by_css_selector( 31 | 'div.agMovieSet div.agMovie') 32 | 33 | 34 | first_movie = movie_recs[0].location 35 | scroll_down = ActionChains(browser).move_by_offset( 36 | 10, first_movie.get('y') - 10) 37 | scroll_down.perform() 38 | 39 | movie_dict = {} 40 | 41 | for movie in movie_recs: 42 | movie_link = movie.find_element_by_css_selector('a.bobbable') 43 | try: 44 | arrow = top_pix.find_element_by_css_selector('div.next.sliderButton') 45 | if arrow.location.get('x') - movie_link.location.get('x') < 80: 46 | hover = ActionChains(browser).move_to_element(arrow) 47 | hover.perform() 48 | sleep(4) 49 | hover = ActionChains(browser).move_to_element(movie_link) 50 | hover.perform() 51 | except Exception, e: 52 | print e 53 | hover = ActionChains(browser).move_to_element(arrow) 54 | hover.perform() 55 | sleep(5) 56 | move_off_arrow = ActionChains(browser).move_by_offset(-450, -130) 57 | move_off_arrow.perform() 58 | hover = ActionChains(browser).move_to_element(movie_link) 59 | hover.perform() 60 | try: 61 | movie_info = WebDriverWait(browser, 10).until( 62 | EC.element_to_be_clickable((By.ID, 'BobMovie'))) 63 | title = movie_info.find_element_by_class_name('title').text 64 | link = movie_info.find_element_by_class_name( 65 | 'mdpLink').get_attribute('href') 66 | desc = movie_info.find_element_by_class_name( 67 | 'bobMovieContent').text.split('\n')[0] 68 | cast = movie_info.find_element_by_tag_name('dd').text 69 | movie_dict[title] = {'link': link, 'title': title, 70 | 'desc': desc, 'cast': cast} 71 | except: 72 | print "taking too long!" 73 | scroll_off = ActionChains(browser).move_by_offset(30, -130) 74 | scroll_off.perform() 75 | sleep(2) 76 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from lxml import html 3 | from email.MIMEText import MIMEText 4 | import smtplib 5 | 6 | GMAIL_LOGIN = 'pyladiestest@gmail.com' 7 | GMAIL_PASSWORD = 'YOU NO CAN HAZ' 8 | 9 | def send_email(subject, message, from_addr=GMAIL_LOGIN, to_addr=GMAIL_LOGIN): 10 | msg = MIMEText(message) 11 | msg['Subject'] = subject 12 | msg['From'] = from_addr 13 | msg['To'] = to_addr 14 | msg['Reply-To'] = 'happyhours@noreply.com' 15 | 16 | server = smtplib.SMTP('smtp.gmail.com',587) #port 465 or 587 17 | server.ehlo() 18 | server.starttls() 19 | server.ehlo() 20 | server.login(GMAIL_LOGIN,GMAIL_PASSWORD) 21 | server.sendmail(from_addr, to_addr, msg.as_string()) 22 | server.close() 23 | 24 | 25 | def get_site_html(url): 26 | source = urllib2.urlopen(url).read() 27 | return source 28 | 29 | 30 | def get_all_tags(url,tag): 31 | source = get_site_html(url) 32 | tree = html.document_fromstring(source) 33 | return tree.cssselect(tag) 34 | 35 | 36 | if __name__ == '__main__': 37 | 38 | stuff_i_like = ['burger','wine','sushi','sweet potato fries','BBQ'] 39 | found_happy_hours = [] 40 | my_happy_hours = [] 41 | 42 | # First, I'm going to identify the areas of the page I want to look at 43 | tables = get_all_tags( 44 | 'http://www.downtownla.com/3_10_happyHours.asp?action=ALL', 45 | 'table table div table td') 46 | 47 | 48 | # Then, I'm going to sort out the *exact* parts of the page 49 | # that match what I'm looking for... 50 | for t in tables: 51 | if t.cssselect('p.calendar_EventTitle'): 52 | found_happy_hours.append(t.text_content()) 53 | 54 | print "The scraper found %d happy hours!" % len(found_happy_hours) 55 | 56 | # Now I'm going to loop through the food I like 57 | # and see if any of the happy hour descriptions match 58 | for food in stuff_i_like: 59 | for hh in found_happy_hours: 60 | # checking for text AND making sure I don't have duplicates 61 | if food in hh and hh not in my_happy_hours: 62 | print "YAY! I found some %s!" % food 63 | my_happy_hours.append(hh) 64 | 65 | print "I think you might like %d of them, yipeeeee!" % len(my_happy_hours) 66 | 67 | #Now, let's make a mail message we can read: 68 | message = 'Hey Katharine,\n\n\n' 69 | message += 'OMG, I found some stuff for you in Downtown, take a look.\n\n' 70 | message += '==============================\n'.join(my_happy_hours) 71 | message = message.encode('utf-8') 72 | # To read more about encoding: 73 | # http://diveintopython.org/xml_processing/unicode.html 74 | message = message.replace('\t','').replace('\r','') 75 | message += '\n\nXOXO,\n Your Py Script' 76 | 77 | #And email it to ourselves! 78 | email = 'katharine@pyladies.com' 79 | send_email('Happy Hour Update',message, 80 | from_addr=GMAIL_LOGIN, to_addr=email) 81 | 82 | -------------------------------------------------------------------------------- /start_selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | browser = webdriver.Firefox() 4 | -------------------------------------------------------------------------------- /xlsx2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright information 4 | # 5 | # Copyright (C) 2010-2012 Dilshod Temirkhodjaev 6 | # 7 | # License 8 | # 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | __author__ = "Dilshod Temirkhodjaev " 23 | __license__ = "GPL-2+" 24 | 25 | import csv, datetime, zipfile, sys, os 26 | import xml.parsers.expat 27 | from xml.dom import minidom 28 | from optparse import OptionParser 29 | from tempfile import mkstemp,TemporaryFile 30 | import logging 31 | 32 | # see also ruby-roo lib at: http://github.com/hmcgowan/roo 33 | FORMATS = { 34 | 'general' : 'float', 35 | '0' : 'float', 36 | '0.00' : 'float', 37 | '#,##0' : 'float', 38 | '#,##0.00' : 'float', 39 | '0%' : 'percentage', 40 | '0.00%' : 'percentage', 41 | '0.00e+00' : 'float', 42 | 'mm-dd-yy' : 'date', 43 | 'd-mmm-yy' : 'date', 44 | 'd-mmm' : 'date', 45 | 'mmm-yy' : 'date', 46 | 'h:mm am/pm' : 'date', 47 | 'h:mm:ss am/pm' : 'date', 48 | 'h:mm' : 'time', 49 | 'h:mm:ss' : 'time', 50 | 'm/d/yy h:mm' : 'date', 51 | '#,##0 ;(#,##0)' : 'float', 52 | '#,##0 ;[red](#,##0)' : 'float', 53 | '#,##0.00;(#,##0.00)' : 'float', 54 | '#,##0.00;[red](#,##0.00)' : 'float', 55 | 'mm:ss' : 'time', 56 | '[h]:mm:ss' : 'time', 57 | 'mmss.0' : 'time', 58 | '##0.0e+0' : 'float', 59 | '@' : 'float', 60 | 'yyyy\\-mm\\-dd' : 'date', 61 | 'dd/mm/yy' : 'date', 62 | 'hh:mm:ss' : 'time', 63 | "dd/mm/yy\\ hh:mm" : 'date', 64 | 'dd/mm/yyyy hh:mm:ss' : 'date', 65 | 'yy-mm-dd' : 'date', 66 | 'd-mmm-yyyy' : 'date', 67 | 'm/d/yy' : 'date', 68 | 'm/d/yyyy' : 'date', 69 | 'dd-mmm-yyyy' : 'date', 70 | 'dd/mm/yyyy' : 'date', 71 | 'mm/dd/yy hh:mm am/pm' : 'date', 72 | 'mm/dd/yyyy hh:mm:ss' : 'date', 73 | 'yyyy-mm-dd hh:mm:ss' : 'date', 74 | } 75 | STANDARD_FORMATS = { 76 | 0 : 'general', 77 | 1 : '0', 78 | 2 : '0.00', 79 | 3 : '#,##0', 80 | 4 : '#,##0.00', 81 | 9 : '0%', 82 | 10 : '0.00%', 83 | 11 : '0.00e+00', 84 | 12 : '# ?/?', 85 | 13 : '# ??/??', 86 | 14 : 'mm-dd-yy', 87 | 15 : 'd-mmm-yy', 88 | 16 : 'd-mmm', 89 | 17 : 'mmm-yy', 90 | 18 : 'h:mm am/pm', 91 | 19 : 'h:mm:ss am/pm', 92 | 20 : 'h:mm', 93 | 21 : 'h:mm:ss', 94 | 22 : 'm/d/yy h:mm', 95 | 37 : '#,##0 ;(#,##0)', 96 | 38 : '#,##0 ;[red](#,##0)', 97 | 39 : '#,##0.00;(#,##0.00)', 98 | 40 : '#,##0.00;[red](#,##0.00)', 99 | 45 : 'mm:ss', 100 | 46 : '[h]:mm:ss', 101 | 47 : 'mmss.0', 102 | 48 : '##0.0e+0', 103 | 49 : '@', 104 | } 105 | 106 | # 107 | # usage: xlsx2csv("test.xslx", open("test.csv", "w+")) 108 | # parameters: 109 | # sheetid - sheet no to convert (0 for all sheets) 110 | # dateformat - override date/time format 111 | # delimiter - csv columns delimiter symbol 112 | # sheet_delimiter - sheets delimiter used when processing all sheets 113 | # skip_empty_lines - skip empty lines 114 | # 115 | def xlsx2csv(infilepath, outfile, sheetid=1, dateformat=None, delimiter=",", sheetdelimiter="--------", skip_empty_lines=False): 116 | tmp = TemporaryFile() 117 | writer = csv.writer(tmp, quoting=csv.QUOTE_MINIMAL, delimiter=delimiter) 118 | ziphandle = zipfile.ZipFile(infilepath) 119 | try: 120 | shared_strings = parse(ziphandle, SharedStrings, "xl/sharedStrings.xml") 121 | styles = parse(ziphandle, Styles, "xl/styles.xml") 122 | workbook = parse(ziphandle, Workbook, "xl/workbook.xml") 123 | 124 | if sheetid > 0: 125 | sheet = None 126 | for s in workbook.sheets: 127 | if s['id'] == sheetid: 128 | sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id'])) 129 | break 130 | if not sheet: 131 | raise Exception("Sheet %i Not Found" %sheetid) 132 | sheet.set_dateformat(dateformat) 133 | sheet.set_skip_empty_lines(skip_empty_lines) 134 | sheet.to_csv(writer) 135 | else: 136 | for s in workbook.sheets: 137 | if sheetdelimiter != "": 138 | outfile.write(sheetdelimiter + " " + str(s['id']) + " - " + s['name'].encode('utf-8') + "\r\n") 139 | sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id'])) 140 | sheet.set_dateformat(dateformat) 141 | sheet.set_skip_empty_lines(skip_empty_lines) 142 | sheet.to_csv(writer) 143 | finally: 144 | ziphandle.close() 145 | return tmp 146 | 147 | 148 | 149 | def parse(ziphandle, klass, filename): 150 | instance = klass() 151 | if filename in ziphandle.namelist(): 152 | instance.parse(ziphandle.read(filename)) 153 | return instance 154 | 155 | class Workbook: 156 | def __init__(self): 157 | self.sheets = [] 158 | self.date1904 = False 159 | 160 | def parse(self, data): 161 | workbookDoc = minidom.parseString(data) 162 | if len(workbookDoc.firstChild.getElementsByTagName("fileVersion")) == 0: 163 | self.appName = 'unknown' 164 | else: 165 | self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value 166 | try: 167 | self.date1904 = workbookDoc.firstChild.getElementsByTagName("workbookPr")[0]._attrs['date1904'].value.lower().strip() != "false" 168 | except: 169 | pass 170 | 171 | sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0] 172 | for sheetNode in sheets.getElementsByTagName("sheet"): 173 | attrs = sheetNode._attrs 174 | name = attrs["name"].value 175 | if self.appName == 'xl': 176 | if attrs.has_key('r:id'): id = int(attrs["r:id"].value[3:]) 177 | else: id = int(attrs['sheetId'].value) 178 | else: 179 | if attrs.has_key('sheetId'): id = int(attrs["sheetId"].value) 180 | else: id = int(attrs['r:id'].value[3:]) 181 | self.sheets.append({'name': name, 'id': id}) 182 | 183 | class Styles: 184 | def __init__(self): 185 | self.numFmts = {} 186 | self.cellXfs = [] 187 | 188 | def parse(self, data): 189 | styles = minidom.parseString(data).firstChild 190 | # numFmts 191 | numFmtsElement = styles.getElementsByTagName("numFmts") 192 | if len(numFmtsElement) == 1: 193 | for numFmt in numFmtsElement[0].childNodes: 194 | numFmtId = int(numFmt._attrs['numFmtId'].value) 195 | formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '') 196 | self.numFmts[numFmtId] = formatCode 197 | # cellXfs 198 | cellXfsElement = styles.getElementsByTagName("cellXfs") 199 | if len(cellXfsElement) == 1: 200 | for cellXfs in cellXfsElement[0].childNodes: 201 | if (cellXfs.nodeName != "xf"): 202 | continue 203 | numFmtId = int(cellXfs._attrs['numFmtId'].value) 204 | self.cellXfs.append(numFmtId) 205 | 206 | class SharedStrings: 207 | def __init__(self): 208 | self.parser = None 209 | self.strings = [] 210 | self.si = False 211 | self.t = False 212 | self.rPh = False 213 | self.value = "" 214 | 215 | def parse(self, data): 216 | self.parser = xml.parsers.expat.ParserCreate() 217 | self.parser.CharacterDataHandler = self.handleCharData 218 | self.parser.StartElementHandler = self.handleStartElement 219 | self.parser.EndElementHandler = self.handleEndElement 220 | self.parser.Parse(data) 221 | 222 | def handleCharData(self, data): 223 | if self.t: 224 | self.value+= data 225 | 226 | def handleStartElement(self, name, attrs): 227 | if name == 'si': 228 | self.si = True 229 | self.value = "" 230 | elif name == 't' and self.rPh: 231 | self.t = False 232 | elif name == 't' and self.si: 233 | self.t = True 234 | elif name == 'rPh': 235 | self.rPh = True 236 | 237 | def handleEndElement(self, name): 238 | if name == 'si': 239 | self.si = False 240 | self.strings.append(self.value) 241 | elif name == 't': 242 | self.t = False 243 | elif name == 'rPh': 244 | self.rPh = False 245 | 246 | class Sheet: 247 | def __init__(self, workbook, sharedString, styles, data): 248 | self.parser = None 249 | self.writer = None 250 | self.sharedString = None 251 | self.styles = None 252 | 253 | self.in_sheet = False 254 | self.in_row = False 255 | self.in_cell = False 256 | self.in_cell_value = False 257 | self.in_cell_formula = False 258 | 259 | self.columns = {} 260 | self.rowNum = None 261 | self.colType = None 262 | self.s_attr = None 263 | self.data = None 264 | 265 | self.dateformat = None 266 | self.skip_empty_lines = False 267 | 268 | self.data = data 269 | self.workbook = workbook 270 | self.sharedStrings = sharedString.strings 271 | self.styles = styles 272 | 273 | def set_dateformat(self, dateformat): 274 | self.dateformat = dateformat 275 | 276 | def set_skip_empty_lines(self, skip): 277 | self.skip_empty_lines = skip 278 | 279 | def to_csv(self, writer): 280 | self.writer = writer 281 | self.parser = xml.parsers.expat.ParserCreate() 282 | self.parser.CharacterDataHandler = self.handleCharData 283 | self.parser.StartElementHandler = self.handleStartElement 284 | self.parser.EndElementHandler = self.handleEndElement 285 | self.parser.Parse(self.data) 286 | 287 | def handleCharData(self, data): 288 | if self.in_cell_value: 289 | self.data = data # default value 290 | if self.colType == "s": # shared string 291 | self.data = self.sharedStrings[int(data)] 292 | elif self.colType == "b": # boolean 293 | self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data 294 | elif self.s_attr: 295 | s = int(self.s_attr) 296 | 297 | # get cell format 298 | format = None 299 | xfs_numfmt = self.styles.cellXfs[s] 300 | if self.styles.numFmts.has_key(xfs_numfmt): 301 | format = self.styles.numFmts[xfs_numfmt] 302 | elif STANDARD_FORMATS.has_key(xfs_numfmt): 303 | format = STANDARD_FORMATS[xfs_numfmt] 304 | # get format type 305 | if format and FORMATS.has_key(format): 306 | format_type = FORMATS[format] 307 | 308 | if format_type == 'date': # date/time 309 | try: 310 | if self.workbook.date1904: 311 | date = datetime.datetime(1904, 01, 01) + datetime.timedelta(float(data)) 312 | else: 313 | date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(data)) 314 | if self.dateformat: 315 | # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782 316 | self.data = date.strftime(str(self.dateformat)) 317 | else: 318 | dateformat = format.replace("yyyy", "%Y").replace("yy", "%y"). \ 319 | replace("hh:mm", "%H:%M").replace("h", "%H").replace("%H%H", "%H").replace("ss", "%S"). \ 320 | replace("d", "%e").replace("%e%e", "%d"). \ 321 | replace("mmmm", "%B").replace("mmm", "%b").replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m"). \ 322 | replace("am/pm", "%p") 323 | self.data = date.strftime(str(dateformat)).strip() 324 | except (ValueError, OverflowError): 325 | # invalid date format 326 | self.data = data 327 | elif format_type == 'time': # time 328 | self.data = str(float(data) * 24*60*60) 329 | # does not support it 330 | #elif self.in_cell_formula: 331 | # self.formula = data 332 | 333 | def handleStartElement(self, name, attrs): 334 | if self.in_row and name == 'c': 335 | self.colType = attrs.get("t") 336 | self.s_attr = attrs.get("s") 337 | cellId = attrs.get("r") 338 | if cellId: 339 | self.colNum = cellId[:len(cellId)-len(self.rowNum)] 340 | self.colIndex = 0 341 | else: 342 | self.colIndex+= 1 343 | #self.formula = None 344 | self.data = "" 345 | self.in_cell = True 346 | elif self.in_cell and name == 'v': 347 | self.in_cell_value = True 348 | #elif self.in_cell and name == 'f': 349 | # self.in_cell_formula = True 350 | elif self.in_sheet and name == 'row' and attrs.has_key('r'): 351 | self.rowNum = attrs['r'] 352 | self.in_row = True 353 | self.columns = {} 354 | self.spans = None 355 | if attrs.has_key('spans'): 356 | self.spans = [int(i) for i in attrs['spans'].split(":")] 357 | elif name == 'sheetData': 358 | self.in_sheet = True 359 | 360 | def handleEndElement(self, name): 361 | if self.in_cell and name == 'v': 362 | self.in_cell_value = False 363 | #elif self.in_cell and name == 'f': 364 | # self.in_cell_formula = False 365 | elif self.in_cell and name == 'c': 366 | t = 0 367 | for i in self.colNum: t = t*26 + ord(i) - 64 368 | self.columns[t - 1 + self.colIndex] = self.data 369 | self.in_cell = False 370 | if self.in_row and name == 'row': 371 | if len(self.columns.keys()) > 0: 372 | d = [""] * (max(self.columns.keys()) + 1) 373 | for k in self.columns.keys(): 374 | d[k] = self.columns[k].encode("utf-8") 375 | if self.spans: 376 | l = self.spans[0] + self.spans[1] - 1 377 | if len(d) < l: 378 | d+= (l - len(d)) * [''] 379 | # write line to csv 380 | if not self.skip_empty_lines or d.count('') != len(d): 381 | self.writer.writerow(d) 382 | self.in_row = False 383 | elif self.in_sheet and name == 'sheetData': 384 | self.in_sheet = False 385 | 386 | def convert_recursive(path, kwargs): 387 | for name in os.listdir(path): 388 | fullpath = os.path.join(path, name) 389 | if os.path.isdir(fullpath): 390 | convert_recursive(fullpath, kwargs) 391 | else: 392 | if fullpath.lower().endswith(".xlsx"): 393 | outfilepath = fullpath[:-4] + 'csv' 394 | print("Converting %s to %s" %(fullpath, outfilepath)) 395 | f = open(outfilepath, 'w+b') 396 | try: 397 | xlsx2csv(fullpath, f, **kwargs) 398 | except zipfile.BadZipfile: 399 | print("File is not a zip file") 400 | f.close() 401 | 402 | if __name__ == "__main__": 403 | parser = OptionParser(usage = "%prog [options] infile [outfile]", version="0.11") 404 | parser.add_option("-d", "--delimiter", dest="delimiter", default=",", 405 | help="delimiter - csv columns delimiter, 'tab' or 'x09' for tab (comma is default)") 406 | parser.add_option("-f", "--dateformat", dest="dateformat", 407 | help="override date/time format (ex. %Y/%m/%d)") 408 | parser.add_option("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true", 409 | help="skip empty lines") 410 | parser.add_option("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------", 411 | help="sheets delimiter used to separate sheets, pass '' if you don't want delimiters (default '--------')") 412 | parser.add_option("-r", "--recursive", dest="recursive", default=False, action="store_true", 413 | help="convert recursively") 414 | parser.add_option("-s", "--sheet", dest="sheetid", default=1, type="int", 415 | help="sheet no to convert (0 for all sheets)") 416 | 417 | (options, args) = parser.parse_args() 418 | 419 | if len(options.delimiter) == 1: 420 | delimiter = options.delimiter 421 | elif options.delimiter == 'tab': 422 | delimiter = '\t' 423 | elif options.delimiter == 'comma': 424 | delimiter = ',' 425 | elif options.delimiter[0] == 'x': 426 | delimiter = chr(int(options.delimiter[1:])) 427 | else: 428 | raise Exception("Invalid delimiter") 429 | 430 | kwargs = { 431 | 'sheetid' : options.sheetid, 432 | 'delimiter' : delimiter, 433 | 'sheetdelimiter' : options.sheetdelimiter, 434 | 'dateformat' : options.dateformat, 435 | 'skip_empty_lines' : options.skip_empty_lines 436 | } 437 | 438 | if options.recursive: 439 | if len(args) == 1: 440 | convert_recursive(args[0], kwargs) 441 | else: 442 | parser.print_help() 443 | else: 444 | if len(args) < 1: 445 | parser.print_help() 446 | else: 447 | if len(args) > 1: 448 | outfile = open(args[1], 'w+b') 449 | xlsx2csv(args[0], outfile, **kwargs) 450 | outfile.close() 451 | else: 452 | xlsx2csv(args[0], sys.stdout, **kwargs) 453 | -------------------------------------------------------------------------------- /xlsx_scraper.py: -------------------------------------------------------------------------------- 1 | import xlrd 2 | from datetime import datetime 3 | 4 | wb = xlrd.open_workbook('data/crunchbase.xlsx') 5 | companies = wb.sheet_by_name('Companies') 6 | 7 | # let's see the first row 8 | print companies.row(0) 9 | 10 | # let's iterate the rest of the rows in a generator 11 | rows = [companies.row(index) for index in range(companies.nrows)] 12 | for r in rows: 13 | if r[3].value == 'news': 14 | date = datetime(*xlrd.xldate_as_tuple(r[16].value, 0)[:3]) 15 | print 'News company: %s from %s raised %s with last funding %s' % ( 16 | r[1].value, r[9].value, r[4].value, date.strftime('%m/%d/%Y')) 17 | -------------------------------------------------------------------------------- /xpath_intro.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | 3 | # Let's grab the simple page source. 4 | simple_page = open('data/simple.html').read() 5 | 6 | # Let's open it with LXML so we can play around with xpath. 7 | simple_tree = html.document_fromstring(simple_page) 8 | 9 | --------------------------------------------------------------------------------