├── .gitignore ├── README.md ├── 02-salaries-basic.py ├── 01-jailscrape.py ├── 03-salaries-mechanize.py └── 04-salaries-full.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyCon Web Scraping Tutorial 2 | 3 | Here are all the scripts students will be able to write after attending the novice-level [web scraping tutorial](https://us.pycon.org/2015/schedule/presentation/318/) at PyCon 2015 in Montreal. 4 | -------------------------------------------------------------------------------- /02-salaries-basic.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import csv 3 | from BeautifulSoup import BeautifulSoup 4 | 5 | ########## STEP 1: Open and read the URL ########## 6 | 7 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/SearchResults.aspx?last=%25&first=%25&year=2013&agency=931' 8 | response = requests.get(url) 9 | html = response.content 10 | 11 | ########## STEP 2: Parse HTML with BeautifulSoup ########## 12 | 13 | soup = BeautifulSoup(html) 14 | results_table = soup.find('table', attrs={'id': 'grdEmployees'}) 15 | 16 | ########## STEP 3: Iterate through the results and write to an output list ########## 17 | 18 | output = [] 19 | 20 | for row in results_table.findAll('tr'): 21 | 22 | output_row = [] 23 | 24 | for cell in row.findAll('td'): 25 | output_row.append(cell.text) 26 | 27 | output.append(output_row) 28 | 29 | ########## STEP 4: Write results to file ########## 30 | 31 | print(output) 32 | 33 | handle = open('out-basic.csv', 'a') 34 | outfile = csv.writer(handle) 35 | outfile.writerows(output) -------------------------------------------------------------------------------- /01-jailscrape.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import csv 3 | from BeautifulSoup import BeautifulSoup 4 | 5 | url = 'http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp' 6 | 7 | # Open the HTML file and turn it into a BeautifulSoup object for parsing 8 | response = requests.get(url) 9 | html = response.content 10 | soup = BeautifulSoup(html) 11 | 12 | # The scrape actually starts here. 13 | # Let's get the table that contains the results. 14 | results_table = soup.find('table', attrs={'class': 'resultsTable'}) 15 | 16 | output = [] # The list that's going to store all of our output rows 17 | 18 | # First we need to loop through all the rows in the table 19 | for row in results_table.findAll('tr'): 20 | 21 | # We'll store all of the values for each given row in a list 22 | output_rows = [] 23 | 24 | for cell in row.findAll('td'): 25 | # Delete annoying tab character 26 | output_rows.append(cell.text.replace(' ', '')) 27 | 28 | # And we'll add that list to our broader list of results 29 | output.append(output_rows) 30 | 31 | # Finally, we'll write our results to a file 32 | 33 | print(output) 34 | 35 | handle = open('out-using-requests.csv', 'a') 36 | outfile = csv.writer(handle) 37 | outfile.writerows(output) 38 | -------------------------------------------------------------------------------- /03-salaries-mechanize.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from mechanize import Browser 3 | from BeautifulSoup import BeautifulSoup 4 | 5 | ########## STEP 1: Open and read the URL ########## 6 | 7 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/searchemployees.aspx' 8 | 9 | # Create a new browser object and open the URL 10 | br = Browser() 11 | br.open(url) 12 | 13 | ########## STEP 2: Select and fill out the appropriate form ########## 14 | 15 | # Select the appropriate form, which we'll find by looking in Chrome 16 | br.select_form("ctl01") 17 | 18 | # Each control can be set. Dropdown lists are handled as lists, text fields take text 19 | br.form['SearchEmployees1$CalendarYear1$ddlCalendarYear'] = ['2013'] 20 | br.form['SearchEmployees1$ddlAgencies'] = ['931'] 21 | br.form['SearchEmployees1$txtLastName'] = '%' 22 | 23 | # Submit the form 24 | br.submit() 25 | 26 | ########## STEP 3: Grab and parse the HTML ########## 27 | 28 | soup = BeautifulSoup(br.response()) 29 | results_table = soup.find('table', attrs={'id': 'grdEmployees'}) 30 | 31 | ########## STEP 4: Iterate through the results and write to an output list ########## 32 | 33 | output = [] 34 | 35 | for row in results_table.findAll('tr'): 36 | 37 | output_row = [] 38 | 39 | for cell in row.findAll('td'): 40 | output_row.append(cell.text) 41 | 42 | output.append(output_row) 43 | 44 | ########## STEP 5: Write results to file ########## 45 | 46 | print(output) 47 | 48 | handle = open('out-mechanize.csv', 'a') 49 | outfile = csv.writer(handle) 50 | outfile.writerows(output) 51 | -------------------------------------------------------------------------------- /04-salaries-full.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from mechanize import Browser 3 | from BeautifulSoup import BeautifulSoup 4 | 5 | # How many pages do you want to retrieve? 6 | NUMBER_OF_PAGES = 4 7 | 8 | ########## STEP 1: Open and read the URL ########## 9 | 10 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/searchemployees.aspx' 11 | 12 | # Create a new browser object and open the URL 13 | br = Browser() 14 | br.open(url) 15 | 16 | ########## STEP 2: Select and fill out the appropriate form ########## 17 | 18 | # Select the appropriate form, which we'll find by looking in Chrome 19 | br.select_form("ctl01") 20 | 21 | # Each control can be set. Dropdown lists are handled as lists, text fields take text 22 | br.form['SearchEmployees1$CalendarYear1$ddlCalendarYear'] = ['2013'] 23 | br.form['SearchEmployees1$ddlAgencies'] = ['200'] 24 | br.form['SearchEmployees1$txtLastName'] = '%' 25 | 26 | # Submit the form 27 | br.submit() 28 | 29 | ########## STEP 3: Loop through each page in the result set ########## 30 | 31 | output = [] 32 | 33 | for i in range(NUMBER_OF_PAGES): 34 | 35 | ########## GO TO THE PROPER PAGE ########## 36 | 37 | # First we need to be sure we're on the correct page, which corresponds to 38 | # the i in our for loop. 39 | 40 | # We'll select the appropriate form, just like we did before. 41 | br.select_form("ctl01") 42 | 43 | # Now we just need to nagivate to the page corresponding to i and repeat the process 44 | br.form['MozillaPager1$ddlPageNumber'] = [str(i)] # Typecast i to string 45 | br.submit('MozillaPager1$btnPageNumber') # Use the bottom submit button! 46 | 47 | ########## GRAB AND PARSE THE HTML ######### 48 | 49 | # We'll grab and parse the HTML to get the appropriate table rows, just like we did before. 50 | soup = BeautifulSoup(br.response()) 51 | results_table = soup.find('table', attrs={'id': 'grdEmployees'}) 52 | 53 | ########## LOOP OVER ROWS AND CELLS ########## 54 | 55 | # This is the same as the equivalent chunk in salaries-mechanize, only we're doing 56 | # it for multiple pages, rather than just one. 57 | for row in results_table.findAll('tr'): 58 | 59 | output_row = [] 60 | 61 | for cell in row.findAll('td'): 62 | output_row.append(cell.text) 63 | 64 | output.append(output_row) 65 | 66 | ########## STEP 4: Write results to file ########## 67 | 68 | print(output) 69 | 70 | handle = open('out-mechanize.csv', 'a') 71 | outfile = csv.writer(handle) 72 | outfile.writerows(output) 73 | --------------------------------------------------------------------------------