├── .gitignore
├── README.md
├── 02-salaries-basic.py
├── 01-jailscrape.py
├── 03-salaries-mechanize.py
└── 04-salaries-full.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyCon Web Scraping Tutorial
2 | 
3 | Here are all the scripts students will be able to write after attending the novice-level [web scraping tutorial](https://us.pycon.org/2015/schedule/presentation/318/) at PyCon 2015 in Montreal.
4 | 


--------------------------------------------------------------------------------
/02-salaries-basic.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import csv
 3 | from BeautifulSoup import BeautifulSoup
 4 | 
 5 | ########## STEP 1: Open and read the URL ##########
 6 | 
 7 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/SearchResults.aspx?last=%25&first=%25&year=2013&agency=931'
 8 | response = requests.get(url)
 9 | html = response.content
10 | 
11 | ########## STEP 2: Parse HTML with BeautifulSoup ##########
12 | 
13 | soup = BeautifulSoup(html)
14 | results_table = soup.find('table', attrs={'id': 'grdEmployees'})
15 | 
16 | ########## STEP 3: Iterate through the results and write to an output list ##########
17 | 
18 | output = []
19 | 
20 | for row in results_table.findAll('tr'):
21 | 
22 |     output_row = []
23 | 
24 |     for cell in row.findAll('td'):
25 |         output_row.append(cell.text)
26 | 
27 |     output.append(output_row)
28 | 
29 | ########## STEP 4: Write results to file ##########
30 | 
31 | print(output)
32 | 
33 | handle = open('out-basic.csv', 'a')
34 | outfile = csv.writer(handle)
35 | outfile.writerows(output)


--------------------------------------------------------------------------------
/01-jailscrape.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import csv
 3 | from BeautifulSoup import BeautifulSoup
 4 | 
 5 | url = 'http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp'
 6 | 
 7 | # Open the HTML file and turn it into a BeautifulSoup object for parsing
 8 | response = requests.get(url)
 9 | html = response.content
10 | soup = BeautifulSoup(html)
11 | 
12 | # The scrape actually starts here.
13 | # Let's get the table that contains the results.
14 | results_table = soup.find('table', attrs={'class': 'resultsTable'})
15 | 
16 | output = []  # The list that's going to store all of our output rows
17 | 
18 | # First we need to loop through all the rows in the table
19 | for row in results_table.findAll('tr'):
20 | 
21 |     # We'll store all of the values for each given row in a list
22 |     output_rows = []
23 | 
24 |     for cell in row.findAll('td'):
25 |         # Delete annoying tab character
26 |         output_rows.append(cell.text.replace('&nbsp;', ''))
27 | 
28 |     # And we'll add that list to our broader list of results
29 |     output.append(output_rows)
30 | 
31 | # Finally, we'll write our results to a file
32 | 
33 | print(output)
34 | 
35 | handle = open('out-using-requests.csv', 'a')
36 | outfile = csv.writer(handle)
37 | outfile.writerows(output)
38 | 


--------------------------------------------------------------------------------
/03-salaries-mechanize.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from mechanize import Browser
 3 | from BeautifulSoup import BeautifulSoup
 4 | 
 5 | ########## STEP 1: Open and read the URL ##########
 6 | 
 7 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/searchemployees.aspx'
 8 | 
 9 | # Create a new browser object and open the URL
10 | br = Browser()
11 | br.open(url)
12 | 
13 | ########## STEP 2: Select and fill out the appropriate form ##########
14 | 
15 | # Select the appropriate form, which we'll find by looking in Chrome
16 | br.select_form("ctl01")
17 | 
18 | # Each control can be set. Dropdown lists are handled as lists, text fields take text
19 | br.form['SearchEmployees1$CalendarYear1$ddlCalendarYear'] = ['2013']
20 | br.form['SearchEmployees1$ddlAgencies'] = ['931']
21 | br.form['SearchEmployees1$txtLastName'] = '%'
22 | 
23 | # Submit the form
24 | br.submit()
25 | 
26 | ########## STEP 3: Grab and parse the HTML ##########
27 | 
28 | soup = BeautifulSoup(br.response())
29 | results_table = soup.find('table', attrs={'id': 'grdEmployees'})
30 | 
31 | ########## STEP 4: Iterate through the results and write to an output list ##########
32 | 
33 | output = []
34 | 
35 | for row in results_table.findAll('tr'):
36 | 
37 |     output_row = []
38 | 
39 |     for cell in row.findAll('td'):
40 |         output_row.append(cell.text)
41 | 
42 |     output.append(output_row)
43 | 
44 | ########## STEP 5: Write results to file ##########
45 | 
46 | print(output)
47 | 
48 | handle = open('out-mechanize.csv', 'a')
49 | outfile = csv.writer(handle)
50 | outfile.writerows(output)
51 | 


--------------------------------------------------------------------------------
/04-salaries-full.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from mechanize import Browser
 3 | from BeautifulSoup import BeautifulSoup
 4 | 
 5 | # How many pages do you want to retrieve?
 6 | NUMBER_OF_PAGES = 4
 7 | 
 8 | ########## STEP 1: Open and read the URL ##########
 9 | 
10 | url = 'http://mapyourtaxes.mo.gov/MAP/Employees/Employee/searchemployees.aspx'
11 | 
12 | # Create a new browser object and open the URL
13 | br = Browser()
14 | br.open(url)
15 | 
16 | ########## STEP 2: Select and fill out the appropriate form ##########
17 | 
18 | # Select the appropriate form, which we'll find by looking in Chrome
19 | br.select_form("ctl01")
20 | 
21 | # Each control can be set. Dropdown lists are handled as lists, text fields take text
22 | br.form['SearchEmployees1$CalendarYear1$ddlCalendarYear'] = ['2013']
23 | br.form['SearchEmployees1$ddlAgencies'] = ['200']
24 | br.form['SearchEmployees1$txtLastName'] = '%'
25 | 
26 | # Submit the form
27 | br.submit()
28 | 
29 | ########## STEP 3: Loop through each page in the result set ##########
30 | 
31 | output = []
32 | 
33 | for i in range(NUMBER_OF_PAGES):
34 | 
35 |     ########## GO TO THE PROPER PAGE ##########
36 | 
37 |     # First we need to be sure we're on the correct page, which corresponds to
38 |     # the i in our for loop.
39 | 
40 |     # We'll select the appropriate form, just like we did before.
41 |     br.select_form("ctl01")
42 | 
43 |     # Now we just need to nagivate to the page corresponding to i and repeat the process
44 |     br.form['MozillaPager1$ddlPageNumber'] = [str(i)] # Typecast i to string
45 |     br.submit('MozillaPager1$btnPageNumber') # Use the bottom submit button!
46 | 
47 |     ########## GRAB AND PARSE THE HTML #########
48 | 
49 |     # We'll grab and parse the HTML to get the appropriate table rows, just like we did before.
50 |     soup = BeautifulSoup(br.response())
51 |     results_table = soup.find('table', attrs={'id': 'grdEmployees'})
52 | 
53 |     ########## LOOP OVER ROWS AND CELLS ##########
54 | 
55 |     # This is the same as the equivalent chunk in salaries-mechanize, only we're doing
56 |     # it for multiple pages, rather than just one.
57 |     for row in results_table.findAll('tr'):
58 | 
59 |         output_row = []
60 | 
61 |         for cell in row.findAll('td'):
62 |             output_row.append(cell.text)
63 | 
64 |         output.append(output_row)
65 | 
66 | ########## STEP 4: Write results to file ##########
67 | 
68 | print(output)
69 | 
70 | handle = open('out-mechanize.csv', 'a')
71 | outfile = csv.writer(handle)
72 | outfile.writerows(output)
73 | 


--------------------------------------------------------------------------------