├── requirements.txt
└── scrape
    └── scrape.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4>=0.0.1      
2 | requests>=2.21
3 | 


--------------------------------------------------------------------------------
/scrape/scrape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import re
 4 | import csv
 5 | import urllib.parse
 6 | from argparse import ArgumentParser
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | 
11 | def Soup(text):
12 |     return BeautifulSoup(text, 'html.parser')
13 | 
14 | 
15 | def get(session, url):
16 |     r = session.get(url)
17 |     if r.status_code != 200:
18 |         raise RuntimeError('problem connecting to ' + url)
19 |     return r.text
20 | 
21 | 
22 | def findlinks(soup):
23 |     '''Find links that aren't to Home or Back.'''
24 |     return soup.select('a[title!=Home][title!=Back]')
25 | 
26 | def findtitle(soup):
27 |     regex = re.compile(r'AD (\d\d)')
28 |     a = soup(text=regex)
29 |     try:
30 |         return regex.search(a[0]).group(1)
31 |     except IndexError:
32 |         return None
33 | 
34 | 
35 | def parsetable(soup):
36 |     '''Parse a table into list of lists.'''
37 |     output = []
38 |     for tr in soup.find_all('tr'):
39 |         row = [col.text.strip() for col in tr.find_all('td') if col.text.strip() != '']
40 |         if row[0] != 'Total' and not row[0].startswith('Reported'):
41 |             output.append(row)
42 |     return output
43 | 
44 | 
45 | def main():
46 |     parser = ArgumentParser()
47 |     parser.add_argument('url')
48 |     args = parser.parse_args()
49 | 
50 |     with requests.Session() as session:
51 |         soup = Soup(get(session, args.url))
52 | 
53 |         writer = csv.writer(sys.stdout)
54 | 
55 |         first = True
56 | 
57 |         for a in findlinks(soup):
58 |             link = urllib.parse.urljoin(args.url, a.attrs['href'])
59 |             print('getting', link, file=sys.stderr)
60 |             adsoup = Soup(get(session, link))
61 |             district = findtitle(adsoup)
62 |             results = parsetable(adsoup.find('table', {'class': 'underline'}))
63 |             if first is True:
64 |                 results[0].insert(0, 'reporting')
65 |                 results[0].insert(0, 'ED')
66 |                 writer.writerow(results[0])
67 |                 first = False
68 | 
69 |             for row in results[1:]:
70 |                 ed = re.search(r'ED\s+(\d+)', row[0]).group(1)
71 |                 row[0] = '{}{:03d}'.format(district, int(ed))
72 | 
73 |             writer.writerows(results[1:])
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------