├── requirements.txt └── scrape └── scrape.py /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4>=0.0.1 2 | requests>=2.21 3 | -------------------------------------------------------------------------------- /scrape/scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import re 4 | import csv 5 | import urllib.parse 6 | from argparse import ArgumentParser 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | def Soup(text): 12 | return BeautifulSoup(text, 'html.parser') 13 | 14 | 15 | def get(session, url): 16 | r = session.get(url) 17 | if r.status_code != 200: 18 | raise RuntimeError('problem connecting to ' + url) 19 | return r.text 20 | 21 | 22 | def findlinks(soup): 23 | '''Find links that aren't to Home or Back.''' 24 | return soup.select('a[title!=Home][title!=Back]') 25 | 26 | def findtitle(soup): 27 | regex = re.compile(r'AD (\d\d)') 28 | a = soup(text=regex) 29 | try: 30 | return regex.search(a[0]).group(1) 31 | except IndexError: 32 | return None 33 | 34 | 35 | def parsetable(soup): 36 | '''Parse a table into list of lists.''' 37 | output = [] 38 | for tr in soup.find_all('tr'): 39 | row = [col.text.strip() for col in tr.find_all('td') if col.text.strip() != ''] 40 | if row[0] != 'Total' and not row[0].startswith('Reported'): 41 | output.append(row) 42 | return output 43 | 44 | 45 | def main(): 46 | parser = ArgumentParser() 47 | parser.add_argument('url') 48 | args = parser.parse_args() 49 | 50 | with requests.Session() as session: 51 | soup = Soup(get(session, args.url)) 52 | 53 | writer = csv.writer(sys.stdout) 54 | 55 | first = True 56 | 57 | for a in findlinks(soup): 58 | link = urllib.parse.urljoin(args.url, a.attrs['href']) 59 | print('getting', link, file=sys.stderr) 60 | adsoup = Soup(get(session, link)) 61 | district = findtitle(adsoup) 62 | results = parsetable(adsoup.find('table', {'class': 'underline'})) 63 | if first is True: 64 | results[0].insert(0, 'reporting') 65 | results[0].insert(0, 'ED') 66 | writer.writerow(results[0]) 67 | first = False 68 | 69 | for row in results[1:]: 70 | ed = re.search(r'ED\s+(\d+)', row[0]).group(1) 71 | row[0] = '{}{:03d}'.format(district, int(ed)) 72 | 73 | writer.writerows(results[1:]) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | --------------------------------------------------------------------------------