├── .gitignore ├── requirements.txt ├── nyc.py ├── README.md ├── common.py ├── base_scraper.py ├── gis_scrapers.py ├── north_bay.py ├── github_read_write.py ├── LICENSE ├── irma_shelters.py └── irma.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.1 2 | requests==2.20.0 3 | Geohash==1.0 4 | pyproj==1.9.5.1 5 | -------------------------------------------------------------------------------- /nyc.py: -------------------------------------------------------------------------------- 1 | # In case a hurricane hits New York... 2 | from base_scraper import BaseDeltaScraper 3 | 4 | import requests 5 | import csv 6 | from pyproj import Proj, transform 7 | 8 | 9 | class NewYorkShelters(BaseDeltaScraper): 10 | record_key = 'BLDG_ID' 11 | filepath = 'new-york-shelters.json' 12 | url = 'https://maps.nyc.gov/hurricane/data/center.csv' 13 | source_url = 'https://maps.nyc.gov/hurricane/' 14 | noun = 'shelter' 15 | 16 | def display_record(self, record): 17 | display = [] 18 | display.append(' %s' % record['BLDG_ADD']) 19 | display.append(' Accessible: %s' % record['ACCESSIBLE']) 20 | if record['ACC_FEAT']: 21 | display.append(' %s' % record['ACC_FEAT']) 22 | display.append('') 23 | return '\n'.join(display) 24 | 25 | def fetch_data(self): 26 | data = requests.get(self.url).content 27 | rows = csv.reader(data.split('\r\n')) 28 | headers = next(rows) 29 | shelters = [] 30 | from_projection = Proj(init='epsg:2263', preserve_units=True) 31 | to_projection = Proj(proj='latlong', ellps='WGS84', datum='WGS84') 32 | for row in rows: 33 | shelter = dict(zip(headers, row)) 34 | if not shelter: 35 | continue 36 | # Convert from epsg:2263 - preserve_units=True because this is in feet 37 | x, y = shelter['X'], shelter['Y'] 38 | longitude, latitude = transform( 39 | from_projection, to_projection, x, y 40 | ) 41 | shelter['longitude'] = longitude 42 | shelter['latitude'] = latitude 43 | shelters.append(shelter) 44 | return shelters 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # irma-scrapers 2 | 3 | Screen scrapers relating to hurricane Irma. See their output in 4 | https://github.com/simonw/disaster-data/ 5 | 6 | ## Irma Response 7 | 8 | The Irma Response project at https://www.irmaresponse.org/ is a team of 9 | volunteers working together to make information available during and after the 10 | storm. There is a huge amount of information out there, on many different 11 | websites. The Irma API at https://irma-api.herokuapp.com/ is an attempt to 12 | gather key information in one place, verify it and publish it in a reuseable 13 | way. 14 | 15 | To aid this effort, I've built a collection of screen scrapers that pull data 16 | from a number of different websites and APIs. That data is then stored in a 17 | Git repository, providing a clear history of changes made to the various 18 | sources that are being tracked. 19 | 20 | Some of the scrapers also publish their findings to Slack in a format designed 21 | to make it obvious when key events happen, such as new shelters being added or 22 | removed from public listings. 23 | 24 | ## Tracking changes over time 25 | 26 | A key goal of this screen scraping mechanism is to allow changes to the 27 | underlying data sources to be tracked over time. This is achieved using git, 28 | via the GitHub API. Each scraper pulls down data from a source (an API or a 29 | website) and reformats that data into a sanitized JSON format. That JSON is 30 | then written to the git repository. If the data has changed since the last 31 | time the scraper ran, those changes will be captured by git and made available 32 | in the commit log. 33 | 34 | Recent changes tracked by the scraper collection can be seen here: 35 | https://github.com/simonw/disaster-data/commits/master 36 | 37 | ## Generating useful commit messages 38 | 39 | The most complex code for most of the scrapers isn't in fetching the data: 40 | it's in generating useful, human-readable commit messages that summarize the 41 | underlying change. For example, here is a commit message generated by the 42 | scraper that tracks the http://www.floridadisaster.org/shelters/summary.aspx 43 | page: 44 | 45 | florida-shelters.json: 2 shelters added 46 | 47 | Added shelter: Atwater Elementary School (Sarasota County) 48 | Added shelter: DEBARY ELEMENTARY SCHOOL (Volusia County) 49 | Change detected on http://www.floridadisaster.org/shelters/summary.aspx 50 | 51 | The full commit also shows the changes to the underlying JSON, but the human- 52 | readable message provides enough information that people who are not JSON- 53 | literate programmers can still derive value from the commit. 54 | 55 | https://github.com/simonw/disaster-data/commit/7919aeff0913ec26d1bea8dc 56 | 57 | ## Publishing to Slack 58 | 59 | The Irma Response team use Slack to co-ordinate their efforts. You can join 60 | their Slack here: https://irma-response-slack.herokuapp.com/ 61 | 62 | Some of the scrapers publish detected changes in their data source to Slack, 63 | as links to the commits generated for each change. The human-readable message 64 | is posted directly to the channel. 65 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | from github_read_write import GithubContent 2 | 3 | import requests 4 | import json 5 | 6 | 7 | class Scraper(object): 8 | owner = None 9 | repo = None 10 | filepath = None 11 | committer = None 12 | slack_channel = None 13 | slack_botname = None 14 | test_mode = False 15 | 16 | def __init__(self, github_token, slack_token=None): 17 | self.last_data = None 18 | self.last_sha = None 19 | self.github_token = github_token 20 | self.slack_token = slack_token 21 | 22 | def post_to_slack(self, message, commit_hash): 23 | if not (self.slack_channel and self.slack_token): 24 | return 25 | headline = message.split('\n')[0] 26 | try: 27 | body = message.split('\n', 1)[1] 28 | except IndexError: 29 | body = '' 30 | github_url = 'https://github.com/%s/%s/commit/%s' % ( 31 | self.owner, self.repo, commit_hash 32 | ) 33 | requests.post('https://slack.com/api/chat.postMessage', { 34 | 'token': self.slack_token, 35 | 'channel': self.slack_channel, 36 | 'attachments': json.dumps([{ 37 | 'fallback': github_url, 38 | 'pretext': headline, 39 | 'title': '%s: %s' % (self.filepath, commit_hash[:8]), 40 | 'title_link': github_url, 41 | 'text': body.strip(), 42 | }]), 43 | 'icon_emoji': ':robot_face:', 44 | 'username': self.slack_botname, 45 | }).json() 46 | 47 | def create_message(self, new_data): 48 | return 'Created %s' % self.filepath 49 | 50 | def update_message(self, old_data, new_data): 51 | return 'Updated %s' % self.filepath 52 | 53 | def fetch_data(self): 54 | return [] 55 | 56 | def scrape_and_store(self): 57 | data = self.fetch_data() 58 | if data is None: 59 | print '%s; Data was None' % self.filepath 60 | return 61 | 62 | if self.test_mode and not self.github_token: 63 | print json.dumps(data, indent=2) 64 | return 65 | 66 | # We need to store the data 67 | github = GithubContent(self.owner, self.repo, self.github_token) 68 | if not self.last_data or not self.last_sha: 69 | # Check and see if it exists yet 70 | try: 71 | content, sha = github.read(self.filepath) 72 | self.last_data = json.loads(content) 73 | self.last_sha = sha 74 | except GithubContent.NotFound: 75 | pass 76 | 77 | if self.last_data == data: 78 | print '%s: Nothing changed' % self.filepath 79 | return 80 | 81 | if self.last_sha: 82 | print 'Updating %s' % self.filepath 83 | message = self.update_message(self.last_data, data) 84 | else: 85 | print 'Creating %s' % self.filepath 86 | message = self.create_message(data) 87 | 88 | if self.test_mode: 89 | print message 90 | print 91 | print json.dumps(data, indent=2) 92 | return 93 | 94 | content_sha, commit_sha = github.write( 95 | filepath=self.filepath, 96 | content=json.dumps(data, indent=2), 97 | sha=self.last_sha, 98 | commit_message=message, 99 | committer=self.committer, 100 | ) 101 | 102 | self.last_sha = content_sha 103 | self.last_data = data 104 | 105 | self.post_to_slack(message, commit_sha) 106 | print 'https://github.com/%s/%s/commit/%s' % ( 107 | self.owner, self.repo, commit_sha 108 | ) 109 | -------------------------------------------------------------------------------- /base_scraper.py: -------------------------------------------------------------------------------- 1 | from common import Scraper 2 | 3 | 4 | class BaseScraper(Scraper): 5 | owner = 'simonw' 6 | repo = 'disaster-data' 7 | committer = { 8 | 'name': 'irma-scraper', 9 | 'email': 'irma-scraper@example.com', 10 | } 11 | slack_botname = 'Irma Scraper' 12 | slack_channel = '#shelter_scraper_data' 13 | 14 | 15 | class BaseDeltaScraper(BaseScraper): 16 | """ 17 | The fetch_data() method should return a list of dicts. Each dict 18 | should have a key that can be used to identify the row in that dict. 19 | 20 | Then you define a display_record(record) method that returns a string. 21 | """ 22 | record_key = None 23 | show_changes = False 24 | noun = 'record' 25 | source_url = None 26 | slack_channel = None 27 | 28 | @property 29 | def display_name(self): 30 | return self.filepath.replace('.json', '') 31 | 32 | @property 33 | def noun_plural(self): 34 | return self.noun + 's' 35 | 36 | def create_message(self, new_records): 37 | return self.update_message([], new_records, 'Created') 38 | 39 | def update_message(self, old_records, new_records, verb='Updated'): 40 | previous_ids = [ 41 | record[self.record_key] for record in old_records 42 | ] 43 | current_ids = [ 44 | record[self.record_key] for record in new_records 45 | ] 46 | added_ids = [id for id in current_ids if id not in previous_ids] 47 | removed_ids = [id for id in previous_ids if id not in current_ids] 48 | 49 | message_blocks = [] 50 | if added_ids: 51 | messages = [] 52 | messages.append('%d new %s:' % ( 53 | len(added_ids), self.noun if len(added_ids) == 1 else self.noun_plural 54 | )) 55 | for id in added_ids: 56 | record = [r for r in new_records if r[self.record_key] == id][0] 57 | messages.append(self.display_record(record)) 58 | message_blocks.append(messages) 59 | 60 | if removed_ids: 61 | messages = [] 62 | messages.append('%d %s removed:' % ( 63 | len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural 64 | )) 65 | for id in removed_ids: 66 | record = [r for r in old_records if r[self.record_key] == id][0] 67 | messages.append(self.display_record(record)) 68 | message_blocks.append(messages) 69 | 70 | # Add useful rendering of CHANGED records as well 71 | changed_records = [] 72 | for new_record in new_records: 73 | try: 74 | old_record = [ 75 | r for r in old_records 76 | if r[self.record_key] == new_record[self.record_key] 77 | ][0] 78 | except IndexError: 79 | continue 80 | changed_records.append((old_record, new_record)) 81 | 82 | if self.show_changes and changed_records: 83 | messages = [] 84 | messages.append('%d %s changed:' % ( 85 | len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural 86 | )) 87 | for old_record, new_record in changed_records: 88 | messages.append(self.display_changes(old_record, new_record)) 89 | message_blocks.append(messages) 90 | 91 | blocks = [] 92 | for message_block in message_blocks: 93 | block = '\n'.join(message_block) 94 | blocks.append(block.strip()) 95 | 96 | if self.source_url: 97 | blocks.append('Detected on %s' % self.source_url) 98 | 99 | body = '\n\n'.join(blocks) 100 | 101 | summary = [] 102 | if added_ids: 103 | summary.append('%d %s added' % ( 104 | len(added_ids), self.noun if len(added_ids) == 1 else self.noun_plural 105 | )) 106 | if removed_ids: 107 | summary.append('%d %s removed' % ( 108 | len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural 109 | )) 110 | if changed_records: 111 | summary.append('%d %s changed' % ( 112 | len(changed_records), self.noun if len(changed_records) == 1 else self.noun_plural 113 | )) 114 | if summary: 115 | summary_text = self.display_name + ': ' + (', '.join(summary)) 116 | else: 117 | summary_text = '%s %s' % (verb, self.display_name) 118 | return summary_text + '\n\n' + body 119 | -------------------------------------------------------------------------------- /gis_scrapers.py: -------------------------------------------------------------------------------- 1 | from base_scraper import BaseScraper 2 | import requests 3 | 4 | 5 | def objectid(d): 6 | # Different datasets represent objectid in different ways 7 | return d.get('OBJECTID') or d['ObjectID'] 8 | 9 | 10 | def shelter_name(d): 11 | return d.get('SHELTER_NAME') or d['label'] 12 | 13 | 14 | def shelter_county(d): 15 | return d.get('COUNTY_PARISH') or d['county'] 16 | 17 | 18 | class BaseGisScraper(BaseScraper): 19 | source_url = None 20 | 21 | def create_message(self, new_data): 22 | return self.update_message([], new_data, verb='Created') 23 | 24 | def update_message(self, old_data, new_data, verb='Updated'): 25 | new_objects = [o for o in new_data if not any(o2 for o2 in old_data if objectid(o2) == objectid(o))] 26 | removed_objects = [o for o in old_data if not any(o2 for o2 in new_data if objectid(o2) == objectid(o))] 27 | message = [] 28 | 29 | def name(row): 30 | if 'COUNTY_PARISH' in row or 'county' in row: 31 | s = '%s (%s County)' % (shelter_name(row), shelter_county(row).title()) 32 | elif 'CITY' in row and 'STATE' in row: 33 | s = '%s (%s, %s)' % (shelter_name(row), row['CITY'].title(), row['STATE']) 34 | else: 35 | s = shelter_name(row) 36 | return s.replace('County County', 'County') 37 | 38 | for new_object in new_objects: 39 | message.append('Added shelter: %s' % name(new_object)) 40 | if new_objects: 41 | message.append('') 42 | for removed_object in removed_objects: 43 | message.append('Removed shelter: %s' % name(removed_object)) 44 | if removed_objects: 45 | message.append('') 46 | num_updated = 0 47 | for new_object in new_data: 48 | old_object = [o for o in old_data if objectid(o) == objectid(new_object)] 49 | if not old_object: 50 | continue 51 | old_object = old_object[0] 52 | if new_object != old_object: 53 | message.append('Updated shelter: %s' % name(new_object)) 54 | num_updated += 1 55 | body = '\n'.join(message) 56 | summary = [] 57 | if new_objects: 58 | summary.append('%d shelter%s added' % ( 59 | len(new_objects), '' if len(new_objects) == 1 else 's', 60 | )) 61 | if removed_objects: 62 | summary.append('%d shelter%s removed' % ( 63 | len(removed_objects), '' if len(removed_objects) == 1 else 's', 64 | )) 65 | if num_updated: 66 | summary.append('%d shelter%s updated' % ( 67 | num_updated, '' if num_updated == 1 else 's', 68 | )) 69 | if summary: 70 | summary_text = self.filepath + ': ' + (', '.join(summary)) 71 | else: 72 | summary_text = '%s %s' % (verb, self.filepath) 73 | if self.source_url: 74 | body += '\nChange detected on %s' % self.source_url 75 | return summary_text + '\n\n' + body 76 | 77 | def fetch_data(self): 78 | data = requests.get(self.url).json() 79 | shelters = [feature['attributes'] for feature in data['features']] 80 | shelters.sort(key=lambda s: objectid(s)) 81 | return shelters 82 | 83 | 84 | class FemaOpenShelters(BaseGisScraper): 85 | filepath = 'fema-open-shelters.json' 86 | url = 'https://gis.fema.gov/REST/services/NSS/OpenShelters/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&geometry=%7B%22xmin%22%3A-10018754.171396945%2C%22ymin%22%3A2504688.5428529754%2C%22xmax%22%3A-7514065.628548954%2C%22ymax%22%3A5009377.085700965%2C%22spatialReference%22%3A%7B%22wkid%22%3A102100%7D%7D&geometryType=esriGeometryEnvelope&inSR=102100&outFields=*&outSR=102100' 87 | 88 | 89 | class FemaNSS(BaseGisScraper): 90 | filepath = 'fema-nss-usa.json' 91 | url = 'https://gis.fema.gov/REST/services/NSS/FEMA_NSS/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&geometry=%7B%22xmin%22%3A+-14404742.108649602%2C+%22ymin%22%3A+-55660.4518654215%2C+%22ymax%22%3A+6782064.328749425%2C+%22xmax%22%3A+-5988988.6046781195%2C+%22spatialReference%22%3A+%7B%22wkid%22%3A+102100%7D%7D&geometryType=esriGeometryEnvelope&inSR=102100&outFields=*&outSR=102100' 92 | 93 | 94 | class GemaAnimalShelters(BaseGisScraper): 95 | filepath = 'georgia-gema-animal-shelters.json' 96 | url = 'https://services1.arcgis.com/2iUE8l8JKrP2tygQ/arcgis/rest/services/AnimalShelters/FeatureServer/0/query?f=json&where=status%20%3D%20%27OPEN%27&returnGeometry=true&spatialRel=esriSpatialRelIntersects&outFields=*&outSR=102100&resultOffset=0&resultRecordCount=1000' 97 | source_url = 'https://gema-soc.maps.arcgis.com/apps/webappviewer/index.html?id=279ef7cfc1da45edb640723c12b02b18' 98 | 99 | 100 | class GemaActiveShelters(BaseGisScraper): 101 | filepath = 'georgia-gema-active-shelters.json' 102 | url = 'https://services1.arcgis.com/2iUE8l8JKrP2tygQ/arcgis/rest/services/SheltersActive/FeatureServer/0/query?f=json&where=shelter_information_shelter_type%20%3C%3E%20%27Reception%20Care%20Ctr.%27&returnGeometry=true&spatialRel=esriSpatialRelIntersects&outFields=*&outSR=102100&resultOffset=0&resultRecordCount=1000' 103 | source_url = 'https://gema-soc.maps.arcgis.com/apps/webappviewer/index.html?id=279ef7cfc1da45edb640723c12b02b18' 104 | -------------------------------------------------------------------------------- /north_bay.py: -------------------------------------------------------------------------------- 1 | from base_scraper import BaseScraper, BaseDeltaScraper 2 | from BeautifulSoup import Comment, BeautifulSoup as Soup 3 | from xml.etree import ElementTree 4 | import requests 5 | import re 6 | 7 | 8 | class PGEOutagesIndividual(BaseDeltaScraper): 9 | url = 'https://apim.pge.com/cocoutage/outages/getOutagesRegions?regionType=city&expand=true' 10 | filepath = 'pge-outages-individual.json' 11 | slack_channel = None 12 | record_key = 'outageNumber' 13 | noun = 'outage' 14 | 15 | def fetch_data(self): 16 | data = requests.get( 17 | self.url, 18 | timeout=10, 19 | ).json() 20 | # Flatten into a list of outages 21 | outages = [] 22 | for region in data['outagesRegions']: 23 | for outage in region['outages']: 24 | outage['regionName'] = region['regionName'] 25 | outages.append(outage) 26 | return outages 27 | 28 | def display_record(self, outage): 29 | display = [] 30 | display.append(' %(outageNumber)s in %(regionName)s affecting %(estCustAffected)s' % outage) 31 | display.append(' https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % outage) 32 | display.append(' %(cause)s - %(crewCurrentStatus)s' % outage) 33 | display.append('') 34 | return '\n'.join(display) 35 | 36 | 37 | class SantaRosaEmergencyInformation(BaseScraper): 38 | url = 'https://srcity.org/610/Emergency-Information' 39 | filepath = 'santa-rosa-emergency.json' 40 | slack_channel = None 41 | 42 | def fetch_data(self): 43 | html = requests.get(self.url).content 44 | soup = Soup(html) 45 | main_content = soup.find('div', {'data-cprole': 'mainContentContainer'}) 46 | # Remove scripts 47 | [s.extract() for s in main_content.findAll('script')] 48 | # Remove source comments 49 | comments = soup.findAll(text=lambda text: isinstance(text, Comment)) 50 | [comment.extract() for comment in comments] 51 | # Remove almost all attributes 52 | for tag in main_content.recursiveChildGenerator(): 53 | try: 54 | tag.attrs = [ 55 | (key, value) for key, value in tag.attrs 56 | if key in ('href', 'src') 57 | and not value.startswith('#') 58 | ] 59 | except AttributeError: 60 | pass 61 | 62 | return { 63 | 'html_lines': unicode(main_content).split(u'\n'), 64 | } 65 | 66 | 67 | class SonomaRoadConditions(BaseScraper): 68 | url = 'http://roadconditions.sonoma-county.org/' 69 | filepath = 'sonoma-road-conditions.json' 70 | slack_channel = None 71 | 72 | def fetch_data(self): 73 | soup = Soup(requests.get(self.url).content) 74 | road_closures = {} 75 | for id in ('divTableCounty', 'divTableCity'): 76 | name = {'divTableCounty': 'county_roads', 'divTableCity': 'city_roads'}[id] 77 | div = soup.find('div', {'id': id}) 78 | table = div.find('table') 79 | headers = [th.text for th in table.findAll('th')] 80 | closures = [] 81 | for tr in table.find('tbody').findAll('tr'): 82 | values = [td.text for td in tr.findAll('td')] 83 | closures.append(dict(zip(headers, values))) 84 | road_closures[name] = closures 85 | return road_closures 86 | 87 | 88 | class CaliforniaDOTRoadInfo(BaseScraper): 89 | url = 'http://www.dot.ca.gov/hq/roadinfo/Hourly' 90 | filepath = 'dot-ca-roadinfo-hourly.json' 91 | slack_channel = None 92 | 93 | def fetch_data(self): 94 | text = requests.get(self.url).content 95 | return { 96 | 'text_lines': [l.rstrip('\r') for l in text.split('\n')], 97 | } 98 | 99 | 100 | class CaliforniaHighwayPatrolIncidents(BaseDeltaScraper): 101 | url = 'http://quickmap.dot.ca.gov/data/chp-only.kml' 102 | filepath = 'chp-incidents.json' 103 | slack_channel = None 104 | record_key = 'name' 105 | noun = 'incident' 106 | 107 | def display_record(self, incident): 108 | display = [] 109 | display.append(' %s' % incident['name']) 110 | display.append(' https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % incident) 111 | display.append(' ' + incident['description']) 112 | display.append('') 113 | return '\n'.join(display) 114 | 115 | def fetch_data(self): 116 | kml = requests.get(self.url).content 117 | et = ElementTree.fromstring(kml) 118 | incidents = [] 119 | for placemark in et.findall('.//{http://www.opengis.net/kml/2.2}Placemark'): 120 | coords = placemark.find('.//{http://www.opengis.net/kml/2.2}coordinates').text.strip() 121 | latitude, longitude, blah = map(float, coords.split(',')) 122 | description = placemark.find('{http://www.opengis.net/kml/2.2}description').text.strip() 123 | name = placemark.find('{http://www.opengis.net/kml/2.2}name').text.strip() 124 | incidents.append({ 125 | 'name': name, 126 | 'description': strip_tags(description), 127 | 'latitude': latitude, 128 | 'longitude': longitude, 129 | }) 130 | return incidents 131 | 132 | 133 | tag_re = re.compile('<.*?>') 134 | 135 | 136 | def strip_tags(s): 137 | return tag_re.sub('', s) 138 | -------------------------------------------------------------------------------- /github_read_write.py: -------------------------------------------------------------------------------- 1 | """ 2 | This class knows how to read and write LARGE files to Github. The regular 3 | GitHub Contents API can't handle files larger than 1MB - this class knows how 4 | to spot that proble and switch to the large-file-supporting low level Git Data 5 | API instead. 6 | 7 | https://developer.github.com/v3/repos/contents/ 8 | https://developer.github.com/v3/git/ 9 | """ 10 | import requests 11 | 12 | 13 | class GithubContent(object): 14 | class NotFound(Exception): 15 | pass 16 | 17 | class UnknownError(Exception): 18 | pass 19 | 20 | def __init__(self, owner, repo, token): 21 | self.owner = owner 22 | self.repo = repo 23 | self.token = token 24 | 25 | def base_url(self): 26 | return 'https://api.github.com/repos/%s/%s' % ( 27 | self.owner, self.repo 28 | ) 29 | 30 | def read(self, filepath): 31 | # Try reading using content API 32 | content_url = self.base_url() + '/contents/%s' % filepath 33 | response = requests.get( 34 | content_url, 35 | headers={ 36 | 'Authorization': 'token %s' % self.token 37 | } 38 | ) 39 | if response.status_code == 200: 40 | data = response.json() 41 | return data['content'].decode('base64'), data['sha'] 42 | elif response.status_code == 404: 43 | raise self.NotFound(filepath) 44 | elif response.status_code == 403: 45 | # It's probably too large 46 | if response.json()['errors'][0]['code'] != 'too_large': 47 | raise self.UnknownError(response.content) 48 | else: 49 | return self.read_large(filepath) 50 | else: 51 | raise self.UnknownError(response.content) 52 | 53 | def read_large(self, filepath): 54 | master = requests.get( 55 | self.base_url() + '/git/trees/master?recursive=1', 56 | headers={ 57 | 'Authorization': 'token %s' % self.token 58 | } 59 | ).json() 60 | try: 61 | tree_entry = [t for t in master['tree'] if t['path'] == filepath][0] 62 | except IndexError: 63 | raise self.NotFound(filepath) 64 | data = requests.get( 65 | tree_entry['url'], 66 | headers={ 67 | 'Authorization': 'token %s' % self.token 68 | } 69 | ).json() 70 | return data['content'].decode('base64'), data['sha'] 71 | 72 | def write(self, filepath, content, sha=None, commit_message=None, committer=None): 73 | github_url = self.base_url() + '/contents/%s' % filepath 74 | payload = { 75 | 'path': filepath, 76 | 'content': content.encode('base64'), 77 | 'message': commit_message, 78 | } 79 | if sha: 80 | payload['sha'] = sha 81 | if committer: 82 | payload['committer'] = committer 83 | 84 | response = requests.put( 85 | github_url, 86 | json=payload, 87 | headers={ 88 | 'Authorization': 'token %s' % self.token 89 | } 90 | ) 91 | if response.status_code == 403 and response.json()['errors'][0]['code'] == 'too_large': 92 | return self.write_large(filepath, content, commit_message, committer) 93 | elif sha is None and response.status_code == 422 and 'sha' in response.json().get('message', ''): 94 | # Missing sha - we need to figure out the sha and try again 95 | old_content, old_sha = self.read(filepath) 96 | return self.write( 97 | filepath, 98 | content, 99 | sha=old_sha, 100 | commit_message=commit_message, 101 | committer=committer, 102 | ) 103 | elif response.status_code in (201, 200): 104 | updated = response.json() 105 | return updated['content']['sha'], updated['commit']['sha'] 106 | else: 107 | raise self.UnknownError(str(response.status_code) + ':' + response.content) 108 | 109 | def write_large(self, filepath, content, commit_message=None, committer=None): 110 | # Create a new blob with the file contents 111 | created_blob = requests.post(self.base_url() + '/git/blobs', json={ 112 | 'encoding': 'utf8', 113 | 'content': content, 114 | }, headers={'Authorization': 'token %s' % self.token}).json() 115 | # Retrieve master tree sha 116 | master_sha = requests.get( 117 | self.base_url() + '/git/trees/master?recursive=1', 118 | headers={ 119 | 'Authorization': 'token %s' % self.token 120 | } 121 | ).json()['sha'] 122 | # Construct a new tree 123 | created_tree = requests.post( 124 | self.base_url() + '/git/trees', 125 | json={ 126 | 'base_tree': master_sha, 127 | 'tree': [{ 128 | 'mode': '100644', # file (blob), 129 | 'path': filepath, 130 | 'type': 'blob', 131 | 'sha': created_blob['sha'], 132 | }] 133 | }, 134 | headers={'Authorization': 'token %s' % self.token} 135 | ).json() 136 | # Create a commit which references the new tree 137 | payload = { 138 | 'message': commit_message, 139 | 'parents': [master_sha], 140 | 'tree': created_tree['sha'], 141 | } 142 | if committer: 143 | payload['committer'] = committer 144 | created_commit = requests.post( 145 | self.base_url() + '/git/commits', 146 | json=payload, 147 | headers={'Authorization': 'token %s' % self.token} 148 | ).json() 149 | # Move HEAD reference on master to the new commit 150 | requests.patch( 151 | self.base_url() + '/git/refs/heads/master', 152 | json={'sha': created_commit['sha']}, 153 | headers={'Authorization': 'token %s' % self.token} 154 | ).json() 155 | return created_blob['sha'], created_commit['sha'] 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /irma_shelters.py: -------------------------------------------------------------------------------- 1 | from base_scraper import BaseScraper 2 | import requests 3 | import Geohash 4 | import re 5 | 6 | IGNORE_DUPE_IDS = { 7 | 456, # Hialeah Middle School 8 | 442, # Amelia Earhart Elementary 9 | } 10 | 11 | GEOHASH_PRECISION = 7 12 | 13 | 14 | class IrmaShelters(BaseScraper): 15 | filepath = 'irma-shelters.json' 16 | url = 'https://irma-api.herokuapp.com/api/v1/shelters' 17 | slack_channel = None 18 | 19 | def update_message(self, old_data, new_data): 20 | def name(n): 21 | return '%s (%s)' % (n['shelter'], n['county']) 22 | 23 | current_ids = [n['id'] for n in new_data] 24 | previous_ids = [n['id'] for n in old_data] 25 | 26 | added_ids = [id for id in current_ids if id not in previous_ids] 27 | removed_ids = [id for id in previous_ids if id not in current_ids] 28 | 29 | message = [] 30 | for id in added_ids: 31 | shelter = [n for n in new_data if n['id'] == id][0] 32 | message.append('Added shelter: %s' % name(shelter)) 33 | if added_ids: 34 | message.append('') 35 | for id in removed_ids: 36 | shelter = [n for n in old_data if n['id'] == id][0] 37 | message.append('Removed shelter: %s' % name(shelter)) 38 | body = '\n'.join(message) 39 | summary = [] 40 | if added_ids: 41 | summary.append('%d shelter%s added' % ( 42 | len(added_ids), '' if len(added_ids) == 1 else 's', 43 | )) 44 | if removed_ids: 45 | summary.append('%d shelter%s removed' % ( 46 | len(removed_ids), '' if len(removed_ids) == 1 else 's', 47 | )) 48 | if summary: 49 | summary_text = self.filepath + ': ' + (', '.join(summary)) 50 | else: 51 | summary_text = 'Updated %s' % self.filepath 52 | return summary_text + '\n\n' + body 53 | 54 | def fetch_data(self): 55 | data = requests.get(self.url).json() 56 | shelters = data['shelters'] 57 | shelters.sort(key=lambda s: s['shelter']) 58 | return shelters 59 | 60 | 61 | class IrmaShelterDupes(BaseScraper): 62 | # Detect possible dupes in irma-api 63 | filepath = 'irma-shelters-dupes.json' 64 | url = 'https://irma-api.herokuapp.com/api/v1/shelters' 65 | 66 | def update_message(self, old_data, new_data): 67 | previous_geohashes = [ 68 | dupe_group['geohash'] for dupe_group in old_data['dupe_groups'] 69 | ] 70 | current_geohashes = [ 71 | dupe_group['geohash'] for dupe_group in new_data['dupe_groups'] 72 | ] 73 | added_geohashes = [ 74 | geohash for geohash in current_geohashes if geohash not in previous_geohashes 75 | ] 76 | removed_geohashes = [ 77 | geohash for geohash in previous_geohashes if geohash not in current_geohashes 78 | ] 79 | 80 | message = [] 81 | for geohash in added_geohashes: 82 | dupe_group = [group for group in new_data['dupe_groups'] if group['geohash'] == geohash][0] 83 | message.append('New potential duplicates:') 84 | for shelter in dupe_group['shelters']: 85 | message.append(' ' + shelter['name']) 86 | if shelter.get('address'): 87 | message.append(' ' + shelter['address']) 88 | message.append(' ' + shelter['google_maps']) 89 | message.append(' ' + shelter['view_url']) 90 | message.append('') 91 | 92 | if added_geohashes and removed_geohashes: 93 | message.append('') 94 | 95 | for geohash in removed_geohashes: 96 | dupe_group = [group for group in old_data['dupe_groups'] if group['geohash'] == geohash][0] 97 | message.append('This previous duplicate looks to be resolved:') 98 | for shelter in dupe_group['shelters']: 99 | message.append(' ' + shelter['name']) 100 | if shelter.get('address'): 101 | message.append(' ' + shelter['address']) 102 | message.append(' ' + shelter['google_maps']) 103 | message.append(' ' + shelter['view_url']) 104 | message.append('') 105 | 106 | current_no_latlon_ids = [ 107 | shelter['id'] for shelter in new_data['no_latitude_longitude'] 108 | ] 109 | # Older data in our repo doesn't have the 'id' property, so we 110 | # have to allow it to be None here 111 | previous_no_latlon_ids = [ 112 | shelter.get('id') for shelter in old_data['no_latitude_longitude'] 113 | ] 114 | 115 | new_no_latlon_ids = [ 116 | id for id in current_no_latlon_ids 117 | if id not in previous_no_latlon_ids 118 | ] 119 | resolved_no_latlon_ids = [ 120 | id for id in previous_no_latlon_ids 121 | if id not in current_no_latlon_ids 122 | and id is not None 123 | ] 124 | 125 | if new_no_latlon_ids: 126 | message.append('') 127 | message.append('New shelters detected with no latitude/longitude:') 128 | for id in new_no_latlon_ids: 129 | shelter = [ 130 | s for s in new_data['no_latitude_longitude'] 131 | if s['id'] == id 132 | ][0] 133 | message.append(' ' + shelter['name']) 134 | if shelter.get('address'): 135 | message.append(' ' + shelter['address']) 136 | message.append(' ' + shelter['view_url']) 137 | message.append('') 138 | 139 | if resolved_no_latlon_ids: 140 | message.append('') 141 | message.append('Fixed shelters that had no latitude/longitude:') 142 | for id in resolved_no_latlon_ids: 143 | shelter = [ 144 | s for s in old_data['no_latitude_longitude'] 145 | if s['id'] == id 146 | ][0] 147 | message.append(' ' + shelter['name']) 148 | message.append(' ' + (shelter.get('address') or '')) 149 | message.append(' ' + shelter['view_url']) 150 | 151 | body = '\n'.join(message) 152 | summary = [] 153 | if added_geohashes: 154 | summary.append('%d new dupe%s detected' % ( 155 | len(added_geohashes), '' if len(added_geohashes) == 1 else 's', 156 | )) 157 | if removed_geohashes: 158 | summary.append('%d dupe%s resolved' % ( 159 | len(removed_geohashes), '' if len(removed_geohashes) == 1 else 's', 160 | )) 161 | if new_no_latlon_ids: 162 | summary.append('%d new no-lat-lon shelter%s' % ( 163 | len(new_no_latlon_ids), '' if len(new_no_latlon_ids) == 1 else 's', 164 | )) 165 | if resolved_no_latlon_ids: 166 | summary.append('%d fixed no-lat-lon shelter%s' % ( 167 | len(resolved_no_latlon_ids), '' if len(resolved_no_latlon_ids) == 1 else 's', 168 | )) 169 | if summary: 170 | summary_text = self.filepath + ': ' + (', '.join(summary)) 171 | else: 172 | summary_text = 'Updated %s' % self.filepath 173 | return summary_text + '\n\n' + body 174 | 175 | def fetch_data(self): 176 | data = requests.get(self.url).json() 177 | shelters = data['shelters'] 178 | # Scan for potential dupes by lat/lon (using geohash) 179 | by_geohash = {} 180 | for shelter in shelters: 181 | if shelter['id'] in IGNORE_DUPE_IDS: 182 | continue 183 | geohash = Geohash.encode( 184 | shelter['latitude'], 185 | shelter['longitude'], 186 | precision=GEOHASH_PRECISION, 187 | ) 188 | by_geohash.setdefault(geohash, []).append(shelter) 189 | dupe_groups = [ 190 | pair for pair in by_geohash.items() 191 | if ( 192 | # More than one shelter in this group 193 | len(pair[1]) > 1 194 | # Group is not invalid lat/lon 195 | and pair[0] != ('0' * GEOHASH_PRECISION) 196 | ) 197 | ] 198 | no_latlons = by_geohash.get('0' * GEOHASH_PRECISION) or [] 199 | return { 200 | 'dupe_groups': [{ 201 | 'geohash': dupe_group[0], 202 | 'shelters': [{ 203 | 'id': shelter['id'], 204 | 'name': shelter['shelter'], 205 | 'address': shelter['address'], 206 | 'latitude': shelter['latitude'], 207 | 'longitude': shelter['longitude'], 208 | 'google_maps': 'https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % shelter, 209 | 'view_url': 'https://irma-api.herokuapp.com/shelters/%s' % shelter['id'], 210 | } for shelter in dupe_group[1]], 211 | } for dupe_group in dupe_groups], 212 | 'no_latitude_longitude': [{ 213 | 'id': shelter['id'], 214 | 'name': shelter['shelter'], 215 | 'address': shelter['address'], 216 | 'view_url': 'https://irma-api.herokuapp.com/shelters/%s' % shelter['id'], 217 | } for shelter in no_latlons] 218 | } 219 | 220 | 221 | map_url_re = re.compile( 222 | r'http://maps.google.com/maps\?saddr=&daddr=-?\d+\.\d+,-?\d+\.\d+' 223 | ) 224 | 225 | 226 | class IrmaSheltersFloridaMissing(BaseScraper): 227 | filepath = 'florida-shelters-missing.json' 228 | our_url = 'https://raw.githubusercontent.com/simonw/disaster-data/master/irma-shelters.json' 229 | their_url = 'https://raw.githubusercontent.com/simonw/disaster-data/master/florida-shelters.json' 230 | issue_comments_url = 'https://api.github.com/repos/simonw/disaster-data/issues/2/comments' 231 | 232 | def create_message(self, new_data): 233 | return self.update_message([], new_data, 'Created') 234 | 235 | def update_message(self, old_data, new_data, verb='Updated'): 236 | previous_map_urls = [ 237 | d['map_url'] for d in old_data 238 | ] 239 | current_map_urls = [ 240 | d['map_url'] for d in new_data 241 | ] 242 | added_map_urls = [ 243 | map_url for map_url in current_map_urls 244 | if map_url not in previous_map_urls 245 | ] 246 | removed_map_urls = [ 247 | map_url for map_url in previous_map_urls 248 | if map_url not in current_map_urls 249 | ] 250 | 251 | message = [] 252 | 253 | if added_map_urls: 254 | message.append('New potentially missing shelters:') 255 | 256 | for map_url in added_map_urls: 257 | shelter = [s for s in new_data if s['map_url'] == map_url][0] 258 | message.append(' %s (%s County)' % (shelter['name'], shelter['county'])) 259 | message.append(' Type: ' + shelter['type']) 260 | message.append(' ' + shelter['address']) 261 | message.append(' ' + shelter['city']) 262 | message.append(' ' + shelter['map_url']) 263 | message.append('') 264 | 265 | if added_map_urls and removed_map_urls: 266 | message.append('') 267 | 268 | if removed_map_urls: 269 | message.append('Previous missing shelters now resolved:') 270 | 271 | for map_url in removed_map_urls: 272 | shelter = [s for s in old_data if s['map_url'] == map_url][0] 273 | message.append(' %s (%s County)' % (shelter['name'], shelter['county'])) 274 | 275 | body = '\n'.join(message) 276 | summary = [] 277 | if added_map_urls: 278 | summary.append('%d potentially missing shelter%s detected' % ( 279 | len(added_map_urls), '' if len(added_map_urls) == 1 else 's', 280 | )) 281 | if removed_map_urls: 282 | summary.append('%d shelter%s resolved' % ( 283 | len(removed_map_urls), '' if len(removed_map_urls) == 1 else 's', 284 | )) 285 | if current_map_urls: 286 | summary.append('%d total' % ( 287 | len(current_map_urls) 288 | )) 289 | if summary: 290 | summary_text = self.filepath + ': ' + (', '.join(summary)) 291 | else: 292 | summary_text = '%s %s' % (verb, self.filepath) 293 | return summary_text + '\n\n' + body 294 | 295 | def fetch_data(self): 296 | our_shelters = requests.get(self.our_url).json() 297 | their_shelters = requests.get(self.their_url).json() 298 | our_geohashes = set([ 299 | Geohash.encode(s['latitude'], s['longitude'], 6) 300 | for s in our_shelters 301 | ]) 302 | for shelter in their_shelters: 303 | coords = shelter['map_url'].split('daddr=')[1] 304 | latitude, longitude = map(float, coords.split(',')) 305 | geohash = Geohash.encode(latitude, longitude, 6) 306 | shelter['geohash'] = geohash 307 | maybe_missing_shelters = [ 308 | s for s in their_shelters 309 | if s['geohash'] not in our_geohashes 310 | ] 311 | ignore_map_urls = [] 312 | for comment in all_comments(self.issue_comments_url, self.github_token): 313 | ignore_map_urls.extend(map_url_re.findall(comment['body'])) 314 | maybe_missing_shelters = [ 315 | s for s in maybe_missing_shelters 316 | if s['map_url'] not in ignore_map_urls 317 | ] 318 | return maybe_missing_shelters 319 | 320 | 321 | def all_comments(issue_comments_url, github_token): 322 | # Paginate through all comments on an issue 323 | while issue_comments_url: 324 | response = requests.get( 325 | issue_comments_url, 326 | headers={ 327 | 'Authorization': 'token %s' % github_token, 328 | }) 329 | try: 330 | issue_comments_url = response.links['next']['url'] 331 | except KeyError: 332 | issue_comments_url = None 333 | for item in response.json(): 334 | yield item 335 | -------------------------------------------------------------------------------- /irma.py: -------------------------------------------------------------------------------- 1 | from base_scraper import BaseScraper 2 | from irma_shelters import ( 3 | IrmaShelters, 4 | IrmaShelterDupes, 5 | IrmaSheltersFloridaMissing, 6 | ) 7 | from gis_scrapers import ( 8 | FemaOpenShelters, 9 | FemaNSS, 10 | GemaAnimalShelters, 11 | GemaActiveShelters, 12 | ) 13 | from nyc import ( 14 | NewYorkShelters, 15 | ) 16 | from north_bay import ( 17 | CaliforniaDOTRoadInfo, 18 | SantaRosaEmergencyInformation, 19 | SonomaRoadConditions, 20 | CaliforniaHighwayPatrolIncidents, 21 | PGEOutagesIndividual, 22 | ) 23 | from BeautifulSoup import BeautifulSoup as Soup 24 | import requests 25 | import os 26 | import sys 27 | import time 28 | import json 29 | import datetime 30 | import zipfile 31 | import StringIO 32 | from xml.etree import ElementTree 33 | 34 | 35 | class GoogleCrisisKmlScraper(BaseScraper): 36 | url = 'https://www.google.com/maps/d/u/1/kml?mid=1fJ4NZ21YW1Ru856hehpufId79CA&ll=22.47126398588183%2C-60.6005859375&z=5&cm.ttl=600' 37 | source_url = 'http://google.org/crisismap/2017-irma' 38 | filepath = 'google-crisis-irma-2017.json' 39 | 40 | def create_message(self, new_data): 41 | return self.update_message([], new_data, verb='Created') 42 | 43 | def update_message(self, old_data, new_data, verb='Updated'): 44 | def name(n): 45 | if 'Name' not in n: 46 | return None 47 | return ('%s (%s)' % ( 48 | n['Name'], n.get('City, State/Province') or '' 49 | )).replace(' ()', '') 50 | 51 | current_names = [name(n) for n in new_data if name(n)] 52 | previous_names = [name(n) for n in old_data if name(n)] 53 | message = update_message_from_names( 54 | current_names, 55 | previous_names, 56 | self.filepath, 57 | verb=verb 58 | ) 59 | message += '\nChange detected on %s' % self.source_url 60 | return message 61 | 62 | def fetch_data(self): 63 | zipped = requests.get(self.url).content 64 | zipdata = zipfile.ZipFile(StringIO.StringIO(zipped)) 65 | kml = zipdata.open('doc.kml').read() 66 | et = ElementTree.fromstring(kml) 67 | shelters = [] 68 | for placemark in et.findall('.//{http://www.opengis.net/kml/2.2}Placemark'): 69 | shelter = {} 70 | for data in placemark.findall('{http://www.opengis.net/kml/2.2}ExtendedData/{http://www.opengis.net/kml/2.2}Data'): 71 | key = data.attrib['name'] 72 | value = ''.join(s.strip() for s in data.itertext()) 73 | shelter[key] = value 74 | coords = placemark.find('.//{http://www.opengis.net/kml/2.2}coordinates').text.strip() 75 | longitude, latitude, _ = coords.split(',') 76 | shelter.update({ 77 | 'latitude': latitude, 78 | 'longitude': longitude, 79 | }) 80 | if 'Phone' in shelter: 81 | # They come through in scientific number format for some reason 82 | shelter['Phone'] = shelter['Phone'].replace('.', '').replace('E9', '') 83 | shelters.append(shelter) 84 | return shelters 85 | 86 | 87 | class SouthCarolinaShelters(BaseScraper): 88 | url = 'http://scemd.org/ShelterStatus.html' 89 | filepath = 'scemd-shelters.json' 90 | 91 | def create_message(self, new_data): 92 | return self.update_message([], new_data, verb='Created') 93 | 94 | def update_message(self, old_data, new_data, verb='Updated'): 95 | def name(n): 96 | return '%s (%s County, SC)' % ( 97 | n['Shelter Name'], n['County'] 98 | ) 99 | 100 | current_names = [name(n) for n in new_data] 101 | previous_names = [name(n) for n in old_data] 102 | message = update_message_from_names( 103 | current_names, 104 | previous_names, 105 | self.filepath, 106 | verb=verb 107 | ) 108 | message += '\nChange detected on %s' % self.url 109 | return message 110 | 111 | def fetch_data(self): 112 | s = Soup(requests.get(self.url).content) 113 | table = s.find('table') 114 | trs = table.findAll('tr') 115 | headings = [ 116 | th.getText() 117 | for th in trs[0].findAll('th') 118 | ] 119 | shelters = [] 120 | for tr in trs[1:]: 121 | content = [td.getText() for td in tr.findAll('td')] 122 | shelters.append(dict(zip(headings, content))) 123 | return shelters 124 | 125 | 126 | class ZeemapsScraper(BaseScraper): 127 | url = 'https://zeemaps.com/emarkers?g=2682928' 128 | filepath = 'zeemaps-2682928.json' 129 | slack_channel = None 130 | 131 | def fetch_data(self): 132 | data = requests.get(self.url).json() 133 | data.sort(key=lambda d: d['nm']) 134 | return data 135 | 136 | 137 | class FplStormOutages(BaseScraper): 138 | filepath = 'fpl-storm-outages.json' 139 | url = 'https://www.fplmaps.com/data/storm-outages.js' 140 | slack_channel = None 141 | 142 | def fetch_data(self): 143 | content = requests.get( 144 | self.url, 145 | timeout=10, 146 | ).content 147 | # Stripe the 'define(' and ');' 148 | if content.startswith('define('): 149 | content = content.split('define(')[1] 150 | if content.endswith(');'): 151 | content = content.rsplit(');', 1)[0] 152 | return json.loads(content) 153 | 154 | 155 | class FplCountyOutages(BaseScraper): 156 | filepath = 'fpl-county-outages.json' 157 | url = 'https://www.fplmaps.com/customer/outage/CountyOutages.json' 158 | slack_channel = None 159 | 160 | def fetch_data(self): 161 | return requests.get( 162 | self.url, 163 | timeout=10, 164 | ).json() 165 | 166 | 167 | class ScegOutages(BaseScraper): 168 | filepath = 'sceg-outages.json' 169 | url = 'https://www.sceg.com/scanapublicservice/outagemapdata/gismapdataonly.aspx?gisUrl=OUTAGE_EX/Outage_EX&gisMapLayer=6' 170 | source_url = 'https://www.sceg.com/outages-emergencies/power-outages/outage-map' 171 | slack_channel = None 172 | 173 | def fetch_data(self): 174 | data = requests.get(self.url).json() 175 | return [feature['attributes'] for feature in data['features']] 176 | 177 | 178 | class GeorgiaOutages(BaseScraper): 179 | filepath = 'georgiapower-outages.json' 180 | url = 'http://outagemap.georgiapower.com/external/data/interval_generation_data/2017_09_12_00_59_50/thematic/thematic_areas.js?timestamp=' 181 | slack_channel = None 182 | 183 | def fetch_data(self): 184 | url = self.url + str(int(time.time())) 185 | return requests.get(url).json() 186 | 187 | 188 | class NorthGeorgiaOutages(BaseScraper): 189 | filepath = 'north-georgia-outages.json' 190 | url = 'http://www2.ngemc.com:81/api/weboutageviewer/get_live_data' 191 | slack_channel = None 192 | 193 | def fetch_data(self): 194 | return requests.get(self.url).json() 195 | 196 | 197 | class TampaElectricOutages(BaseScraper): 198 | filepath = 'tampa-electric-outages.json' 199 | url = 'http://www.tampaelectric.com/residential/outages/outagemap/datafilereader/index.cfm' 200 | slack_channel = None 201 | 202 | def fetch_data(self): 203 | return requests.get( 204 | self.url, 205 | headers={ 206 | 'Referer': 'http://www.tampaelectric.com/residential/outages/outagemap/', 207 | } 208 | ).json()['markers'] 209 | 210 | 211 | class JemcOutages(BaseScraper): 212 | filepath = 'jemc-outages.json' 213 | url = 'https://jemc.maps.sienatech.com/data/outages.xml' 214 | slack_channel = None 215 | 216 | def fetch_data(self): 217 | et = ElementTree.fromstring(requests.get(self.url).content) 218 | reports = et.find('reports').findall('report') 219 | data = {} 220 | for report in reports: 221 | id = report.attrib['id'] 222 | keys = [d.attrib['key'] for d in report.findall('dimension/dim')] 223 | rows = report.findall('dataset/t') 224 | results = [ 225 | dict(zip(keys, [e.text for e in row])) 226 | for row in rows 227 | ] 228 | data[id] = results 229 | return data 230 | 231 | 232 | class BaseDukeScraper(BaseScraper): 233 | slack_channel = None 234 | 235 | def fetch_data(self): 236 | metadata_url = 'https://s3.amazonaws.com/outagemap.duke-energy.com/data/%s/external/interval_generation_data/metadata.xml?timestamp=%d' % ( 237 | self.state_code, int(time.time()) 238 | ) 239 | metadata = requests.get(metadata_url).content 240 | directory = metadata.split('')[1].split('')[0] 241 | data_url = 'https://s3.amazonaws.com/outagemap.duke-energy.com/data/%s/external/interval_generation_data/%s/thematic/thematic_areas.js?timestamp=%d' % ( 242 | self.state_code, directory, int(time.time()) 243 | ) 244 | return requests.get(data_url).json() 245 | 246 | 247 | class DukeFloridaOutages(BaseDukeScraper): 248 | filepath = 'duke-fl-outages.json' 249 | state_code = 'fl' 250 | 251 | 252 | class DukeCarolinasOutages(BaseDukeScraper): 253 | filepath = 'duke-ncsc-outages.json' 254 | state_code = 'ncsc' 255 | 256 | 257 | class PascoCounty(BaseScraper): 258 | # From http://www.pascocountyfl.net/index.aspx?NID=2816 259 | # in particular this iframe: 260 | # https://secure.pascocountyfl.net/sheltersdisplay 261 | filepath = 'pascocountyfl.json' 262 | url = 'https://secure.pascocountyfl.net/SheltersDisplay/Home/GetShelterInfo' 263 | 264 | def create_message(self, new_data): 265 | return self.update_message([], new_data, verb='Created') 266 | 267 | def update_message(self, old_data, new_data, verb='Updated'): 268 | def name(n): 269 | return '%s (Pasco County FL)' % n['Name'] 270 | 271 | current_names = [name(n) for n in new_data] 272 | previous_names = [name(n) for n in old_data] 273 | message = update_message_from_names( 274 | current_names, 275 | previous_names, 276 | self.filepath, 277 | verb=verb 278 | ) 279 | message += '\nChange detected on http://www.pascocountyfl.net/index.aspx?NID=2816' 280 | return message 281 | 282 | def fetch_data(self): 283 | data = requests.post(self.url).json() 284 | data.sort(key=lambda d: d['Name']) 285 | return data 286 | 287 | 288 | class LedgerPolkCounty(BaseScraper): 289 | filepath = 'ledger-polk-county.json' 290 | url = 'http://www.ledgerdata.com/hurricane-guide/shelter/' 291 | 292 | def create_message(self, new_data): 293 | return self.update_message([], new_data, verb='Created') 294 | 295 | def update_message(self, old_data, new_data, verb='Updated'): 296 | current_names = [n['name'] for n in new_data] 297 | previous_names = [n['name'] for n in old_data] 298 | 299 | added_names = [name for name in current_names if name not in previous_names] 300 | removed_names = [name for name in previous_names if name not in current_names] 301 | 302 | message = [] 303 | for name in added_names: 304 | shelter = [n for n in new_data if n['name'] == name][0] 305 | message.append('Added shelter: %s, %s' % ( 306 | shelter['name'], shelter['city'] 307 | )) 308 | message.append(' %s' % shelter['url']) 309 | if added_names and removed_names: 310 | message.append('') 311 | for name in removed_names: 312 | shelter = [n for n in old_data if n['name'] == name][0] 313 | message.append('Removed shelter: %s, %s' % ( 314 | shelter['name'], shelter['city'] 315 | )) 316 | body = '\n'.join(message) 317 | summary = [] 318 | if added_names: 319 | summary.append('%d shelter%s added' % ( 320 | len(added_names), '' if len(added_names) == 1 else 's', 321 | )) 322 | if removed_names: 323 | summary.append('%d shelter%s removed' % ( 324 | len(removed_names), '' if len(removed_names) == 1 else 's', 325 | )) 326 | if summary: 327 | summary_text = '%s %s: %s' % ( 328 | verb, self.filepath, (', '.join(summary)) 329 | ) 330 | else: 331 | summary_text = '%s %s' % (verb, self.filepath) 332 | return '%s\n\n%s\nChange detected on %s' % ( 333 | summary_text, body, self.url 334 | ) 335 | 336 | def fetch_data(self): 337 | s = Soup(requests.get(self.url).content) 338 | trs = s.find('table').findAll('tr')[1:] 339 | shelters = [] 340 | for tr in trs: 341 | tds = tr.findAll('td') 342 | shelters.append({ 343 | 'name': tds[1].getText(), 344 | 'url': 'http://www.ledgerdata.com/' + tds[1].find('a')['href'], 345 | 'city': tds[2].getText(), 346 | 'type': tds[3].getText(), 347 | }) 348 | return shelters 349 | 350 | 351 | class HernandoCountyShelters(BaseScraper): 352 | filepath = 'hernando-county.json' 353 | url = 'http://www.hernandocounty.us/em/shelter-information' 354 | 355 | def create_message(self, new_data): 356 | return self.update_message([], new_data, verb='Created') 357 | 358 | def update_message(self, old_data, new_data, verb='Updated'): 359 | current_names = [n['name'] for n in new_data] 360 | previous_names = [n['name'] for n in old_data] 361 | 362 | added_names = [name for name in current_names if name not in previous_names] 363 | removed_names = [name for name in previous_names if name not in current_names] 364 | 365 | message = [] 366 | for name in added_names: 367 | shelter = [n for n in new_data if n['name'] == name][0] 368 | message.append('Added shelter: %s, Hernando County' % ( 369 | shelter['name'] 370 | )) 371 | message.append(' %s, %s' % ( 372 | shelter['type'], shelter['status'] 373 | )) 374 | message.append(' %s' % shelter['address']) 375 | if added_names and removed_names: 376 | message.append('') 377 | for name in removed_names: 378 | shelter = [n for n in old_data if n['name'] == name][0] 379 | message.append('Removed shelter: %s, Hernando County' % ( 380 | shelter['name'] 381 | )) 382 | body = '\n'.join(message) 383 | summary = [] 384 | if added_names: 385 | summary.append('%d shelter%s added' % ( 386 | len(added_names), '' if len(added_names) == 1 else 's', 387 | )) 388 | if removed_names: 389 | summary.append('%d shelter%s removed' % ( 390 | len(removed_names), '' if len(removed_names) == 1 else 's', 391 | )) 392 | if summary: 393 | summary_text = '%s %s: %s' % ( 394 | verb, self.filepath, (', '.join(summary)) 395 | ) 396 | else: 397 | summary_text = '%s %s' % (verb, self.filepath) 398 | return '%s\n\n%s\nChange detected on %s' % ( 399 | summary_text, body, self.url 400 | ) 401 | 402 | def fetch_data(self): 403 | s = Soup(requests.get(self.url).content) 404 | shelters = [] 405 | for tr in s.find('table').findAll('tr'): 406 | tds = tr.findAll('td') 407 | img = tds[1].find('img') 408 | if img is not None: 409 | shelter_type = img['alt'].title() 410 | else: 411 | shelter_type = 'General' 412 | shelters.append({ 413 | 'name': tds[2].getText(), 414 | 'type': shelter_type, 415 | 'address': tds[3].getText(), 416 | 'status': tds[4].getText(), 417 | }) 418 | return shelters 419 | 420 | 421 | def update_message_from_names(current_names, previous_names, filepath, verb='Updated'): 422 | added_names = [n for n in current_names if n not in previous_names] 423 | removed_names = [n for n in previous_names if n not in current_names] 424 | message = [] 425 | for name in added_names: 426 | message.append('Added shelter: %s' % name) 427 | if added_names: 428 | message.append('') 429 | for name in removed_names: 430 | message.append('Removed shelter: %s' % name) 431 | body = '\n'.join(message) 432 | summary = [] 433 | if added_names: 434 | summary.append('%d shelter%s added' % ( 435 | len(added_names), '' if len(added_names) == 1 else 's', 436 | )) 437 | if removed_names: 438 | summary.append('%d shelter%s removed' % ( 439 | len(removed_names), '' if len(removed_names) == 1 else 's', 440 | )) 441 | if summary: 442 | summary_text = filepath + ': ' + (', '.join(summary)) 443 | else: 444 | summary_text = '%s %s' % (verb, filepath) 445 | return summary_text + '\n\n' + body 446 | 447 | 448 | def is_heading(tr): 449 | return tr.findAll('td')[1].text == 'Shelter Name' 450 | 451 | 452 | def is_shelter(tr): 453 | return len(tr.findAll('td')) == 4 and not is_heading(tr) 454 | 455 | 456 | def is_county_heading(tr): 457 | if tr.find('td').get('colspan') == '5' and (u'#d4d4d4' in tr.find('td').get('style', '')) and tr.text != ' ': 458 | return tr.text 459 | else: 460 | return None 461 | 462 | 463 | class FloridaDisasterShelters(BaseScraper): 464 | filepath = 'florida-shelters.json' 465 | url = 'http://www.floridadisaster.org/shelters/summary.aspx' 466 | 467 | def update_message(self, old_data, new_data): 468 | def name(n): 469 | return '%s (%s County)' % (n['name'], n['county']) 470 | 471 | current_names = [name(n) for n in new_data] 472 | previous_names = [name(n) for n in old_data] 473 | message = update_message_from_names(current_names, previous_names, self.filepath) 474 | message += '\nChange detected on %s' % self.url 475 | return message 476 | 477 | def fetch_data(self): 478 | r = requests.get(self.url) 479 | if r.status_code != 200: 480 | print "Oh no - status code = %d" % r.status_code 481 | return None 482 | table = Soup(r.content).findAll('table')[9] 483 | current_county = None 484 | shelters = [] 485 | for tr in table.findAll('tr'): 486 | heading = is_county_heading(tr) 487 | if heading: 488 | current_county = heading 489 | if is_shelter(tr): 490 | shelters.append({ 491 | 'type': tr.findAll('td')[0].text, 492 | 'county': current_county.title(), 493 | 'name': tr.findAll('td')[1].text, 494 | 'address': tr.findAll('td')[2].text, 495 | 'map_url': tr.findAll('td')[2].find('a')['href'].split(' ')[0], 496 | 'city': tr.findAll('td')[3].text, 497 | }) 498 | shelters.sort(key=lambda s: (s['county'], s['name'])) 499 | return shelters 500 | 501 | 502 | class CrowdSourceRescue(BaseScraper): 503 | filepath = 'crowdsourcerescue.json' 504 | owner = 'simonw' 505 | repo = 'private-irma-data' 506 | slack_channel = None 507 | url = 'https://crowdsourcerescue.com/rescuees/searchApi/' 508 | 509 | def fetch_data(self): 510 | return requests.post(self.url, { 511 | 'needstring': '', 512 | 'lat_min': '23.882475192722612', 513 | 'lat_max': '29.761185051094046', 514 | 'lng_min': '-86.76083325000002', 515 | 'lng_max': '-77.97177075000002', 516 | 'status': '0', 517 | }).json() 518 | 519 | 520 | if __name__ == '__main__': 521 | test_mode = ('--test' in sys.argv) 522 | github_token = os.environ.get('GITHUB_API_TOKEN', '') 523 | slack_token = os.environ.get('SLACK_TOKEN', '') 524 | scrapers = [ 525 | klass(github_token, slack_token) 526 | for klass in ( 527 | SantaRosaEmergencyInformation, 528 | SonomaRoadConditions, 529 | GoogleCrisisKmlScraper, 530 | SouthCarolinaShelters, 531 | FemaOpenShelters, 532 | FemaNSS, 533 | IrmaShelters, 534 | IrmaShelterDupes, 535 | FloridaDisasterShelters, 536 | ZeemapsScraper, 537 | PascoCounty, 538 | CrowdSourceRescue, 539 | LedgerPolkCounty, 540 | HernandoCountyShelters, 541 | FplStormOutages, 542 | FplCountyOutages, 543 | GemaAnimalShelters, 544 | GemaActiveShelters, 545 | ScegOutages, 546 | IrmaSheltersFloridaMissing, 547 | GeorgiaOutages, 548 | DukeFloridaOutages, 549 | DukeCarolinasOutages, 550 | NorthGeorgiaOutages, 551 | TampaElectricOutages, 552 | JemcOutages, 553 | NewYorkShelters, 554 | CaliforniaDOTRoadInfo, 555 | CaliforniaHighwayPatrolIncidents, 556 | PGEOutagesIndividual, 557 | ) 558 | ] 559 | while True: 560 | print datetime.datetime.now() 561 | for scraper in scrapers: 562 | if test_mode and not scraper.test_mode: 563 | continue 564 | try: 565 | scraper.scrape_and_store() 566 | except Exception, e: 567 | print "!!!! %s: %s !!!!!" % ( 568 | scraper.__class__.__name__, e 569 | ) 570 | if test_mode: 571 | import pdb; pdb.post_mortem() 572 | 573 | time.sleep(120) 574 | --------------------------------------------------------------------------------