├── .gitignore
├── requirements.txt
├── nyc.py
├── README.md
├── common.py
├── base_scraper.py
├── gis_scrapers.py
├── north_bay.py
├── github_read_write.py
├── LICENSE
├── irma_shelters.py
└── irma.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | BeautifulSoup==3.2.1
2 | requests==2.20.0
3 | Geohash==1.0
4 | pyproj==1.9.5.1
5 | 


--------------------------------------------------------------------------------
/nyc.py:
--------------------------------------------------------------------------------
 1 | # In case a hurricane hits New York...
 2 | from base_scraper import BaseDeltaScraper
 3 | 
 4 | import requests
 5 | import csv
 6 | from pyproj import Proj, transform
 7 | 
 8 | 
 9 | class NewYorkShelters(BaseDeltaScraper):
10 |     record_key = 'BLDG_ID'
11 |     filepath = 'new-york-shelters.json'
12 |     url = 'https://maps.nyc.gov/hurricane/data/center.csv'
13 |     source_url = 'https://maps.nyc.gov/hurricane/'
14 |     noun = 'shelter'
15 | 
16 |     def display_record(self, record):
17 |         display = []
18 |         display.append('  %s' % record['BLDG_ADD'])
19 |         display.append('    Accessible: %s' % record['ACCESSIBLE'])
20 |         if record['ACC_FEAT']:
21 |             display.append('    %s' % record['ACC_FEAT'])
22 |         display.append('')
23 |         return '\n'.join(display)
24 | 
25 |     def fetch_data(self):
26 |         data = requests.get(self.url).content
27 |         rows = csv.reader(data.split('\r\n'))
28 |         headers = next(rows)
29 |         shelters = []
30 |         from_projection = Proj(init='epsg:2263', preserve_units=True)
31 |         to_projection = Proj(proj='latlong', ellps='WGS84', datum='WGS84')
32 |         for row in rows:
33 |             shelter = dict(zip(headers, row))
34 |             if not shelter:
35 |                 continue
36 |             # Convert from epsg:2263 - preserve_units=True because this is in feet
37 |             x, y = shelter['X'], shelter['Y']
38 |             longitude, latitude = transform(
39 |                 from_projection, to_projection, x, y
40 |             )
41 |             shelter['longitude'] = longitude
42 |             shelter['latitude'] = latitude
43 |             shelters.append(shelter)
44 |         return shelters
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # irma-scrapers
 2 | 
 3 | Screen scrapers relating to hurricane Irma. See their output in
 4 | https://github.com/simonw/disaster-data/
 5 | 
 6 | ## Irma Response
 7 | 
 8 | The Irma Response project at https://www.irmaresponse.org/ is a team of
 9 | volunteers working together to make information available during and after the
10 | storm. There is a huge amount of information out there, on many different
11 | websites. The Irma API at https://irma-api.herokuapp.com/ is an attempt to
12 | gather key information in one place, verify it and publish it in a reuseable
13 | way.
14 | 
15 | To aid this effort, I've built a collection of screen scrapers that pull data
16 | from a number of different websites and APIs. That data is then stored in a
17 | Git repository, providing a clear history of changes made to the various
18 | sources that are being tracked.
19 | 
20 | Some of the scrapers also publish their findings to Slack in a format designed
21 | to make it obvious when key events happen, such as new shelters being added or
22 | removed from public listings.
23 | 
24 | ## Tracking changes over time
25 | 
26 | A key goal of this screen scraping mechanism is to allow changes to the
27 | underlying data sources to be tracked over time. This is achieved using git,
28 | via the GitHub API. Each scraper pulls down data from a source (an API or a
29 | website) and reformats that data into a sanitized JSON format. That JSON is
30 | then written to the git repository. If the data has changed since the last
31 | time the scraper ran, those changes will be captured by git and made available
32 | in the commit log.
33 | 
34 | Recent changes tracked by the scraper collection can be seen here:
35 | https://github.com/simonw/disaster-data/commits/master
36 | 
37 | ## Generating useful commit messages
38 | 
39 | The most complex code for most of the scrapers isn't in fetching the data:
40 | it's in generating useful, human-readable commit messages that summarize the
41 | underlying change. For example, here is a commit message generated by the
42 | scraper that tracks the http://www.floridadisaster.org/shelters/summary.aspx
43 | page:
44 | 
45 |     florida-shelters.json: 2 shelters added
46 | 
47 |     Added shelter: Atwater Elementary School (Sarasota County)
48 |     Added shelter: DEBARY ELEMENTARY SCHOOL (Volusia County)
49 |     Change detected on http://www.floridadisaster.org/shelters/summary.aspx
50 | 
51 | The full commit also shows the changes to the underlying JSON, but the human-
52 | readable message provides enough information that people who are not JSON-
53 | literate programmers can still derive value from the commit.
54 | 
55 | https://github.com/simonw/disaster-data/commit/7919aeff0913ec26d1bea8dc
56 | 
57 | ## Publishing to Slack
58 | 
59 | The Irma Response team use Slack to co-ordinate their efforts. You can join
60 | their Slack here: https://irma-response-slack.herokuapp.com/
61 | 
62 | Some of the scrapers publish detected changes in their data source to Slack,
63 | as links to the commits generated for each change. The human-readable message
64 | is posted directly to the channel.
65 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | from github_read_write import GithubContent
  2 | 
  3 | import requests
  4 | import json
  5 | 
  6 | 
  7 | class Scraper(object):
  8 |     owner = None
  9 |     repo = None
 10 |     filepath = None
 11 |     committer = None
 12 |     slack_channel = None
 13 |     slack_botname = None
 14 |     test_mode = False
 15 | 
 16 |     def __init__(self, github_token, slack_token=None):
 17 |         self.last_data = None
 18 |         self.last_sha = None
 19 |         self.github_token = github_token
 20 |         self.slack_token = slack_token
 21 | 
 22 |     def post_to_slack(self, message, commit_hash):
 23 |         if not (self.slack_channel and self.slack_token):
 24 |             return
 25 |         headline = message.split('\n')[0]
 26 |         try:
 27 |             body = message.split('\n', 1)[1]
 28 |         except IndexError:
 29 |             body = ''
 30 |         github_url = 'https://github.com/%s/%s/commit/%s' % (
 31 |             self.owner, self.repo, commit_hash
 32 |         )
 33 |         requests.post('https://slack.com/api/chat.postMessage', {
 34 |             'token': self.slack_token,
 35 |             'channel': self.slack_channel,
 36 |             'attachments': json.dumps([{
 37 |                 'fallback': github_url,
 38 |                 'pretext': headline,
 39 |                 'title': '%s: %s' % (self.filepath, commit_hash[:8]),
 40 |                 'title_link': github_url,
 41 |                 'text': body.strip(),
 42 |             }]),
 43 |             'icon_emoji': ':robot_face:',
 44 |             'username': self.slack_botname,
 45 |         }).json()
 46 | 
 47 |     def create_message(self, new_data):
 48 |         return 'Created %s' % self.filepath
 49 | 
 50 |     def update_message(self, old_data, new_data):
 51 |         return 'Updated %s' % self.filepath
 52 | 
 53 |     def fetch_data(self):
 54 |         return []
 55 | 
 56 |     def scrape_and_store(self):
 57 |         data = self.fetch_data()
 58 |         if data is None:
 59 |             print '%s; Data was None' % self.filepath
 60 |             return
 61 | 
 62 |         if self.test_mode and not self.github_token:
 63 |             print json.dumps(data, indent=2)
 64 |             return
 65 | 
 66 |         # We need to store the data
 67 |         github = GithubContent(self.owner, self.repo, self.github_token)
 68 |         if not self.last_data or not self.last_sha:
 69 |             # Check and see if it exists yet
 70 |             try:
 71 |                 content, sha = github.read(self.filepath)
 72 |                 self.last_data = json.loads(content)
 73 |                 self.last_sha = sha
 74 |             except GithubContent.NotFound:
 75 |                 pass
 76 | 
 77 |         if self.last_data == data:
 78 |             print '%s: Nothing changed' % self.filepath
 79 |             return
 80 | 
 81 |         if self.last_sha:
 82 |             print 'Updating %s' % self.filepath
 83 |             message = self.update_message(self.last_data, data)
 84 |         else:
 85 |             print 'Creating %s' % self.filepath
 86 |             message = self.create_message(data)
 87 | 
 88 |         if self.test_mode:
 89 |             print message
 90 |             print
 91 |             print json.dumps(data, indent=2)
 92 |             return
 93 | 
 94 |         content_sha, commit_sha = github.write(
 95 |             filepath=self.filepath,
 96 |             content=json.dumps(data, indent=2),
 97 |             sha=self.last_sha,
 98 |             commit_message=message,
 99 |             committer=self.committer,
100 |         )
101 | 
102 |         self.last_sha = content_sha
103 |         self.last_data = data
104 | 
105 |         self.post_to_slack(message, commit_sha)
106 |         print 'https://github.com/%s/%s/commit/%s' % (
107 |             self.owner, self.repo, commit_sha
108 |         )
109 | 


--------------------------------------------------------------------------------
/base_scraper.py:
--------------------------------------------------------------------------------
  1 | from common import Scraper
  2 | 
  3 | 
  4 | class BaseScraper(Scraper):
  5 |     owner = 'simonw'
  6 |     repo = 'disaster-data'
  7 |     committer = {
  8 |         'name': 'irma-scraper',
  9 |         'email': 'irma-scraper@example.com',
 10 |     }
 11 |     slack_botname = 'Irma Scraper'
 12 |     slack_channel = '#shelter_scraper_data'
 13 | 
 14 | 
 15 | class BaseDeltaScraper(BaseScraper):
 16 |     """
 17 |     The fetch_data() method should return a list of dicts. Each dict
 18 |     should have a key that can be used to identify the row in that dict.
 19 | 
 20 |     Then you define a display_record(record) method that returns a string.
 21 |     """
 22 |     record_key = None
 23 |     show_changes = False
 24 |     noun = 'record'
 25 |     source_url = None
 26 |     slack_channel = None
 27 | 
 28 |     @property
 29 |     def display_name(self):
 30 |         return self.filepath.replace('.json', '')
 31 | 
 32 |     @property
 33 |     def noun_plural(self):
 34 |         return self.noun + 's'
 35 | 
 36 |     def create_message(self, new_records):
 37 |         return self.update_message([], new_records, 'Created')
 38 | 
 39 |     def update_message(self, old_records, new_records, verb='Updated'):
 40 |         previous_ids = [
 41 |             record[self.record_key] for record in old_records
 42 |         ]
 43 |         current_ids = [
 44 |             record[self.record_key] for record in new_records
 45 |         ]
 46 |         added_ids = [id for id in current_ids if id not in previous_ids]
 47 |         removed_ids = [id for id in previous_ids if id not in current_ids]
 48 | 
 49 |         message_blocks = []
 50 |         if added_ids:
 51 |             messages = []
 52 |             messages.append('%d new %s:' % (
 53 |                 len(added_ids), self.noun if len(added_ids) == 1 else self.noun_plural
 54 |             ))
 55 |             for id in added_ids:
 56 |                 record = [r for r in new_records if r[self.record_key] == id][0]
 57 |                 messages.append(self.display_record(record))
 58 |             message_blocks.append(messages)
 59 | 
 60 |         if removed_ids:
 61 |             messages = []
 62 |             messages.append('%d %s removed:' % (
 63 |                 len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural
 64 |             ))
 65 |             for id in removed_ids:
 66 |                 record = [r for r in old_records if r[self.record_key] == id][0]
 67 |                 messages.append(self.display_record(record))
 68 |             message_blocks.append(messages)
 69 | 
 70 |         # Add useful rendering of CHANGED records as well
 71 |         changed_records = []
 72 |         for new_record in new_records:
 73 |             try:
 74 |                 old_record = [
 75 |                     r for r in old_records
 76 |                     if r[self.record_key] == new_record[self.record_key]
 77 |                 ][0]
 78 |             except IndexError:
 79 |                 continue
 80 |             changed_records.append((old_record, new_record))
 81 | 
 82 |         if self.show_changes and changed_records:
 83 |             messages = []
 84 |             messages.append('%d %s changed:' % (
 85 |                 len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural
 86 |             ))
 87 |             for old_record, new_record in changed_records:
 88 |                 messages.append(self.display_changes(old_record, new_record))
 89 |             message_blocks.append(messages)
 90 | 
 91 |         blocks = []
 92 |         for message_block in message_blocks:
 93 |             block = '\n'.join(message_block)
 94 |             blocks.append(block.strip())
 95 | 
 96 |         if self.source_url:
 97 |             blocks.append('Detected on %s' % self.source_url)
 98 | 
 99 |         body = '\n\n'.join(blocks)
100 | 
101 |         summary = []
102 |         if added_ids:
103 |             summary.append('%d %s added' % (
104 |                 len(added_ids), self.noun if len(added_ids) == 1 else self.noun_plural
105 |             ))
106 |         if removed_ids:
107 |             summary.append('%d %s removed' % (
108 |                 len(removed_ids), self.noun if len(removed_ids) == 1 else self.noun_plural
109 |             ))
110 |         if changed_records:
111 |             summary.append('%d %s changed' % (
112 |                 len(changed_records), self.noun if len(changed_records) == 1 else self.noun_plural
113 |             ))
114 |         if summary:
115 |             summary_text = self.display_name + ': ' + (', '.join(summary))
116 |         else:
117 |             summary_text = '%s %s' % (verb, self.display_name)
118 |         return summary_text + '\n\n' + body
119 | 


--------------------------------------------------------------------------------
/gis_scrapers.py:
--------------------------------------------------------------------------------
  1 | from base_scraper import BaseScraper
  2 | import requests
  3 | 
  4 | 
  5 | def objectid(d):
  6 |     # Different datasets represent objectid in different ways
  7 |     return d.get('OBJECTID') or d['ObjectID']
  8 | 
  9 | 
 10 | def shelter_name(d):
 11 |     return d.get('SHELTER_NAME') or d['label']
 12 | 
 13 | 
 14 | def shelter_county(d):
 15 |     return d.get('COUNTY_PARISH') or d['county']
 16 | 
 17 | 
 18 | class BaseGisScraper(BaseScraper):
 19 |     source_url = None
 20 | 
 21 |     def create_message(self, new_data):
 22 |         return self.update_message([], new_data, verb='Created')
 23 | 
 24 |     def update_message(self, old_data, new_data, verb='Updated'):
 25 |         new_objects = [o for o in new_data if not any(o2 for o2 in old_data if objectid(o2) == objectid(o))]
 26 |         removed_objects = [o for o in old_data if not any(o2 for o2 in new_data if objectid(o2) == objectid(o))]
 27 |         message = []
 28 | 
 29 |         def name(row):
 30 |             if 'COUNTY_PARISH' in row or 'county' in row:
 31 |                 s = '%s (%s County)' % (shelter_name(row), shelter_county(row).title())
 32 |             elif 'CITY' in row and 'STATE' in row:
 33 |                 s = '%s (%s, %s)' % (shelter_name(row), row['CITY'].title(), row['STATE'])
 34 |             else:
 35 |                 s = shelter_name(row)
 36 |             return s.replace('County County', 'County')
 37 | 
 38 |         for new_object in new_objects:
 39 |             message.append('Added shelter: %s' % name(new_object))
 40 |         if new_objects:
 41 |             message.append('')
 42 |         for removed_object in removed_objects:
 43 |             message.append('Removed shelter: %s' % name(removed_object))
 44 |         if removed_objects:
 45 |             message.append('')
 46 |         num_updated = 0
 47 |         for new_object in new_data:
 48 |             old_object = [o for o in old_data if objectid(o) == objectid(new_object)]
 49 |             if not old_object:
 50 |                 continue
 51 |             old_object = old_object[0]
 52 |             if new_object != old_object:
 53 |                 message.append('Updated shelter: %s' % name(new_object))
 54 |                 num_updated += 1
 55 |         body = '\n'.join(message)
 56 |         summary = []
 57 |         if new_objects:
 58 |             summary.append('%d shelter%s added' % (
 59 |                 len(new_objects), '' if len(new_objects) == 1 else 's',
 60 |             ))
 61 |         if removed_objects:
 62 |             summary.append('%d shelter%s removed' % (
 63 |                 len(removed_objects), '' if len(removed_objects) == 1 else 's',
 64 |             ))
 65 |         if num_updated:
 66 |             summary.append('%d shelter%s updated' % (
 67 |                 num_updated, '' if num_updated == 1 else 's',
 68 |             ))
 69 |         if summary:
 70 |             summary_text = self.filepath + ': ' + (', '.join(summary))
 71 |         else:
 72 |             summary_text = '%s %s' % (verb, self.filepath)
 73 |         if self.source_url:
 74 |             body += '\nChange detected on %s' % self.source_url
 75 |         return summary_text + '\n\n' + body
 76 | 
 77 |     def fetch_data(self):
 78 |         data = requests.get(self.url).json()
 79 |         shelters = [feature['attributes'] for feature in data['features']]
 80 |         shelters.sort(key=lambda s: objectid(s))
 81 |         return shelters
 82 | 
 83 | 
 84 | class FemaOpenShelters(BaseGisScraper):
 85 |     filepath = 'fema-open-shelters.json'
 86 |     url = 'https://gis.fema.gov/REST/services/NSS/OpenShelters/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&geometry=%7B%22xmin%22%3A-10018754.171396945%2C%22ymin%22%3A2504688.5428529754%2C%22xmax%22%3A-7514065.628548954%2C%22ymax%22%3A5009377.085700965%2C%22spatialReference%22%3A%7B%22wkid%22%3A102100%7D%7D&geometryType=esriGeometryEnvelope&inSR=102100&outFields=*&outSR=102100'
 87 | 
 88 | 
 89 | class FemaNSS(BaseGisScraper):
 90 |     filepath = 'fema-nss-usa.json'
 91 |     url = 'https://gis.fema.gov/REST/services/NSS/FEMA_NSS/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&geometry=%7B%22xmin%22%3A+-14404742.108649602%2C+%22ymin%22%3A+-55660.4518654215%2C+%22ymax%22%3A+6782064.328749425%2C+%22xmax%22%3A+-5988988.6046781195%2C+%22spatialReference%22%3A+%7B%22wkid%22%3A+102100%7D%7D&geometryType=esriGeometryEnvelope&inSR=102100&outFields=*&outSR=102100'
 92 | 
 93 | 
 94 | class GemaAnimalShelters(BaseGisScraper):
 95 |     filepath = 'georgia-gema-animal-shelters.json'
 96 |     url = 'https://services1.arcgis.com/2iUE8l8JKrP2tygQ/arcgis/rest/services/AnimalShelters/FeatureServer/0/query?f=json&where=status%20%3D%20%27OPEN%27&returnGeometry=true&spatialRel=esriSpatialRelIntersects&outFields=*&outSR=102100&resultOffset=0&resultRecordCount=1000'
 97 |     source_url = 'https://gema-soc.maps.arcgis.com/apps/webappviewer/index.html?id=279ef7cfc1da45edb640723c12b02b18'
 98 | 
 99 | 
100 | class GemaActiveShelters(BaseGisScraper):
101 |     filepath = 'georgia-gema-active-shelters.json'
102 |     url = 'https://services1.arcgis.com/2iUE8l8JKrP2tygQ/arcgis/rest/services/SheltersActive/FeatureServer/0/query?f=json&where=shelter_information_shelter_type%20%3C%3E%20%27Reception%20Care%20Ctr.%27&returnGeometry=true&spatialRel=esriSpatialRelIntersects&outFields=*&outSR=102100&resultOffset=0&resultRecordCount=1000'
103 |     source_url = 'https://gema-soc.maps.arcgis.com/apps/webappviewer/index.html?id=279ef7cfc1da45edb640723c12b02b18'
104 | 


--------------------------------------------------------------------------------
/north_bay.py:
--------------------------------------------------------------------------------
  1 | from base_scraper import BaseScraper, BaseDeltaScraper
  2 | from BeautifulSoup import Comment, BeautifulSoup as Soup
  3 | from xml.etree import ElementTree
  4 | import requests
  5 | import re
  6 | 
  7 | 
  8 | class PGEOutagesIndividual(BaseDeltaScraper):
  9 |     url = 'https://apim.pge.com/cocoutage/outages/getOutagesRegions?regionType=city&expand=true'
 10 |     filepath = 'pge-outages-individual.json'
 11 |     slack_channel = None
 12 |     record_key = 'outageNumber'
 13 |     noun = 'outage'
 14 | 
 15 |     def fetch_data(self):
 16 |         data = requests.get(
 17 |             self.url,
 18 |             timeout=10,
 19 |         ).json()
 20 |         # Flatten into a list of outages
 21 |         outages = []
 22 |         for region in data['outagesRegions']:
 23 |             for outage in region['outages']:
 24 |                 outage['regionName'] = region['regionName']
 25 |                 outages.append(outage)
 26 |         return outages
 27 | 
 28 |     def display_record(self, outage):
 29 |         display = []
 30 |         display.append('  %(outageNumber)s in %(regionName)s affecting %(estCustAffected)s' % outage)
 31 |         display.append('    https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % outage)
 32 |         display.append('    %(cause)s - %(crewCurrentStatus)s' % outage)
 33 |         display.append('')
 34 |         return '\n'.join(display)
 35 | 
 36 | 
 37 | class SantaRosaEmergencyInformation(BaseScraper):
 38 |     url = 'https://srcity.org/610/Emergency-Information'
 39 |     filepath = 'santa-rosa-emergency.json'
 40 |     slack_channel = None
 41 | 
 42 |     def fetch_data(self):
 43 |         html = requests.get(self.url).content
 44 |         soup = Soup(html)
 45 |         main_content = soup.find('div', {'data-cprole': 'mainContentContainer'})
 46 |         # Remove scripts
 47 |         [s.extract() for s in main_content.findAll('script')]
 48 |         # Remove source comments
 49 |         comments = soup.findAll(text=lambda text: isinstance(text, Comment))
 50 |         [comment.extract() for comment in comments]
 51 |         # Remove almost all attributes
 52 |         for tag in main_content.recursiveChildGenerator():
 53 |             try:
 54 |                 tag.attrs = [
 55 |                     (key, value) for key, value in tag.attrs
 56 |                     if key in ('href', 'src')
 57 |                     and not value.startswith('#')
 58 |                 ]
 59 |             except AttributeError:
 60 |                 pass
 61 | 
 62 |         return {
 63 |             'html_lines': unicode(main_content).split(u'\n'),
 64 |         }
 65 | 
 66 | 
 67 | class SonomaRoadConditions(BaseScraper):
 68 |     url = 'http://roadconditions.sonoma-county.org/'
 69 |     filepath = 'sonoma-road-conditions.json'
 70 |     slack_channel = None
 71 | 
 72 |     def fetch_data(self):
 73 |         soup = Soup(requests.get(self.url).content)
 74 |         road_closures = {}
 75 |         for id in ('divTableCounty', 'divTableCity'):
 76 |             name = {'divTableCounty': 'county_roads', 'divTableCity': 'city_roads'}[id]
 77 |             div = soup.find('div', {'id': id})
 78 |             table = div.find('table')
 79 |             headers = [th.text for th in table.findAll('th')]
 80 |             closures = []
 81 |             for tr in table.find('tbody').findAll('tr'):
 82 |                 values = [td.text for td in tr.findAll('td')]
 83 |                 closures.append(dict(zip(headers, values)))
 84 |             road_closures[name] = closures
 85 |         return road_closures
 86 | 
 87 | 
 88 | class CaliforniaDOTRoadInfo(BaseScraper):
 89 |     url = 'http://www.dot.ca.gov/hq/roadinfo/Hourly'
 90 |     filepath = 'dot-ca-roadinfo-hourly.json'
 91 |     slack_channel = None
 92 | 
 93 |     def fetch_data(self):
 94 |         text = requests.get(self.url).content
 95 |         return {
 96 |             'text_lines': [l.rstrip('\r') for l in text.split('\n')],
 97 |         }
 98 | 
 99 | 
100 | class CaliforniaHighwayPatrolIncidents(BaseDeltaScraper):
101 |     url = 'http://quickmap.dot.ca.gov/data/chp-only.kml'
102 |     filepath = 'chp-incidents.json'
103 |     slack_channel = None
104 |     record_key = 'name'
105 |     noun = 'incident'
106 | 
107 |     def display_record(self, incident):
108 |         display = []
109 |         display.append('  %s' % incident['name'])
110 |         display.append('    https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % incident)
111 |         display.append('    ' + incident['description'])
112 |         display.append('')
113 |         return '\n'.join(display)
114 | 
115 |     def fetch_data(self):
116 |         kml = requests.get(self.url).content
117 |         et = ElementTree.fromstring(kml)
118 |         incidents = []
119 |         for placemark in et.findall('.//{http://www.opengis.net/kml/2.2}Placemark'):
120 |             coords = placemark.find('.//{http://www.opengis.net/kml/2.2}coordinates').text.strip()
121 |             latitude, longitude, blah = map(float, coords.split(','))
122 |             description = placemark.find('{http://www.opengis.net/kml/2.2}description').text.strip()
123 |             name = placemark.find('{http://www.opengis.net/kml/2.2}name').text.strip()
124 |             incidents.append({
125 |                 'name': name,
126 |                 'description': strip_tags(description),
127 |                 'latitude': latitude,
128 |                 'longitude': longitude,
129 |             })
130 |         return incidents
131 | 
132 | 
133 | tag_re = re.compile('<.*?>')
134 | 
135 | 
136 | def strip_tags(s):
137 |     return tag_re.sub('', s)
138 | 


--------------------------------------------------------------------------------
/github_read_write.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This class knows how to read and write LARGE files to Github. The regular
  3 | GitHub Contents API can't handle files larger than 1MB - this class knows how
  4 | to spot that proble and switch to the large-file-supporting low level Git Data
  5 | API instead.
  6 | 
  7 | https://developer.github.com/v3/repos/contents/
  8 | https://developer.github.com/v3/git/
  9 | """
 10 | import requests
 11 | 
 12 | 
 13 | class GithubContent(object):
 14 |     class NotFound(Exception):
 15 |         pass
 16 | 
 17 |     class UnknownError(Exception):
 18 |         pass
 19 | 
 20 |     def __init__(self, owner, repo, token):
 21 |         self.owner = owner
 22 |         self.repo = repo
 23 |         self.token = token
 24 | 
 25 |     def base_url(self):
 26 |         return 'https://api.github.com/repos/%s/%s' % (
 27 |             self.owner, self.repo
 28 |         )
 29 | 
 30 |     def read(self, filepath):
 31 |         # Try reading using content API
 32 |         content_url = self.base_url() + '/contents/%s' % filepath
 33 |         response = requests.get(
 34 |             content_url,
 35 |             headers={
 36 |                 'Authorization': 'token %s' % self.token
 37 |             }
 38 |         )
 39 |         if response.status_code == 200:
 40 |             data = response.json()
 41 |             return data['content'].decode('base64'), data['sha']
 42 |         elif response.status_code == 404:
 43 |             raise self.NotFound(filepath)
 44 |         elif response.status_code == 403:
 45 |             # It's probably too large
 46 |             if response.json()['errors'][0]['code'] != 'too_large':
 47 |                 raise self.UnknownError(response.content)
 48 |             else:
 49 |                 return self.read_large(filepath)
 50 |         else:
 51 |             raise self.UnknownError(response.content)
 52 | 
 53 |     def read_large(self, filepath):
 54 |         master = requests.get(
 55 |             self.base_url() + '/git/trees/master?recursive=1',
 56 |             headers={
 57 |                 'Authorization': 'token %s' % self.token
 58 |             }
 59 |         ).json()
 60 |         try:
 61 |             tree_entry = [t for t in master['tree'] if t['path'] == filepath][0]
 62 |         except IndexError:
 63 |             raise self.NotFound(filepath)
 64 |         data = requests.get(
 65 |             tree_entry['url'],
 66 |             headers={
 67 |                 'Authorization': 'token %s' % self.token
 68 |             }
 69 |         ).json()
 70 |         return data['content'].decode('base64'), data['sha']
 71 | 
 72 |     def write(self, filepath, content, sha=None, commit_message=None, committer=None):
 73 |         github_url = self.base_url() + '/contents/%s' % filepath
 74 |         payload = {
 75 |             'path': filepath,
 76 |             'content': content.encode('base64'),
 77 |             'message': commit_message,
 78 |         }
 79 |         if sha:
 80 |             payload['sha'] = sha
 81 |         if committer:
 82 |             payload['committer'] = committer
 83 | 
 84 |         response = requests.put(
 85 |             github_url,
 86 |             json=payload,
 87 |             headers={
 88 |                 'Authorization': 'token %s' % self.token
 89 |             }
 90 |         )
 91 |         if response.status_code == 403 and response.json()['errors'][0]['code'] == 'too_large':
 92 |             return self.write_large(filepath, content, commit_message, committer)
 93 |         elif sha is None and response.status_code == 422 and 'sha' in response.json().get('message', ''):
 94 |             # Missing sha - we need to figure out the sha and try again
 95 |             old_content, old_sha = self.read(filepath)
 96 |             return self.write(
 97 |                 filepath,
 98 |                 content,
 99 |                 sha=old_sha,
100 |                 commit_message=commit_message,
101 |                 committer=committer,
102 |             )
103 |         elif response.status_code in (201, 200):
104 |             updated = response.json()
105 |             return updated['content']['sha'], updated['commit']['sha']
106 |         else:
107 |             raise self.UnknownError(str(response.status_code) + ':' + response.content)
108 | 
109 |     def write_large(self, filepath, content, commit_message=None, committer=None):
110 |         # Create a new blob with the file contents
111 |         created_blob = requests.post(self.base_url() + '/git/blobs', json={
112 |             'encoding': 'utf8',
113 |             'content': content,
114 |         }, headers={'Authorization': 'token %s' % self.token}).json()
115 |         # Retrieve master tree sha
116 |         master_sha = requests.get(
117 |             self.base_url() + '/git/trees/master?recursive=1',
118 |             headers={
119 |                 'Authorization': 'token %s' % self.token
120 |             }
121 |         ).json()['sha']
122 |         # Construct a new tree
123 |         created_tree = requests.post(
124 |             self.base_url() + '/git/trees',
125 |             json={
126 |                 'base_tree': master_sha,
127 |                 'tree': [{
128 |                     'mode': '100644', # file (blob),
129 |                     'path': filepath,
130 |                     'type': 'blob',
131 |                     'sha': created_blob['sha'],
132 |                 }]
133 |             },
134 |             headers={'Authorization': 'token %s' % self.token}
135 |         ).json()
136 |         # Create a commit which references the new tree
137 |         payload = {
138 |             'message': commit_message,
139 |             'parents': [master_sha],
140 |             'tree': created_tree['sha'],
141 |         }
142 |         if committer:
143 |             payload['committer'] = committer
144 |         created_commit = requests.post(
145 |             self.base_url() + '/git/commits',
146 |             json=payload,
147 |             headers={'Authorization': 'token %s' % self.token}
148 |         ).json()
149 |         # Move HEAD reference on master to the new commit
150 |         requests.patch(
151 |             self.base_url() + '/git/refs/heads/master',
152 |             json={'sha': created_commit['sha']},
153 |             headers={'Authorization': 'token %s' % self.token}
154 |         ).json()
155 |         return created_blob['sha'], created_commit['sha']
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/irma_shelters.py:
--------------------------------------------------------------------------------
  1 | from base_scraper import BaseScraper
  2 | import requests
  3 | import Geohash
  4 | import re
  5 | 
  6 | IGNORE_DUPE_IDS = {
  7 |     456, # Hialeah Middle School
  8 |     442, # Amelia Earhart Elementary
  9 | }
 10 | 
 11 | GEOHASH_PRECISION = 7
 12 | 
 13 | 
 14 | class IrmaShelters(BaseScraper):
 15 |     filepath = 'irma-shelters.json'
 16 |     url = 'https://irma-api.herokuapp.com/api/v1/shelters'
 17 |     slack_channel = None
 18 | 
 19 |     def update_message(self, old_data, new_data):
 20 |         def name(n):
 21 |             return '%s (%s)' % (n['shelter'], n['county'])
 22 | 
 23 |         current_ids = [n['id'] for n in new_data]
 24 |         previous_ids = [n['id'] for n in old_data]
 25 | 
 26 |         added_ids = [id for id in current_ids if id not in previous_ids]
 27 |         removed_ids = [id for id in previous_ids if id not in current_ids]
 28 | 
 29 |         message = []
 30 |         for id in added_ids:
 31 |             shelter = [n for n in new_data if n['id'] == id][0]
 32 |             message.append('Added shelter: %s' % name(shelter))
 33 |         if added_ids:
 34 |             message.append('')
 35 |         for id in removed_ids:
 36 |             shelter = [n for n in old_data if n['id'] == id][0]
 37 |             message.append('Removed shelter: %s' % name(shelter))
 38 |         body = '\n'.join(message)
 39 |         summary = []
 40 |         if added_ids:
 41 |             summary.append('%d shelter%s added' % (
 42 |                 len(added_ids), '' if len(added_ids) == 1 else 's',
 43 |             ))
 44 |         if removed_ids:
 45 |             summary.append('%d shelter%s removed' % (
 46 |                 len(removed_ids), '' if len(removed_ids) == 1 else 's',
 47 |             ))
 48 |         if summary:
 49 |             summary_text = self.filepath + ': ' + (', '.join(summary))
 50 |         else:
 51 |             summary_text = 'Updated %s' % self.filepath
 52 |         return summary_text + '\n\n' + body
 53 | 
 54 |     def fetch_data(self):
 55 |         data = requests.get(self.url).json()
 56 |         shelters = data['shelters']
 57 |         shelters.sort(key=lambda s: s['shelter'])
 58 |         return shelters
 59 | 
 60 | 
 61 | class IrmaShelterDupes(BaseScraper):
 62 |     # Detect possible dupes in irma-api
 63 |     filepath = 'irma-shelters-dupes.json'
 64 |     url = 'https://irma-api.herokuapp.com/api/v1/shelters'
 65 | 
 66 |     def update_message(self, old_data, new_data):
 67 |         previous_geohashes = [
 68 |             dupe_group['geohash'] for dupe_group in old_data['dupe_groups']
 69 |         ]
 70 |         current_geohashes = [
 71 |             dupe_group['geohash'] for dupe_group in new_data['dupe_groups']
 72 |         ]
 73 |         added_geohashes = [
 74 |             geohash for geohash in current_geohashes if geohash not in previous_geohashes
 75 |         ]
 76 |         removed_geohashes = [
 77 |             geohash for geohash in previous_geohashes if geohash not in current_geohashes
 78 |         ]
 79 | 
 80 |         message = []
 81 |         for geohash in added_geohashes:
 82 |             dupe_group = [group for group in new_data['dupe_groups'] if group['geohash'] == geohash][0]
 83 |             message.append('New potential duplicates:')
 84 |             for shelter in dupe_group['shelters']:
 85 |                 message.append('  ' + shelter['name'])
 86 |                 if shelter.get('address'):
 87 |                     message.append('    ' + shelter['address'])
 88 |                 message.append('    ' + shelter['google_maps'])
 89 |                 message.append('    ' + shelter['view_url'])
 90 |                 message.append('')
 91 | 
 92 |         if added_geohashes and removed_geohashes:
 93 |             message.append('')
 94 | 
 95 |         for geohash in removed_geohashes:
 96 |             dupe_group = [group for group in old_data['dupe_groups'] if group['geohash'] == geohash][0]
 97 |             message.append('This previous duplicate looks to be resolved:')
 98 |             for shelter in dupe_group['shelters']:
 99 |                 message.append('  ' + shelter['name'])
100 |                 if shelter.get('address'):
101 |                     message.append('    ' + shelter['address'])
102 |                 message.append('    ' + shelter['google_maps'])
103 |                 message.append('    ' + shelter['view_url'])
104 |                 message.append('')
105 | 
106 |         current_no_latlon_ids = [
107 |             shelter['id'] for shelter in new_data['no_latitude_longitude']
108 |         ]
109 |         # Older data in our repo doesn't have the 'id' property, so we
110 |         # have to allow it to be None here
111 |         previous_no_latlon_ids = [
112 |             shelter.get('id') for shelter in old_data['no_latitude_longitude']
113 |         ]
114 | 
115 |         new_no_latlon_ids = [
116 |             id for id in current_no_latlon_ids
117 |             if id not in previous_no_latlon_ids
118 |         ]
119 |         resolved_no_latlon_ids = [
120 |             id for id in previous_no_latlon_ids
121 |             if id not in current_no_latlon_ids
122 |             and id is not None
123 |         ]
124 | 
125 |         if new_no_latlon_ids:
126 |             message.append('')
127 |             message.append('New shelters detected with no latitude/longitude:')
128 |             for id in new_no_latlon_ids:
129 |                 shelter = [
130 |                     s for s in new_data['no_latitude_longitude']
131 |                     if s['id'] == id
132 |                 ][0]
133 |                 message.append('    ' + shelter['name'])
134 |                 if shelter.get('address'):
135 |                     message.append('    ' + shelter['address'])
136 |                 message.append('    ' + shelter['view_url'])
137 |                 message.append('')
138 | 
139 |         if resolved_no_latlon_ids:
140 |             message.append('')
141 |             message.append('Fixed shelters that had no latitude/longitude:')
142 |             for id in resolved_no_latlon_ids:
143 |                 shelter = [
144 |                     s for s in old_data['no_latitude_longitude']
145 |                     if s['id'] == id
146 |                 ][0]
147 |                 message.append('  ' + shelter['name'])
148 |                 message.append('  ' + (shelter.get('address') or ''))
149 |                 message.append('  ' + shelter['view_url'])
150 | 
151 |         body = '\n'.join(message)
152 |         summary = []
153 |         if added_geohashes:
154 |             summary.append('%d new dupe%s detected' % (
155 |                 len(added_geohashes), '' if len(added_geohashes) == 1 else 's',
156 |             ))
157 |         if removed_geohashes:
158 |             summary.append('%d dupe%s resolved' % (
159 |                 len(removed_geohashes), '' if len(removed_geohashes) == 1 else 's',
160 |             ))
161 |         if new_no_latlon_ids:
162 |             summary.append('%d new no-lat-lon shelter%s' % (
163 |                 len(new_no_latlon_ids), '' if len(new_no_latlon_ids) == 1 else 's',
164 |             ))
165 |         if resolved_no_latlon_ids:
166 |             summary.append('%d fixed no-lat-lon shelter%s' % (
167 |                 len(resolved_no_latlon_ids), '' if len(resolved_no_latlon_ids) == 1 else 's',
168 |             ))
169 |         if summary:
170 |             summary_text = self.filepath + ': ' + (', '.join(summary))
171 |         else:
172 |             summary_text = 'Updated %s' % self.filepath
173 |         return summary_text + '\n\n' + body
174 | 
175 |     def fetch_data(self):
176 |         data = requests.get(self.url).json()
177 |         shelters = data['shelters']
178 |         # Scan for potential dupes by lat/lon (using geohash)
179 |         by_geohash = {}
180 |         for shelter in shelters:
181 |             if shelter['id'] in IGNORE_DUPE_IDS:
182 |                 continue
183 |             geohash = Geohash.encode(
184 |                 shelter['latitude'],
185 |                 shelter['longitude'],
186 |                 precision=GEOHASH_PRECISION,
187 |             )
188 |             by_geohash.setdefault(geohash, []).append(shelter)
189 |         dupe_groups = [
190 |             pair for pair in by_geohash.items()
191 |             if (
192 |                 # More than one shelter in this group
193 |                 len(pair[1]) > 1
194 |                 # Group is not invalid lat/lon
195 |                 and pair[0] != ('0' * GEOHASH_PRECISION)
196 |             )
197 |         ]
198 |         no_latlons = by_geohash.get('0' * GEOHASH_PRECISION) or []
199 |         return {
200 |             'dupe_groups': [{
201 |                 'geohash': dupe_group[0],
202 |                 'shelters': [{
203 |                     'id': shelter['id'],
204 |                     'name': shelter['shelter'],
205 |                     'address': shelter['address'],
206 |                     'latitude': shelter['latitude'],
207 |                     'longitude': shelter['longitude'],
208 |                     'google_maps': 'https://www.google.com/maps/search/%(latitude)s,%(longitude)s' % shelter,
209 |                     'view_url': 'https://irma-api.herokuapp.com/shelters/%s' % shelter['id'],
210 |                 } for shelter in dupe_group[1]],
211 |             } for dupe_group in dupe_groups],
212 |             'no_latitude_longitude': [{
213 |                 'id': shelter['id'],
214 |                 'name': shelter['shelter'],
215 |                 'address': shelter['address'],
216 |                 'view_url': 'https://irma-api.herokuapp.com/shelters/%s' % shelter['id'],
217 |             } for shelter in no_latlons]
218 |         }
219 | 
220 | 
221 | map_url_re = re.compile(
222 |     r'http://maps.google.com/maps\?saddr=&daddr=-?\d+\.\d+,-?\d+\.\d+'
223 | )
224 | 
225 | 
226 | class IrmaSheltersFloridaMissing(BaseScraper):
227 |     filepath = 'florida-shelters-missing.json'
228 |     our_url = 'https://raw.githubusercontent.com/simonw/disaster-data/master/irma-shelters.json'
229 |     their_url = 'https://raw.githubusercontent.com/simonw/disaster-data/master/florida-shelters.json'
230 |     issue_comments_url = 'https://api.github.com/repos/simonw/disaster-data/issues/2/comments'
231 | 
232 |     def create_message(self, new_data):
233 |         return self.update_message([], new_data, 'Created')
234 | 
235 |     def update_message(self, old_data, new_data, verb='Updated'):
236 |         previous_map_urls = [
237 |             d['map_url'] for d in old_data
238 |         ]
239 |         current_map_urls = [
240 |             d['map_url'] for d in new_data
241 |         ]
242 |         added_map_urls = [
243 |             map_url for map_url in current_map_urls
244 |             if map_url not in previous_map_urls
245 |         ]
246 |         removed_map_urls = [
247 |             map_url for map_url in previous_map_urls
248 |             if map_url not in current_map_urls
249 |         ]
250 | 
251 |         message = []
252 | 
253 |         if added_map_urls:
254 |             message.append('New potentially missing shelters:')
255 | 
256 |         for map_url in added_map_urls:
257 |             shelter = [s for s in new_data if s['map_url'] == map_url][0]
258 |             message.append('  %s (%s County)' % (shelter['name'], shelter['county']))
259 |             message.append('  Type: ' + shelter['type'])
260 |             message.append('  ' + shelter['address'])
261 |             message.append('  ' + shelter['city'])
262 |             message.append('  ' + shelter['map_url'])
263 |             message.append('')
264 | 
265 |         if added_map_urls and removed_map_urls:
266 |             message.append('')
267 | 
268 |         if removed_map_urls:
269 |             message.append('Previous missing shelters now resolved:')
270 | 
271 |         for map_url in removed_map_urls:
272 |             shelter = [s for s in old_data if s['map_url'] == map_url][0]
273 |             message.append('  %s (%s County)' % (shelter['name'], shelter['county']))
274 | 
275 |         body = '\n'.join(message)
276 |         summary = []
277 |         if added_map_urls:
278 |             summary.append('%d potentially missing shelter%s detected' % (
279 |                 len(added_map_urls), '' if len(added_map_urls) == 1 else 's',
280 |             ))
281 |         if removed_map_urls:
282 |             summary.append('%d shelter%s resolved' % (
283 |                 len(removed_map_urls), '' if len(removed_map_urls) == 1 else 's',
284 |             ))
285 |         if current_map_urls:
286 |             summary.append('%d total' % (
287 |                 len(current_map_urls)
288 |             ))
289 |         if summary:
290 |             summary_text = self.filepath + ': ' + (', '.join(summary))
291 |         else:
292 |             summary_text = '%s %s' % (verb, self.filepath)
293 |         return summary_text + '\n\n' + body
294 | 
295 |     def fetch_data(self):
296 |         our_shelters = requests.get(self.our_url).json()
297 |         their_shelters = requests.get(self.their_url).json()
298 |         our_geohashes = set([
299 |             Geohash.encode(s['latitude'], s['longitude'], 6)
300 |             for s in our_shelters
301 |         ])
302 |         for shelter in their_shelters:
303 |             coords = shelter['map_url'].split('daddr=')[1]
304 |             latitude, longitude = map(float, coords.split(','))
305 |             geohash = Geohash.encode(latitude, longitude, 6)
306 |             shelter['geohash'] = geohash
307 |         maybe_missing_shelters = [
308 |             s for s in their_shelters
309 |             if s['geohash'] not in our_geohashes
310 |         ]
311 |         ignore_map_urls = []
312 |         for comment in all_comments(self.issue_comments_url, self.github_token):
313 |             ignore_map_urls.extend(map_url_re.findall(comment['body']))
314 |         maybe_missing_shelters = [
315 |             s for s in maybe_missing_shelters
316 |             if s['map_url'] not in ignore_map_urls
317 |         ]
318 |         return maybe_missing_shelters
319 | 
320 | 
321 | def all_comments(issue_comments_url, github_token):
322 |     # Paginate through all comments on an issue
323 |     while issue_comments_url:
324 |         response = requests.get(
325 |             issue_comments_url,
326 |             headers={
327 |                 'Authorization': 'token %s' % github_token,
328 |             })
329 |         try:
330 |             issue_comments_url = response.links['next']['url']
331 |         except KeyError:
332 |             issue_comments_url = None
333 |         for item in response.json():
334 |             yield item
335 | 


--------------------------------------------------------------------------------
/irma.py:
--------------------------------------------------------------------------------
  1 | from base_scraper import BaseScraper
  2 | from irma_shelters import (
  3 |     IrmaShelters,
  4 |     IrmaShelterDupes,
  5 |     IrmaSheltersFloridaMissing,
  6 | )
  7 | from gis_scrapers import (
  8 |     FemaOpenShelters,
  9 |     FemaNSS,
 10 |     GemaAnimalShelters,
 11 |     GemaActiveShelters,
 12 | )
 13 | from nyc import (
 14 |     NewYorkShelters,
 15 | )
 16 | from north_bay import (
 17 |     CaliforniaDOTRoadInfo,
 18 |     SantaRosaEmergencyInformation,
 19 |     SonomaRoadConditions,
 20 |     CaliforniaHighwayPatrolIncidents,
 21 |     PGEOutagesIndividual,
 22 | )
 23 | from BeautifulSoup import BeautifulSoup as Soup
 24 | import requests
 25 | import os
 26 | import sys
 27 | import time
 28 | import json
 29 | import datetime
 30 | import zipfile
 31 | import StringIO
 32 | from xml.etree import ElementTree
 33 | 
 34 | 
 35 | class GoogleCrisisKmlScraper(BaseScraper):
 36 |     url = 'https://www.google.com/maps/d/u/1/kml?mid=1fJ4NZ21YW1Ru856hehpufId79CA&ll=22.47126398588183%2C-60.6005859375&z=5&cm.ttl=600'
 37 |     source_url = 'http://google.org/crisismap/2017-irma'
 38 |     filepath = 'google-crisis-irma-2017.json'
 39 | 
 40 |     def create_message(self, new_data):
 41 |         return self.update_message([], new_data, verb='Created')
 42 | 
 43 |     def update_message(self, old_data, new_data, verb='Updated'):
 44 |         def name(n):
 45 |             if 'Name' not in n:
 46 |                 return None
 47 |             return ('%s (%s)' % (
 48 |                 n['Name'], n.get('City, State/Province') or ''
 49 |             )).replace(' ()', '')
 50 | 
 51 |         current_names = [name(n) for n in new_data if name(n)]
 52 |         previous_names = [name(n) for n in old_data if name(n)]
 53 |         message = update_message_from_names(
 54 |             current_names,
 55 |             previous_names,
 56 |             self.filepath,
 57 |             verb=verb
 58 |         )
 59 |         message += '\nChange detected on %s' % self.source_url
 60 |         return message
 61 | 
 62 |     def fetch_data(self):
 63 |         zipped = requests.get(self.url).content
 64 |         zipdata = zipfile.ZipFile(StringIO.StringIO(zipped))
 65 |         kml = zipdata.open('doc.kml').read()
 66 |         et = ElementTree.fromstring(kml)
 67 |         shelters = []
 68 |         for placemark in et.findall('.//{http://www.opengis.net/kml/2.2}Placemark'):
 69 |             shelter = {}
 70 |             for data in placemark.findall('{http://www.opengis.net/kml/2.2}ExtendedData/{http://www.opengis.net/kml/2.2}Data'):
 71 |                 key = data.attrib['name']
 72 |                 value = ''.join(s.strip() for s in data.itertext())
 73 |                 shelter[key] = value
 74 |             coords = placemark.find('.//{http://www.opengis.net/kml/2.2}coordinates').text.strip()
 75 |             longitude, latitude, _ = coords.split(',')
 76 |             shelter.update({
 77 |                 'latitude': latitude,
 78 |                 'longitude': longitude,
 79 |             })
 80 |             if 'Phone' in shelter:
 81 |                 # They come through in scientific number format for some reason
 82 |                 shelter['Phone'] = shelter['Phone'].replace('.', '').replace('E9', '')
 83 |             shelters.append(shelter)
 84 |         return shelters
 85 | 
 86 | 
 87 | class SouthCarolinaShelters(BaseScraper):
 88 |     url = 'http://scemd.org/ShelterStatus.html'
 89 |     filepath = 'scemd-shelters.json'
 90 | 
 91 |     def create_message(self, new_data):
 92 |         return self.update_message([], new_data, verb='Created')
 93 | 
 94 |     def update_message(self, old_data, new_data, verb='Updated'):
 95 |         def name(n):
 96 |             return '%s (%s County, SC)' % (
 97 |                 n['Shelter Name'], n['County']
 98 |             )
 99 | 
100 |         current_names = [name(n) for n in new_data]
101 |         previous_names = [name(n) for n in old_data]
102 |         message = update_message_from_names(
103 |             current_names,
104 |             previous_names,
105 |             self.filepath,
106 |             verb=verb
107 |         )
108 |         message += '\nChange detected on %s' % self.url
109 |         return message
110 | 
111 |     def fetch_data(self):
112 |         s = Soup(requests.get(self.url).content)
113 |         table = s.find('table')
114 |         trs = table.findAll('tr')
115 |         headings = [
116 |             th.getText()
117 |             for th in trs[0].findAll('th')
118 |         ]
119 |         shelters = []
120 |         for tr in trs[1:]:
121 |             content = [td.getText() for td in tr.findAll('td')]
122 |             shelters.append(dict(zip(headings, content)))
123 |         return shelters
124 | 
125 | 
126 | class ZeemapsScraper(BaseScraper):
127 |     url = 'https://zeemaps.com/emarkers?g=2682928'
128 |     filepath = 'zeemaps-2682928.json'
129 |     slack_channel = None
130 | 
131 |     def fetch_data(self):
132 |         data = requests.get(self.url).json()
133 |         data.sort(key=lambda d: d['nm'])
134 |         return data
135 | 
136 | 
137 | class FplStormOutages(BaseScraper):
138 |     filepath = 'fpl-storm-outages.json'
139 |     url = 'https://www.fplmaps.com/data/storm-outages.js'
140 |     slack_channel = None
141 | 
142 |     def fetch_data(self):
143 |         content = requests.get(
144 |             self.url,
145 |             timeout=10,
146 |         ).content
147 |         # Stripe the 'define(' and ');'
148 |         if content.startswith('define('):
149 |             content = content.split('define(')[1]
150 |         if content.endswith(');'):
151 |             content = content.rsplit(');', 1)[0]
152 |         return json.loads(content)
153 | 
154 | 
155 | class FplCountyOutages(BaseScraper):
156 |     filepath = 'fpl-county-outages.json'
157 |     url = 'https://www.fplmaps.com/customer/outage/CountyOutages.json'
158 |     slack_channel = None
159 | 
160 |     def fetch_data(self):
161 |         return requests.get(
162 |             self.url,
163 |             timeout=10,
164 |         ).json()
165 | 
166 | 
167 | class ScegOutages(BaseScraper):
168 |     filepath = 'sceg-outages.json'
169 |     url = 'https://www.sceg.com/scanapublicservice/outagemapdata/gismapdataonly.aspx?gisUrl=OUTAGE_EX/Outage_EX&gisMapLayer=6'
170 |     source_url = 'https://www.sceg.com/outages-emergencies/power-outages/outage-map'
171 |     slack_channel = None
172 | 
173 |     def fetch_data(self):
174 |         data = requests.get(self.url).json()
175 |         return [feature['attributes'] for feature in data['features']]
176 | 
177 | 
178 | class GeorgiaOutages(BaseScraper):
179 |     filepath = 'georgiapower-outages.json'
180 |     url = 'http://outagemap.georgiapower.com/external/data/interval_generation_data/2017_09_12_00_59_50/thematic/thematic_areas.js?timestamp='
181 |     slack_channel = None
182 | 
183 |     def fetch_data(self):
184 |         url = self.url + str(int(time.time()))
185 |         return requests.get(url).json()
186 | 
187 | 
188 | class NorthGeorgiaOutages(BaseScraper):
189 |     filepath = 'north-georgia-outages.json'
190 |     url = 'http://www2.ngemc.com:81/api/weboutageviewer/get_live_data'
191 |     slack_channel = None
192 | 
193 |     def fetch_data(self):
194 |         return requests.get(self.url).json()
195 | 
196 | 
197 | class TampaElectricOutages(BaseScraper):
198 |     filepath = 'tampa-electric-outages.json'
199 |     url = 'http://www.tampaelectric.com/residential/outages/outagemap/datafilereader/index.cfm'
200 |     slack_channel = None
201 | 
202 |     def fetch_data(self):
203 |         return requests.get(
204 |             self.url,
205 |             headers={
206 |                 'Referer': 'http://www.tampaelectric.com/residential/outages/outagemap/',
207 |             }
208 |         ).json()['markers']
209 | 
210 | 
211 | class JemcOutages(BaseScraper):
212 |     filepath = 'jemc-outages.json'
213 |     url = 'https://jemc.maps.sienatech.com/data/outages.xml'
214 |     slack_channel = None
215 | 
216 |     def fetch_data(self):
217 |         et = ElementTree.fromstring(requests.get(self.url).content)
218 |         reports = et.find('reports').findall('report')
219 |         data = {}
220 |         for report in reports:
221 |             id = report.attrib['id']
222 |             keys = [d.attrib['key'] for d in report.findall('dimension/dim')]
223 |             rows = report.findall('dataset/t')
224 |             results = [
225 |                 dict(zip(keys, [e.text for e in row]))
226 |                 for row in rows
227 |             ]
228 |             data[id] = results
229 |         return data
230 | 
231 | 
232 | class BaseDukeScraper(BaseScraper):
233 |     slack_channel = None
234 | 
235 |     def fetch_data(self):
236 |         metadata_url = 'https://s3.amazonaws.com/outagemap.duke-energy.com/data/%s/external/interval_generation_data/metadata.xml?timestamp=%d' % (
237 |             self.state_code, int(time.time())
238 |         )
239 |         metadata = requests.get(metadata_url).content
240 |         directory = metadata.split('<directory>')[1].split('</directory>')[0]
241 |         data_url = 'https://s3.amazonaws.com/outagemap.duke-energy.com/data/%s/external/interval_generation_data/%s/thematic/thematic_areas.js?timestamp=%d' % (
242 |             self.state_code, directory, int(time.time())
243 |         )
244 |         return requests.get(data_url).json()
245 | 
246 | 
247 | class DukeFloridaOutages(BaseDukeScraper):
248 |     filepath = 'duke-fl-outages.json'
249 |     state_code = 'fl'
250 | 
251 | 
252 | class DukeCarolinasOutages(BaseDukeScraper):
253 |     filepath = 'duke-ncsc-outages.json'
254 |     state_code = 'ncsc'
255 | 
256 | 
257 | class PascoCounty(BaseScraper):
258 |     # From http://www.pascocountyfl.net/index.aspx?NID=2816
259 |     # in particular this iframe:
260 |     # https://secure.pascocountyfl.net/sheltersdisplay
261 |     filepath = 'pascocountyfl.json'
262 |     url = 'https://secure.pascocountyfl.net/SheltersDisplay/Home/GetShelterInfo'
263 | 
264 |     def create_message(self, new_data):
265 |         return self.update_message([], new_data, verb='Created')
266 | 
267 |     def update_message(self, old_data, new_data, verb='Updated'):
268 |         def name(n):
269 |             return '%s (Pasco County FL)' % n['Name']
270 | 
271 |         current_names = [name(n) for n in new_data]
272 |         previous_names = [name(n) for n in old_data]
273 |         message = update_message_from_names(
274 |             current_names,
275 |             previous_names,
276 |             self.filepath,
277 |             verb=verb
278 |         )
279 |         message += '\nChange detected on http://www.pascocountyfl.net/index.aspx?NID=2816'
280 |         return message
281 | 
282 |     def fetch_data(self):
283 |         data = requests.post(self.url).json()
284 |         data.sort(key=lambda d: d['Name'])
285 |         return data
286 | 
287 | 
288 | class LedgerPolkCounty(BaseScraper):
289 |     filepath = 'ledger-polk-county.json'
290 |     url = 'http://www.ledgerdata.com/hurricane-guide/shelter/'
291 | 
292 |     def create_message(self, new_data):
293 |         return self.update_message([], new_data, verb='Created')
294 | 
295 |     def update_message(self, old_data, new_data, verb='Updated'):
296 |         current_names = [n['name'] for n in new_data]
297 |         previous_names = [n['name'] for n in old_data]
298 | 
299 |         added_names = [name for name in current_names if name not in previous_names]
300 |         removed_names = [name for name in previous_names if name not in current_names]
301 | 
302 |         message = []
303 |         for name in added_names:
304 |             shelter = [n for n in new_data if n['name'] == name][0]
305 |             message.append('Added shelter: %s, %s' % (
306 |                 shelter['name'], shelter['city']
307 |             ))
308 |             message.append('  %s' % shelter['url'])
309 |         if added_names and removed_names:
310 |             message.append('')
311 |         for name in removed_names:
312 |             shelter = [n for n in old_data if n['name'] == name][0]
313 |             message.append('Removed shelter: %s, %s' % (
314 |                 shelter['name'], shelter['city']
315 |             ))
316 |         body = '\n'.join(message)
317 |         summary = []
318 |         if added_names:
319 |             summary.append('%d shelter%s added' % (
320 |                 len(added_names), '' if len(added_names) == 1 else 's',
321 |             ))
322 |         if removed_names:
323 |             summary.append('%d shelter%s removed' % (
324 |                 len(removed_names), '' if len(removed_names) == 1 else 's',
325 |             ))
326 |         if summary:
327 |             summary_text = '%s %s: %s' % (
328 |                 verb, self.filepath, (', '.join(summary))
329 |             )
330 |         else:
331 |             summary_text = '%s %s' % (verb, self.filepath)
332 |         return '%s\n\n%s\nChange detected on %s' % (
333 |             summary_text, body, self.url
334 |         )
335 | 
336 |     def fetch_data(self):
337 |         s = Soup(requests.get(self.url).content)
338 |         trs = s.find('table').findAll('tr')[1:]
339 |         shelters = []
340 |         for tr in trs:
341 |             tds = tr.findAll('td')
342 |             shelters.append({
343 |                 'name': tds[1].getText(),
344 |                 'url': 'http://www.ledgerdata.com/' + tds[1].find('a')['href'],
345 |                 'city': tds[2].getText(),
346 |                 'type': tds[3].getText(),
347 |             })
348 |         return shelters
349 | 
350 | 
351 | class HernandoCountyShelters(BaseScraper):
352 |     filepath = 'hernando-county.json'
353 |     url = 'http://www.hernandocounty.us/em/shelter-information'
354 | 
355 |     def create_message(self, new_data):
356 |         return self.update_message([], new_data, verb='Created')
357 | 
358 |     def update_message(self, old_data, new_data, verb='Updated'):
359 |         current_names = [n['name'] for n in new_data]
360 |         previous_names = [n['name'] for n in old_data]
361 | 
362 |         added_names = [name for name in current_names if name not in previous_names]
363 |         removed_names = [name for name in previous_names if name not in current_names]
364 | 
365 |         message = []
366 |         for name in added_names:
367 |             shelter = [n for n in new_data if n['name'] == name][0]
368 |             message.append('Added shelter: %s, Hernando County' % (
369 |                 shelter['name']
370 |             ))
371 |             message.append('  %s, %s' % (
372 |                 shelter['type'], shelter['status']
373 |             ))
374 |             message.append('  %s' % shelter['address'])
375 |         if added_names and removed_names:
376 |             message.append('')
377 |         for name in removed_names:
378 |             shelter = [n for n in old_data if n['name'] == name][0]
379 |             message.append('Removed shelter: %s, Hernando County' % (
380 |                 shelter['name']
381 |             ))
382 |         body = '\n'.join(message)
383 |         summary = []
384 |         if added_names:
385 |             summary.append('%d shelter%s added' % (
386 |                 len(added_names), '' if len(added_names) == 1 else 's',
387 |             ))
388 |         if removed_names:
389 |             summary.append('%d shelter%s removed' % (
390 |                 len(removed_names), '' if len(removed_names) == 1 else 's',
391 |             ))
392 |         if summary:
393 |             summary_text = '%s %s: %s' % (
394 |                 verb, self.filepath, (', '.join(summary))
395 |             )
396 |         else:
397 |             summary_text = '%s %s' % (verb, self.filepath)
398 |         return '%s\n\n%s\nChange detected on %s' % (
399 |             summary_text, body, self.url
400 |         )
401 | 
402 |     def fetch_data(self):
403 |         s = Soup(requests.get(self.url).content)
404 |         shelters = []
405 |         for tr in s.find('table').findAll('tr'):
406 |             tds = tr.findAll('td')
407 |             img = tds[1].find('img')
408 |             if img is not None:
409 |                 shelter_type = img['alt'].title()
410 |             else:
411 |                 shelter_type = 'General'
412 |             shelters.append({
413 |                 'name': tds[2].getText(),
414 |                 'type': shelter_type,
415 |                 'address': tds[3].getText(),
416 |                 'status': tds[4].getText(),
417 |             })
418 |         return shelters
419 | 
420 | 
421 | def update_message_from_names(current_names, previous_names, filepath, verb='Updated'):
422 |     added_names = [n for n in current_names if n not in previous_names]
423 |     removed_names = [n for n in previous_names if n not in current_names]
424 |     message = []
425 |     for name in added_names:
426 |         message.append('Added shelter: %s' % name)
427 |     if added_names:
428 |         message.append('')
429 |     for name in removed_names:
430 |         message.append('Removed shelter: %s' % name)
431 |     body = '\n'.join(message)
432 |     summary = []
433 |     if added_names:
434 |         summary.append('%d shelter%s added' % (
435 |             len(added_names), '' if len(added_names) == 1 else 's',
436 |         ))
437 |     if removed_names:
438 |         summary.append('%d shelter%s removed' % (
439 |             len(removed_names), '' if len(removed_names) == 1 else 's',
440 |         ))
441 |     if summary:
442 |         summary_text = filepath + ': ' + (', '.join(summary))
443 |     else:
444 |         summary_text = '%s %s' % (verb, filepath)
445 |     return summary_text + '\n\n' + body
446 | 
447 | 
448 | def is_heading(tr):
449 |     return tr.findAll('td')[1].text == 'Shelter Name'
450 | 
451 | 
452 | def is_shelter(tr):
453 |     return len(tr.findAll('td')) == 4 and not is_heading(tr)
454 | 
455 | 
456 | def is_county_heading(tr):
457 |     if tr.find('td').get('colspan') == '5' and (u'#d4d4d4' in tr.find('td').get('style', '')) and tr.text != '&nbsp;':
458 |         return tr.text
459 |     else:
460 |         return None
461 | 
462 | 
463 | class FloridaDisasterShelters(BaseScraper):
464 |     filepath = 'florida-shelters.json'
465 |     url = 'http://www.floridadisaster.org/shelters/summary.aspx'
466 | 
467 |     def update_message(self, old_data, new_data):
468 |         def name(n):
469 |             return '%s (%s County)' % (n['name'], n['county'])
470 | 
471 |         current_names = [name(n) for n in new_data]
472 |         previous_names = [name(n) for n in old_data]
473 |         message = update_message_from_names(current_names, previous_names, self.filepath)
474 |         message += '\nChange detected on %s' % self.url
475 |         return message
476 | 
477 |     def fetch_data(self):
478 |         r = requests.get(self.url)
479 |         if r.status_code != 200:
480 |             print "Oh no - status code = %d" % r.status_code
481 |             return None
482 |         table = Soup(r.content).findAll('table')[9]
483 |         current_county = None
484 |         shelters = []
485 |         for tr in table.findAll('tr'):
486 |             heading = is_county_heading(tr)
487 |             if heading:
488 |                 current_county = heading
489 |             if is_shelter(tr):
490 |                 shelters.append({
491 |                     'type': tr.findAll('td')[0].text,
492 |                     'county': current_county.title(),
493 |                     'name': tr.findAll('td')[1].text,
494 |                     'address': tr.findAll('td')[2].text,
495 |                     'map_url': tr.findAll('td')[2].find('a')['href'].split(' ')[0],
496 |                     'city': tr.findAll('td')[3].text,
497 |                 })
498 |         shelters.sort(key=lambda s: (s['county'], s['name']))
499 |         return shelters
500 | 
501 | 
502 | class CrowdSourceRescue(BaseScraper):
503 |     filepath = 'crowdsourcerescue.json'
504 |     owner = 'simonw'
505 |     repo = 'private-irma-data'
506 |     slack_channel = None
507 |     url = 'https://crowdsourcerescue.com/rescuees/searchApi/'
508 | 
509 |     def fetch_data(self):
510 |         return requests.post(self.url, {
511 |             'needstring': '',
512 |             'lat_min': '23.882475192722612',
513 |             'lat_max': '29.761185051094046',
514 |             'lng_min': '-86.76083325000002',
515 |             'lng_max': '-77.97177075000002',
516 |             'status': '0',
517 |         }).json()
518 | 
519 | 
520 | if __name__ == '__main__':
521 |     test_mode = ('--test' in sys.argv)
522 |     github_token = os.environ.get('GITHUB_API_TOKEN', '')
523 |     slack_token = os.environ.get('SLACK_TOKEN', '')
524 |     scrapers = [
525 |         klass(github_token, slack_token)
526 |         for klass in (
527 |             SantaRosaEmergencyInformation,
528 |             SonomaRoadConditions,
529 |             GoogleCrisisKmlScraper,
530 |             SouthCarolinaShelters,
531 |             FemaOpenShelters,
532 |             FemaNSS,
533 |             IrmaShelters,
534 |             IrmaShelterDupes,
535 |             FloridaDisasterShelters,
536 |             ZeemapsScraper,
537 |             PascoCounty,
538 |             CrowdSourceRescue,
539 |             LedgerPolkCounty,
540 |             HernandoCountyShelters,
541 |             FplStormOutages,
542 |             FplCountyOutages,
543 |             GemaAnimalShelters,
544 |             GemaActiveShelters,
545 |             ScegOutages,
546 |             IrmaSheltersFloridaMissing,
547 |             GeorgiaOutages,
548 |             DukeFloridaOutages,
549 |             DukeCarolinasOutages,
550 |             NorthGeorgiaOutages,
551 |             TampaElectricOutages,
552 |             JemcOutages,
553 |             NewYorkShelters,
554 |             CaliforniaDOTRoadInfo,
555 |             CaliforniaHighwayPatrolIncidents,
556 |             PGEOutagesIndividual,
557 |         )
558 |     ]
559 |     while True:
560 |         print datetime.datetime.now()
561 |         for scraper in scrapers:
562 |             if test_mode and not scraper.test_mode:
563 |                 continue
564 |             try:
565 |                 scraper.scrape_and_store()
566 |             except Exception, e:
567 |                 print "!!!! %s: %s !!!!!" % (
568 |                     scraper.__class__.__name__, e
569 |                 )
570 |                 if test_mode:
571 |                     import pdb; pdb.post_mortem()
572 | 
573 |         time.sleep(120)
574 | 


--------------------------------------------------------------------------------