├── .gitignore ├── process.cfg ├── README.markdown └── process.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | cache/ 4 | -------------------------------------------------------------------------------- /process.cfg: -------------------------------------------------------------------------------- 1 | [geonames] 2 | username = YOUR_USERNAME 3 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | Scrape all datasets from datacatalogs.org, geocode them, tidy them up a little, 2 | and ouput as a JSON list of dictionaries in a single file. The output file has 3 | been uploaded to thedatahub here: 4 | 5 | -------------------------------------------------------------------------------- /process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import ConfigParser 4 | import urllib 5 | import json 6 | import time 7 | import datetime 8 | 9 | config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 10 | 'process.cfg') 11 | config = ConfigParser.SafeConfigParser() 12 | config.read([config_file]) 13 | geonames_username = config.get('geonames', 'username') 14 | geonames_baseurl = 'http://api.geonames.org/searchJSON?maxRows=1&username=%s&q=' % geonames_username 15 | time_of_last_geonames_request = datetime.datetime.now() 16 | 17 | def geonames_lookup(spatial_text): 18 | ''' 19 | Return the lat. and long. from a geonames search for the given text. 20 | 21 | Results from geonames are cached in a local file. 22 | 23 | :param spatial_text: the text to search for, e.g. "Albania: 24 | :type spatial_text: string 25 | 26 | :rtype: a dictionary with keys 'lat' and 'long', or None if the geonames 27 | search fails 28 | 29 | ''' 30 | global time_of_last_geonames_request 31 | 32 | if spatial_text.lower() in ('global', 'earth', 'globe', 'world', 33 | 'worldwide'): 34 | return None 35 | 36 | if os.path.exists('cache/geonames.json'): 37 | cache = json.loads(open('cache/geonames.json', 'r').read()) 38 | else: 39 | cache = {} 40 | if spatial_text in cache: 41 | return cache[spatial_text] 42 | else: 43 | 44 | # Don't send requests to geonames too fast. 45 | if (datetime.datetime.now() - time_of_last_geonames_request).total_seconds() < 0.5: 46 | time.sleep(0.5) 47 | time_of_last_geonames_request = datetime.datetime.now() 48 | 49 | url = geonames_baseurl + urllib.quote(spatial_text) 50 | fo = urllib.urlopen(url) 51 | res = fo.read() 52 | res = json.loads(res) 53 | if res['geonames']: 54 | result = { 55 | 'lat': res['geonames'][0]['lat'], 56 | 'lon': res['geonames'][0]['lng'] 57 | } 58 | cache[spatial_text] = result 59 | json.dump(cache, open('cache/geonames.json', 'w')) 60 | return result 61 | else: 62 | return None 63 | 64 | if not os.path.exists('cache'): 65 | os.makedirs('cache') 66 | 67 | if not os.path.exists('cache/datacatalogs.json'): 68 | url = 'http://datacatalogs.org/api/search/dataset?q=&limit=500&all_fields=1' 69 | urllib.urlretrieve(url, 'cache/datacatalogs.json') 70 | 71 | num_geocoded = 0 72 | num_failed = 0 73 | rawtext = open('cache/datacatalogs.json', 'r').read() 74 | datasets = json.loads(rawtext) 75 | for dataset in datasets['results']: 76 | 77 | # Special-case some problem datasets. 78 | # FIXME: These may not all be correct. 79 | # TODO: 80 | # bouche-rhone-visitprovence 81 | special_cases = { 82 | 'allerdale': 'Allerdale', 83 | 'bordeaux_fr': 'Bordeaux', 84 | 'dati-lombardia': 'Lombardia, Italy', 85 | 'dnv_org': 'North Vancouver', 86 | 'dublinked-datastore': 'Dublin', 87 | 'gironde-aquitaine_fr': 'Gironde', 88 | 'go-geo': 'United Kingdom', 89 | 'montpellier_fr': 'Montpellier', 90 | 'mosman-council-datastore': 'Mosman', 91 | 'nantes_fr': 'Nantes', 92 | 'new-orleans-louisiana': 'New-Orleans', 93 | 'opendata-lv': 'Latvia', 94 | 'openstreetmap': 'Earth', 95 | 'portal-de-datos-abiertos-de-jccm': 'Castilla-La Mancha', 96 | 'provincia-roma': 'Rome', 97 | 'region-of-waterloo-ontario': 'Waterloo, Ontario', 98 | 'rennes_fr': 'Rennes', 99 | 'salford': 'Salford', 100 | 'saone-et-loire_fr': 'Saone-et-Loire', 101 | 'toulouse_fr': 'Toulouse', 102 | 'us-department-of-labor-enforcement-data': 'USA', 103 | 'victoria-australian-state-open-data-catalogue': 'Victoria, Australia', 104 | } 105 | if dataset['name'] in special_cases: 106 | dataset['extras']['spatial_text'] = special_cases[dataset['name']] 107 | 108 | spatial_text = dataset['extras']['spatial_text'] 109 | spatial_text = spatial_text.encode('utf8', 'ignore') 110 | location = geonames_lookup(spatial_text) 111 | if location: 112 | dataset['location'] = location 113 | print "Geocoded dataset {name}".format(**dataset) 114 | num_geocoded = num_geocoded + 1 115 | else: 116 | dataset['location'] = None 117 | print "No geonames in result for dataset {name}".format(**dataset) 118 | num_failed = num_failed + 1 119 | 120 | # Promote the dataset's extras to top-level keys. 121 | dataset['spatial_code'] = dataset['extras']['spatial'] 122 | del dataset['extras']['spatial'] 123 | dataset.update(dataset['extras']) 124 | del dataset['extras'] 125 | 126 | # Delete any empty values. 127 | for key in dataset.keys(): 128 | if not dataset[key]: 129 | del dataset[key] 130 | 131 | json.dump(datasets['results'], 132 | open('cache/datacatalogs.geocoded.json', 'w')) 133 | print "Geocoded {0} datasets, {1} failed or not geo-locatable (e.g. global datacatalogs)".format(num_geocoded, num_failed) 134 | 135 | # Use ckanclient (https://github.com/okfn/ckanclient) to upload the resource 136 | # to thedatahub.org. 137 | import ckanclient.datastore 138 | datastore_url = 'http://datahub.io/api/data/39317285-d0e8-4dad-9e5d-f064100132c9' 139 | client = ckanclient.datastore.DataStoreClient(datastore_url) 140 | 141 | # Specify that the 'location' field is a geo_point. 142 | mapping = { 143 | 'properties': { 144 | 'location': { 145 | 'type': 'geo_point' 146 | } 147 | } 148 | } 149 | 150 | client.delete() 151 | client.mapping_update(mapping) 152 | client.upload('cache/datacatalogs.geocoded.json', refresh=True) 153 | --------------------------------------------------------------------------------