├── .gitignore ├── .travis.yml ├── LICENSE.md ├── Makefile ├── README.md ├── setup.py ├── sheetsite ├── __init__.py ├── chain.py ├── cmdline.py ├── csv_spreadsheet.py ├── destination │ ├── __init__.py │ ├── csv_ss.py │ ├── drop.py │ ├── excel.py │ ├── ftp.py │ ├── git.py │ ├── install_local_soup.py │ ├── json_ss.py │ ├── sqlite_ss.py │ ├── stone_soup.py │ └── stone_soup_v2.py ├── expand.py ├── filtered_spreadsheet.py ├── geocache.py ├── google_spreadsheet.py ├── ids.py ├── json_spreadsheet.py ├── jsonify.py ├── merged_spreadsheet.py ├── names.py ├── sheet.py ├── sheetsend.py ├── sheetwatch.py ├── site.py ├── site_queue.py ├── source │ ├── __init__.py │ ├── csv.py │ ├── excel.py │ ├── google.py │ └── json.py ├── spreadsheet.py ├── tasks │ ├── __init__.py │ ├── detect_site.py │ ├── notify.py │ └── update_site.py ├── templates │ ├── update.html │ └── update.txt ├── tweaks │ ├── __init__.py │ ├── add_dccid.py │ ├── coalesce.py │ ├── custom.py │ ├── formula.py │ ├── list_to_rows.py │ ├── merge_tables.py │ ├── patch.py │ ├── prune_tables.py │ ├── rename_column.py │ ├── rename_table.py │ ├── replace_cell.py │ ├── required_field.py │ ├── sniff_inactive.py │ ├── split_addresses.py │ ├── split_addresses_v2.py │ ├── us_state.py │ └── us_zip.py └── xls_spreadsheet.py ├── sites └── available │ ├── commoners.yml │ ├── hack_spots.yml │ ├── local.yml │ ├── manitoba.yml │ ├── tap.yml │ └── test.yml └── tests ├── configs ├── fill.json ├── json_to_json.json ├── multirow.json └── things.json ├── test_chain.py ├── test_environment.py └── test_filter.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | env.sh 4 | service.json 5 | build 6 | src 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | install: "pip install -e ." 7 | script: "nosetests -s -vv tests" 8 | 9 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Paul Fitzpatrick <paul.michael.fitzpatrick@gmail.com> 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | 'Software'), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | echo "Hello" 3 | 4 | q: 5 | bin/celery3 -A sheetsite.queue worker -l info 6 | 7 | sdist: 8 | rm -rf dist 9 | cp README.md README 10 | python3 setup.py sdist 11 | cd dist && mkdir tmp && cd tmp && tar xzvf ../sheet*.tar.gz && cd sheet*[0-9] && ./setup.py build 12 | python3 setup.py sdist upload 13 | rm -rf dist 14 | rm README MANIFEST 15 | 16 | test: 17 | which nosetests3 && nosetests3 -s -vv tests || echo "no nosetest3" 18 | which nosetests && nosetests -s -vv tests || echo "no nosetest" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sheetsite: sheets for sites 2 | 3 | [![Build Status](https://travis-ci.org/paulfitz/sheetsite.svg?branch=master)](https://travis-ci.org/paulfitz/sheetsite) 4 | [![PyPI version](https://badge.fury.io/py/sheetsite.svg)](http://badge.fury.io/py/sheetsite) 5 | 6 | Keep a website or directory in sync with a google sheet. 7 | 8 | Features: 9 | 10 | * Copy a google spreadsheet locally, as json or excel format. 11 | * Can strip specified tabs, columns, or cells from the spreadsheet, 12 | in case not all of it should be copied along. 13 | * Can push a filtered json copy out to a git repository, handy for 14 | maintaining a website based on a private shared spreadsheet. 15 | * Can augment the sheet with geocoding, adding latitude and longitude based 16 | on address fields for example. 17 | * Can notify people by email with a summary of updates. 18 | 19 | 20 | ## Installation 21 | 22 | For the basics: 23 | 24 | ``` 25 | pip install sheetsite 26 | ``` 27 | 28 | For all bells and whistles, when automating a sheet-to-site workflow: 29 | 30 | ``` 31 | pip install sheetsite[queue] 32 | ``` 33 | 34 | ## Specifying the source and destination 35 | 36 | The `sheetsite` utility, when run without any arguments, will expect 37 | to find all necessary options in a `_sheetsite.yml` file. A simple 38 | example of such a file is: 39 | 40 | ```yaml 41 | source: 42 | name: google-sheets 43 | key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc 44 | credential_file: service.json 45 | 46 | destination: 47 | file: sheet.xlsx 48 | ``` 49 | 50 | The file should have two stanzas, `source` specifying where to get 51 | data from, and `destination` specifying where to put it. This 52 | examples reads a private google spreadsheet and saves it as 53 | `sheet.xlsx`. The key comes from the url of the spreadsheet. 54 | The credentials file is something you [get from google](https://pygsheets.readthedocs.io/en/stable/authorizing.html). 55 | 56 | Here's an example that outputs json: 57 | 58 | ```yaml 59 | source: 60 | name: google-sheets 61 | key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc 62 | credential_file: service.json 63 | 64 | destination: 65 | file: _data/directory.json 66 | ``` 67 | 68 | You could now build a static website from that `.json`, see 69 | http://jekyllrb.com/docs/datafiles/ for how, or see an example 70 | at https://github.com/datacommons/commoners 71 | 72 | Here's an example that adds some geocoded fields and directly 73 | updates a git repository: 74 | 75 | ```yaml 76 | source: 77 | name: google-sheets 78 | key: 19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA 79 | credential_file: service.json 80 | 81 | flags: 82 | add: 83 | directory: 84 | - LAT 85 | - LNG 86 | - COUNTRY 87 | - STREET 88 | - REGION 89 | - LOCALITY 90 | 91 | destination: 92 | name: git 93 | repo: git@github.com:datacommons/commoners 94 | file: _data/directory.json 95 | ``` 96 | 97 | ## Strip private sheets, columns, or cells 98 | 99 | By default, sheetsite will strip: 100 | 101 | * Any columns whose name is in parentheses, e.g. `(Private Notes)` 102 | * Any cells or text within cells surrounded by double parentheses, e.g. `((private@email.address))` 103 | * Any sheets whose name is in double parentheses, e.g. `((secret sheet))` 104 | 105 | ## Geocoding 106 | 107 | If you have a table with a column called `address`, sheetsite can geocode it for 108 | you and pass along the results. Just add the following in your yaml: 109 | 110 | ``` 111 | flags: 112 | add: 113 | table_name_goes_here: 114 | - latitude 115 | - longitude 116 | - country 117 | - state 118 | - city 119 | - street 120 | - zip 121 | ``` 122 | 123 | You can add just the columns you want. Geocoding results are cached in a `_cache` 124 | directory by default so they do not need to be repeated in future calls to sheetsite. 125 | 126 | The full list of columns (with synonyms) available is: 127 | * latitude / lat 128 | * longitude / lng 129 | * latlng 130 | * country 131 | * state / province / region 132 | * city / locality 133 | * street 134 | * zip / postal_code 135 | 136 | Normally you won't actually have a stand-alone `address` column. More usually, 137 | information will be spread over multiple columns, or some will be implicit (e.g. 138 | the state/province and country). You can tell sheetsite how to construct addresses 139 | for geocoding by listing columns and constants to build it from. For example: 140 | 141 | ``` 142 | flags: 143 | address: 144 | table_name_goes_here: 145 | - street_address1 146 | - street_address2 147 | - city 148 | - Manitoba 149 | - Canada 150 | add: 151 | table_name_goes_here: 152 | - postal_code 153 | ``` 154 | 155 | This tells sheetsite to produce addresses of the form: 156 | ``` 157 | Manitoba Canada 158 | ``` 159 | And add a `postal_code` column populated by geocoding. 160 | 161 | It is possible to request columns directly in the spreadsheet. Just 162 | wrap the column name in square brackets, like `[state]` or `[zip]`. 163 | Any blank cells in such columns will be filled using geocoding based 164 | on the address given in that row. If the address columns have not been 165 | configured in `flags` then the address must be present in a single column 166 | literally called `address`. 167 | 168 | ## Row uuids 169 | 170 | There's a random feature to add uuids to rows. Just add a column 171 | called `dccid` for some reason: 172 | 173 | ``` 174 | flags: 175 | add: 176 | table_name_goes_here: 177 | - dccid 178 | ``` 179 | 180 | A uuid will be added to each row. A good faith effort will be made 181 | to keep that uuid constant across updates, keeping it linked to the 182 | row where it first appeared. 183 | 184 | ## Grouping locations 185 | 186 | If there are several rows of a sheet that will give locations that should 187 | be thought of as a single unit (e.g. an organization with multiple locations), 188 | you can tell `sheetsite` about that. To do so, give it a `group` key. 189 | Every row for which the `group` is the same (and not blank) will be bound 190 | together. When geocaching, blank cells in address cells will be filled 191 | in with information from the first row in this group. For example, with this 192 | configuration: 193 | 194 | ``` 195 | flags: 196 | group: WEBSITE 197 | ``` 198 | 199 | Then for a table like the following: 200 | 201 | ``` 202 | STREET, CITY, STATE, WEBSITE 203 | ... 204 | 17 N St, Foo, Utopia, joe.ut 205 | 16 S St, , , joe.ut 206 | ... 207 | ``` 208 | 209 | During geocoding, `16 S St` would be assumed to be in `Foo, Utopia`. 210 | 211 | ## Renaming columns 212 | 213 | Columns can be renamed. This will occur before any other operation. 214 | 215 | ``` 216 | flags: 217 | rename: 218 | table_name: 219 | old_column_name1: new_column_name1 220 | old_column_name2: new_column_name2 221 | ``` 222 | 223 | ## Getting credentials 224 | 225 | [Obtain credentials for accessing sheets from the Google Developers Console](https://pygsheets.readthedocs.io/en/latest/authorizing.html). 226 | 227 | Make sure you share the sheet with the email address in the credentials file. Read-only permission is fine. 228 | 229 | ## Examples 230 | 231 | For example, the map at http://datacommons.coop/tap/ is a visualization 232 | of data pulled from a google spreadsheet, styled using 233 | https://github.com/datacommons/tap via github pages. 234 | 235 | ## sheetwatch 236 | 237 | It can be useful to automate and forget `sheetsite`, so that updates 238 | to a google spreadsheet propagate automatically to their final 239 | destination. The `sheetwatch` utility does this. It requires a queue 240 | server to operate. To install, do: 241 | 242 | ``` 243 | pip install sheetsite[queue] 244 | ``` 245 | 246 | Install any queue server supported by `celery`. For example, `redis`: 247 | 248 | ``` 249 | sudo apt-get install redis-server 250 | redis-server 251 | ``` 252 | 253 | We need to set some environment variables to let `sheetwatch` know 254 | where to find the queue server: 255 | 256 | ``` 257 | export SHEETSITE_BROKER_URL=redis://localhost 258 | export SHEETSITE_RESULT_BACKEND=redis://localhost 259 | ``` 260 | 261 | The `sheetwatch` program needs a cache directory for its operations. 262 | 263 | ``` 264 | export SHEETSITE_CACHE=$HOME/cache/sites 265 | ``` 266 | 267 | Finally, it needs to know where there is a directory full of `yml` 268 | files describing any sheets to monitor and their corresponding sites: 269 | 270 | ``` 271 | export SHEETSITE_LAYOUT=$PWD/sites/enabled 272 | ``` 273 | 274 | We now start a worker: 275 | 276 | ``` 277 | sheetwatch worker 278 | ``` 279 | 280 | The last thing we need to do is check a mailbox from time to time 281 | for sheet change notifications from Google, and kick off site updates 282 | as needed: 283 | 284 | ``` 285 | export GMAIL_USERNAME=***** 286 | export GMAIL_PASSWORD=***** 287 | sheetwatch ping --delay 60 288 | ``` 289 | 290 | ## License 291 | 292 | sheetsite is distributed under the MIT License. 293 | 294 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from distutils.core import setup 5 | from setuptools import find_packages 6 | import os.path 7 | 8 | 9 | def read(fname, fname2): 10 | if not(os.path.exists(fname)): 11 | fname = fname2 12 | with open(os.path.join(os.path.dirname(__file__), fname)) as f: 13 | return f.read() 14 | 15 | 16 | setup( 17 | name="sheetsite", 18 | version="0.2.2", 19 | author="Paul Fitzpatrick", 20 | author_email="paul.michael.fitzpatrick@gmail.com", 21 | description=("read google sheets, use them for sites"), 22 | license="MIT", 23 | keywords="google sheet xls json", 24 | url="https://github.com/paulfitz/sheetsite", 25 | packages=find_packages(), 26 | entry_points={ 27 | "console_scripts": [ 28 | "sheetsite=sheetsite.cmdline:cmd_sheetsite", 29 | "sheetwatch=sheetsite.sheetwatch:run" 30 | ] 31 | }, 32 | long_description=read('README', 'README.md'), 33 | classifiers=[ 34 | "Development Status :: 3 - Alpha", 35 | "Topic :: Utilities", 36 | "License :: OSI Approved :: MIT License" 37 | ], 38 | install_requires=[ 39 | "daff>=1.3.39", 40 | "dataset>=1.0.2", 41 | "oauth2client>=2.0.0", 42 | "openpyxl", 43 | "pygsheets", 44 | "pyyaml", 45 | "requests", 46 | "six", 47 | "tqdm" 48 | ], 49 | extras_require={ 50 | "queue": [ 51 | "celery", 52 | "jinja2", 53 | "premailer", 54 | "redis" 55 | ] 56 | } 57 | ) 58 | -------------------------------------------------------------------------------- /sheetsite/__init__.py: -------------------------------------------------------------------------------- 1 | from sheetsite.sheet import Sheets 2 | -------------------------------------------------------------------------------- /sheetsite/chain.py: -------------------------------------------------------------------------------- 1 | import daff 2 | import os 3 | from sheetsite.ids import process_ids 4 | from sheetsite.sheet import Sheets 5 | from sheetsite.site import Site 6 | from sheetsite.source import read_source 7 | from sheetsite.destination import write_destination 8 | import shutil 9 | 10 | 11 | def apply_chain(site, path): 12 | 13 | if not(os.path.exists(path)): 14 | os.makedirs(path) 15 | 16 | source = site['source'] 17 | destination = site['destination'] 18 | tweaks = site.get('tweaks') 19 | 20 | wb = None 21 | 22 | raw_file = os.path.join(path, 'raw.json') 23 | if 'cache' in source: 24 | wb = read_source({ 25 | 'filename': raw_file 26 | }) 27 | else: 28 | wb = read_source(source) 29 | 30 | ss = Site(wb, os.path.join(path, 'geocache.sqlite')) 31 | if 'flags' in site: 32 | ss.configure(site['flags']) 33 | output_file = os.path.join(path, 'public.json') 34 | prev_raw_file = os.path.join(path, 'prev_raw.json') 35 | private_output_file = os.path.join(path, 'private.json') 36 | id_file = os.path.join(path, 'ids.json') 37 | prev_id_file = os.path.join(path, 'prev_ids.json') 38 | if os.path.exists(raw_file): 39 | shutil.copyfile(raw_file, prev_raw_file) 40 | if os.path.exists(id_file): 41 | shutil.copyfile(id_file, prev_id_file) 42 | 43 | ss.save_local(raw_file, enhance=False) 44 | 45 | ids = process_ids(prev_raw_file, raw_file, prev_id_file, id_file) 46 | ss.add_ids(ids) 47 | 48 | state = { 49 | 'path': path, 50 | 'output_file': output_file, 51 | 'id_file': id_file 52 | } 53 | 54 | if tweaks: 55 | import json 56 | wj = json.load(open(raw_file, 'r')) 57 | if hasattr(tweaks, 'items'): 58 | tweak_items = tweaks.items() 59 | else: 60 | tweak_items = [[params['tweak'], params] for params in tweaks] 61 | for tweak, params in tweak_items: 62 | print("Working on tweak", json.dumps(tweak)) 63 | if 'tweak' in params: 64 | tweak = params['tweak'] 65 | import importlib 66 | mod = importlib.import_module('sheetsite.tweaks.{}'.format(tweak)) 67 | ct = 2 68 | try: 69 | target = mod.apply3 70 | ct = 3 71 | except AttributeError: 72 | target = mod.apply 73 | if ct == 3: 74 | target(Sheets(wj), params, state) 75 | else: 76 | target(Sheets(wj), params) 77 | from sheetsite.json_spreadsheet import JsonSpreadsheet 78 | ss.workbook = JsonSpreadsheet(None, data=wj) 79 | 80 | ss.save_local(output_file) 81 | if not os.path.exists(prev_raw_file): 82 | # once daff can cope with blank tables correctly, switch to this 83 | # with open(prev_raw_file, 'w') as fout: 84 | # fout.write('{ "names": [], "tables": [] }') 85 | shutil.copyfile(raw_file, prev_raw_file) 86 | shutil.copyfile(id_file, prev_id_file) 87 | ss.save_local(private_output_file, private_sheets=True) 88 | 89 | state['workbook'] = ss.public_workbook() 90 | 91 | write_destination(destination, state) 92 | 93 | return { 94 | 'prev_raw_file': prev_raw_file, 95 | 'raw_file': raw_file 96 | } 97 | 98 | 99 | def compute_diff(files, format='html'): 100 | io = daff.TableIO() 101 | dapp = daff.Coopy(io) 102 | t1 = dapp.loadTable(files['prev_raw_file'], 'local') 103 | t2 = dapp.loadTable(files['raw_file'], 'remote') 104 | if format == 'both': 105 | r1 = daff.diffAsHtml(t1, t2) 106 | r2 = daff.diffAsAnsi(t1, t2) 107 | return (r1, r2) 108 | if format == 'html': 109 | return daff.diffAsHtml(t1, t2) 110 | return daff.diffAsAnsi(t1, t2) 111 | -------------------------------------------------------------------------------- /sheetsite/cmdline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | import os 5 | from sheetsite.chain import apply_chain, compute_diff 6 | from sheetsite.expand import load_config 7 | import sys 8 | 9 | 10 | def run(argv): 11 | parser = argparse.ArgumentParser(description='Run a website from a spreadsheet. ' 12 | 'Take a spreadsheet (from google sheets or locally), and ' 13 | 'convert it to a .json file that a static website ' 14 | 'generator like jekyll can use. Optionally strip private ' 15 | 'information and add derived geographic fields like ' 16 | 'latitude and longitude.') 17 | 18 | parser.add_argument('--config', nargs='*', required=False, 19 | default=['_sheetsite.yml', '_sheetsite.json'], 20 | help='name of configuration file.') 21 | 22 | parser.add_argument('--cache-dir', nargs=1, required=False, default=['_cache'], 23 | help='name of default cache directory.') 24 | 25 | args = parser.parse_args(argv) 26 | 27 | config_file = None 28 | for config_candidate in args.config: 29 | if os.path.exists(config_candidate): 30 | config_file = config_candidate 31 | break 32 | if not config_file: 33 | print("Could not find config file", args.config) 34 | exit(1) 35 | params = load_config(config_file) 36 | files = apply_chain(params, args.cache_dir[0]) 37 | diff = compute_diff(files, 'ansi') 38 | print(diff) 39 | 40 | 41 | def cmd_sheetsite(): 42 | run(sys.argv[1:]) 43 | 44 | 45 | if __name__ == '__main__': 46 | cmd_sheetsite() 47 | 48 | -------------------------------------------------------------------------------- /sheetsite/csv_spreadsheet.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class CsvSpreadsheet(object): 5 | 6 | def __init__(self, filename): 7 | with open(filename, 'r') as fin: 8 | reader = csv.reader(fin) 9 | self.data = [row for row in reader] 10 | 11 | def worksheets(self): 12 | return [self] 13 | 14 | def get_all_values(self): 15 | return self.data 16 | 17 | @property 18 | def title(self): 19 | return "sheet" 20 | -------------------------------------------------------------------------------- /sheetsite/destination/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from sheetsite.destination.drop import write_destination_drop 4 | from sheetsite.destination.excel import write_destination_excel 5 | from sheetsite.destination.ftp import write_destination_ftp 6 | from sheetsite.destination.git import write_destination_git 7 | from sheetsite.destination.json_ss import write_destination_json 8 | from sheetsite.destination.stone_soup import write_destination_stone_soup 9 | from sheetsite.destination.sqlite_ss import write_destination_sqlite 10 | from sheetsite.destination.csv_ss import write_destination_csv 11 | 12 | def write_destination_chain(params, state): 13 | writers = params['chain'] 14 | for writer in writers: 15 | writer['parent'] = params 16 | write_destination(writer, state) 17 | 18 | def write_destination(params, state): 19 | 20 | if isinstance(params, list): 21 | params = { 22 | 'name': 'chain', 23 | 'chain': params 24 | } 25 | 26 | writers = { 27 | 'chain': write_destination_chain, 28 | 'drop': write_destination_drop, 29 | 'ftp': write_destination_ftp, 30 | 'git': write_destination_git, 31 | 'stone-soup': write_destination_stone_soup, 32 | '.sqlite': write_destination_sqlite, 33 | '.sqlite3': write_destination_sqlite, 34 | '.json': write_destination_json, 35 | '.xlsx': write_destination_excel, 36 | '.xls': write_destination_excel, 37 | '.csv': write_destination_csv, 38 | 'drop': write_destination_drop, 39 | 'chain': write_destination_chain 40 | } 41 | 42 | name = None 43 | if 'name' in params: 44 | name = params['name'] 45 | elif 'step' in params and params['step'] != 'save': 46 | name = params['step'] 47 | elif 'output_file' in params: 48 | _, ext = os.path.splitext(params['output_file']) 49 | name = ext 50 | elif 'file' in params: 51 | _, ext = os.path.splitext(params['file']) 52 | name = ext 53 | params['output_file'] = params['file'] 54 | 55 | if name not in writers: 56 | import importlib 57 | return importlib.import_module('sheetsite.destination.{}'.format(name)).apply(params, 58 | state) 59 | 60 | return writers[name](params, state) 61 | -------------------------------------------------------------------------------- /sheetsite/destination/csv_ss.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | def write_destination_csv(params, state): 4 | workbook = state['workbook'] 5 | output_file = params['output_file'] 6 | for sheet in workbook.worksheets(): 7 | title = sheet.title 8 | rows = sheet.get_all_values() 9 | with open(output_file, 'w') as csvfile: 10 | writer = csv.writer(csvfile) 11 | writer.writerows(rows) 12 | return True 13 | -------------------------------------------------------------------------------- /sheetsite/destination/drop.py: -------------------------------------------------------------------------------- 1 | def write_destination_drop(params, state): 2 | pass 3 | -------------------------------------------------------------------------------- /sheetsite/destination/excel.py: -------------------------------------------------------------------------------- 1 | def write_destination_excel(params, state): 2 | workbook = state['workbook'] 3 | output_file = params['output_file'] 4 | from openpyxl import Workbook 5 | wb = Workbook() 6 | first = True 7 | for sheet in workbook.worksheets(): 8 | title = sheet.title 9 | if first: 10 | ws = wb.active 11 | first = False 12 | else: 13 | ws = wb.create_sheet() 14 | ws.title = title 15 | rows = sheet.get_all_values() 16 | for r, row in enumerate(rows): 17 | for c, cell in enumerate(row): 18 | ws.cell(row=r+1, column=c+1).value = cell 19 | wb.save(output_file) 20 | return True 21 | -------------------------------------------------------------------------------- /sheetsite/destination/ftp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def write_destination_ftp(params, state): 5 | output_file = state['output_file'] 6 | url = params['url'] 7 | cmd = ['wput', '-v', '--binary', '-u', '-nc', output_file, url] 8 | print(' '.join(cmd)) 9 | out = subprocess.check_output(cmd) 10 | print("ftp: {}".format(out)) 11 | return True 12 | -------------------------------------------------------------------------------- /sheetsite/destination/git.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | 5 | 6 | def write_destination_git(destination, state): 7 | wd = os.getcwd() 8 | try: 9 | path = state['path'] 10 | output_file = state['output_file'] 11 | local_repo = os.path.join(path, destination.get('local', 'repo')) 12 | if not(os.path.exists(local_repo)): 13 | subprocess.check_output(['git', 'clone', destination['repo'], local_repo]) 14 | os.chdir(local_repo) 15 | subprocess.check_output(['git', 'pull']) 16 | os.chdir(wd) 17 | shutil.copyfile(output_file, os.path.join(local_repo, destination['file'])) 18 | os.chdir(local_repo) 19 | subprocess.check_output(['git', 'add', destination['file']]) 20 | try: 21 | subprocess.check_output(['git', 'commit', '-m', 'update from sheetsite']) 22 | subprocess.check_output(['git', 'push']) 23 | except subprocess.CalledProcessError: 24 | print("Commit/push skipped") 25 | finally: 26 | os.chdir(wd) 27 | -------------------------------------------------------------------------------- /sheetsite/destination/install_local_soup.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | # Hey this is embarrassing I'll remove it soon I promise. 4 | # I mean, maybe. Or I'll leave it malingering for years. 5 | 6 | 7 | def apply(params, state): 8 | subprocess.check_output(["cp", 9 | state['sqlite_file'], 10 | "/srv/git/datacommons_manitoba/production.sqlite3"]) 11 | ok = False 12 | for i in range(0, 4): 13 | try: 14 | subprocess.check_output(["/srv/git/datacommons_manitoba/rebuild.sh"]) 15 | ok = True 16 | break 17 | except subprocess.CalledProcessError: 18 | pass 19 | 20 | if not ok: 21 | raise subprocess.CalledProcessError("rebuild sadness") 22 | 23 | return True 24 | -------------------------------------------------------------------------------- /sheetsite/destination/json_ss.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sheetsite.jsonify import dump, dumps 3 | from sheetsite.json_spreadsheet import JsonSpreadsheet 4 | 5 | 6 | def write_destination_json(params, state): 7 | workbook = state['workbook'] 8 | output_file = params['output_file'] 9 | result = JsonSpreadsheet.as_dict(workbook) 10 | if output_file is None: 11 | print(dumps(result, indent=2)) 12 | else: 13 | with open(output_file, 'w') as f: 14 | dump(result, f, indent=2) 15 | return True 16 | -------------------------------------------------------------------------------- /sheetsite/destination/sqlite_ss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def write_destination_sqlite(params, state): 5 | path = state['path'] 6 | output_file_prev = state['output_file'] 7 | output_file_next = params['output_file'] 8 | subprocess.check_output(['ssformat', 9 | 'dbi:jsonbook::file={}'.format(output_file_prev), 10 | output_file_next]) 11 | state['output_file'] = output_file_next 12 | return True 13 | -------------------------------------------------------------------------------- /sheetsite/destination/stone_soup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import csv 5 | import json 6 | import os 7 | import time 8 | import sys 9 | 10 | import sqlite3 as lite 11 | 12 | 13 | schema = ''' 14 | CREATE TABLE IF NOT EXISTS access_rules (id INTEGER PRIMARY KEY,access_type TEXT); 15 | CREATE TABLE IF NOT EXISTS data_sharing_orgs (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME,default_import_plugin_name TEXT); 16 | CREATE TABLE IF NOT EXISTS data_sharing_orgs_taggables (id INTEGER PRIMARY KEY,data_sharing_org_id INTEGER NOT NULL,taggable_id INTEGER NOT NULL,verified INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME,foreign_key_id TEXT,taggable_type TEXT); 17 | CREATE TABLE IF NOT EXISTS data_sharing_orgs_users (data_sharing_org_id INTEGER NOT NULL,user_id INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME); 18 | CREATE TABLE IF NOT EXISTS entries (id INTEGER PRIMARY KEY,name TEXT,physical_address1 TEXT,physical_address2 TEXT,physical_city TEXT,physical_state TEXT,physical_zip TEXT,physical_country TEXT,mailing_address1 TEXT,mailing_address2 TEXT,mailing_city TEXT,mailing_state TEXT,mailing_zip TEXT,mailing_country TEXT,phone1 TEXT,phone2 TEXT,fax TEXT,email TEXT,website TEXT,preferred_contact TEXT,description TEXT,created_at DATETIME,updated_at DATETIME,created_by_id INTEGER,updated_by_id INTEGER,latitude REAL,longitude REAL,distance REAL,member_id INTEGER,prod_serv1 TEXT,prod_serv2 TEXT,prod_serv3 TEXT,support_organization INTEGER,worker_coop INTEGER,producer_coop INTEGER,marketing_coop INTEGER,housing_coop INTEGER,consumer_coop INTEGER,community_land_trust INTEGER,conservation_ag_land_trust INTEGER,alternative_currency INTEGER,intentional_community INTEGER,collective INTEGER,artist_run_center INTEGER,community_center INTEGER,community_development_financial_institution INTEGER,cooperative_financial_institution INTEGER,mutual_aid_self_help_group INTEGER,activist_social_change_organization INTEGER,union_labor_organization INTEGER,government INTEGER,fair_trade_organization INTEGER,network_association INTEGER,non_profit_org INTEGER,esop INTEGER,majority_owned_esop INTEGER,percentage_owned INTEGER,other INTEGER,type_of_other TEXT,naics_code INTEGER,informal INTEGER,cooperative INTEGER,partnership INTEGER,llc INTEGER,s_corporation INTEGER,c_corporation INTEGER,non_profit_corporation_501c3 INTEGER,non_profit_corporation_501c4 INTEGER,non_profit_corporation_other INTEGER,other_type_of_incorp INTEGER,type_of_other_incorp TEXT,have_a_fiscal_sponsor INTEGER,year_founded DATETIME,democratic INTEGER,union_association INTEGER,which_union TEXT); 19 | CREATE TABLE IF NOT EXISTS legal_structures (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME); 20 | CREATE TABLE IF NOT EXISTS locations (id INTEGER PRIMARY KEY,taggable_id INTEGER NOT NULL,note TEXT,physical_address1 TEXT,physical_address2 TEXT,physical_city TEXT,physical_state TEXT,physical_zip TEXT,physical_country TEXT,mailing_address1 TEXT,mailing_address2 TEXT,mailing_city TEXT,mailing_state TEXT,mailing_zip TEXT,mailing_country TEXT,latitude REAL,longitude REAL,created_at DATETIME,updated_at DATETIME,mailing_county TEXT,physical_county TEXT,taggable_type TEXT); 21 | CREATE TABLE IF NOT EXISTS member_orgs (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME); 22 | CREATE TABLE IF NOT EXISTS member_orgs_organizations (member_org_id INTEGER NOT NULL,organization_id INTEGER NOT NULL); 23 | CREATE TABLE IF NOT EXISTS org_types (id INTEGER PRIMARY KEY,name TEXT,description TEXT,created_at DATETIME,updated_at DATETIME); 24 | CREATE TABLE IF NOT EXISTS org_types_organizations (org_type_id INTEGER NOT NULL,organization_id INTEGER NOT NULL); 25 | CREATE TABLE IF NOT EXISTS organizations (id INTEGER PRIMARY KEY,name TEXT NOT NULL,description TEXT,created_by_id INTEGER,updated_by_id INTEGER,phone TEXT,fax TEXT,email TEXT,website TEXT,year_founded DATETIME,democratic INTEGER,primary_location_id INTEGER,created_at DATETIME,updated_at DATETIME,legal_structure_id INTEGER,access_rule_id INTEGER NOT NULL,import_notice_sent_at DATETIME,email_response_token TEXT,responded_at DATETIME,response TEXT); 26 | CREATE TABLE IF NOT EXISTS organizations_people (id INTEGER PRIMARY KEY,organization_id INTEGER NOT NULL,person_id INTEGER NOT NULL,role_name TEXT,phone TEXT,email TEXT,created_at DATETIME,updated_at DATETIME); 27 | CREATE TABLE IF NOT EXISTS organizations_sectors (organization_id INTEGER NOT NULL,sector_id INTEGER NOT NULL); 28 | CREATE TABLE IF NOT EXISTS organizations_users (organization_id INTEGER NOT NULL,user_id INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME); 29 | CREATE TABLE IF NOT EXISTS people (id INTEGER PRIMARY KEY,firstname TEXT,lastname TEXT,phone_mobile TEXT,phone_home TEXT,fax TEXT,email TEXT,phone_contact_preferred INTEGER,email_contact_preferred INTEGER,created_at DATETIME,updated_at DATETIME,access_rule_id INTEGER NOT NULL); 30 | CREATE TABLE IF NOT EXISTS product_services (id INTEGER PRIMARY KEY,name TEXT,organization_id INTEGER,created_at DATETIME,updated_at DATETIME); 31 | CREATE TABLE IF NOT EXISTS schema_migrations (version TEXT NOT NULL); 32 | CREATE TABLE IF NOT EXISTS sectors (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME); 33 | CREATE TABLE IF NOT EXISTS tag_contexts (id INTEGER PRIMARY KEY,name TEXT,friendly_name TEXT); 34 | CREATE TABLE IF NOT EXISTS tag_worlds (id INTEGER PRIMARY KEY,name TEXT); 35 | CREATE TABLE IF NOT EXISTS taggings (id INTEGER PRIMARY KEY,tag_id INTEGER,taggable_id INTEGER,taggable_type TEXT,created_at DATETIME); 36 | CREATE TABLE IF NOT EXISTS tags (id INTEGER PRIMARY KEY,name TEXT,root_id INTEGER,root_type TEXT,parent_id INTEGER,effective_id INTEGER,created_at DATETIME,updated_at DATETIME); 37 | CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY,login TEXT,password TEXT,is_admin INTEGER,created_at DATETIME,last_login DATETIME,person_id INTEGER,update_notifications_enabled INTEGER); 38 | ''' 39 | 40 | 41 | def get_prop(key,rows): 42 | val = None 43 | many_versions = False 44 | for row in rows: 45 | v = row[key] 46 | if v != None: 47 | if val == None: 48 | val = v 49 | if v != val: 50 | many_versions = True 51 | return val, many_versions 52 | 53 | def get_props(keys,rows,first): 54 | result = {} 55 | for key in keys: 56 | val, many_versions = get_prop(key,rows) 57 | if many_versions and not(first): 58 | val = None 59 | result[key] = val 60 | return result 61 | 62 | def get_common_props(rows): 63 | return get_props(rows[0].keys(),rows,False) 64 | 65 | def get_main_props(rows): 66 | return get_props(rows[0].keys(),rows,True) 67 | 68 | def fix_email(email): 69 | if email==None: 70 | return email 71 | email = str(email) 72 | email = re.sub(r'mailto:','',email) 73 | return email 74 | 75 | def make_org(props): 76 | organization = { 77 | 'name': props["NAME"], 78 | 'phone': props["PHONE"], 79 | 'email': fix_email(props["EMAIL"]), 80 | 'website': props["WEBSITE"], 81 | 'description': props["GOODS AND SERVICES"], 82 | 'access_rule_id': 1 83 | } 84 | return organization 85 | 86 | def safe_access(props,key): 87 | if not(key in props): 88 | return None 89 | x = props[key] 90 | if x == "": 91 | return None 92 | return x 93 | 94 | def make_loc(props,rid): 95 | location = { 96 | 'physical_address1': props["Physical Address"], 97 | 'physical_address2': None, 98 | 'physical_city': props["City"], 99 | 'physical_state': props["State"], 100 | 'physical_zip': safe_access(props,"Postal Code"), 101 | 'physical_country': props["Country"], 102 | 'latitude': safe_access(props,"Latitude"), 103 | 'longitude': safe_access(props,"Longitude"), 104 | 'taggable_id': rid, 105 | 'taggable_type': "Organization" 106 | } 107 | return location 108 | 109 | def insert_hash(cur,tbl,values): 110 | columns = ', '.join([('"'+v+'"') for v in values.keys()]) 111 | placeholders = ', '.join('?' * len(values)) 112 | sql = 'INSERT INTO {} ({}) VALUES ({})'.format(tbl,columns,placeholders) 113 | # print(sql) 114 | # print(values.values()) 115 | cur.execute(sql, list(values.values())) 116 | return cur.lastrowid 117 | 118 | def blanky(x): 119 | if x == "" or x == None: 120 | return None 121 | return x 122 | 123 | 124 | def write_destination_stone_soup(params, state): 125 | 126 | path = state['path'] 127 | output_file = state['output_file'] 128 | 129 | target = os.path.join(path, 'stonesoup.sqlite3') 130 | state['sqlite_file'] = target 131 | 132 | if os.path.exists(target): 133 | os.remove(target) 134 | con = lite.connect(target) 135 | cur = con.cursor() 136 | 137 | global schema 138 | cur.executescript(schema) 139 | 140 | ot = insert_hash(cur, "tag_contexts", { 141 | 'name': 'OrgType', 142 | 'friendly_name': 'Organization Type' 143 | }) 144 | ot = insert_hash(cur, "tags", { 145 | 'name': 'OrgType', 146 | 'root_id': ot, 147 | 'root_type': "TagContext" 148 | }) 149 | 150 | cur.execute('INSERT OR REPLACE INTO access_rules VALUES (1,"PUBLIC");') 151 | 152 | cur.execute('INSERT OR REPLACE INTO data_sharing_orgs (id,name) VALUES (1,?);', 153 | [params['organization']]) 154 | 155 | org_names = [] 156 | orgs = {} 157 | 158 | lol = json.load(open(output_file))["tables"]["directory"]["rows"] 159 | 160 | # collect all locations for each org 161 | for idx, row in enumerate(lol): 162 | name = row['NAME'] 163 | if not(name in orgs): 164 | orgs[name] = [] 165 | org_names.append(name) 166 | orgs[name].append(row) 167 | 168 | organizations = [] 169 | 170 | print("ORG COUNT " + str(len(org_names))) 171 | 172 | for idx, name in enumerate(org_names): 173 | rows = orgs[name] 174 | common = get_common_props(rows) 175 | main = get_main_props(rows) 176 | print(name + " : " + str(common) + " " + str(len(rows))) 177 | organization = make_org(common) 178 | rid = insert_hash(cur, "organizations", organization) 179 | fid = None 180 | for row in rows: 181 | loc = make_loc(row, rid) 182 | if loc['latitude'] == None: 183 | loc['latitude'] = blanky(row['Latitude']) 184 | if loc['longitude'] == None: 185 | loc['longitude'] = blanky(row['Longitude']) 186 | if loc['physical_zip'] == None: 187 | loc['physical_zip'] = blanky(row['Postal Code']) 188 | fid0 = insert_hash(cur,"locations",loc) 189 | if fid == None: 190 | fid = fid0 191 | cur.execute("UPDATE organizations SET primary_location_id = ? WHERE id = ?", 192 | [fid, rid]) 193 | insert_hash(cur,"data_sharing_orgs_taggables",{ 194 | "data_sharing_org_id": 1, 195 | "taggable_id": rid, 196 | "taggable_type": "Organization", 197 | "verified": 1, 198 | "foreign_key_id": 999 199 | }) 200 | typ = main["TYPE"] 201 | if typ: 202 | v = cur.execute('SELECT id FROM org_types WHERE name = ?',[typ]).fetchall() 203 | tid = None 204 | if len(v) == 0: 205 | tid = insert_hash(cur,"org_types",{ 206 | 'name': typ 207 | }) 208 | tid = insert_hash(cur,"tags",{ 209 | 'name': typ, 210 | 'root_id': tid, 211 | 'root_type': "OrgType", 212 | 'parent_id': ot 213 | }) 214 | else: 215 | tid = v[0][0] 216 | tid = cur.execute('SELECT id FROM tags WHERE root_id = ? AND root_type = "OrgType"',[tid]).fetchall()[0][0] 217 | insert_hash(cur,"taggings",{ 218 | "tag_id": tid, 219 | "taggable_id": rid, 220 | "taggable_type": "Organization" 221 | }) 222 | dex = main['Index'] 223 | if dex: 224 | for dex in [x.strip() for x in dex.lower().split(',')]: 225 | v = cur.execute('SELECT id FROM tags WHERE name = ?',[dex]).fetchall() 226 | tid = None 227 | if len(v) == 0: 228 | tid = insert_hash(cur,"tags",{ 229 | 'name': dex 230 | }) 231 | else: 232 | tid = v[0][0] 233 | insert_hash(cur,"taggings",{ 234 | "tag_id": tid, 235 | "taggable_id": rid, 236 | "taggable_type": "Organization" 237 | }) 238 | dex = main['IndexTilde'] 239 | if dex: 240 | for dex in [x.strip() for x in dex.lower().split(' ~ ')]: 241 | v = cur.execute('SELECT id FROM tags WHERE name = ?',[dex]).fetchall() 242 | tid = None 243 | if len(v) == 0: 244 | tid = insert_hash(cur,"tags",{ 245 | 'name': dex 246 | }) 247 | else: 248 | tid = v[0][0] 249 | insert_hash(cur,"taggings",{ 250 | "tag_id": tid, 251 | "taggable_id": rid, 252 | "taggable_type": "Organization" 253 | }) 254 | 255 | with open('junk.json', 'w') as outfile: 256 | json.dump(organizations, outfile) 257 | 258 | con.commit() 259 | con.close() 260 | 261 | -------------------------------------------------------------------------------- /sheetsite/destination/stone_soup_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from contextlib import contextmanager 4 | import dataset 5 | from datetime import date, datetime 6 | import json 7 | import os 8 | import re 9 | from tqdm import tqdm 10 | import uuid 11 | import dateutil.parser 12 | 13 | 14 | def get_prop(key, rows): 15 | val = None 16 | many_versions = False 17 | for row in rows: 18 | v = row[key] 19 | if v is not None: 20 | if val is None: 21 | val = v 22 | if v != val: 23 | many_versions = True 24 | return val, many_versions 25 | 26 | 27 | def get_props(keys, rows, first): 28 | result = {} 29 | for key in keys: 30 | val, many_versions = get_prop(key, rows) 31 | if many_versions and not(first): 32 | val = None 33 | result[key] = val 34 | return result 35 | 36 | 37 | def get_common_props(rows): 38 | return get_props(rows[0].keys(), rows, False) 39 | 40 | 41 | def get_main_props(rows): 42 | return get_props(rows[0].keys(), rows, True) 43 | 44 | 45 | def anykey(props, *keys): 46 | optional = (None in keys) 47 | keys = list(filter(None, keys)) 48 | prop_keys = dict((key.upper(), key) for key in props.keys()) 49 | for key in keys: 50 | key = key.upper() 51 | if key in prop_keys: 52 | return props[prop_keys[key]] 53 | # fail deliberately 54 | if optional: 55 | return None 56 | return props[keys[0]] 57 | 58 | 59 | def fix_email(email): 60 | if email is None: 61 | return email 62 | email = str(email) 63 | email = re.sub(r'mailto:', '', email) 64 | return email 65 | 66 | 67 | def as_year(when): 68 | if when is None: 69 | return when 70 | when = str(when) 71 | when = when.replace('.', ' ') 72 | when = when.replace('-', ' ') 73 | when = when.replace('/', ' ') 74 | parts = when.split(' ') 75 | for part in parts: 76 | if len(part) == 4 and re.match('^[0-9]{4}$', part): 77 | return date(int(part), 1, 1) 78 | return None 79 | 80 | 81 | def fix_website(x): 82 | if x is None: 83 | return x 84 | x = x.strip() 85 | x = x.split(' ') 86 | if len(x) == 0: 87 | return None 88 | return x[0] 89 | 90 | 91 | def make_org(props): 92 | organization = { 93 | 'name': anykey(props, "NAME", "CompanyName"), 94 | 'phone': anykey(props, "PHONE", "WorkPhone"), 95 | 'email': fix_email(anykey(props, "EMAIL", "Email Address")), 96 | 'website': fix_website(anykey(props, "WEBSITE", "Web Address", None)), 97 | 'description': anykey(props, "GOODS AND SERVICES", "Description", None), 98 | 'year_founded': as_year(anykey(props, "year_founded", "year founded", None)), 99 | 'access_rule_id': 1, 100 | 'source_grouping': anykey(props, 'source_grouping', None), 101 | 'mode': anykey(props, 'mode', None) 102 | } 103 | if 'stamp' in props: 104 | if props['stamp'] is not None: 105 | organization['updated_at'] = date(int(props['stamp']), 1, 1) 106 | if 'updated_at' in props: 107 | if props['updated_at'] is not None: 108 | organization['updated_at'] = dateutil.parser.parse(props['updated_at']) 109 | return organization 110 | 111 | 112 | def safe_access(props, key): 113 | if not(key in props): 114 | return None 115 | x = props[key] 116 | if x == "": 117 | return None 118 | return x 119 | 120 | 121 | def make_loc(props, rid): 122 | location = { 123 | 'physical_address1': anykey(props, "Street Address", 124 | "Street", "Physical Address", "street1"), 125 | 'physical_address2': anykey(props, "street2", None), 126 | 'physical_city': anykey(props, "city"), 127 | 'physical_state': anykey(props, "state"), 128 | 'physical_zip': anykey(props, "zip", "postal code"), 129 | 'physical_country': anykey(props, "country"), 130 | 'mailing_address1': anykey(props, "mailing_address1", None), 131 | 'mailing_address2': anykey(props, "mailing_address2", None), 132 | 'mailing_city': anykey(props, "mailing_city", None), 133 | 'mailing_state': anykey(props, "mailing_state", None), 134 | 'mailing_zip': anykey(props, "mailing_zip", None), 135 | 'mailing_country': anykey(props, "mailing_country", None), 136 | 'latitude': anykey(props, "lat", "Latitude", "latitude", None), 137 | 'longitude': anykey(props, "lng", "Longitude", "longitude", None), 138 | 'taggable_id': rid, 139 | 'taggable_type': "Organization", 140 | 'dccid': anykey(props, 'dccid') 141 | } 142 | return location 143 | 144 | 145 | class DirectToDB(object): 146 | def __init__(self, cur): 147 | self.cur = cur 148 | 149 | def column(self, tbl, column, example): 150 | return self.cur[tbl].create_column_by_example(column, example) 151 | 152 | def index(self, tbl, columns): 153 | return self.cur[tbl].create_index(columns) 154 | 155 | def insert(self, tbl, values): 156 | return self.cur[tbl].insert(values) 157 | 158 | def delete(self, tbl, **conds): 159 | return self.cur[tbl].delete(**conds) 160 | 161 | def update(self, tbl, values, keys): 162 | self.cur[tbl].update(values, keys) 163 | 164 | def upsert(self, tbl, values, keys): 165 | result = self.cur[tbl].upsert(values, keys) 166 | if result is not True: 167 | return result 168 | vs = dict((k, values[k]) for k in keys) 169 | return self.cur[tbl].find_one(**vs)['id'] 170 | 171 | def find(self, tbl, **conds): 172 | return self.cur[tbl].find(**conds) 173 | 174 | def find_one(self, tbl, **conds): 175 | return self.cur[tbl].find_one(**conds) 176 | 177 | @contextmanager 178 | def transaction(self): 179 | with self.cur as x: 180 | yield DirectToDB(x) 181 | 182 | def is_blank(x): 183 | return x is None or x == "" 184 | 185 | def blanky(x): 186 | if x == "" or x is None: 187 | return None 188 | return x 189 | 190 | 191 | def floaty(x): 192 | if x is None or x == "": 193 | return None 194 | return float(x) 195 | 196 | 197 | class TargetDB(object): 198 | 199 | def __init__(self, target_db): 200 | cur = DirectToDB(target_db) 201 | cur.upsert("tag_contexts", { 202 | 'name': 'OrgType', 203 | 'friendly_name': 'Organization Type' 204 | }, ['name']) 205 | cur.upsert("tag_contexts", { 206 | 'name': 'MemberOrg', 207 | 'friendly_name': 'Member Organization Affiliation' 208 | }, ['name']) 209 | cur.upsert("tag_contexts", { 210 | 'name': 'Sector', 211 | 'friendly_name': 'Business Sector' 212 | }, ['name']) 213 | cur.upsert("tag_contexts", { 214 | 'name': 'LegalStructure', 215 | 'friendly_name': 'Legal Structure' 216 | }, ['name']) 217 | dcc = cur.upsert("tags", { 218 | 'name': 'dcc', 219 | 'root_id': 1, 220 | 'root_type': "TagWorld" 221 | }, ['name']) 222 | for name in ['OrgType', 'Sector', 'MemberOrg', 'LegalStructure']: 223 | cur.upsert("tags", { 224 | 'name': name, 225 | 'root_id': cur.find_one('tag_contexts', name=name)['id'], 226 | 'root_type': "TagContext", 227 | 'parent_id': dcc 228 | }, ['name']) 229 | self.ot = cur.find_one("tags", name='OrgType')['id'] 230 | cur.upsert("tag_worlds", { 231 | 'name': 'dcc', 232 | }, ['name']) 233 | 234 | cur.column('users', 'login', 'x') 235 | cur.column('users', 'password', 'x') 236 | cur.column('users', 'is_admin', 1) 237 | cur.column('users', 'person_id', 1) 238 | cur.column('users', 'last_login', datetime.now()) 239 | cur.column('organizations', 'grouping', 'x') 240 | cur.column('locations', 'mailing_address1', 'x') 241 | cur.column('locations', 'mailing_address2', 'x') 242 | cur.column('locations', 'mailing_city', 'x') 243 | cur.column('locations', 'mailing_state', 'x') 244 | cur.column('locations', 'mailing_zip', 'x') 245 | cur.column('locations', 'mailing_country', 'x') 246 | cur.column('locations', 'mailing_county', 'x') 247 | cur.column('locations', 'physical_zip', 'x') 248 | cur.column('locations', 'physical_county', 'x') 249 | for tab in ['organizations', 'locations']: 250 | cur.column(tab, 'dccid', 'x') 251 | cur.column(tab, 'created_at', datetime.now()) 252 | cur.column(tab, 'updated_at', datetime.now()) 253 | cur.column('people', 'firstname', 'x') 254 | cur.column('people', 'lastname', 'x') 255 | cur.column('people', 'updated_at', datetime.now()) 256 | cur.column('organizations_people', 'person_id', 1) 257 | cur.column('organizations_people', 'organization_id', 1) 258 | cur.column('tags', 'effective_id', 1) 259 | cur.column('locations', 'note', 'x') 260 | cur.column('organizations', 'fax', 'x') 261 | cur.column('organizations', 'year_founded', datetime.now()) 262 | cur.column('organizations', 'source_grouping', 'x') 263 | cur.column('product_services', 'name', 'x') 264 | cur.column('product_services', 'organization_id', 1) 265 | cur.column('organizations_users', 'user_id', 1) 266 | cur.column('organizations_users', 'organization_id', 1) 267 | cur.column('users', 'login', 'x') 268 | 269 | cur.column('access_rules', 'access_type', 'PUBLIC') 270 | cur.upsert('access_rules', {'id': 1, 'access_type': 'PUBLIC'}, ['id']) 271 | 272 | cur.column('data_sharing_orgs', 'name', 'x') 273 | 274 | cur.column('data_sharing_orgs_users', 'user_id', 1) 275 | cur.column('data_sharing_orgs_users', 'data_sharing_org_id', 1) 276 | 277 | cur.column('member_orgs_organizations', 'member_org_id', 1) 278 | cur.column('member_orgs_organizations', 'organization_id', 1) 279 | 280 | cur.column('org_types_organizations', 'org_type_id', 1) 281 | cur.column('org_types_organizations', 'organization_id', 1) 282 | 283 | cur.column('organizations_sectors', 'sector_id', 1) 284 | cur.column('organizations_sectors', 'organization_id', 1) 285 | 286 | cur.column('member_orgs', 'name', 'x') 287 | 288 | cur.column('sectors', 'name', 'x') 289 | 290 | cur.column('taggings', 'tag_id', 1) 291 | cur.column('taggings', 'taggable_id', 1) 292 | cur.column('taggings', 'taggable_type', 'x') 293 | 294 | cur.column('data_sharing_orgs_taggables', 'data_sharing_org_id', 1) 295 | cur.column('data_sharing_orgs_taggables', 'taggable_id', 1) 296 | cur.column('data_sharing_orgs_taggables', 'taggable_type', 'x') 297 | cur.column('data_sharing_orgs_taggables', 'verified', 1) 298 | 299 | cur.index('locations', ['taggable_id', 'taggable_type']) 300 | cur.index('product_services', ['organization_id']) 301 | cur.index('organizations_sectors', ['organization_id']) 302 | cur.index('organizations_sectors', ['sector_id']) 303 | cur.index('organizations_people', ['organization_id']) 304 | cur.index('organizations_people', ['person_id']) 305 | cur.index('tags', ['name']) 306 | cur.index('tags', ['root_id', 'root_type']) 307 | cur.index('tags', ['parent_id']) 308 | cur.index('taggings', ['tag_id']) 309 | cur.index('taggings', ['taggable_id', 'taggable_type']) 310 | cur.index('tag_contexts', ['name']) 311 | cur.index('tag_worlds', ['name']) 312 | cur.index('data_sharing_orgs_taggables', ['data_sharing_org_id']) 313 | cur.index('data_sharing_orgs_taggables', ['taggable_type']) 314 | cur.index('data_sharing_orgs_taggables', ['taggable_id', 'taggable_type']) 315 | 316 | self.cur = cur 317 | 318 | def get_org_type(self): 319 | return self.ot 320 | 321 | def set_name(self, name): 322 | cur = self.cur 323 | dso = name 324 | dso_id = cur.upsert('data_sharing_orgs', 325 | {'name': name}, 326 | ['name']) 327 | self.dso = dso 328 | self.dso_id = dso_id 329 | tabs = ['locations', 'organizations', 'taggings', 330 | 'data_sharing_orgs_taggables', 331 | 'data_sharing_orgs'] 332 | 333 | def prep(tab): 334 | cur.column(tab, 'dso', 'x') 335 | cur.column(tab, 'dso_update', 'x') 336 | cur.update(tab, { 337 | 'dso': dso, 338 | 'dso_update': 'old' 339 | }, ['dso']) 340 | for tab in tabs: 341 | prep(tab) 342 | self.tabs = tabs 343 | 344 | def clear(self): 345 | for tab in self.tabs: 346 | self.cur.delete(tab, dso=self.dso, dso_update='old') 347 | 348 | 349 | def apply(params, state): 350 | 351 | path = merge_path = state['path'] 352 | output_file = state['output_file'] 353 | 354 | if 'merge_path' in params: 355 | merge_path = params['merge_path'] 356 | elif 'MERGE_PATH' in os.environ: 357 | merge_path = os.environ['MERGE_PATH'] 358 | 359 | target = os.path.abspath(os.path.join(merge_path, 360 | 'stonesoup.sqlite3')) 361 | target_perm = os.path.abspath(os.path.join(path, 362 | 'stonesoup.sqlite3')) 363 | state['sqlite_file'] = target_perm 364 | 365 | tdb = TargetDB(dataset.connect("sqlite:///" + target)) 366 | cur = tdb.cur 367 | tdb.set_name(params['organization']) 368 | dso = tdb.dso 369 | dso_id = tdb.dso_id 370 | ot = tdb.get_org_type() 371 | 372 | org_names = [] 373 | orgs = {} 374 | 375 | print("READING", output_file) 376 | tables = json.load(open(output_file)) 377 | selection = tables['names'][0] 378 | lol = tables['tables'][selection]["rows"] 379 | 380 | # collect all locations for each org 381 | for idx, row in tqdm(list(enumerate(lol))): 382 | name = anykey(row, 'row_group', 'NAME', 'CompanyName') 383 | if not(name in orgs): 384 | orgs[name] = [] 385 | org_names.append(name) 386 | orgs[name].append(row) 387 | 388 | print("ORG COUNT " + str(len(org_names))) 389 | 390 | for idx, name in tqdm(list(enumerate(org_names))): 391 | rows = orgs[name] 392 | print("Org {} / {} has {} rows".format(idx, name, len(rows))) 393 | lct = 0 394 | for row in rows: 395 | loc = make_loc(row, None) 396 | if not(is_blank(loc['physical_state']) and is_blank(loc['physical_country']) 397 | and is_blank(loc['physical_address1'])): 398 | lct += 1 399 | if lct == 0: 400 | continue 401 | common = get_common_props(rows) 402 | main = get_main_props(rows) 403 | # print(name + " : " + str(common) + " " + str(len(rows))) 404 | organization = make_org(common) 405 | # print(organization, rows) 406 | # get a dccid 407 | ids = set(filter(None, [row['dccid'] for row in rows])) - set(['']) 408 | oid = None 409 | for id in ids: 410 | y = list(cur.find('oids', dccid=id)) 411 | if len(y) > 0: 412 | oid = y[0]['oid'] 413 | break 414 | if oid is None: 415 | oid = str(uuid.uuid4()) 416 | with cur.transaction() as cur1: 417 | for id in ids: 418 | cur1.upsert('oids', {'oid': oid, 'dccid': id}, ['dccid']) 419 | organization['oid'] = oid 420 | organization['dso'] = dso 421 | organization['dso_update'] = 'fresh' 422 | rid = cur.upsert("organizations", organization, ['oid']) 423 | fid = None 424 | with cur.transaction() as cur1: 425 | for row in rows: 426 | loc = make_loc(row, rid) 427 | if loc['latitude'] is None or loc['latitude'] == "": 428 | loc['latitude'] = floaty(blanky(row['Latitude'])) 429 | if loc['longitude'] is None or loc['longitude'] == "": 430 | loc['longitude'] = floaty(blanky(row['Longitude'])) 431 | if loc['physical_zip'] is None: 432 | loc['physical_zip'] = blanky(row['Postal Code']) 433 | if loc['dccid'] is None: 434 | loc['dccid'] = blanky(row['dccid']) 435 | loc['dso'] = dso 436 | loc['dso_update'] = 'fresh' 437 | fid0 = cur1.upsert("locations", loc, ['dccid']) 438 | if fid is None: 439 | fid = fid0 440 | with cur.transaction() as cur1: 441 | cur1.update('organizations', 442 | {'id': rid, 'primary_location_id': fid}, 443 | ['id']) 444 | cur1.upsert("data_sharing_orgs_taggables", { 445 | "data_sharing_org_id": dso_id, 446 | "taggable_id": rid, 447 | "taggable_type": "Organization", 448 | "verified": 1, 449 | "foreign_key_id": 999, 450 | "dso": dso, 451 | "dso_update": "fresh" 452 | }, ['data_sharing_org_id', 'taggable_id', 'taggable_type']) 453 | typs = main["TYPE"] 454 | if typs is None: 455 | typs = "" 456 | typs = typs.split(',') 457 | if "dcc_status" in main: 458 | typ0 = main['dcc_status'] 459 | if typ0: 460 | typs.append(typ0) 461 | typs = [typ.strip() for typ in typs if typ.strip() != ""] 462 | for typ in typs: 463 | v = list(cur.find('org_types', name=typ)) 464 | tid = None 465 | if len(v) == 0: 466 | tid = cur.insert("org_types", { 467 | 'name': typ 468 | }) 469 | else: 470 | tid = v[0]['id'] 471 | nid = cur.find_one('tags', root_id=tid, root_type='OrgType') 472 | if nid is None: 473 | tid = cur.insert("tags", { 474 | 'name': typ, 475 | 'root_id': tid, 476 | 'root_type': "OrgType", 477 | 'parent_id': ot 478 | }) 479 | else: 480 | tid = nid['id'] 481 | cur.upsert("taggings", { 482 | "tag_id": tid, 483 | "taggable_id": rid, 484 | "taggable_type": "Organization", 485 | "dso": dso, 486 | "dso_update": "fresh" 487 | }, ['tag_id', 'taggable_id', 'taggable_type']) 488 | dex = main['Index'] 489 | if dex: 490 | for dex in [x.strip() for x in dex.lower().split(',')]: 491 | v = list(cur.find('tags', name=dex)) 492 | tid = None 493 | if len(v) == 0: 494 | tid = cur.insert("tags", { 495 | 'name': dex 496 | }) 497 | else: 498 | tid = v[0]['id'] 499 | cur.insert("taggings", { 500 | "tag_id": tid, 501 | "taggable_id": rid, 502 | "taggable_type": "Organization", 503 | "dso": dso, 504 | "dso_update": "fresh" 505 | }) 506 | if 'tags' in main: 507 | dex = main['tags'] 508 | if dex: 509 | try: 510 | lst = [x.strip() for x in dex.split(';;')] 511 | except: 512 | lst = dex 513 | for idex in lst: 514 | parts = idex.split('|') 515 | if len(parts) > 0: 516 | pass 517 | parent_id = None 518 | for part in parts: 519 | v = list(cur.find('tags', name=part, parent_id=parent_id)) 520 | tid = None 521 | if len(v) == 0: 522 | tid = cur.insert("tags", { 523 | 'name': part, 524 | 'parent_id': parent_id 525 | }) 526 | else: 527 | tid = v[0]['id'] 528 | parent_id = tid 529 | cur.insert("taggings", { 530 | "tag_id": parent_id, 531 | "taggable_id": rid, 532 | "taggable_type": "Organization", 533 | "dso": dso, 534 | "dso_update": "fresh" 535 | }) 536 | 537 | tdb.clear() 538 | 539 | from shutil import copyfile 540 | copyfile(target, target_perm) 541 | 542 | 543 | def apply_direct(target_db, name, source_db): 544 | tdb = TargetDB(target_db) 545 | tdb.set_name(name) 546 | 547 | oids = {} 548 | pids = {} 549 | 550 | types = {} 551 | 552 | caps = { 553 | 'OrgType': 'org_types', 554 | 'Sector': 'sectors', 555 | 'LegalStructure': 'legal_structures', 556 | 'MemberOrg': 'member_orgs', 557 | 'TagContext': 'tag_contexts' 558 | } 559 | 560 | dsos = {} 561 | 562 | # add dsos 563 | with tdb.cur.transaction() as cur: 564 | print('dsos') 565 | for rec in tqdm(list(source_db['data_sharing_orgs'].all())): 566 | fid = rec['id'] 567 | dccid = '{}_{}_{}'.format(name, 'DSO', fid) 568 | rec['dccid'] = dccid 569 | rec['dso'] = name 570 | rec['dso_update'] = 'fresh' 571 | rec.pop('id') 572 | oid = cur.upsert("data_sharing_orgs", rec, ['dccid']) 573 | dsos[fid] = oid 574 | 575 | # add types 576 | for k in ['org_types', 'sectors', 'legal_structures', 'member_orgs', 'tag_contexts']: 577 | print(k) 578 | ts = types[k] = {} 579 | with tdb.cur.transaction() as cur: 580 | for rec in tqdm(list(source_db[k].all())): 581 | fid = rec.pop('id') 582 | tid = cur.upsert(k, rec, ['name']) 583 | ts[fid] = tid 584 | 585 | # add organizations 586 | with tdb.cur.transaction() as cur: 587 | print('organizations') 588 | for org in tqdm(list(source_db['organizations'].all())): 589 | fid = org['id'] 590 | dccid = '{}_{}_{}'.format(name, 'Organization', fid) 591 | org['dccid'] = dccid 592 | org['dso'] = name 593 | org['dso_update'] = 'fresh' 594 | pids[org['primary_location_id']] = fid 595 | org.pop('id') 596 | org.pop('created_by_id') 597 | org.pop('updated_by_id') 598 | org.pop('primary_location_id') 599 | org.pop('legal_structure_id') 600 | org.pop('access_rule_id') 601 | org['access_rule_id'] = 1 602 | oid = cur.upsert("organizations", org, ['dccid']) 603 | oids[fid] = oid 604 | 605 | # add locations 606 | with tdb.cur.transaction() as cur: 607 | print('locations') 608 | for org in tqdm(list(source_db['locations'].all())): 609 | fid = org['id'] 610 | dccid = '{}_{}_{}'.format(name, 'Location', fid) 611 | org['dccid'] = dccid 612 | org['dso'] = name 613 | org['dso_update'] = 'fresh' 614 | org.pop('id') 615 | if org['taggable_type'] != 'Organization': 616 | continue 617 | org['taggable_id'] = oids[org['taggable_id']] 618 | oid = cur.upsert("locations", org, ['dccid']) 619 | pid = pids.get(fid) 620 | if pid is not None: 621 | cur.update("organizations", { 622 | 'primary_location_id': oid, 623 | 'id': oids[pid] 624 | }, ['id']) 625 | 626 | tids = {} 627 | 628 | # add tags 629 | with tdb.cur.transaction() as cur: 630 | print('tags') 631 | for rec in tqdm(list(source_db['tags'].all())): 632 | fid = rec.pop('id') 633 | rtype = rec['root_type'] 634 | rid = rec['root_id'] 635 | if rtype in caps: 636 | rtypes = types[caps[rtype]] 637 | rid = rtypes[rid] 638 | rec['root_id'] = rid 639 | else: 640 | rec.pop('root_id') 641 | rec.pop('root_type') 642 | pid = rec['parent_id'] 643 | rec.pop('parent_id') 644 | if pid is not None: 645 | if pid in tids: 646 | rec['parent_id'] = tids[pid] 647 | rec.pop('effective_id') 648 | tid = cur.upsert("tags", rec, ['name']) 649 | tids[fid] = tid 650 | 651 | # add taggings 652 | ct = 0 653 | goods = 0 654 | with tdb.cur.transaction() as cur: 655 | print('taggings') 656 | for rec in tqdm(list(source_db['taggings'].all())): 657 | if rec['taggable_type'] != 'Organization': 658 | continue 659 | ct += 1 660 | if rec['taggable_id'] is None: 661 | continue 662 | if rec['tag_id'] is None: 663 | continue 664 | fid = rec['id'] 665 | dccid = '{}_{}_{}'.format(name, 'Taggings', fid) 666 | rec['dccid'] = dccid 667 | rec.pop('id') 668 | tid = rec['tag_id'] 669 | if tid not in tids: 670 | continue 671 | rec['tag_id'] = tids[tid] 672 | oid = rec['taggable_id'] 673 | if oid not in oids: 674 | continue 675 | rec['taggable_id'] = oids[oid] 676 | rec['dso'] = name 677 | rec['dso_update'] = 'fresh' 678 | cur.upsert("taggings", rec, ['dccid']) 679 | goods += 1 680 | print("taggings {} of which {} good".format(ct, goods)) 681 | 682 | # add dso_taggables 683 | with tdb.cur.transaction() as cur: 684 | print('dso_taggables') 685 | for rec in tqdm(list(source_db['data_sharing_orgs_taggables'].all())): 686 | fid = rec['id'] 687 | dccid = '{}_{}_{}'.format(name, 'DSO_taggables', fid) 688 | rec['dccid'] = dccid 689 | rec['dso'] = name 690 | rec['dso_update'] = 'fresh' 691 | rec.pop('id') 692 | did = rec['data_sharing_org_id'] 693 | if did not in dsos: 694 | continue 695 | rec['data_sharing_org_id'] = dsos[did] 696 | if rec['taggable_type'] != 'Organization': 697 | continue 698 | tid = rec['taggable_id'] 699 | if tid not in oids: 700 | continue 701 | rec['taggable_id'] = oids[tid] 702 | oid = cur.upsert("data_sharing_orgs_taggables", rec, ['dccid']) 703 | dsos[fid] = oid 704 | 705 | 706 | tdb.clear() 707 | 708 | 709 | if __name__ == '__main__': 710 | import sys 711 | target = sys.argv[1] 712 | name = sys.argv[2] 713 | source = sys.argv[3] 714 | target_db = dataset.connect('sqlite:///' + target) 715 | source_db = dataset.connect('sqlite:///' + source) 716 | apply_direct(target_db, name, source_db) 717 | -------------------------------------------------------------------------------- /sheetsite/expand.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import json 3 | import os 4 | import six 5 | import yaml 6 | 7 | 8 | # borrowed code to load yaml dicts as ordered 9 | def ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict): 10 | class OrderedLoader(Loader): 11 | pass 12 | 13 | def construct_mapping(loader, node): 14 | loader.flatten_mapping(node) 15 | return object_pairs_hook(loader.construct_pairs(node)) 16 | OrderedLoader.add_constructor( 17 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, 18 | construct_mapping) 19 | return yaml.load(stream, OrderedLoader) 20 | 21 | 22 | def expand(x): 23 | return os.path.expandvars(x) 24 | 25 | 26 | def expand_all(o): 27 | if type(o) == dict: 28 | return dict([[k, expand_all(v)] for k, v in o.items()]) 29 | if type(o) == list: 30 | return [expand_all(x) for x in o] 31 | if isinstance(o, six.string_types): 32 | return expand(o) 33 | return o 34 | 35 | 36 | def load_config(config_file): 37 | with open(config_file, 'r') as config: 38 | _, ext = os.path.splitext(config_file) 39 | ext = ext.lower() 40 | if ext == '.yml' or ext == '.yaml': 41 | import yaml 42 | params = ordered_load(config, yaml.SafeLoader) 43 | else: 44 | params = json.load(config) 45 | params = expand_all(params) # should make this optional 46 | return params 47 | -------------------------------------------------------------------------------- /sheetsite/filtered_spreadsheet.py: -------------------------------------------------------------------------------- 1 | class FilteredSpreadsheet(object): 2 | def __init__(self, workbook, selector, processor): 3 | self.workbook = workbook 4 | titles = [(sheet, selector(sheet)) 5 | for sheet in self.workbook.worksheets()] 6 | self.sheets = [FilteredSheet(sheet, title, processor) 7 | for sheet, title in titles 8 | if title is not None] 9 | 10 | def worksheets(self): 11 | return self.sheets 12 | 13 | 14 | class FilteredSheet(object): 15 | def __init__(self, sheet, title, processor): 16 | self.sheet = sheet 17 | self.name = title 18 | self.processor = processor 19 | 20 | def get_all_values(self): 21 | return self.processor(self.sheet, self.title) 22 | 23 | @property 24 | def title(self): 25 | return self.name 26 | 27 | -------------------------------------------------------------------------------- /sheetsite/geocache.py: -------------------------------------------------------------------------------- 1 | import dataset 2 | import json 3 | import logging 4 | import os 5 | import requests 6 | import six 7 | import time 8 | 9 | GEOCODER = 'google' if 'GOOGLE_GEOCODER_KEY' in os.environ else None 10 | 11 | class GeoCache(object): 12 | def __init__(self, filename, geocoder=GEOCODER, group_key=None): 13 | logging.basicConfig() 14 | logging.getLogger("dataset.persistence.table").setLevel( 15 | logging.ERROR 16 | ) 17 | if '://' not in filename: 18 | filename = "sqlite:///{}".format(os.path.abspath(filename)) 19 | self.db = dataset.connect(filename) 20 | self.geocache = self.db['geocache'] 21 | self.update_schema() 22 | self.geocoder = geocoder 23 | self.group_key = group_key 24 | self.prev_row = None 25 | 26 | def update_schema(self): 27 | if 'geocache' not in self.db: 28 | self.db.create_table('geocache', 29 | primary_id='address', 30 | primary_type=self.db.types.string) 31 | 32 | def complete(self, result): 33 | if 'lat' in result and 'lng' in result: 34 | if result['lat'] is not None and result['lng'] is not None: 35 | if result['lat'] != '' and result['lng'] != '': 36 | result['latlng'] = "{},{}".format(result['lat'], 37 | result['lng']) 38 | return result 39 | 40 | def find(self, address): 41 | if address is None or address.lower() == 'n/a': 42 | return { 43 | 'status': "not applicable" 44 | } 45 | results = self.geocache.find(address=address) 46 | for row in results: 47 | return self.complete(dict(row)) 48 | result = self.find_without_cache(address) 49 | print("--- geocoded [{}]".format(result)) 50 | if result is None: 51 | result = { 52 | 'address': address, 53 | 'status': 'unknown' 54 | } 55 | self.geocache.insert(result) 56 | else: 57 | result['status'] = 'ok' 58 | self.geocache.insert(result) 59 | self.db.commit() 60 | return self.complete(result) 61 | 62 | def blank(self, val): 63 | return val is None or val == "" 64 | 65 | def find_all(self, rows, pattern, cols): 66 | for row in rows: 67 | parts = [] 68 | for p in pattern: 69 | if isinstance(p, int): 70 | if ((self.blank(row[p]) and self.prev_row and 71 | self.prev_row[self.group_key] == row[self.group_key] and 72 | not self.blank(self.group_key) and 73 | not self.blank(row[self.group_key]))): 74 | parts.append(self.prev_row[p]) 75 | else: 76 | parts.append(row[p]) 77 | else: 78 | parts.append(p) 79 | parts = [part for part in parts if not self.blank(part)] 80 | if six.PY2: 81 | address = " ".join(str((x or '').encode('utf-8')) for x in parts) 82 | else: 83 | address = " ".join(str(x or '') for x in parts) 84 | result = self.find(address) 85 | if result['status'] == 'ok': 86 | for col in cols: 87 | name = col[0].lower() 88 | idx = col[1] 89 | val = result[name] 90 | if idx >= len(row): 91 | row.append(None) 92 | if row[idx] is None or row[idx] == '': 93 | row[idx] = val 94 | if self.group_key: 95 | if self.prev_row: 96 | if self.prev_row[self.group_key] != row[self.group_key]: 97 | self.prev_row = row 98 | else: 99 | self.prev_row = row 100 | 101 | def find_without_cache(self, address): 102 | print("--- geocoding [{}]".format(address)) 103 | if self.geocoder == "datasciencetoolkit" or self.geocoder is None: 104 | return self.find_without_cache_dstk(address) 105 | if self.geocoder == "google": 106 | return self.find_without_cache_gmap(address) 107 | if self.geocoder == "dummy": 108 | return self.find_without_cache_dummy(address) 109 | raise ValueError('unknown geocoder {}'.format(self.geocoder)) 110 | 111 | def find_without_cache_dummy(self, address): 112 | return { 113 | "address": address, 114 | "lat": 10.0, 115 | "lng": 10.0, 116 | "street": "Street St", 117 | "locality": "Cityville", 118 | "region": "New State", 119 | "country": "Countryland", 120 | "postal_code": "PO-STAL", 121 | "administrative_area_level_2": "Glig County", 122 | "status": 'valid' 123 | } 124 | 125 | def find_without_cache_dstk(self, address): 126 | try: 127 | r = requests.post("http://www.datasciencetoolkit.org/street2coordinates/", address, 128 | timeout=15) 129 | v = json.loads(r.text) 130 | v = v[address] 131 | return { 132 | "address": address, 133 | "lat": v['latitude'], 134 | "lng": v['longitude'], 135 | "street": v['street_address'], 136 | "locality": v['locality'], 137 | "region": v['region'], 138 | "country": v['country_name'], 139 | "postal_code": None, 140 | "administrative_area_level_2": v['fips_county'], 141 | "status": 'valid' 142 | } 143 | except: 144 | return None 145 | 146 | def find_without_cache_gmap(self, address, fallback=None): 147 | try: 148 | def get_part(cmps, name, fallback=None): 149 | zips = [cmp["long_name"] for cmp in cmps if name in cmp["types"]] 150 | zip = zips[0] if len(zips)>0 else fallback 151 | return zip 152 | 153 | v = None 154 | xaddress = address 155 | key = os.environ['GOOGLE_GEOCODER_KEY'] 156 | for delay in [1, 2, 4, 8]: 157 | r = requests.get("https://maps.googleapis.com/maps/api/geocode/json", 158 | params={"sensor": "false", "address": xaddress, "key": key}) 159 | time.sleep(delay) 160 | v = json.loads(r.text) 161 | print("v", v) 162 | if 'status' in v: 163 | if v['status'] == 'ZERO_RESULTS': 164 | if ',' in xaddress: 165 | xaddress = xaddress.split(',', 1)[1] 166 | continue 167 | if v['status'] != 'OVER_QUERY_LIMIT': 168 | break 169 | coord = v["results"][0]["geometry"]["location"] 170 | lat = coord["lat"] 171 | lng = coord["lng"] 172 | cmp = v["results"][0]["address_components"] 173 | try: 174 | street = get_part(cmp, 'street_number', '') + ' ' + get_part(cmp, 'route') 175 | except: 176 | street = None 177 | return { 178 | "address": address, 179 | "lat": lat, 180 | "lng": lng, 181 | "street": street, 182 | "locality": get_part(cmp, 'locality'), 183 | "region": get_part(cmp, 'administrative_area_level_1'), 184 | "administrative_area_level_2": get_part(cmp, 'administrative_area_level_2'), 185 | "country": get_part(cmp, 'country'), 186 | "postal_code": get_part(cmp, 'postal_code') 187 | } 188 | except Exception as e: 189 | print("PROBLEM", e) 190 | return None 191 | 192 | 193 | if __name__ == '__main__': 194 | cache = GeoCache("cache.db") 195 | # print(cache.find("305 Memorial Dr, Cambridge, MA")) 196 | # print(cache.find("Chittenden, Franklin County, Connecticut, United States")) 197 | print(cache.find("Lamoille County, Connecticut, United States")) 198 | -------------------------------------------------------------------------------- /sheetsite/google_spreadsheet.py: -------------------------------------------------------------------------------- 1 | import pygsheets 2 | 3 | 4 | class GoogleSpreadsheet(object): 5 | 6 | def __init__(self): 7 | self.connection = None 8 | self.workbook = None 9 | 10 | def connect(self, credential_file): 11 | self.connection = pygsheets.authorize(service_file=credential_file) 12 | 13 | def load_remote(self, spreadsheet): 14 | self.workbook = self.connection.open_by_key(spreadsheet) 15 | 16 | def worksheets(self): 17 | return self.workbook.worksheets() 18 | -------------------------------------------------------------------------------- /sheetsite/ids.py: -------------------------------------------------------------------------------- 1 | import daff 2 | import json 3 | import os 4 | import uuid 5 | 6 | 7 | def process_ids(prev_file, curr_file, prev_id_file, id_file): 8 | io = daff.TableIO() 9 | dapp = daff.Coopy(io) 10 | if not os.path.exists(prev_file): 11 | prev_file = curr_file 12 | v1 = dapp.loadTable(prev_file, 'local') 13 | v2 = dapp.loadTable(curr_file, 'remote') 14 | flags = daff.CompareFlags() 15 | flags.allow_nested_cells = True 16 | alignment = daff.compareTables3(None, v1, v2, flags).align() 17 | daff.TableDiff(alignment, flags).hiliteSingle(daff.SimpleTable(0, 0)) 18 | if os.path.exists(prev_id_file): 19 | in_refs = json.load(open(prev_id_file)) 20 | else: 21 | in_refs = {} 22 | out_refs = {} 23 | for part in alignment.comp.child_order: 24 | comp = alignment.comp.children.h.get(part) 25 | nalignment = comp.alignment 26 | order = nalignment.toOrder().getList() 27 | v1 = comp.a 28 | v2 = comp.b 29 | ref = in_refs.get(part, {}) 30 | if part not in out_refs: 31 | out_ref = out_refs[part] = {} 32 | mints = 0 33 | copies = 0 34 | drops = 0 35 | for o in order: 36 | if o.r == 0: 37 | continue 38 | if o.r >= 0 and o.l >= 0: 39 | src = ref.get(str(o.l)) 40 | if src is None: 41 | out_ref[o.r] = str(uuid.uuid4()) 42 | mints += 1 43 | else: 44 | out_ref[o.r] = ref[str(o.l)] 45 | copies += 1 46 | if o.r < 0 and o.l >= 0: 47 | drops += 1 48 | if o.r >= 0 and o.l < 0: 49 | out_ref[o.r] = str(uuid.uuid4()) 50 | mints += 1 51 | json.dump(out_refs, open(id_file, 'w'), indent=2) 52 | return out_refs 53 | -------------------------------------------------------------------------------- /sheetsite/json_spreadsheet.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import json 3 | 4 | 5 | class JsonSpreadsheet(object): 6 | 7 | def __init__(self, filename, data=None): 8 | if data is not None: 9 | self.data = data 10 | else: 11 | self.data = json.load(open(filename)) 12 | if 'tables' in self.data: 13 | self.sheets = [JsonSheet(n, self.data['tables'][n]) 14 | for n in self.data['names']] 15 | else: 16 | self.sheets = [JsonSheet('sheet', self.data['data'])] 17 | 18 | def worksheets(self): 19 | return self.sheets 20 | 21 | @classmethod 22 | def as_dict(cls, workbook): 23 | result = OrderedDict() 24 | order = result['names'] = [] 25 | sheets = result['tables'] = OrderedDict() 26 | for sheet in workbook.worksheets(): 27 | title = sheet.title 28 | order.append(title) 29 | ws = sheets[title] = OrderedDict() 30 | vals = sheet.get_all_values() 31 | if len(vals) > 0: 32 | columns = vals[0] 33 | rows = vals[1:] 34 | ws['columns'] = columns 35 | ws['rows'] = [OrderedDict(zip(columns, row)) for row in rows] 36 | else: 37 | ws['columns'] = [] 38 | ws['rows'] = [] 39 | return result 40 | 41 | 42 | class JsonSheet(object): 43 | 44 | def __init__(self, name, data): 45 | self.name = name 46 | self.data = data 47 | if isinstance(data, list): 48 | print("WORKING WITH", data[0].keys()) 49 | self.columns = data[0].keys() 50 | self.data = {"rows": data} 51 | else: 52 | self.columns = data['columns'] 53 | 54 | def get_all_values(self): 55 | cols = [c for c in self.columns if c is not None] 56 | results = [cols] 57 | for row in self.data['rows']: 58 | print("Working on", row) 59 | results.append([row.get(c) for c in cols]) 60 | return results 61 | 62 | @property 63 | def title(self): 64 | return self.name 65 | 66 | -------------------------------------------------------------------------------- /sheetsite/jsonify.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | import json 3 | 4 | 5 | def json_serialize(obj): 6 | if isinstance(obj, (datetime, date)): 7 | return obj.isoformat() 8 | raise TypeError ("Cannot deserialize %s" % type(obj)) 9 | 10 | 11 | def dump(*args, **kwargs): 12 | kwargs['default'] = json_serialize 13 | json.dump(*args, **kwargs) 14 | 15 | def dumps(*args, **kwargs): 16 | kwargs['default'] = json_serialize 17 | return json.dumps(*args, **kwargs) 18 | -------------------------------------------------------------------------------- /sheetsite/merged_spreadsheet.py: -------------------------------------------------------------------------------- 1 | class MergedSpreadsheet(object): 2 | def __init__(self, workbook, merge_tables): 3 | self.workbook = workbook 4 | merged = set() 5 | for key, lst in merge_tables.items(): 6 | merged = merged | set(lst) 7 | original_sheets = self.workbook.worksheets() 8 | sheet_by_name = {} 9 | for sheet in original_sheets: 10 | sheet_by_name[sheet.title] = sheet 11 | sheets = [sheet for sheet in original_sheets if sheet.title not in merged and '*' not in merged] 12 | for key, lst in merge_tables.items(): 13 | if lst[0] == '*': 14 | sheets.append(MergedSheet(key, original_sheets)) 15 | else: 16 | sheets.append(MergedSheet(key, [sheet_by_name[name] for name in lst])) 17 | self.sheets = sheets 18 | 19 | def worksheets(self): 20 | return self.sheets 21 | 22 | class MergedSheet(object): 23 | def __init__(self, name, sheets): 24 | self.sheets = sheets 25 | self.name = name 26 | 27 | def get_all_values(self): 28 | rows = [] 29 | for sheet in self.sheets: 30 | rows += sheet.get_all_values() 31 | deduped_rows = [] 32 | keys = {} 33 | for row in rows: 34 | # I hate near dupes!!!!! 35 | rowx = [row[0], row[3]] 36 | # I hate python 2.7 37 | key = ' // '.join(str((x or '').encode('utf-8')) for x in rowx) 38 | import re 39 | key = re.sub(r'[\n\r ]+', ' ', key) 40 | if key not in keys: 41 | deduped_rows.append(row) 42 | keys[key] = True 43 | return deduped_rows 44 | 45 | @property 46 | def title(self): 47 | return self.name 48 | 49 | -------------------------------------------------------------------------------- /sheetsite/names.py: -------------------------------------------------------------------------------- 1 | 2 | NAMES = { 3 | 'lat': 'lat', 4 | 'latitude': 'lat', 5 | 'lng': 'lng', 6 | 'lon': 'lng', 7 | 'longitude': 'lng', 8 | 'address': 'address', 9 | 'zip': 'postal_code', 10 | 'zipcode': 'postal_code', 11 | 'zip_code': 'postal_code', 12 | 'zip code': 'postal_code', 13 | 'postal_code': 'postal_code', 14 | 'postal code': 'postal_code', 15 | 'locality': 'locality', 16 | 'city': 'locality', 17 | 'country': 'country', 18 | 'street': 'street', 19 | 'region': 'region', 20 | 'state': 'region', 21 | 'province': 'region', 22 | 'county': 'administrative_area_level_2', 23 | 'geo_county': 'administrative_area_level_2', 24 | 'latlng': 'latlng' 25 | } 26 | 27 | 28 | def normalize_name(name): 29 | name = name.lower() 30 | return NAMES.get(name, name) 31 | -------------------------------------------------------------------------------- /sheetsite/sheet.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class Sheets(object): 4 | 5 | def __init__(self, data): 6 | self.data = data 7 | 8 | @property 9 | def tables(self): 10 | return [self.table(name) for name in self.data['names']] 11 | 12 | def table(self, name): 13 | return Table(self.data['tables'][name], name) 14 | 15 | def tables_with_columns(self, *columns, **keys): 16 | lst = [self.table(name) for name in self.data['names'] 17 | if set(columns) <= set(self.data['tables'][name]['columns'])] 18 | if keys.get('require') and len(lst) == 0: 19 | raise Exception('no table found with column(s) {}'.format(columns)) 20 | return lst 21 | 22 | def __repr__(self): 23 | return json.dumps(self.data) 24 | 25 | def __getitem__(self, key): 26 | return self.data[key] 27 | 28 | def __setitem__(self, key, val): 29 | self.data[key] = val 30 | 31 | def __delitem__(self, key): 32 | del self.data[key] 33 | 34 | 35 | class Table(object): 36 | def __init__(self, data, name): 37 | self.data = data 38 | self.name = name 39 | 40 | @property 41 | def columns(self): 42 | return self.data['columns'] 43 | 44 | def has_column(self, name): 45 | return (name in self.columns) 46 | 47 | @property 48 | def rows(self): 49 | return [Row(row) for row in self.data['rows']] 50 | 51 | def add_column(self, name): 52 | if self.has_column(name): 53 | return 54 | self.data['columns'].append(name) 55 | for row in self.rows: 56 | row[name] = None 57 | 58 | def remove_column(self, name): 59 | if not self.has_column(name): 60 | return 61 | self.data['columns'] = [c for c in self.data['columns'] if c != name] 62 | for row in self.rows: 63 | del row[name] 64 | 65 | def __repr__(self): 66 | return json.dumps(self.data) 67 | 68 | class Row(object): 69 | def __init__(self, data): 70 | self.data = data 71 | 72 | def __getitem__(self, key): 73 | return self.data[key] 74 | 75 | def __setitem__(self, key, val): 76 | self.data[key] = val 77 | 78 | def __delitem__(self, key): 79 | del self.data[key] 80 | 81 | def __repr__(self): 82 | return json.dumps(self.data) 83 | 84 | def add_to_set(self, key, val): 85 | if self.data[key] is None: 86 | self.data[key] = [] 87 | if val not in self.data[key]: 88 | self.data[key].append(val) 89 | -------------------------------------------------------------------------------- /sheetsite/sheetsend.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import sheetsite 5 | import shutil 6 | import subprocess 7 | 8 | # States: 9 | # .pending -> will need to be processed 10 | # .processing -> working on it 11 | # .ack_pending -> will need to be acknowledged 12 | # .ack_processing -> working on acking 13 | 14 | def run(): 15 | parser = argparse.ArgumentParser(description='Update a website from a spreadsheet. ' 16 | 'Take a spreadsheet (from google sheets or locally), and ' 17 | 'convert it to a .json file that a static website ' 18 | 'generator like jekyll can use, then push it out.') 19 | parser.add_argument('layout_file', nargs="?", help='json file ' 20 | 'describing source, destination, and all settings') 21 | parser.add_argument('--cache', nargs=1, required=False, default='cache', 22 | help='cache directory where work is stored.') 23 | parser.add_argument('--spool', nargs=1, required=False, 24 | help='if supplied, work only on sheets mentioned in this directory.' 25 | '(see sheetmail)') 26 | args = parser.parse_args() 27 | if args.layout_file is None: 28 | print "Need a layout file, I should give you an example." 29 | print "See example_sites.json in github repository for sheetsite." 30 | print "Add -h for help." 31 | exit(1) 32 | 33 | layout = json.loads(open(args.layout_file).read()) 34 | root = args.cache[0] 35 | spool = args.spool[0] 36 | 37 | names = layout['names'] 38 | 39 | for name in names: 40 | 41 | site = layout['sites'][name] 42 | 43 | source = site['source'] 44 | if source['name'] != 'google-sheets': 45 | print "do not know how to read from", source['name'] 46 | exit(1) 47 | 48 | if spool is not None: 49 | key = source['key'] 50 | pending_file = os.path.join(spool, '{}.pending.json'.format(key)) 51 | processing_file = os.path.join(spool, '{}.processing.json'.format(key)) 52 | present = False 53 | if os.path.exists(pending_file): 54 | shutil.move(pending_file, processing_file) 55 | present = True 56 | if os.path.exists(processing_file): 57 | present = True 58 | if not present: 59 | continue 60 | 61 | path = os.path.join(root, name) 62 | if not(os.path.exists(path)): 63 | os.makedirs(path) 64 | 65 | from sheetsite.google_spreadsheet import GoogleSpreadsheet 66 | from sheetsite.site import Site 67 | wb = GoogleSpreadsheet() 68 | wb.connect(source['credential_file']) 69 | wb.load_remote(source['key']) 70 | 71 | ss = Site(wb, os.path.join(path, 'geocache.sqlite')) 72 | if 'flags' in site: 73 | ss.configure(site['flags']) 74 | output_file = os.path.join(path, 'public.json') 75 | private_output_file = os.path.join(path, 'private.json') 76 | ss.save_local(output_file) 77 | ss.save_local(private_output_file, private_sheets=True) 78 | 79 | destination = site['destination'] 80 | if destination['name'] != 'git': 81 | print "do not know how to write to", destination['name'] 82 | exit(1) 83 | 84 | local_repo = os.path.join(path, 'repo') 85 | if not(os.path.exists(local_repo)): 86 | subprocess.check_output(['git', 'clone', destination['repo'], local_repo]) 87 | wd = os.getcwd() 88 | os.chdir(local_repo) 89 | subprocess.check_output(['git', 'pull']) 90 | os.chdir(wd) 91 | shutil.copyfile(output_file, os.path.join(local_repo, destination['file'])) 92 | os.chdir(local_repo) 93 | subprocess.check_output(['git', 'add', destination['file']]) 94 | try: 95 | subprocess.check_output(['git', 'commit', '-m', 'update from sheetsite']) 96 | subprocess.check_output(['git', 'push']) 97 | except subprocess.CalledProcessError: 98 | print "Commit/push skipped" 99 | os.chdir(wd) 100 | 101 | if spool is not None: 102 | key = source['key'] 103 | processing_file = os.path.join(spool, '{}.processing.json'.format(key)) 104 | ack_pending_file = os.path.join(spool, '{}.ack_pending.json'.format(key)) 105 | if os.path.exists(processing_file): 106 | shutil.move(processing_file, ack_pending_file) 107 | -------------------------------------------------------------------------------- /sheetsite/sheetwatch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dataset 3 | import datetime 4 | import imaplib 5 | import json 6 | import os 7 | import re 8 | import six 9 | import sys 10 | import time 11 | 12 | try: 13 | from sheetsite.tasks.detect_site import detect_site 14 | except ImportError as e: 15 | print(e) 16 | print("*** Did you pip install sheetsite[queue]?") 17 | exit(1) 18 | 19 | 20 | def find_sheet(msg): 21 | key = None 22 | title = None 23 | who = None 24 | m = re.search(r'docs.google.com/spreadsheets/d/([^/]*)/', msg.body) 25 | if m: 26 | key = m.group(1) 27 | m = re.search(r'\"([^\"]*)', msg.subject) 28 | if m: 29 | title = m.group(1) 30 | title = re.sub(r'[\r\n]', '', title) 31 | m = re.search(r'[\r\n]\* (.*) made changes', msg.body) 32 | if m: 33 | who = m.group(1) 34 | if key is not None: 35 | print("Found %s: %s (%s)" % (key, title, who)) 36 | return { 37 | "key": key, 38 | "title": title, 39 | "who": who 40 | } 41 | return None 42 | 43 | 44 | def store_work(job): 45 | if 'key' not in job: 46 | return 47 | detect_site.delay(job) 48 | 49 | 50 | class TestMail(object): 51 | def __init__(self, subject=None, body=None, labels=None): 52 | self.subject = subject 53 | self.body = body 54 | self.labels = labels 55 | 56 | def fetch(self): 57 | pass 58 | 59 | def has_label(self, label): 60 | return label in self.labels 61 | 62 | def add_label(self, label): 63 | if not self.has_label(label): 64 | self.labels.append(label) 65 | 66 | 67 | class TestMailbox(object): 68 | def __init__(self, fname): 69 | self.fname = fname 70 | self.data = json.load(open(fname)) 71 | 72 | def inbox(self): 73 | return self 74 | 75 | def mail(self, **_): 76 | return [TestMail(**x) for x in self.data] 77 | 78 | def logout(self): 79 | pass 80 | 81 | 82 | class ImapMail(object): 83 | def __init__(self, parent, uid): 84 | self.parent = parent 85 | self.uid = uid 86 | self.subject = "" 87 | self.body = "" 88 | 89 | def plain(self, part): 90 | if isinstance(part, six.string_types): 91 | return part.encode('utf8', 'xmlcharrefreplace').strip() 92 | return part.as_string() 93 | 94 | def parse_header(self, part): 95 | if isinstance(part, six.string_types): 96 | return self.plain(part) 97 | elif isinstance(part, list): 98 | return " ".join([self.parse_header(p) for p in part]) 99 | elif isinstance(part, tuple): 100 | return part[0] 101 | return part 102 | 103 | def parse_body(self, message): 104 | payload = message.get_payload(decode=True) or message.get_payload() 105 | if isinstance(payload, six.string_types): 106 | return self.plain(payload) 107 | elif isinstance(payload, list): 108 | for part in payload: 109 | if part.get_content_type() == 'text/plain': 110 | return self.plain(part) 111 | return self.plain(payload[0]) 112 | return message.as_string() 113 | 114 | def fetch(self): 115 | result, data = self.parent.mailer.uid('fetch', self.uid, '(RFC822)') 116 | raw_email = data[0][1] 117 | import email 118 | from email.header import decode_header 119 | email_message = email.message_from_string(raw_email.decode('utf-8')) 120 | 121 | def extract(key): 122 | return self.parse_header(decode_header(email_message[key])) 123 | self.subject = extract('Subject') 124 | self.body = self.parse_body(email_message) 125 | 126 | def has_label(self, label): 127 | return False 128 | 129 | def add_label(self, label): 130 | self.parent.set_processed(self.uid) 131 | 132 | 133 | class ImapMailbox(object): 134 | def __init__(self, username, pword): 135 | self.mailer = imaplib.IMAP4_SSL('imap.gmail.com') 136 | self.db_name = os.path.join(os.environ['SHEETSITE_CACHE'], 137 | "emails.sqlite3") 138 | self.db_uri = "sqlite:///{}".format(self.db_name) 139 | print(self.db_uri) 140 | self.db = dataset.connect(self.db_uri) 141 | self.record = self.db['emails'] 142 | import sqlalchemy.types 143 | if self.record.count() == 0: 144 | self.record.create_column('uid', sqlalchemy.types.Text) 145 | self.record.create_index(['uid']) 146 | self.login(username, pword) 147 | 148 | def login(self, username, pword): 149 | self.mailer.login(username, pword) 150 | 151 | def inbox(self): 152 | self.mailer.select('inbox') 153 | return self 154 | 155 | def set_processed(self, uid): 156 | self.record.insert({'uid': uid}) 157 | 158 | def mail(self, **_): 159 | import datetime 160 | date = (datetime.date.today() - datetime.timedelta(10)).strftime("%d-%b-%Y") 161 | result, data = self.mailer.uid( 162 | 'search', 163 | None, 164 | '(SENTSINCE {date} FROM "notify@google.com")'.format( 165 | date=date) 166 | ) 167 | email_uids = data[0].split() 168 | mails = [] 169 | for uid in email_uids: 170 | print("Checking", uid) 171 | if len(list(self.record.find(uid=uid))) == 0: 172 | print("Not processed yet!") 173 | mails.append(ImapMail(self, uid)) 174 | return mails 175 | 176 | def logout(self): 177 | self.mailer.logout() 178 | 179 | 180 | def worker(): 181 | from celery.__main__ import main 182 | while len(sys.argv) > 0: 183 | sys.argv.pop() 184 | for arg in ['celery', '-A', 'sheetsite.site_queue', 'worker', '-l', 'info']: 185 | sys.argv.append(arg) 186 | sys.exit(main()) 187 | 188 | 189 | def run(): 190 | 191 | parser = argparse.ArgumentParser(description='Check email for sheet change notifications.' 192 | 'For when webhooks are not an option.') 193 | 194 | subparsers = parser.add_subparsers(dest='cmd') 195 | 196 | ping = subparsers.add_parser('ping') 197 | 198 | ping.add_argument('--clear', action='store_true', 199 | help="do not take action on initial emails, just absorb them") 200 | 201 | ping.add_argument('--no-notify', action='store_true', 202 | help="do not send notification emails") 203 | 204 | ping.add_argument('--delay', type=int, default=0, 205 | help="delay in seconds between pings" 206 | " (if not set, just one ping is made") 207 | 208 | subparsers.add_parser('worker') 209 | 210 | args = parser.parse_args() 211 | 212 | if args.cmd == 'worker': 213 | worker() 214 | return 215 | 216 | ignore = args.clear 217 | while True: 218 | # log in to gmail 219 | if 'GMAIL_PASSWORD' in os.environ: 220 | if os.environ['GMAIL_USERNAME'] == 'test': 221 | g = TestMailbox(os.environ['GMAIL_PASSWORD']) 222 | else: 223 | g = ImapMailbox(os.environ['GMAIL_USERNAME'], 224 | os.environ['GMAIL_PASSWORD']) 225 | else: 226 | print("Need GMAIL_USERNAME/GMAIL_PASSWORD to be set in environment.") 227 | print("They should be set to whatever account receives change notications of sheet.") 228 | exit(1) 229 | 230 | # look for recent emails from google notify 231 | window = datetime.datetime.now() - datetime.timedelta(days=10) 232 | mail = g.inbox().mail(sender='notify@google.com', after=window) 233 | 234 | # check emails for action items 235 | keys = {} 236 | for msg in mail: 237 | msg.fetch() 238 | print(msg.subject) 239 | # msg.remove_label('sheetmailed') 240 | if msg.has_label('sheetmailed'): 241 | continue 242 | sheet = find_sheet(msg) 243 | if sheet is not None: 244 | if sheet['key'] in keys: 245 | sheet = None 246 | else: 247 | keys[sheet['key']] = True 248 | if sheet is not None: 249 | if not ignore: 250 | sheet['no_notify'] = args.no_notify 251 | store_work(sheet) 252 | else: 253 | print(" * ignoring this email as directed") 254 | msg.add_label('sheetmailed') 255 | 256 | # leave 257 | g.logout() 258 | 259 | if args.delay == 0: 260 | break 261 | 262 | ignore = False 263 | time.sleep(args.delay) 264 | 265 | 266 | if __name__ == '__main__': 267 | run() 268 | -------------------------------------------------------------------------------- /sheetsite/site.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from sheetsite.names import normalize_name 4 | from sheetsite.filtered_spreadsheet import FilteredSpreadsheet 5 | from sheetsite.merged_spreadsheet import MergedSpreadsheet 6 | 7 | 8 | class Site(object): 9 | 10 | def __init__(self, spreadsheet, geocache_filename=None, censor=True): 11 | self.workbook = spreadsheet 12 | self.geocache_filename = geocache_filename 13 | self.censor = censor 14 | self.include = None 15 | self.exclude = None 16 | self.fill_columns = None 17 | self.add_columns = {} 18 | self.const_columns = {} 19 | self.rename_columns = {} 20 | self.address_columns = {} 21 | self.constant_columns = {} 22 | self.merge_tables = None 23 | self.modify = True 24 | self.geocoder = None 25 | self.group_key = None 26 | self.ids = None 27 | 28 | def add_sheet_filter(self, include, exclude): 29 | self.include = include 30 | self.exclude = exclude 31 | 32 | def add_column_fills(self, fill_columns): 33 | if fill_columns is None: 34 | self.fill_columns = None 35 | return 36 | self.fill_columns = [normalize_name(n) for n in fill_columns] 37 | 38 | def save_local(self, output_file, private_sheets=False, enhance=True): 39 | self.modify = enhance 40 | ext = '-' 41 | if output_file is not None: 42 | _, ext = os.path.splitext(output_file) 43 | ext = ext.lower() 44 | 45 | return self.save(output_file, private_sheets) 46 | 47 | def add_ids(self, ids): 48 | self.ids = ids 49 | 50 | def process_cells(self, rows, name): 51 | if not(self.modify): 52 | return rows 53 | rows = self.clean_cells(rows, name) 54 | rows = self.add_location(rows, name) 55 | return rows 56 | 57 | def filter(self, sheet, private_sheets): 58 | title = sheet.title 59 | core_title = re.sub(r'\(\((.*)\)\)', r'\1', title) 60 | if self.exclude is not None: 61 | if core_title in self.exclude: 62 | return None 63 | if self.include is not None: 64 | if core_title in self.include: 65 | return core_title 66 | return None 67 | if (core_title == title) == private_sheets: 68 | return None 69 | return core_title 70 | 71 | def private_workbook(self): 72 | return self.filtered_workbook(True) 73 | 74 | def public_workbook(self): 75 | return self.filtered_workbook(False) 76 | 77 | def merge(self, wb, merge_tables): 78 | if merge_tables is None: 79 | return wb 80 | return MergedSpreadsheet(wb, merge_tables) 81 | 82 | def filtered_workbook(self, selector_flags): 83 | workbook = self.merge(self.workbook, self.merge_tables) 84 | selector = lambda sheet: self.filter(sheet, selector_flags) 85 | processor = lambda sheet, title: self.process_cells(sheet.get_all_values(), title) 86 | fs = FilteredSpreadsheet(workbook, selector=selector, processor=processor) 87 | return fs 88 | 89 | def save(self, output_file, selector_flags): 90 | from sheetsite.destination import write_destination 91 | params = { 'output_file': output_file } 92 | state = { 'workbook': self.filtered_workbook(selector_flags) } 93 | write_destination(params, state) 94 | return True 95 | 96 | def sanity_stick(self, locs): 97 | result = [] 98 | if len(locs) <= 1: 99 | return locs 100 | import re 101 | if len(re.sub(r'[^,]', '', locs[0])) < 3: 102 | return [' '.join(locs)] 103 | return locs 104 | 105 | def clean_cells(self, vals, name): 106 | if len(vals) == 0: 107 | return vals 108 | 109 | hide_column = {} 110 | split_column = {} 111 | for idx, cell in enumerate(vals[0]): 112 | if cell is None or len(cell) == 0 or cell[0] == '(': 113 | hide_column[idx] = True 114 | if cell == "Other Addresses (deprecated)": 115 | split_column[idx] = '\n' 116 | 117 | results = [] 118 | 119 | existing = {} 120 | for ridx, row in enumerate(vals): 121 | result = [] 122 | for idx, cell in enumerate(row): 123 | if idx in hide_column: 124 | continue 125 | if cell is not None: 126 | try: 127 | cell = re.sub(r'\(\(.*\)\)','', cell) 128 | cell = re.sub(r'[\n\r]+$','', cell) 129 | cell = re.sub(r'^[\t \n\r]+$','', cell) 130 | except TypeError: 131 | pass 132 | if ridx > 0: 133 | if idx in split_column: 134 | if cell is not None: 135 | splits = cell.split(split_column[idx]) 136 | splits = self.sanity_stick(splits) 137 | cell = [['address']] + [[x] for x in splits] 138 | print(">>>", cell) 139 | cell = self.clean_cells(cell, "other") 140 | cell = self.add_location(cell, "other") 141 | cell = { 142 | 'columns': cell[0], 143 | 'rows': [dict(zip(cell[0], row)) for row in cell[1:]] 144 | } 145 | print("<<<", cell) 146 | result.append(cell) 147 | if ridx == 0: 148 | existing[cell] = 1 149 | if name in self.add_columns: 150 | for col in self.add_columns[name]: 151 | if col not in existing: 152 | if ridx == 0: 153 | result.append(col) 154 | else: 155 | result.append(None) 156 | if name in self.constant_columns: 157 | for col, val in self.constant_columns[name].items(): 158 | if col not in existing: 159 | if ridx == 0: 160 | result.append(col) 161 | else: 162 | result.append(val) 163 | results.append(result) 164 | 165 | return results 166 | 167 | def add_location(self, vals, name): 168 | if len(vals) == 0: 169 | return vals 170 | 171 | have_address = False 172 | have_fill_in = False 173 | pattern = [0] 174 | fill_in = [] 175 | group_index = None 176 | offset = 0 177 | for idx, cell in enumerate(vals[0]): 178 | if name in self.rename_columns: 179 | renames = self.rename_columns[name] 180 | if cell in renames: 181 | cell = renames[cell] 182 | vals[0][idx] = cell 183 | if cell == self.group_key and self.group_key is not None: 184 | group_index = idx 185 | nn = normalize_name(cell) 186 | if nn == 'address': 187 | pattern = [idx] 188 | have_address = True 189 | if cell is not None and len(cell) > 0 and cell[0] == '[': 190 | have_fill_in = True 191 | vals[0][idx] = cell[1:-1] 192 | fill_in.append([normalize_name(vals[0][idx]), idx]) 193 | if self.fill_columns is not None: 194 | if nn in self.fill_columns: 195 | have_fill_in = True 196 | fill_in.append([nn, idx]) 197 | if self.add_columns is not None: 198 | if name in self.add_columns: 199 | if cell in self.add_columns[name]: 200 | offset -= 1 201 | fill_in.append([normalize_name(nn), idx]) 202 | have_fill_in = True 203 | if self.address_columns is not None: 204 | if name in self.address_columns: 205 | have_address = True 206 | pattern = self.address_columns[name] 207 | for idx, col in enumerate(pattern): 208 | try: 209 | pattern[idx] = vals[0].index(col) 210 | except ValueError: 211 | pass 212 | if have_fill_in: 213 | dccid = None 214 | for at, (cname, cidx) in enumerate(fill_in): 215 | if cname == 'dccid' and self.ids is not None: 216 | dccid = at 217 | if name in self.ids: 218 | ref = self.ids[name] 219 | for idx, row in enumerate(vals): 220 | if idx == 0: 221 | continue 222 | key = ref.get(idx) 223 | row[cidx] = key 224 | if dccid is not None: 225 | del fill_in[dccid] 226 | if len(fill_in) == 0: 227 | have_fill_in = False 228 | if not(have_fill_in) or not(have_address): 229 | return vals 230 | from sheetsite.geocache import GeoCache 231 | cache = GeoCache(self.geocache_filename, geocoder=self.geocoder, 232 | group_key=group_index) 233 | cache.find_all(vals[1:], pattern, fill_in) 234 | return vals 235 | 236 | def configure(self, flags): 237 | self.geocoder = flags.get('geocoder') 238 | for key, val in flags.items(): 239 | if key == 'rename': 240 | self.rename_columns = val 241 | if key == 'add': 242 | self.add_columns = val 243 | if key == 'constant': 244 | self.constant_columns = val 245 | if key == 'address': 246 | self.address_columns = val 247 | if key == 'merge': 248 | self.merge_tables = val 249 | if key == 'group': 250 | self.group_key = val 251 | 252 | 253 | -------------------------------------------------------------------------------- /sheetsite/site_queue.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | import os 3 | 4 | app = Celery('sheetsite', 5 | broker=os.environ.get('SHEETSITE_BROKER_URL', None), 6 | backend=os.environ.get('SHEETSITE_RESULT_BACKEND', None), 7 | include=['sheetsite.tasks']) 8 | 9 | if __name__ == '__main__': 10 | app.start() 11 | -------------------------------------------------------------------------------- /sheetsite/source/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sheetsite.source.csv import read_source_csv 3 | from sheetsite.source.google import read_source_google 4 | from sheetsite.source.excel import read_source_excel 5 | from sheetsite.source.json import read_source_json 6 | 7 | 8 | def read_source(params): 9 | 10 | readers = { 11 | '.csv': read_source_csv, 12 | 'google-sheets': read_source_google, 13 | '.json': read_source_json, 14 | '.xls': read_source_excel, 15 | '.xlsx': read_source_excel 16 | } 17 | 18 | name = None 19 | if 'name' in params: 20 | name = params['name'] 21 | elif 'filename' in params: 22 | _, ext = os.path.splitext(params['filename']) 23 | name = ext 24 | 25 | if name is None: 26 | raise IOError('source not specified') 27 | 28 | if name not in readers: 29 | raise IOError('source not recognized: {}'.format(name)) 30 | 31 | return readers[name](params) 32 | 33 | 34 | -------------------------------------------------------------------------------- /sheetsite/source/csv.py: -------------------------------------------------------------------------------- 1 | def read_source_csv(source): 2 | from sheetsite.csv_spreadsheet import CsvSpreadsheet 3 | wb = CsvSpreadsheet(source['filename']) 4 | return wb 5 | -------------------------------------------------------------------------------- /sheetsite/source/excel.py: -------------------------------------------------------------------------------- 1 | def read_source_excel(source): 2 | from sheetsite.xls_spreadsheet import XlsSpreadsheet 3 | wb = XlsSpreadsheet(source['filename']) 4 | return wb 5 | 6 | -------------------------------------------------------------------------------- /sheetsite/source/google.py: -------------------------------------------------------------------------------- 1 | def read_source_google(source): 2 | from sheetsite.google_spreadsheet import GoogleSpreadsheet 3 | wb = GoogleSpreadsheet() 4 | wb.connect(source['credential_file']) 5 | wb.load_remote(source['key']) 6 | return wb 7 | 8 | -------------------------------------------------------------------------------- /sheetsite/source/json.py: -------------------------------------------------------------------------------- 1 | def read_source_json(source): 2 | from sheetsite.json_spreadsheet import JsonSpreadsheet 3 | wb = JsonSpreadsheet(source['filename']) 4 | return wb 5 | -------------------------------------------------------------------------------- /sheetsite/spreadsheet.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import gspread 3 | import json 4 | import os 5 | from oauth2client.client import SignedJwtAssertionCredentials 6 | import re 7 | from sheetsite.jsonify import dump 8 | 9 | 10 | class Spreadsheet(object): 11 | 12 | def __init__(self, censor=True): 13 | self.connection = None 14 | self.workbook = None 15 | self.censor = censor 16 | 17 | def connect(self, credential_file): 18 | json_key = json.load(open(credential_file)) 19 | scope = ['https://spreadsheets.google.com/feeds'] 20 | credentials = SignedJwtAssertionCredentials(json_key['client_email'], 21 | json_key['private_key'], scope) 22 | self.connection = gspread.authorize(credentials) 23 | 24 | def load_remote(self, spreadsheet_key): 25 | self.workbook = self.connection.open_by_key(spreadsheet_key) 26 | 27 | def save_local(self, output_file): 28 | _, ext = os.path.splitext(output_file) 29 | 30 | if ext == ".xls": 31 | return self.save_to_excel(output_file) 32 | elif ext == ".json": 33 | return self.save_to_json(output_file) 34 | 35 | print("Unknown extension", ext) 36 | return False 37 | 38 | def save_to_excel(self, output_file): 39 | import xlwt 40 | wb = xlwt.Workbook() 41 | for sheet in self.workbook.worksheets(): 42 | ws = wb.add_sheet(sheet.title) 43 | rows = self.clean_cells(sheet.get_all_values()) 44 | for r, row in enumerate(rows): 45 | for c, cell in enumerate(row): 46 | ws.write(r, c, cell) 47 | wb.save(output_file) 48 | return True 49 | 50 | def save_to_json(self, output_file): 51 | result = OrderedDict() 52 | order = result['names'] = [] 53 | sheets = result['tables'] = OrderedDict() 54 | for sheet in self.workbook.worksheets(): 55 | order.append(sheet.title) 56 | ws = sheets[sheet.title] = OrderedDict() 57 | vals = self.clean_cells(sheet.get_all_values()) 58 | columns = vals[0] 59 | rows = vals[1:] 60 | ws['columns'] = columns 61 | ws['rows'] = [OrderedDict(zip(columns, row)) for row in rows] 62 | with open(output_file, 'w') as f: 63 | dump(result, f, indent=2) 64 | return True 65 | 66 | def clean_cells(self, vals): 67 | hide_column = {} 68 | 69 | for idx, cell in enumerate(vals[0]): 70 | if len(cell) == 0 or cell[0] == '(': 71 | hide_column[idx] = True 72 | 73 | results = [] 74 | 75 | for ridx, row in enumerate(vals): 76 | result = [] 77 | for idx, cell in enumerate(row): 78 | if idx in hide_column: 79 | continue 80 | cell = re.sub(r'\(\(.*\)\)', '', cell) 81 | cell = re.sub(r'[\n\r]+$', '', cell) 82 | cell = re.sub(r'^[\t \n\r]+$', '', cell) 83 | result.append(cell) 84 | results.append(result) 85 | 86 | return results 87 | 88 | -------------------------------------------------------------------------------- /sheetsite/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from sheetsite.site_queue import app 2 | from sheetsite.site import Site 3 | import sheetsite.tasks.notify 4 | import sheetsite.tasks.update_site 5 | import sheetsite.tasks.detect_site 6 | 7 | 8 | @app.task 9 | def add(x, y): 10 | return x + y 11 | 12 | 13 | -------------------------------------------------------------------------------- /sheetsite/tasks/detect_site.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from sheetsite.expand import load_config 4 | from sheetsite.site_queue import app 5 | from sheetsite.tasks.update_site import update_site 6 | 7 | 8 | @app.task 9 | def detect_site(params): 10 | key = params['key'] 11 | print("PROCESS_spreadsheet", key, params) 12 | 13 | if os.path.isdir(os.environ['SHEETSITE_LAYOUT']): 14 | from glob import glob 15 | files = glob(os.path.join(os.environ['SHEETSITE_LAYOUT'], '*.yml')) 16 | files += glob(os.path.join(os.environ['SHEETSITE_LAYOUT'], '*.json')) 17 | layout = { 18 | 'names': [], 19 | 'sites': {} 20 | } 21 | for fname in files: 22 | name = os.path.splitext(os.path.split(fname)[1])[0] 23 | layout['names'].append(name) 24 | layout['sites'][name] = load_config(fname) 25 | else: 26 | # old big json file 27 | layout = json.loads(open(os.environ['SHEETSITE_LAYOUT']).read()) 28 | 29 | root = os.environ['SHEETSITE_CACHE'] 30 | 31 | names = layout['names'] 32 | 33 | for name in names: 34 | 35 | site = layout['sites'][name] 36 | 37 | if key != site['source']['key']: 38 | continue 39 | 40 | path = os.path.join(root, name) 41 | if not(os.path.exists(path)): 42 | os.makedirs(path) 43 | 44 | update_site.delay(params, path, site, name) 45 | 46 | return False 47 | 48 | -------------------------------------------------------------------------------- /sheetsite/tasks/notify.py: -------------------------------------------------------------------------------- 1 | from email.mime.multipart import MIMEMultipart 2 | from email.mime.text import MIMEText 3 | import json 4 | import os 5 | from sheetsite.site_queue import app 6 | import smtplib 7 | 8 | 9 | @app.task 10 | def notify_one(email, subject, page, text): 11 | 12 | print("send [%s] / %s / %s" % (email, subject, page)) 13 | 14 | server_ssl = smtplib.SMTP_SSL("smtp.gmail.com", 465) 15 | server_ssl.ehlo() # optional, called by login() 16 | me = os.environ['GMAIL_USERNAME'] 17 | server_ssl.login(me, os.environ['GMAIL_PASSWORD']) 18 | 19 | msg = MIMEMultipart('alternative') 20 | msg['Subject'] = subject 21 | msg['From'] = me 22 | msg['To'] = email 23 | 24 | # Record the MIME types of both parts - text/plain and text/html. 25 | part1 = MIMEText(text, 'plain') 26 | part2 = MIMEText(page, 'html') 27 | 28 | msg.attach(part1) 29 | msg.attach(part2) 30 | 31 | server_ssl.sendmail(me, email, msg.as_string()) 32 | server_ssl.close() 33 | 34 | return True 35 | 36 | 37 | @app.task 38 | def notify_all(name, site_params, diff_html, diff_text): 39 | print("NOTIFY_spreadsheet", site_params, name) 40 | 41 | import daff 42 | import jinja2 43 | import premailer 44 | 45 | root = os.environ['SHEETSITE_CACHE'] 46 | path = os.path.join(root, name) 47 | print("Should look in", path) 48 | notifications = None 49 | for fname in ['private.json', 'public.json']: 50 | full_fname = os.path.join(path, fname) 51 | print("Look in", full_fname) 52 | book = json.loads(open(full_fname).read()) 53 | if 'notifications' in book['tables']: 54 | notifications = book['tables']['notifications'] 55 | break 56 | if notifications is None: 57 | print("No notifications requested") 58 | return True 59 | print("Notifications", notifications) 60 | 61 | # make a html report 62 | css = daff.DiffRender().sampleCss() 63 | site_params = dict(site_params) 64 | site_params['css'] = css 65 | site_params['diff'] = diff_html 66 | env = jinja2.Environment(loader=jinja2.PackageLoader('sheetsite', 'templates')) 67 | template = env.get_template('update.html') 68 | page = template.render(site_params) 69 | page = premailer.transform(page) 70 | site_params['diff'] = diff_text 71 | template = env.get_template('update.txt') 72 | page_text = template.render(site_params) 73 | 74 | for target in notifications['rows']: 75 | email = target.get('EMAIL', None) 76 | if email is None: 77 | email = target.get('email', None) 78 | if email is not None: 79 | if site_params['no_notify']: 80 | print("skip email to {}".format(email)) 81 | else: 82 | notify_one.delay(email=email, 83 | subject="update to {}".format(site_params.get('name', 84 | 'directory')), 85 | page=page, 86 | text=page_text) 87 | 88 | return True 89 | -------------------------------------------------------------------------------- /sheetsite/tasks/update_site.py: -------------------------------------------------------------------------------- 1 | from sheetsite.chain import apply_chain, compute_diff 2 | from sheetsite.site_queue import app 3 | 4 | 5 | @app.task 6 | def update_site(params, path, site, name): 7 | 8 | source = site['source'] 9 | destination = site['destination'] 10 | 11 | site_params = { 12 | 'name': params.get('title', None), 13 | 'who': params.get('who', None), 14 | 'sheet_link': source.get('link', None), 15 | 'site_link': destination.get('link', None), 16 | 'no_notify': params['no_notify'] 17 | } 18 | 19 | files = apply_chain(site, path) 20 | diff_html, diff_text = compute_diff(files, format='both') 21 | 22 | from sheetsite.tasks.notify import notify_all 23 | notify_all.delay(name=name, 24 | site_params=site_params, 25 | diff_html=diff_html, 26 | diff_text=diff_text) 27 | return True 28 | 29 | 30 | -------------------------------------------------------------------------------- /sheetsite/templates/update.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ name }} 6 | 9 | 10 | 11 | 12 |

There's been an update in the “{{ name }}” site.

13 | 14 |
    15 | {% if who %} 16 |
  • Edit made by: {{ who }}
  • 17 | {% endif %} 18 | {% if site_link %} 19 |
  • See site at: {{ site_link }}
  • 20 | {% endif %} 21 | {% if sheet_link %} 22 |
  • Edit at: {{ sheet_link }}
  • 23 | {% endif %} 24 |
  • Unsubscribe by removing your address from the notifications sheet.
  • 25 |
26 | 27 |
28 | {{ diff }} 29 |
30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /sheetsite/templates/update.txt: -------------------------------------------------------------------------------- 1 | There's been an update in the "{{ name }}" site. 2 | 3 | {% if who %} 4 | * Edit made by: {{ who }} 5 | {% endif %}{% if site_link %} 6 | * See site at: {{ site_link }} 7 | {% endif %}{% if sheet_link %} 8 | * Edit at: {{ sheet_link }} 9 | {% endif %} 10 | * Unsubscribe by removing your address from the notifications sheet. 11 | 12 | {{ diff }} 13 | -------------------------------------------------------------------------------- /sheetsite/tweaks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paulfitz/sheetsite/0556e5713f01d7d8950365501bedecb5cdfabe6a/sheetsite/tweaks/__init__.py -------------------------------------------------------------------------------- /sheetsite/tweaks/add_dccid.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Early add id - this stuff needs to get reworked 3 | 4 | (stone soup only) 5 | ''' 6 | 7 | import json 8 | 9 | def apply3(wb, params, state): 10 | column = params['column'] 11 | id_file = state['id_file'] 12 | ids = json.load(open(id_file, 'r')) 13 | for name, t in wb['tables'].items(): 14 | if name in ids: 15 | ids0 = ids[name] 16 | if column not in t['columns']: 17 | t['columns'].append(column) 18 | for i, row in enumerate(t['rows']): 19 | idx = str(i + 1) 20 | row[column] = ids0[idx] 21 | -------------------------------------------------------------------------------- /sheetsite/tweaks/coalesce.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Replaces all blank values in a column with the first non-blank value in a series 3 | of columns, falling back on a default_value if all are blank. 4 | ``` 5 | tweaks: 6 | coalesce: 7 | # the first of the following list of columns is the one that is modified 8 | columns: first_priority_column second_priority_column third_priority_column 9 | default_value: N/A 10 | table: sheet1 # optional 11 | ``` 12 | ''' 13 | 14 | def apply(wb, params): 15 | columns = params['columns'] 16 | default_value = params['default'] 17 | table = params.get('table') 18 | active = False 19 | for name, t in wb['tables'].items(): 20 | if name == table or table is None: 21 | if len(set(columns) - set(t['columns'])) > 0: 22 | continue 23 | active = True 24 | for row in t['rows']: 25 | v = None 26 | for column in columns: 27 | if v is not None and v != "": 28 | break 29 | v = row[column] 30 | if v is None or v == '': 31 | v = default_value 32 | row[columns[0]] = v 33 | if not active: 34 | raise KeyError(column) 35 | -------------------------------------------------------------------------------- /sheetsite/tweaks/custom.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Apply a custom tweak. Expects my_script_name.py in same dir as .yml, with a the_method 3 | method that will receive wb, params. 4 | 5 | tweaks: 6 | custom: 7 | script: my_script_name 8 | method: the_method 9 | arg1: val1 10 | ... 11 | 12 | ''' 13 | 14 | import importlib 15 | import os 16 | import sys 17 | 18 | sys.path.append(os.getcwd()) 19 | 20 | def apply(wb, params): 21 | script = params['script'] 22 | method = params['method'] 23 | module = importlib.import_module(script) 24 | method_definition = getattr(module, method) 25 | return method_definition(wb, params) 26 | 27 | -------------------------------------------------------------------------------- /sheetsite/tweaks/formula.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Apply a python formatting string to a column. 3 | 4 | tweaks: 5 | formula: 6 | formula: "%05d" 7 | column: zip 8 | table: addresses # optional 9 | ''' 10 | 11 | def apply(wb, params): 12 | formula = params['formula'] 13 | column = params['column'] 14 | table = params.get('table') 15 | for name, t in wb['tables'].items(): 16 | if name == table or table is None: 17 | if column not in t['columns']: 18 | t['columns'].append(column) 19 | for row in t['rows']: 20 | row[column] = formula.format(**row) 21 | -------------------------------------------------------------------------------- /sheetsite/tweaks/list_to_rows.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | 4 | Take a list and make extra rows from it 5 | 6 | tweaks: 7 | list_to_rows: 8 | column: "Other Addresses" 9 | target: "address" # optional 10 | 11 | ''' 12 | 13 | 14 | import copy 15 | import re 16 | import six 17 | 18 | 19 | def apply(wb, params): 20 | column = params['column'] 21 | target = params.get('target', column) 22 | table = params.get('table') 23 | active = False 24 | for name, t in wb['tables'].items(): 25 | if name == table or table is None: 26 | if column not in t['columns']: 27 | continue 28 | if target not in t['columns']: 29 | continue 30 | active = True 31 | orows = [] 32 | for row in t['rows']: 33 | cell = row[column] 34 | print(">>>>", cell) 35 | orows.append(row) 36 | if cell is not None: 37 | if not isinstance(cell, six.string_types): 38 | for part in cell: 39 | nrow = copy.deepcopy(row) 40 | nrow[column] = None 41 | nrow[target] = part 42 | orows.append(nrow) 43 | t['rows'] = orows 44 | if not active: 45 | raise KeyError(column + " / " + target) 46 | 47 | 48 | -------------------------------------------------------------------------------- /sheetsite/tweaks/merge_tables.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | 4 | Smush all the tables together. 5 | 6 | tweaks: 7 | merge_tables: 8 | table: directory # name of the single created table 9 | column: thing # sheet names are placed here 10 | 11 | ''' 12 | 13 | def apply(wb, params): 14 | table = params['table'] 15 | column = params['column'] 16 | input_names = wb['names'] 17 | input_tables = wb['tables'] 18 | wb['names'] = [table] 19 | target_table = {} 20 | wb['tables'] = { 21 | table: target_table 22 | } 23 | order_cols = [] 24 | seen_cols = set() 25 | tables = [(name, input_tables[name]) for name in input_names] 26 | for name, t in tables: 27 | cols = t['columns'] 28 | for col in cols: 29 | if col not in seen_cols: 30 | order_cols.append(col) 31 | seen_cols.add(col) 32 | if column not in seen_cols: 33 | order_cols.append(column) 34 | seen_cols.add(column) 35 | 36 | target_table['columns'] = order_cols 37 | rows = target_table['rows'] = [] 38 | for name, t in tables: 39 | extra = dict((c, None) for c in seen_cols - set(t['columns'])) 40 | for row in t['rows']: 41 | row.update(extra) 42 | row[column] = name 43 | rows.append(row) 44 | 45 | -------------------------------------------------------------------------------- /sheetsite/tweaks/patch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Patch a cell 4 | 5 | tweaks: 6 | patch: 7 | where: 8 | col1: val1 9 | col2: val2 10 | update: 11 | col3: val3 12 | ''' 13 | 14 | def apply(wb, params): 15 | where = params['where'] 16 | update = params['update'] 17 | for name, t in wb['tables'].items(): 18 | for row in t['rows']: 19 | ok = True 20 | active = True 21 | for key, val in where.items(): 22 | if key not in row: 23 | ok = False 24 | break 25 | if row[key] != val: 26 | active = False 27 | break 28 | if not ok: 29 | break 30 | if not active: 31 | continue 32 | for key, val in update.items(): 33 | row[key] = val 34 | -------------------------------------------------------------------------------- /sheetsite/tweaks/prune_tables.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Little Bobby Drop Tables 4 | 5 | tweaks: 6 | prune_tables: 7 | - Table1 # list of all tables in desired order 8 | - Table2 9 | 10 | ''' 11 | 12 | def apply(wb, params): 13 | old_names = wb['names'] 14 | old_tables = wb['tables'] 15 | names = wb['names'] = list(params) 16 | tables = wb['tables'] = {} 17 | for name in names: 18 | tables[name] = old_tables[name] 19 | -------------------------------------------------------------------------------- /sheetsite/tweaks/rename_column.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Rename a column 4 | 5 | tweaks: 6 | rename_column: 7 | table: Table1 # optional 8 | from: OldColumnName 9 | to: NewColumnName # blank to delete 10 | 11 | ''' 12 | 13 | def apply(wb, params): 14 | table = params.get('table') 15 | from_name = params['from'] 16 | to_name = params.get('to') 17 | active = False 18 | for name, t in wb['tables'].items(): 19 | if name == table or table is None: 20 | if from_name not in t['columns']: 21 | continue 22 | active = True 23 | t['columns'] = [to_name if name == from_name else name 24 | for name in t['columns'] 25 | if name != from_name or to_name] 26 | for row in t['rows']: 27 | tmp = row[from_name] 28 | if to_name: 29 | row[to_name] = tmp 30 | if not active: 31 | raise KeyError(from_name) 32 | 33 | -------------------------------------------------------------------------------- /sheetsite/tweaks/rename_table.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Rename a table 4 | 5 | tweaks: 6 | rename_table: 7 | from: OldName 8 | to: NewName 9 | 10 | ''' 11 | 12 | def apply(wb, params): 13 | from_name = params['from'] 14 | to_name = params['to'] 15 | old_names = wb['names'] 16 | if from_name in old_names: 17 | wb['names'] = [to_name if name == from_name else name for name in old_names] 18 | wb['tables'][to_name] = wb['tables'].pop(from_name) 19 | -------------------------------------------------------------------------------- /sheetsite/tweaks/replace_cell.py: -------------------------------------------------------------------------------- 1 | 2 | def apply(wb, params): 3 | column = params['column'] 4 | table = params.get('table') 5 | mapping = params['map'] 6 | active = False 7 | for name, t in wb['tables'].items(): 8 | if name == table or table is None: 9 | if column not in t['columns']: 10 | continue 11 | active = True 12 | for row in t['rows']: 13 | code = str(row[column]) 14 | if "," in code: 15 | cactive = False 16 | codes = [x.strip() for x in code.split(',')] 17 | for idx, code in enumerate(codes): 18 | if code in mapping: 19 | codes[idx] = mapping[code] 20 | cactive = True 21 | if cactive: 22 | code = ', '.join(codes) 23 | row[column] = code 24 | elif code in mapping: 25 | row[column] = mapping[code] 26 | 27 | if not active: 28 | raise KeyError(column) 29 | 30 | -------------------------------------------------------------------------------- /sheetsite/tweaks/required_field.py: -------------------------------------------------------------------------------- 1 | 2 | def apply(wb, params): 3 | column = params['column'] 4 | table = params.get('table') 5 | value = params.get('value') 6 | not_value = params.get('not-value') 7 | active = False 8 | for name, t in wb['tables'].items(): 9 | if name == table or table is None: 10 | if column not in t['columns']: 11 | continue 12 | active = True 13 | orows = [] 14 | for row in t['rows']: 15 | v = row[column] 16 | if value is not None: 17 | if str(v) == str(value): 18 | orows.append(row) 19 | elif not_value is not None: 20 | if str(v) != str(not_value): 21 | orows.append(row) 22 | elif v is not None and v != '': 23 | orows.append(row) 24 | t['rows'] = orows 25 | if not active: 26 | raise KeyError(column) 27 | 28 | -------------------------------------------------------------------------------- /sheetsite/tweaks/sniff_inactive.py: -------------------------------------------------------------------------------- 1 | 2 | def apply(wb, params): 3 | table = params.get('table') 4 | for name, t in wb['tables'].items(): 5 | if name == table or table is None: 6 | if 'dcc_status' not in t['columns']: 7 | t['columns'].append('dcc_status') 8 | if 'dcc_stamp' not in t['columns']: 9 | t['columns'].append('dcc_stamp') 10 | for row in t['rows']: 11 | status = None 12 | stamp = None 13 | if 'NOTES' in t['columns']: 14 | code = str(row['NOTES'] or '') 15 | if 'DELETE' in code: 16 | status = 'Inactive' 17 | if 'Active' in t['columns']: 18 | code = str(row['Active'] or '') 19 | if code == 'no': 20 | status = 'Inactive' 21 | elif len(code) > 0 and code[0] >= '0' and code[0] <= '9': 22 | stamp = int(code) 23 | if 'Member' in t['columns']: 24 | code = str(row['Member'] or '') 25 | if code.lower() == 'closed': 26 | status = 'Inactive' 27 | row['dcc_status'] = status 28 | row['dcc_stamp'] = stamp 29 | -------------------------------------------------------------------------------- /sheetsite/tweaks/split_addresses.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | 4 | Addresses separated by newlines in a single cell get parsed 5 | 6 | tweaks: 7 | split_addresses: 8 | column: "Other Addresses" 9 | 10 | ''' 11 | 12 | 13 | import re 14 | 15 | 16 | def sanity_stick(locs): 17 | if len(locs) <= 1: 18 | return locs 19 | if len(re.sub(r'[^,]', '', locs[0])) < 3: 20 | return [' '.join(locs)] 21 | return locs 22 | 23 | 24 | def apply(wb, params): 25 | column = params['column'] 26 | table = params.get('table') 27 | active = False 28 | for name, t in wb['tables'].items(): 29 | if name == table or table is None: 30 | if column not in t['columns']: 31 | continue 32 | active = True 33 | for row in t['rows']: 34 | cell = row[column] 35 | if cell is not None: 36 | print("[{}]".format(cell)) 37 | cell = re.sub(r'^[ \n\r\t]*', '', cell) 38 | cell = re.sub(r'[ \n\r\t]*$', '', cell) 39 | cell = re.sub(r'^n/a$', '', cell, flags=re.IGNORECASE) 40 | print("[{}]".format(cell)) 41 | if cell == '': 42 | splits = None 43 | else: 44 | splits = cell.split('\n') 45 | splits = sanity_stick(splits) 46 | row[column] = splits 47 | if not active: 48 | raise KeyError(column) 49 | 50 | 51 | -------------------------------------------------------------------------------- /sheetsite/tweaks/split_addresses_v2.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | 4 | Addresses separated by *double* newlines in a single cell get parsed 5 | 6 | tweaks: 7 | split_addresses_v2: 8 | column: "Other Addresses" 9 | 10 | ''' 11 | 12 | 13 | import json 14 | import re 15 | 16 | 17 | def sanity_stick(locs): 18 | if len(locs) <= 1: 19 | return locs 20 | if len(re.sub(r'[^,]', '', locs[0])) < 1: 21 | return [' '.join(locs)] 22 | return locs 23 | 24 | 25 | def apply(wb, params): 26 | column = params['column'] 27 | table = params.get('table') 28 | active = False 29 | for name, t in wb['tables'].items(): 30 | if name == table or table is None: 31 | if column not in t['columns']: 32 | continue 33 | active = True 34 | for row in t['rows']: 35 | cell = row[column] 36 | if cell is not None and 'See:' in cell: 37 | cell = None 38 | if cell is not None: 39 | print(">>> {}".format(cell)) 40 | cell = re.sub(r'^[ \n\r\t]*', '', cell) 41 | cell = re.sub(r'[ \n\r\t]*$', '', cell) 42 | cell = re.sub(r'^n/a$', '', cell, flags=re.IGNORECASE) 43 | if cell == '': 44 | splits = None 45 | else: 46 | splits = re.split('[\n\r][\n\r]', cell) 47 | splits = sanity_stick(splits) 48 | print(json.dumps(splits)) 49 | row[column] = splits 50 | if not active: 51 | raise KeyError(column) 52 | 53 | 54 | -------------------------------------------------------------------------------- /sheetsite/tweaks/us_state.py: -------------------------------------------------------------------------------- 1 | 2 | def apply(wb, params): 3 | column = params['column'] 4 | table = params.get('table') 5 | active = False 6 | for name, t in wb['tables'].items(): 7 | if name == table or table is None: 8 | if column not in t['columns']: 9 | continue 10 | active = True 11 | for row in t['rows']: 12 | code = str(row[column]) 13 | if code == "CT": 14 | row[column] = "Connecticut" 15 | # important to replace this or geocoder will sporadically 16 | # interpret it as Court or Crescent or the like 17 | if code == "RI": 18 | row[column] = "Rhode Island" 19 | if code == "MA": 20 | row[column] = "Massachusetts" 21 | 22 | if not active: 23 | raise KeyError(column) 24 | 25 | -------------------------------------------------------------------------------- /sheetsite/tweaks/us_zip.py: -------------------------------------------------------------------------------- 1 | 2 | def apply(wb, params): 3 | column = params['column'] 4 | table = params.get('table') 5 | active = False 6 | for name, t in wb['tables'].items(): 7 | if name == table or table is None: 8 | if column not in t['columns']: 9 | continue 10 | active = True 11 | for row in t['rows']: 12 | code = str(row[column]) 13 | if len(code) < 5 and len(code) > 0: 14 | try: 15 | code = "%05d" % int(code) 16 | row[column] = code 17 | except ValueError: 18 | pass # let odd values through 19 | if not active: 20 | raise KeyError(column) 21 | 22 | -------------------------------------------------------------------------------- /sheetsite/xls_spreadsheet.py: -------------------------------------------------------------------------------- 1 | from openpyxl import load_workbook 2 | 3 | 4 | class XlsSpreadsheet(object): 5 | 6 | def __init__(self, filename): 7 | self.book = book = load_workbook(filename=filename) 8 | self.sheets = [XlsSheet(n, book.get_sheet_by_name(n)) for n in book.get_sheet_names()] 9 | 10 | def worksheets(self): 11 | return self.sheets 12 | 13 | 14 | class XlsSheet(object): 15 | 16 | def __init__(self, name, data): 17 | self.name = name 18 | self.data = data 19 | 20 | def get_all_values(self): 21 | input = self.data.rows 22 | output = [] 23 | for i, row in enumerate(input): 24 | output_row = [] 25 | output.append(output_row) 26 | for j, cell in enumerate(row): 27 | try: 28 | output_row.append(cell.value) 29 | except: 30 | output_row.append(None) 31 | return output 32 | 33 | @property 34 | def title(self): 35 | return self.name 36 | 37 | -------------------------------------------------------------------------------- /sites/available/commoners.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: google-sheets 3 | key: 19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA 4 | credential_file: service.json 5 | link: https://docs.google.com/spreadsheets/d/19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA/edit 6 | 7 | flags: 8 | add: 9 | directory: 10 | - LAT 11 | - LNG 12 | - COUNTRY 13 | - STREET 14 | - REGION 15 | - LOCALITY 16 | 17 | destination: 18 | name: chain 19 | chain: 20 | - name: git 21 | repo: git@github.com:datacommons/commoners 22 | local: commoners 23 | file: _data/directory.json 24 | - name: git 25 | repo: git@github.com:datacommons/datacommons.github.io 26 | file: _data/directory.json 27 | local: website 28 | link: http://datacommons.coop/members 29 | -------------------------------------------------------------------------------- /sites/available/hack_spots.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: google-sheets 3 | key: 1hnfQcggYcBYimuO_UOMvwoOi_I9vUvFpkMt4wjrrpLE 4 | credential_file: service.json 5 | 6 | destination: 7 | file: hackspots.xlsx 8 | -------------------------------------------------------------------------------- /sites/available/local.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: .json 3 | filename: test.json 4 | 5 | flags: 6 | add: 7 | zig: 8 | - dccid 9 | zag: 10 | - dccid 11 | 12 | destination: 13 | name: .json 14 | output_file: foo.json 15 | -------------------------------------------------------------------------------- /sites/available/manitoba.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: google-sheets 3 | key: 1LvBgFeYsI9GeN2PTw5klcwBFFFeROlbwvTVF2qAIuBk 4 | credential_file: service.json 5 | link: https://docs.google.com/spreadsheets/d/1LvBgFeYsI9GeN2PTw5klcwBFFFeROlbwvTVF2qAIuBk/edit 6 | 7 | flags: 8 | add: 9 | directory: 10 | - Latitude 11 | - Longitude 12 | - Postal Code 13 | - State 14 | - Country 15 | address: 16 | directory: 17 | - Physical Address 18 | - City 19 | - Manitoba 20 | - Canada 21 | 22 | destination: 23 | name: chain 24 | chain: 25 | - name: stone-soup 26 | organization: Manitoba Cooperative Association 27 | - name: install_local_soup 28 | link: http://find.manitoba.coop 29 | -------------------------------------------------------------------------------- /sites/available/tap.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: google-sheets 3 | key: 1mBqfuAWYkRO5M7dd-bw0jKbd0fJGI-4UUv4BDmsyJy4 4 | credential_file: service.json 5 | link: https://docs.google.com/spreadsheets/d/1mBqfuAWYkRO5M7dd-bw0jKbd0fJGI-4UUv4BDmsyJy4/edit 6 | 7 | flags: 8 | add: 9 | directory: 10 | - LAT 11 | - LNG 12 | - COUNTRY 13 | - STREET 14 | - REGION 15 | - LOCALITY 16 | 17 | destination: 18 | name: git 19 | repo: git@github.com:datacommons/tap 20 | file: _data/directory.json 21 | link: http://datacommons.coop/tap 22 | -------------------------------------------------------------------------------- /sites/available/test.yml: -------------------------------------------------------------------------------- 1 | source: 2 | name: google-sheets 3 | key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc 4 | credential_file: service.json 5 | link: https://docs.google.com/spreadsheets/d/15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc/edit 6 | 7 | flags: 8 | add: 9 | directory: 10 | - LAT 11 | - LNG 12 | - COUNTRY 13 | - STREET 14 | - REGION 15 | - LOCALITY 16 | 17 | destination: 18 | name: git 19 | repo: git@github.com:paulfitz/scrapyard 20 | file: directory.json 21 | link: https://github.com/paulfitz/scrapyard/blob/master/directory.json 22 | -------------------------------------------------------------------------------- /tests/configs/fill.json: -------------------------------------------------------------------------------- 1 | { 2 | "names": ["countries"], 3 | "tables": { 4 | "countries": { 5 | "columns": ["country", "[zip]", "code", "(opinion)"], 6 | "rows": [ 7 | { 8 | "country": "United Kingdom", 9 | "code": "uk", 10 | "(opinion)": "dubious", 11 | "[zip]": "" 12 | }, 13 | { 14 | "country": "United States", 15 | "code": "((usa))", 16 | "(opinion)": "dubious", 17 | "[zip]": "" 18 | } 19 | ] 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/configs/json_to_json.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": { 3 | "filename": "tests/configs/things.json" 4 | }, 5 | "destination": { 6 | "output_file": "${TEST_DIR}/out.json" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /tests/configs/multirow.json: -------------------------------------------------------------------------------- 1 | { 2 | "names": ["places"], 3 | "tables": { 4 | "places": { 5 | "columns": ["street", "city", "state", "country", "web"], 6 | "rows": [ 7 | { 8 | "street": "Test1", 9 | "city": "Test2", 10 | "state": "", 11 | "country": "", 12 | "web": "" 13 | }, 14 | { 15 | "street": "Test1", 16 | "city": "", 17 | "state": "", 18 | "country": "", 19 | "web": "" 20 | }, 21 | { 22 | "street": "305 Memorial Dr", 23 | "city": "Cambridge", 24 | "state": "Massachusetts", 25 | "country": "United States", 26 | "web": "web1" 27 | }, 28 | { 29 | "street": "306 Memorial Dr", 30 | "city": null, 31 | "state": "", 32 | "country": "", 33 | "web": "web1" 34 | } 35 | ] 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/configs/things.json: -------------------------------------------------------------------------------- 1 | { 2 | "names": ["countries", "((secret))"], 3 | "tables": { 4 | "countries": { 5 | "columns": ["country", "code", "(opinion)"], 6 | "rows": [ 7 | { 8 | "country": "United Kingdom", 9 | "code": "uk", 10 | "(opinion)": "dubious" 11 | }, 12 | { 13 | "country": "United States", 14 | "code": "((usa))", 15 | "(opinion)": "dubious" 16 | } 17 | ] 18 | }, 19 | "((secret))": { 20 | "columns": ["a", "b"], 21 | "rows": {} 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/test_chain.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | from sheetsite.chain import apply_chain 5 | from sheetsite.cmdline import run 6 | 7 | ######################################################## 8 | # python2 doesn't have TemporaryDirectory 9 | # replacement begins 10 | 11 | import contextlib 12 | import shutil 13 | import tempfile 14 | 15 | 16 | @contextlib.contextmanager 17 | def TemporaryDirectory(): 18 | dirpath = tempfile.mkdtemp() 19 | try: 20 | yield dirpath 21 | finally: 22 | shutil.rmtree(dirpath) 23 | 24 | 25 | # replacement ends 26 | # python2 doesn't have TemporaryDirectory 27 | ######################################################## 28 | 29 | 30 | class TestChain(unittest.TestCase): 31 | 32 | def test_json_to_json_cmdline(self): 33 | with TemporaryDirectory() as temp_dir: 34 | os.environ['TEST_DIR'] = temp_dir 35 | run(['--config', 'tests/configs/json_to_json.json', '--cache-dir', temp_dir]) 36 | 37 | def test_json_to_json(self): 38 | with TemporaryDirectory() as temp_dir: 39 | target = "{}/out.json".format(temp_dir) 40 | params = { 41 | "source": {"filename": "tests/configs/things.json"}, 42 | "destination": {"output_file": target} 43 | } 44 | apply_chain(params, temp_dir) 45 | with open(target, 'r') as f: 46 | data = json.load(f) 47 | assert len(data["tables"]["countries"]["columns"]) == 2 48 | assert data["tables"]["countries"]["rows"][1]["code"] == "" 49 | 50 | def test_fill(self): 51 | with TemporaryDirectory() as temp_dir: 52 | target = "{}/out.json".format(temp_dir) 53 | params = { 54 | "source": {"filename": "tests/configs/fill.json"}, 55 | "flags": { 56 | "geocoder": "dummy", 57 | "address": {"countries": ["country"]} 58 | }, 59 | "destination": {"output_file": target} 60 | } 61 | apply_chain(params, temp_dir) 62 | with open(target, 'r') as f: 63 | data = json.load(f) 64 | assert data["tables"]["countries"]["rows"][0]["zip"] == "PO-STAL" 65 | 66 | def test_single_to_multiple_add(self): 67 | with TemporaryDirectory() as temp_dir: 68 | target = "{}/out.json".format(temp_dir) 69 | params = { 70 | "source": {"filename": "tests/configs/things.json"}, 71 | "flags": { 72 | "geocoder": "dummy", 73 | "address": {"countries": ["country"]}, 74 | "add": {"countries": ["city", "address"]} 75 | }, 76 | "destination": {"output_file": target} 77 | } 78 | apply_chain(params, temp_dir) 79 | with open(target, 'r') as f: 80 | data = json.load(f) 81 | assert data["tables"]["countries"]["rows"][0]["city"] == "Cityville" 82 | assert data["tables"]["countries"]["rows"][0]["address"] == "United Kingdom" 83 | assert data["tables"]["countries"]["rows"][1]["address"] == "United States" 84 | 85 | def test_multiple_to_multiple_add(self): 86 | with TemporaryDirectory() as temp_dir: 87 | target = "{}/out.json".format(temp_dir) 88 | params = { 89 | "source": {"filename": "tests/configs/things.json"}, 90 | "flags": { 91 | "geocoder": "dummy", 92 | "address": {"countries": ["code", "country", "Earth"]}, 93 | "add": {"countries": ["city", "address"]} 94 | }, 95 | "destination": {"output_file": target} 96 | } 97 | apply_chain(params, temp_dir) 98 | with open(target, 'r') as f: 99 | data = json.load(f) 100 | assert data["tables"]["countries"]["rows"][0]["city"] == "Cityville" 101 | assert data["tables"]["countries"]["rows"][0]["address"] == "uk United Kingdom Earth" 102 | 103 | def test_multirow(self): 104 | with TemporaryDirectory() as temp_dir: 105 | target = "{}/out.json".format(temp_dir) 106 | params = { 107 | "source": {"filename": "tests/configs/multirow.json"}, 108 | "flags": { 109 | "geocoder": "dummy", 110 | "group": "web", 111 | "address": {"places": ["street", "city", "state", "country"]}, 112 | "add": {"places": ["lat", "lon", "address"]} 113 | }, 114 | "destination": {"output_file": target} 115 | } 116 | apply_chain(params, temp_dir) 117 | with open(target, 'r') as f: 118 | data = json.load(f) 119 | places = data["tables"]["places"]["rows"] 120 | self.assertEqual(places[0]["address"], "Test1 Test2") 121 | self.assertEqual(places[1]["address"], "Test1") 122 | self.assertEqual(places[2]["address"], 123 | "305 Memorial Dr Cambridge Massachusetts United States") 124 | self.assertEqual(places[3]["address"], 125 | "306 Memorial Dr Cambridge Massachusetts United States") 126 | 127 | def test_rename(self): 128 | with TemporaryDirectory() as temp_dir: 129 | target = "{}/out.json".format(temp_dir) 130 | params = { 131 | "source": {"filename": "tests/configs/multirow.json"}, 132 | "flags": { 133 | "geocoder": "dummy", 134 | "rename": {"places": {"web": "website"}}, 135 | "address": {"places": ["street", "city", "state", "country"]}, 136 | "add": {"places": ["lat", "lon", "address"]} 137 | }, 138 | "destination": {"output_file": target} 139 | } 140 | apply_chain(params, temp_dir) 141 | with open(target, 'r') as f: 142 | data = json.load(f) 143 | places = data["tables"]["places"]["rows"] 144 | self.assertIn('website', places[0]) 145 | self.assertNotIn('web', places[0]) 146 | -------------------------------------------------------------------------------- /tests/test_environment.py: -------------------------------------------------------------------------------- 1 | def test_general_import(): 2 | import sheetsite 3 | 4 | def test_specific_import(): 5 | import sheetsite.cmdline 6 | -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sheetsite.json_spreadsheet import JsonSpreadsheet 3 | from sheetsite.site import Site 4 | 5 | def test_filter(): 6 | wb = JsonSpreadsheet('tests/configs/things.json') 7 | site = Site(wb) 8 | 9 | filtered_wb = site.public_workbook() 10 | result = wb.as_dict(filtered_wb) 11 | columns = result["tables"]["countries"]["columns"] 12 | assert "country" in columns 13 | assert not "opinion" in columns 14 | assert not "secret" in result["tables"] 15 | 16 | filtered_wb = site.private_workbook() 17 | result = wb.as_dict(filtered_wb) 18 | assert "secret" in result["tables"] 19 | --------------------------------------------------------------------------------