├── .gitignore
├── .travis.yml
├── LICENSE.md
├── Makefile
├── README.md
├── setup.py
├── sheetsite
    ├── __init__.py
    ├── chain.py
    ├── cmdline.py
    ├── csv_spreadsheet.py
    ├── destination
    │   ├── __init__.py
    │   ├── csv_ss.py
    │   ├── drop.py
    │   ├── excel.py
    │   ├── ftp.py
    │   ├── git.py
    │   ├── install_local_soup.py
    │   ├── json_ss.py
    │   ├── sqlite_ss.py
    │   ├── stone_soup.py
    │   └── stone_soup_v2.py
    ├── expand.py
    ├── filtered_spreadsheet.py
    ├── geocache.py
    ├── google_spreadsheet.py
    ├── ids.py
    ├── json_spreadsheet.py
    ├── jsonify.py
    ├── merged_spreadsheet.py
    ├── names.py
    ├── sheet.py
    ├── sheetsend.py
    ├── sheetwatch.py
    ├── site.py
    ├── site_queue.py
    ├── source
    │   ├── __init__.py
    │   ├── csv.py
    │   ├── excel.py
    │   ├── google.py
    │   └── json.py
    ├── spreadsheet.py
    ├── tasks
    │   ├── __init__.py
    │   ├── detect_site.py
    │   ├── notify.py
    │   └── update_site.py
    ├── templates
    │   ├── update.html
    │   └── update.txt
    ├── tweaks
    │   ├── __init__.py
    │   ├── add_dccid.py
    │   ├── coalesce.py
    │   ├── custom.py
    │   ├── formula.py
    │   ├── list_to_rows.py
    │   ├── merge_tables.py
    │   ├── patch.py
    │   ├── prune_tables.py
    │   ├── rename_column.py
    │   ├── rename_table.py
    │   ├── replace_cell.py
    │   ├── required_field.py
    │   ├── sniff_inactive.py
    │   ├── split_addresses.py
    │   ├── split_addresses_v2.py
    │   ├── us_state.py
    │   └── us_zip.py
    └── xls_spreadsheet.py
├── sites
    └── available
    │   ├── commoners.yml
    │   ├── hack_spots.yml
    │   ├── local.yml
    │   ├── manitoba.yml
    │   ├── tap.yml
    │   └── test.yml
└── tests
    ├── configs
        ├── fill.json
        ├── json_to_json.json
        ├── multirow.json
        └── things.json
    ├── test_chain.py
    ├── test_environment.py
    └── test_filter.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc
3 | env.sh
4 | service.json
5 | build
6 | src
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "2.7"
4 |   - "3.3"
5 |   - "3.4"
6 | install: "pip install -e ."
7 | script: "nosetests -s -vv tests"
8 | 
9 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Paul Fitzpatrick &lt;paul.michael.fitzpatrick@gmail.com&gt;
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | 'Software'), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | default:
 2 | 	echo "Hello"
 3 | 
 4 | q:
 5 | 	bin/celery3 -A sheetsite.queue worker  -l info
 6 | 
 7 | sdist:
 8 | 	rm -rf dist
 9 | 	cp README.md README
10 | 	python3 setup.py sdist
11 | 	cd dist && mkdir tmp && cd tmp && tar xzvf ../sheet*.tar.gz && cd sheet*[0-9] && ./setup.py build
12 | 	python3 setup.py sdist upload
13 | 	rm -rf dist
14 | 	rm README MANIFEST
15 | 
16 | test:
17 | 	which nosetests3 && nosetests3 -s -vv tests || echo "no nosetest3"
18 | 	which nosetests && nosetests -s -vv tests || echo "no nosetest"
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # sheetsite: sheets for sites
  2 | 
  3 | [![Build Status](https://travis-ci.org/paulfitz/sheetsite.svg?branch=master)](https://travis-ci.org/paulfitz/sheetsite)
  4 | [![PyPI version](https://badge.fury.io/py/sheetsite.svg)](http://badge.fury.io/py/sheetsite)
  5 | 
  6 | Keep a website or directory in sync with a google sheet.
  7 | 
  8 | Features:
  9 | 
 10 | * Copy a google spreadsheet locally, as json or excel format.
 11 | * Can strip specified tabs, columns, or cells from the spreadsheet,
 12 |   in case not all of it should be copied along.
 13 | * Can push a filtered json copy out to a git repository, handy for
 14 |   maintaining a website based on a private shared spreadsheet.
 15 | * Can augment the sheet with geocoding, adding latitude and longitude based
 16 |   on address fields for example.
 17 | * Can notify people by email with a summary of updates.
 18 | 
 19 | 
 20 | ## Installation
 21 | 
 22 | For the basics:
 23 | 
 24 | ```
 25 | pip install sheetsite
 26 | ```
 27 | 
 28 | For all bells and whistles, when automating a sheet-to-site workflow:
 29 | 
 30 | ```
 31 | pip install sheetsite[queue]
 32 | ```
 33 | 
 34 | ## Specifying the source and destination
 35 | 
 36 | The `sheetsite` utility, when run without any arguments, will expect
 37 | to find all necessary options in a `_sheetsite.yml` file.  A simple
 38 | example of such a file is:
 39 | 
 40 | ```yaml
 41 | source:
 42 |   name: google-sheets
 43 |   key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc
 44 |   credential_file: service.json
 45 | 
 46 | destination:
 47 |   file: sheet.xlsx
 48 | ```
 49 | 
 50 | The file should have two stanzas, `source` specifying where to get
 51 | data from, and `destination` specifying where to put it.  This
 52 | examples reads a private google spreadsheet and saves it as
 53 | `sheet.xlsx`.  The key comes from the url of the spreadsheet.
 54 | The credentials file is something you [get from google](https://pygsheets.readthedocs.io/en/stable/authorizing.html).
 55 | 
 56 | Here's an example that outputs json:
 57 | 
 58 | ```yaml
 59 | source:
 60 |   name: google-sheets
 61 |   key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc
 62 |   credential_file: service.json
 63 | 
 64 | destination:
 65 |   file: _data/directory.json
 66 | ```
 67 | 
 68 | You could now build a static website from that `.json`, see
 69 | http://jekyllrb.com/docs/datafiles/ for how, or see an example
 70 | at https://github.com/datacommons/commoners
 71 | 
 72 | Here's an example that adds some geocoded fields and directly
 73 | updates a git repository:
 74 | 
 75 | ```yaml
 76 | source:
 77 |   name: google-sheets
 78 |   key: 19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA
 79 |   credential_file: service.json
 80 | 
 81 | flags:
 82 |   add:
 83 |     directory:
 84 |       - LAT
 85 |       - LNG
 86 |       - COUNTRY
 87 |       - STREET
 88 |       - REGION
 89 |       - LOCALITY
 90 | 
 91 | destination:
 92 |   name: git
 93 |   repo: git@github.com:datacommons/commoners
 94 |   file: _data/directory.json
 95 | ```
 96 | 
 97 | ## Strip private sheets, columns, or cells
 98 | 
 99 | By default, sheetsite will strip:
100 | 
101 | * Any columns whose name is in parentheses, e.g. `(Private Notes)`
102 | * Any cells or text within cells surrounded by double parentheses, e.g. `((private@email.address))`
103 | * Any sheets whose name is in double parentheses, e.g. `((secret sheet))`
104 | 
105 | ## Geocoding
106 | 
107 | If you have a table with a column called `address`, sheetsite can geocode it for
108 | you and pass along the results.  Just add the following in your yaml:
109 | 
110 | ```
111 | flags:
112 |   add:
113 |     table_name_goes_here:
114 |       - latitude
115 |       - longitude
116 |       - country
117 |       - state
118 |       - city
119 |       - street
120 |       - zip
121 | ```
122 | 
123 | You can add just the columns you want.  Geocoding results are cached in a `_cache`
124 | directory by default so they do not need to be repeated in future calls to sheetsite.
125 | 
126 | The full list of columns (with synonyms) available is:
127 |   * latitude / lat
128 |   * longitude / lng
129 |   * latlng
130 |   * country
131 |   * state / province / region
132 |   * city / locality
133 |   * street
134 |   * zip / postal_code
135 | 
136 | Normally you won't actually have a stand-alone `address` column.  More usually,
137 | information will be spread over multiple columns, or some will be implicit (e.g.
138 | the state/province and country).  You can tell sheetsite how to construct addresses
139 | for geocoding by listing columns and constants to build it from.  For example:
140 | 
141 | ```
142 | flags:
143 |   address:
144 |     table_name_goes_here:
145 |       - street_address1
146 |       - street_address2
147 |       - city
148 |       - Manitoba
149 |       - Canada
150 |   add:
151 |     table_name_goes_here:
152 |       - postal_code
153 | ```
154 | 
155 | This tells sheetsite to produce addresses of the form:
156 | ```
157 | <street_address1> <street_address2> <city> Manitoba Canada
158 | ```
159 | And add a `postal_code` column populated by geocoding.
160 | 
161 | It is possible to request columns directly in the spreadsheet.  Just
162 | wrap the column name in square brackets, like `[state]` or `[zip]`.
163 | Any blank cells in such columns will be filled using geocoding based
164 | on the address given in that row.  If the address columns have not been
165 | configured in `flags` then the address must be present in a single column
166 | literally called `address`.
167 | 
168 | ## Row uuids
169 | 
170 | There's a random feature to add uuids to rows.   Just add a column
171 | called `dccid` for some reason:
172 | 
173 | ```
174 | flags:
175 |   add:
176 |     table_name_goes_here:
177 |       - dccid
178 | ```
179 | 
180 | A uuid will be added to each row.  A good faith effort will be made
181 | to keep that uuid constant across updates, keeping it linked to the
182 | row where it first appeared.
183 | 
184 | ## Grouping locations
185 | 
186 | If there are several rows of a sheet that will give locations that should
187 | be thought of as a single unit (e.g. an organization with multiple locations),
188 | you can tell `sheetsite` about that.  To do so, give it a `group` key.
189 | Every row for which the `group` is the same (and not blank) will be bound
190 | together.  When geocaching, blank cells in address cells  will be filled
191 | in with information from the first row in this group.  For example, with this
192 | configuration:
193 | 
194 | ```
195 | flags:
196 |   group: WEBSITE
197 | ```
198 | 
199 | Then for a table like the following:
200 | 
201 | ```
202 | STREET,   CITY,   STATE,    WEBSITE
203 | ...
204 | 17 N St,  Foo,    Utopia,   joe.ut
205 | 16 S St,  ,       ,         joe.ut
206 | ...
207 | ```
208 | 
209 | During geocoding, `16 S St` would be assumed to be in `Foo, Utopia`.
210 | 
211 | ## Renaming columns
212 | 
213 | Columns can be renamed.  This will occur before any other operation.
214 | 
215 | ```
216 | flags:
217 |   rename:
218 |     table_name:
219 |       old_column_name1: new_column_name1
220 |       old_column_name2: new_column_name2
221 | ```
222 | 
223 | ## Getting credentials
224 | 
225 | [Obtain credentials for accessing sheets from the Google Developers Console](https://pygsheets.readthedocs.io/en/latest/authorizing.html).
226 | 
227 | Make sure you share the sheet with the email address in the credentials file.  Read-only permission is fine.
228 | 
229 | ## Examples
230 | 
231 | For example, the map at http://datacommons.coop/tap/ is a visualization
232 | of data pulled from a google spreadsheet, styled using
233 | https://github.com/datacommons/tap via github pages.
234 | 
235 | ## sheetwatch
236 | 
237 | It can be useful to automate and forget `sheetsite`, so that updates
238 | to a google spreadsheet propagate automatically to their final
239 | destination.  The `sheetwatch` utility does this.  It requires a queue
240 | server to operate.  To install, do:
241 | 
242 | ```
243 | pip install sheetsite[queue]
244 | ```
245 | 
246 | Install any queue server supported by `celery`.  For example, `redis`:
247 | 
248 | ```
249 | sudo apt-get install redis-server
250 | redis-server
251 | ```
252 | 
253 | We need to set some environment variables to let `sheetwatch` know
254 | where to find the queue server:
255 | 
256 | ```
257 | export SHEETSITE_BROKER_URL=redis://localhost
258 | export SHEETSITE_RESULT_BACKEND=redis://localhost
259 | ```
260 | 
261 | The `sheetwatch` program needs a cache directory for its operations.
262 | 
263 | ```
264 | export SHEETSITE_CACHE=$HOME/cache/sites
265 | ```
266 | 
267 | Finally, it needs to know where there is a directory full of `yml`
268 | files describing any sheets to monitor and their corresponding sites:
269 | 
270 | ```
271 | export SHEETSITE_LAYOUT=$PWD/sites/enabled
272 | ```
273 | 
274 | We now start a worker:
275 | 
276 | ```
277 | sheetwatch worker
278 | ```
279 | 
280 | The last thing we need to do is check a mailbox from time to time
281 | for sheet change notifications from Google, and kick off site updates
282 | as needed:
283 | 
284 | ```
285 |   export GMAIL_USERNAME=*****
286 |   export GMAIL_PASSWORD=*****
287 | sheetwatch ping --delay 60
288 | ```
289 | 
290 | ## License
291 | 
292 | sheetsite is distributed under the MIT License.
293 | 
294 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | from distutils.core import setup
 5 | from setuptools import find_packages
 6 | import os.path
 7 | 
 8 | 
 9 | def read(fname, fname2):
10 |     if not(os.path.exists(fname)):
11 |         fname = fname2
12 |     with open(os.path.join(os.path.dirname(__file__), fname)) as f:
13 |         return f.read()
14 | 
15 | 
16 | setup(
17 |     name="sheetsite",
18 |     version="0.2.2",
19 |     author="Paul Fitzpatrick",
20 |     author_email="paul.michael.fitzpatrick@gmail.com",
21 |     description=("read google sheets, use them for sites"),
22 |     license="MIT",
23 |     keywords="google sheet xls json",
24 |     url="https://github.com/paulfitz/sheetsite",
25 |     packages=find_packages(),
26 |     entry_points={
27 |         "console_scripts": [
28 |             "sheetsite=sheetsite.cmdline:cmd_sheetsite",
29 |             "sheetwatch=sheetsite.sheetwatch:run"
30 |         ]
31 |     },
32 |     long_description=read('README', 'README.md'),
33 |     classifiers=[
34 |         "Development Status :: 3 - Alpha",
35 |         "Topic :: Utilities",
36 |         "License :: OSI Approved :: MIT License"
37 |     ],
38 |     install_requires=[
39 |         "daff>=1.3.39",
40 |         "dataset>=1.0.2",
41 |         "oauth2client>=2.0.0",
42 |         "openpyxl",
43 |         "pygsheets",
44 |         "pyyaml",
45 |         "requests",
46 |         "six",
47 |         "tqdm"
48 |     ],
49 |     extras_require={
50 |         "queue": [
51 |             "celery",
52 |             "jinja2",
53 |             "premailer",
54 |             "redis"
55 |         ]
56 |     }
57 | )
58 | 


--------------------------------------------------------------------------------
/sheetsite/__init__.py:
--------------------------------------------------------------------------------
1 | from sheetsite.sheet import Sheets
2 | 


--------------------------------------------------------------------------------
/sheetsite/chain.py:
--------------------------------------------------------------------------------
  1 | import daff
  2 | import os
  3 | from sheetsite.ids import process_ids
  4 | from sheetsite.sheet import Sheets
  5 | from sheetsite.site import Site
  6 | from sheetsite.source import read_source
  7 | from sheetsite.destination import write_destination
  8 | import shutil
  9 | 
 10 | 
 11 | def apply_chain(site, path):
 12 | 
 13 |     if not(os.path.exists(path)):
 14 |         os.makedirs(path)
 15 | 
 16 |     source = site['source']
 17 |     destination = site['destination']
 18 |     tweaks = site.get('tweaks')
 19 | 
 20 |     wb = None
 21 | 
 22 |     raw_file = os.path.join(path, 'raw.json')
 23 |     if 'cache' in source:
 24 |         wb = read_source({
 25 |             'filename': raw_file
 26 |         })
 27 |     else:
 28 |         wb = read_source(source)
 29 | 
 30 |     ss = Site(wb, os.path.join(path, 'geocache.sqlite'))
 31 |     if 'flags' in site:
 32 |         ss.configure(site['flags'])
 33 |     output_file = os.path.join(path, 'public.json')
 34 |     prev_raw_file = os.path.join(path, 'prev_raw.json')
 35 |     private_output_file = os.path.join(path, 'private.json')
 36 |     id_file = os.path.join(path, 'ids.json')
 37 |     prev_id_file = os.path.join(path, 'prev_ids.json')
 38 |     if os.path.exists(raw_file):
 39 |         shutil.copyfile(raw_file, prev_raw_file)
 40 |         if os.path.exists(id_file):
 41 |             shutil.copyfile(id_file, prev_id_file)
 42 | 
 43 |     ss.save_local(raw_file, enhance=False)
 44 | 
 45 |     ids = process_ids(prev_raw_file, raw_file, prev_id_file, id_file)
 46 |     ss.add_ids(ids)
 47 | 
 48 |     state = {
 49 |         'path': path,
 50 |         'output_file': output_file,
 51 |         'id_file': id_file
 52 |     }
 53 | 
 54 |     if tweaks:
 55 |         import json
 56 |         wj = json.load(open(raw_file, 'r'))
 57 |         if hasattr(tweaks, 'items'):
 58 |             tweak_items = tweaks.items()
 59 |         else:
 60 |             tweak_items = [[params['tweak'], params] for params in tweaks]
 61 |         for tweak, params in tweak_items:
 62 |             print("Working on tweak", json.dumps(tweak))
 63 |             if 'tweak' in params:
 64 |                 tweak = params['tweak']
 65 |             import importlib
 66 |             mod = importlib.import_module('sheetsite.tweaks.{}'.format(tweak))
 67 |             ct = 2
 68 |             try:
 69 |                 target = mod.apply3
 70 |                 ct = 3
 71 |             except AttributeError:
 72 |                 target = mod.apply
 73 |             if ct == 3:
 74 |                 target(Sheets(wj), params, state)
 75 |             else:
 76 |                 target(Sheets(wj), params)
 77 |         from sheetsite.json_spreadsheet import JsonSpreadsheet
 78 |         ss.workbook = JsonSpreadsheet(None, data=wj)
 79 | 
 80 |     ss.save_local(output_file)
 81 |     if not os.path.exists(prev_raw_file):
 82 |         # once daff can cope with blank tables correctly, switch to this
 83 |         # with open(prev_raw_file, 'w') as fout:
 84 |         #    fout.write('{ "names": [], "tables": [] }')
 85 |         shutil.copyfile(raw_file, prev_raw_file)
 86 |         shutil.copyfile(id_file, prev_id_file)
 87 |     ss.save_local(private_output_file, private_sheets=True)
 88 | 
 89 |     state['workbook'] = ss.public_workbook()
 90 | 
 91 |     write_destination(destination, state)
 92 | 
 93 |     return {
 94 |         'prev_raw_file': prev_raw_file,
 95 |         'raw_file': raw_file
 96 |     }
 97 | 
 98 | 
 99 | def compute_diff(files, format='html'):
100 |     io = daff.TableIO()
101 |     dapp = daff.Coopy(io)
102 |     t1 = dapp.loadTable(files['prev_raw_file'], 'local')
103 |     t2 = dapp.loadTable(files['raw_file'], 'remote')
104 |     if format == 'both':
105 |         r1 = daff.diffAsHtml(t1, t2)
106 |         r2 = daff.diffAsAnsi(t1, t2)
107 |         return (r1, r2)
108 |     if format == 'html':
109 |         return daff.diffAsHtml(t1, t2)
110 |     return daff.diffAsAnsi(t1, t2)
111 | 


--------------------------------------------------------------------------------
/sheetsite/cmdline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import argparse
 4 | import os
 5 | from sheetsite.chain import apply_chain, compute_diff
 6 | from sheetsite.expand import load_config
 7 | import sys
 8 | 
 9 | 
10 | def run(argv):
11 |     parser = argparse.ArgumentParser(description='Run a website from a spreadsheet. '
12 |                                      'Take a spreadsheet (from google sheets or locally), and '
13 |                                      'convert it to a .json file that a static website '
14 |                                      'generator like jekyll can use.  Optionally strip private '
15 |                                      'information and add derived geographic fields like '
16 |                                      'latitude and longitude.')
17 | 
18 |     parser.add_argument('--config', nargs='*', required=False,
19 |                         default=['_sheetsite.yml', '_sheetsite.json'],
20 |                         help='name of configuration file.')
21 | 
22 |     parser.add_argument('--cache-dir', nargs=1, required=False, default=['_cache'],
23 |                         help='name of default cache directory.')
24 | 
25 |     args = parser.parse_args(argv)
26 | 
27 |     config_file = None
28 |     for config_candidate in args.config:
29 |         if os.path.exists(config_candidate):
30 |             config_file = config_candidate
31 |             break
32 |     if not config_file:
33 |         print("Could not find config file", args.config)
34 |         exit(1)
35 |     params = load_config(config_file)
36 |     files = apply_chain(params, args.cache_dir[0])
37 |     diff = compute_diff(files, 'ansi')
38 |     print(diff)
39 | 
40 | 
41 | def cmd_sheetsite():
42 |     run(sys.argv[1:])
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     cmd_sheetsite()
47 | 
48 | 


--------------------------------------------------------------------------------
/sheetsite/csv_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class CsvSpreadsheet(object):
 5 | 
 6 |     def __init__(self, filename):
 7 |         with open(filename, 'r') as fin:
 8 |             reader = csv.reader(fin)
 9 |             self.data = [row for row in reader]
10 | 
11 |     def worksheets(self):
12 |         return [self]
13 | 
14 |     def get_all_values(self):
15 |         return self.data
16 | 
17 |     @property
18 |     def title(self):
19 |         return "sheet"
20 | 


--------------------------------------------------------------------------------
/sheetsite/destination/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from sheetsite.destination.drop import write_destination_drop
 4 | from sheetsite.destination.excel import write_destination_excel
 5 | from sheetsite.destination.ftp import write_destination_ftp
 6 | from sheetsite.destination.git import write_destination_git
 7 | from sheetsite.destination.json_ss import write_destination_json
 8 | from sheetsite.destination.stone_soup import write_destination_stone_soup
 9 | from sheetsite.destination.sqlite_ss import write_destination_sqlite
10 | from sheetsite.destination.csv_ss import write_destination_csv
11 | 
12 | def write_destination_chain(params, state):
13 |     writers = params['chain']
14 |     for writer in writers:
15 |         writer['parent'] = params
16 |         write_destination(writer, state)
17 | 
18 | def write_destination(params, state):
19 | 
20 |     if isinstance(params, list):
21 |         params = {
22 |             'name': 'chain',
23 |             'chain': params
24 |         }
25 | 
26 |     writers = {
27 |         'chain': write_destination_chain,
28 |         'drop': write_destination_drop,
29 |         'ftp': write_destination_ftp,
30 |         'git': write_destination_git,
31 |         'stone-soup': write_destination_stone_soup,
32 |         '.sqlite': write_destination_sqlite,
33 |         '.sqlite3': write_destination_sqlite,
34 |         '.json': write_destination_json,
35 |         '.xlsx': write_destination_excel,
36 |         '.xls': write_destination_excel,
37 |         '.csv': write_destination_csv,
38 |         'drop': write_destination_drop,
39 |         'chain': write_destination_chain
40 |     }
41 | 
42 |     name = None
43 |     if 'name' in params:
44 |         name = params['name']
45 |     elif 'step' in params and params['step'] != 'save':
46 |         name = params['step']
47 |     elif 'output_file' in params:
48 |         _, ext = os.path.splitext(params['output_file'])
49 |         name = ext
50 |     elif 'file' in params:
51 |         _, ext = os.path.splitext(params['file'])
52 |         name = ext
53 |         params['output_file'] = params['file']
54 | 
55 |     if name not in writers:
56 |         import importlib
57 |         return importlib.import_module('sheetsite.destination.{}'.format(name)).apply(params,
58 |                                                                                       state)
59 | 
60 |     return writers[name](params, state)
61 | 


--------------------------------------------------------------------------------
/sheetsite/destination/csv_ss.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | def write_destination_csv(params, state):
 4 |     workbook = state['workbook']
 5 |     output_file = params['output_file']
 6 |     for sheet in workbook.worksheets():
 7 |         title = sheet.title
 8 |         rows = sheet.get_all_values()
 9 |         with open(output_file, 'w') as csvfile:
10 |             writer = csv.writer(csvfile)
11 |             writer.writerows(rows)
12 |     return True
13 | 


--------------------------------------------------------------------------------
/sheetsite/destination/drop.py:
--------------------------------------------------------------------------------
1 | def write_destination_drop(params, state):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/sheetsite/destination/excel.py:
--------------------------------------------------------------------------------
 1 | def write_destination_excel(params, state):
 2 |     workbook = state['workbook']
 3 |     output_file = params['output_file']
 4 |     from openpyxl import Workbook
 5 |     wb = Workbook()
 6 |     first = True
 7 |     for sheet in workbook.worksheets():
 8 |         title = sheet.title
 9 |         if first:
10 |             ws = wb.active
11 |             first = False
12 |         else:
13 |             ws = wb.create_sheet()
14 |         ws.title = title
15 |         rows = sheet.get_all_values()
16 |         for r, row in enumerate(rows):
17 |             for c, cell in enumerate(row):
18 |                 ws.cell(row=r+1, column=c+1).value = cell
19 |     wb.save(output_file)
20 |     return True
21 | 


--------------------------------------------------------------------------------
/sheetsite/destination/ftp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def write_destination_ftp(params, state):
 5 |     output_file = state['output_file']
 6 |     url = params['url']
 7 |     cmd = ['wput', '-v', '--binary', '-u', '-nc', output_file, url]
 8 |     print(' '.join(cmd))
 9 |     out = subprocess.check_output(cmd)
10 |     print("ftp: {}".format(out))
11 |     return True
12 | 


--------------------------------------------------------------------------------
/sheetsite/destination/git.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import subprocess
 4 | 
 5 | 
 6 | def write_destination_git(destination, state):
 7 |     wd = os.getcwd()
 8 |     try:
 9 |         path = state['path']
10 |         output_file = state['output_file']
11 |         local_repo = os.path.join(path, destination.get('local', 'repo'))
12 |         if not(os.path.exists(local_repo)):
13 |             subprocess.check_output(['git', 'clone', destination['repo'], local_repo])
14 |         os.chdir(local_repo)
15 |         subprocess.check_output(['git', 'pull'])
16 |         os.chdir(wd)
17 |         shutil.copyfile(output_file, os.path.join(local_repo, destination['file']))
18 |         os.chdir(local_repo)
19 |         subprocess.check_output(['git', 'add', destination['file']])
20 |         try:
21 |             subprocess.check_output(['git', 'commit', '-m', 'update from sheetsite'])
22 |             subprocess.check_output(['git', 'push'])
23 |         except subprocess.CalledProcessError:
24 |             print("Commit/push skipped")
25 |     finally:
26 |         os.chdir(wd)
27 | 


--------------------------------------------------------------------------------
/sheetsite/destination/install_local_soup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | # Hey this is embarrassing I'll remove it soon I promise.
 4 | # I mean, maybe.  Or I'll leave it malingering for years.
 5 | 
 6 | 
 7 | def apply(params, state):
 8 |     subprocess.check_output(["cp",
 9 |                              state['sqlite_file'],
10 |                              "/srv/git/datacommons_manitoba/production.sqlite3"])
11 |     ok = False
12 |     for i in range(0, 4):
13 |         try:
14 |             subprocess.check_output(["/srv/git/datacommons_manitoba/rebuild.sh"])
15 |             ok = True
16 |             break
17 |         except subprocess.CalledProcessError:
18 |             pass
19 | 
20 |     if not ok:
21 |         raise subprocess.CalledProcessError("rebuild sadness")
22 | 
23 |     return True
24 | 


--------------------------------------------------------------------------------
/sheetsite/destination/json_ss.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from sheetsite.jsonify import dump, dumps
 3 | from sheetsite.json_spreadsheet import JsonSpreadsheet
 4 | 
 5 | 
 6 | def write_destination_json(params, state):
 7 |     workbook = state['workbook']
 8 |     output_file = params['output_file']
 9 |     result = JsonSpreadsheet.as_dict(workbook)
10 |     if output_file is None:
11 |         print(dumps(result, indent=2))
12 |     else:
13 |         with open(output_file, 'w') as f:
14 |             dump(result, f, indent=2)
15 |     return True
16 | 


--------------------------------------------------------------------------------
/sheetsite/destination/sqlite_ss.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def write_destination_sqlite(params, state):
 5 |     path = state['path']
 6 |     output_file_prev = state['output_file']
 7 |     output_file_next = params['output_file']
 8 |     subprocess.check_output(['ssformat',
 9 |                              'dbi:jsonbook::file={}'.format(output_file_prev),
10 |                              output_file_next])
11 |     state['output_file'] = output_file_next
12 |     return True
13 | 


--------------------------------------------------------------------------------
/sheetsite/destination/stone_soup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import csv
  5 | import json
  6 | import os
  7 | import time
  8 | import sys
  9 | 
 10 | import sqlite3 as lite
 11 | 
 12 | 
 13 | schema = '''
 14 | CREATE TABLE IF NOT EXISTS access_rules (id INTEGER PRIMARY KEY,access_type TEXT);
 15 | CREATE TABLE IF NOT EXISTS data_sharing_orgs (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME,default_import_plugin_name TEXT);
 16 | CREATE TABLE IF NOT EXISTS data_sharing_orgs_taggables (id INTEGER PRIMARY KEY,data_sharing_org_id INTEGER NOT NULL,taggable_id INTEGER NOT NULL,verified INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME,foreign_key_id TEXT,taggable_type TEXT);
 17 | CREATE TABLE IF NOT EXISTS data_sharing_orgs_users (data_sharing_org_id INTEGER NOT NULL,user_id INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME);
 18 | CREATE TABLE IF NOT EXISTS entries (id INTEGER PRIMARY KEY,name TEXT,physical_address1 TEXT,physical_address2 TEXT,physical_city TEXT,physical_state TEXT,physical_zip TEXT,physical_country TEXT,mailing_address1 TEXT,mailing_address2 TEXT,mailing_city TEXT,mailing_state TEXT,mailing_zip TEXT,mailing_country TEXT,phone1 TEXT,phone2 TEXT,fax TEXT,email TEXT,website TEXT,preferred_contact TEXT,description TEXT,created_at DATETIME,updated_at DATETIME,created_by_id INTEGER,updated_by_id INTEGER,latitude REAL,longitude REAL,distance REAL,member_id INTEGER,prod_serv1 TEXT,prod_serv2 TEXT,prod_serv3 TEXT,support_organization INTEGER,worker_coop INTEGER,producer_coop INTEGER,marketing_coop INTEGER,housing_coop INTEGER,consumer_coop INTEGER,community_land_trust INTEGER,conservation_ag_land_trust INTEGER,alternative_currency INTEGER,intentional_community INTEGER,collective INTEGER,artist_run_center INTEGER,community_center INTEGER,community_development_financial_institution INTEGER,cooperative_financial_institution INTEGER,mutual_aid_self_help_group INTEGER,activist_social_change_organization INTEGER,union_labor_organization INTEGER,government INTEGER,fair_trade_organization INTEGER,network_association INTEGER,non_profit_org INTEGER,esop INTEGER,majority_owned_esop INTEGER,percentage_owned INTEGER,other INTEGER,type_of_other TEXT,naics_code INTEGER,informal INTEGER,cooperative INTEGER,partnership INTEGER,llc INTEGER,s_corporation INTEGER,c_corporation INTEGER,non_profit_corporation_501c3 INTEGER,non_profit_corporation_501c4 INTEGER,non_profit_corporation_other INTEGER,other_type_of_incorp INTEGER,type_of_other_incorp TEXT,have_a_fiscal_sponsor INTEGER,year_founded DATETIME,democratic INTEGER,union_association INTEGER,which_union TEXT);
 19 | CREATE TABLE IF NOT EXISTS legal_structures (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME);
 20 | CREATE TABLE IF NOT EXISTS locations (id INTEGER PRIMARY KEY,taggable_id INTEGER NOT NULL,note TEXT,physical_address1 TEXT,physical_address2 TEXT,physical_city TEXT,physical_state TEXT,physical_zip TEXT,physical_country TEXT,mailing_address1 TEXT,mailing_address2 TEXT,mailing_city TEXT,mailing_state TEXT,mailing_zip TEXT,mailing_country TEXT,latitude REAL,longitude REAL,created_at DATETIME,updated_at DATETIME,mailing_county TEXT,physical_county TEXT,taggable_type TEXT);
 21 | CREATE TABLE IF NOT EXISTS member_orgs (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME);
 22 | CREATE TABLE IF NOT EXISTS member_orgs_organizations (member_org_id INTEGER NOT NULL,organization_id INTEGER NOT NULL);
 23 | CREATE TABLE IF NOT EXISTS org_types (id INTEGER PRIMARY KEY,name TEXT,description TEXT,created_at DATETIME,updated_at DATETIME);
 24 | CREATE TABLE IF NOT EXISTS org_types_organizations (org_type_id INTEGER NOT NULL,organization_id INTEGER NOT NULL);
 25 | CREATE TABLE IF NOT EXISTS organizations (id INTEGER PRIMARY KEY,name TEXT NOT NULL,description TEXT,created_by_id INTEGER,updated_by_id INTEGER,phone TEXT,fax TEXT,email TEXT,website TEXT,year_founded DATETIME,democratic INTEGER,primary_location_id INTEGER,created_at DATETIME,updated_at DATETIME,legal_structure_id INTEGER,access_rule_id INTEGER NOT NULL,import_notice_sent_at DATETIME,email_response_token TEXT,responded_at DATETIME,response TEXT);
 26 | CREATE TABLE IF NOT EXISTS organizations_people (id INTEGER PRIMARY KEY,organization_id INTEGER NOT NULL,person_id INTEGER NOT NULL,role_name TEXT,phone TEXT,email TEXT,created_at DATETIME,updated_at DATETIME);
 27 | CREATE TABLE IF NOT EXISTS organizations_sectors (organization_id INTEGER NOT NULL,sector_id INTEGER NOT NULL);
 28 | CREATE TABLE IF NOT EXISTS organizations_users (organization_id INTEGER NOT NULL,user_id INTEGER NOT NULL,created_at DATETIME,updated_at DATETIME);
 29 | CREATE TABLE IF NOT EXISTS people (id INTEGER PRIMARY KEY,firstname TEXT,lastname TEXT,phone_mobile TEXT,phone_home TEXT,fax TEXT,email TEXT,phone_contact_preferred INTEGER,email_contact_preferred INTEGER,created_at DATETIME,updated_at DATETIME,access_rule_id INTEGER NOT NULL);
 30 | CREATE TABLE IF NOT EXISTS product_services (id INTEGER PRIMARY KEY,name TEXT,organization_id INTEGER,created_at DATETIME,updated_at DATETIME);
 31 | CREATE TABLE IF NOT EXISTS schema_migrations (version TEXT NOT NULL);
 32 | CREATE TABLE IF NOT EXISTS sectors (id INTEGER PRIMARY KEY,name TEXT,created_at DATETIME,updated_at DATETIME);
 33 | CREATE TABLE IF NOT EXISTS tag_contexts (id INTEGER PRIMARY KEY,name TEXT,friendly_name TEXT);
 34 | CREATE TABLE IF NOT EXISTS tag_worlds (id INTEGER PRIMARY KEY,name TEXT);
 35 | CREATE TABLE IF NOT EXISTS taggings (id INTEGER PRIMARY KEY,tag_id INTEGER,taggable_id INTEGER,taggable_type TEXT,created_at DATETIME);
 36 | CREATE TABLE IF NOT EXISTS tags (id INTEGER PRIMARY KEY,name TEXT,root_id INTEGER,root_type TEXT,parent_id INTEGER,effective_id INTEGER,created_at DATETIME,updated_at DATETIME);
 37 | CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY,login TEXT,password TEXT,is_admin INTEGER,created_at DATETIME,last_login DATETIME,person_id INTEGER,update_notifications_enabled INTEGER);
 38 | '''
 39 | 
 40 | 
 41 | def get_prop(key,rows):
 42 |     val = None
 43 |     many_versions = False
 44 |     for row in rows:
 45 |         v = row[key]
 46 |         if v != None:
 47 |             if val == None:
 48 |                 val = v
 49 |             if v != val:
 50 |                 many_versions = True
 51 |     return val, many_versions
 52 | 
 53 | def get_props(keys,rows,first):
 54 |     result = {}
 55 |     for key in keys:
 56 |         val, many_versions = get_prop(key,rows)
 57 |         if many_versions and not(first):
 58 |             val = None
 59 |         result[key] = val
 60 |     return result
 61 | 
 62 | def get_common_props(rows):
 63 |     return get_props(rows[0].keys(),rows,False)
 64 | 
 65 | def get_main_props(rows):
 66 |     return get_props(rows[0].keys(),rows,True)
 67 | 
 68 | def fix_email(email):
 69 |     if email==None:
 70 |         return email
 71 |     email = str(email)
 72 |     email = re.sub(r'mailto:','',email)
 73 |     return email
 74 | 
 75 | def make_org(props):
 76 |     organization = {
 77 |         'name': props["NAME"],
 78 |         'phone': props["PHONE"],
 79 |         'email': fix_email(props["EMAIL"]),
 80 |         'website': props["WEBSITE"],
 81 |         'description': props["GOODS AND SERVICES"],
 82 |         'access_rule_id': 1
 83 |         }
 84 |     return organization
 85 | 
 86 | def safe_access(props,key):
 87 |     if not(key in props):
 88 |         return None
 89 |     x = props[key]
 90 |     if x == "":
 91 |         return None
 92 |     return x
 93 | 
 94 | def make_loc(props,rid):
 95 |     location = {
 96 |         'physical_address1': props["Physical Address"],
 97 |         'physical_address2': None,
 98 |         'physical_city': props["City"],
 99 |         'physical_state': props["State"],
100 |         'physical_zip': safe_access(props,"Postal Code"),
101 |         'physical_country': props["Country"],
102 |         'latitude': safe_access(props,"Latitude"),
103 |         'longitude': safe_access(props,"Longitude"),
104 |         'taggable_id': rid,
105 |         'taggable_type': "Organization"
106 |         }
107 |     return location
108 | 
109 | def insert_hash(cur,tbl,values):
110 |     columns = ', '.join([('"'+v+'"') for v in values.keys()])
111 |     placeholders = ', '.join('?' * len(values))
112 |     sql = 'INSERT INTO {} ({}) VALUES ({})'.format(tbl,columns,placeholders)
113 |     # print(sql)
114 |     # print(values.values())
115 |     cur.execute(sql, list(values.values()))
116 |     return cur.lastrowid
117 | 
118 | def blanky(x):
119 |     if x == "" or x == None:
120 |         return None
121 |     return x
122 | 
123 | 
124 | def write_destination_stone_soup(params, state):
125 | 
126 |     path = state['path']
127 |     output_file = state['output_file']
128 | 
129 |     target = os.path.join(path, 'stonesoup.sqlite3')
130 |     state['sqlite_file'] = target
131 | 
132 |     if os.path.exists(target):
133 |         os.remove(target)
134 |     con = lite.connect(target)
135 |     cur = con.cursor()
136 | 
137 |     global schema
138 |     cur.executescript(schema)
139 | 
140 |     ot = insert_hash(cur, "tag_contexts", {
141 |             'name': 'OrgType',
142 |             'friendly_name': 'Organization Type'
143 |             })
144 |     ot = insert_hash(cur, "tags", {
145 |             'name': 'OrgType',
146 |             'root_id': ot,
147 |             'root_type': "TagContext"
148 |             })
149 | 
150 |     cur.execute('INSERT OR REPLACE INTO access_rules VALUES (1,"PUBLIC");')
151 | 
152 |     cur.execute('INSERT OR REPLACE INTO data_sharing_orgs (id,name) VALUES (1,?);',
153 |                 [params['organization']])
154 | 
155 |     org_names = []
156 |     orgs = {}
157 | 
158 |     lol = json.load(open(output_file))["tables"]["directory"]["rows"]
159 | 
160 |     # collect all locations for each org
161 |     for idx, row in enumerate(lol):
162 |         name = row['NAME']
163 |         if not(name in orgs):
164 |             orgs[name] = []
165 |             org_names.append(name)
166 |         orgs[name].append(row)
167 | 
168 |     organizations = []
169 | 
170 |     print("ORG COUNT " + str(len(org_names)))
171 | 
172 |     for idx, name in enumerate(org_names):
173 |         rows = orgs[name]
174 |         common = get_common_props(rows)
175 |         main = get_main_props(rows)
176 |         print(name + " : " + str(common) + " " + str(len(rows)))
177 |         organization = make_org(common)
178 |         rid = insert_hash(cur, "organizations", organization)
179 |         fid = None
180 |         for row in rows:
181 |             loc = make_loc(row, rid)
182 |             if loc['latitude'] == None:
183 |                 loc['latitude'] = blanky(row['Latitude'])
184 |             if loc['longitude'] == None:
185 |                 loc['longitude'] = blanky(row['Longitude'])
186 |             if loc['physical_zip'] == None:
187 |                 loc['physical_zip'] = blanky(row['Postal Code'])
188 |             fid0 = insert_hash(cur,"locations",loc)
189 |             if fid == None:
190 |                 fid = fid0
191 |         cur.execute("UPDATE organizations SET primary_location_id = ? WHERE id = ?",
192 |                     [fid, rid])
193 |         insert_hash(cur,"data_sharing_orgs_taggables",{
194 |                 "data_sharing_org_id": 1,
195 |                 "taggable_id": rid,
196 |                 "taggable_type": "Organization",
197 |                 "verified": 1,
198 |                 "foreign_key_id": 999
199 |                 })
200 |         typ = main["TYPE"]
201 |         if typ:
202 |             v = cur.execute('SELECT id FROM org_types WHERE name = ?',[typ]).fetchall()
203 |             tid = None
204 |             if len(v) == 0:
205 |                 tid = insert_hash(cur,"org_types",{
206 |                         'name': typ
207 |                         })
208 |                 tid = insert_hash(cur,"tags",{
209 |                         'name': typ,
210 |                         'root_id': tid,
211 |                         'root_type': "OrgType",
212 |                         'parent_id': ot
213 |                         })
214 |             else:
215 |                 tid = v[0][0]
216 |                 tid = cur.execute('SELECT id FROM tags WHERE root_id = ? AND root_type = "OrgType"',[tid]).fetchall()[0][0]
217 |             insert_hash(cur,"taggings",{
218 |                     "tag_id": tid,
219 |                     "taggable_id": rid,
220 |                     "taggable_type": "Organization"
221 |                     })
222 |         dex = main['Index']
223 |         if dex:
224 |             for dex in [x.strip() for x in dex.lower().split(',')]:
225 |                 v = cur.execute('SELECT id FROM tags WHERE name = ?',[dex]).fetchall()
226 |                 tid = None
227 |                 if len(v) == 0:
228 |                     tid = insert_hash(cur,"tags",{
229 |                             'name': dex
230 |                             })
231 |                 else:
232 |                     tid = v[0][0]
233 |                 insert_hash(cur,"taggings",{
234 |                         "tag_id": tid,
235 |                         "taggable_id": rid,
236 |                         "taggable_type": "Organization"
237 |                         })
238 |         dex = main['IndexTilde']
239 |         if dex:
240 |             for dex in [x.strip() for x in dex.lower().split(' ~ ')]:
241 |                 v = cur.execute('SELECT id FROM tags WHERE name = ?',[dex]).fetchall()
242 |                 tid = None
243 |                 if len(v) == 0:
244 |                     tid = insert_hash(cur,"tags",{
245 |                             'name': dex
246 |                             })
247 |                 else:
248 |                     tid = v[0][0]
249 |                 insert_hash(cur,"taggings",{
250 |                         "tag_id": tid,
251 |                         "taggable_id": rid,
252 |                         "taggable_type": "Organization"
253 |                         })
254 | 
255 |     with open('junk.json', 'w') as outfile:
256 |       json.dump(organizations, outfile)
257 | 
258 |     con.commit()
259 |     con.close()
260 | 
261 | 


--------------------------------------------------------------------------------
/sheetsite/destination/stone_soup_v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from contextlib import contextmanager
  4 | import dataset
  5 | from datetime import date, datetime
  6 | import json
  7 | import os
  8 | import re
  9 | from tqdm import tqdm
 10 | import uuid
 11 | import dateutil.parser
 12 | 
 13 | 
 14 | def get_prop(key, rows):
 15 |     val = None
 16 |     many_versions = False
 17 |     for row in rows:
 18 |         v = row[key]
 19 |         if v is not None:
 20 |             if val is None:
 21 |                 val = v
 22 |             if v != val:
 23 |                 many_versions = True
 24 |     return val, many_versions
 25 | 
 26 | 
 27 | def get_props(keys, rows, first):
 28 |     result = {}
 29 |     for key in keys:
 30 |         val, many_versions = get_prop(key, rows)
 31 |         if many_versions and not(first):
 32 |             val = None
 33 |         result[key] = val
 34 |     return result
 35 | 
 36 | 
 37 | def get_common_props(rows):
 38 |     return get_props(rows[0].keys(), rows, False)
 39 | 
 40 | 
 41 | def get_main_props(rows):
 42 |     return get_props(rows[0].keys(), rows, True)
 43 | 
 44 | 
 45 | def anykey(props, *keys):
 46 |     optional = (None in keys)
 47 |     keys = list(filter(None, keys))
 48 |     prop_keys = dict((key.upper(), key) for key in props.keys())
 49 |     for key in keys:
 50 |         key = key.upper()
 51 |         if key in prop_keys:
 52 |             return props[prop_keys[key]]
 53 |     # fail deliberately
 54 |     if optional:
 55 |         return None
 56 |     return props[keys[0]]
 57 | 
 58 | 
 59 | def fix_email(email):
 60 |     if email is None:
 61 |         return email
 62 |     email = str(email)
 63 |     email = re.sub(r'mailto:', '', email)
 64 |     return email
 65 | 
 66 | 
 67 | def as_year(when):
 68 |     if when is None:
 69 |         return when
 70 |     when = str(when)
 71 |     when = when.replace('.', ' ')
 72 |     when = when.replace('-', ' ')
 73 |     when = when.replace('/', ' ')
 74 |     parts = when.split(' ')
 75 |     for part in parts:
 76 |         if len(part) == 4 and re.match('^[0-9]{4}$', part):
 77 |             return date(int(part), 1, 1)
 78 |     return None
 79 | 
 80 | 
 81 | def fix_website(x):
 82 |     if x is None:
 83 |         return x
 84 |     x = x.strip()
 85 |     x = x.split(' ')
 86 |     if len(x) == 0:
 87 |         return None
 88 |     return x[0]
 89 | 
 90 | 
 91 | def make_org(props):
 92 |     organization = {
 93 |         'name': anykey(props, "NAME", "CompanyName"),
 94 |         'phone': anykey(props, "PHONE", "WorkPhone"),
 95 |         'email': fix_email(anykey(props, "EMAIL", "Email Address")),
 96 |         'website': fix_website(anykey(props, "WEBSITE", "Web Address", None)),
 97 |         'description': anykey(props, "GOODS AND SERVICES", "Description", None),
 98 |         'year_founded': as_year(anykey(props, "year_founded", "year founded", None)),
 99 |         'access_rule_id': 1,
100 |         'source_grouping': anykey(props, 'source_grouping', None),
101 |         'mode': anykey(props, 'mode', None)
102 |         }
103 |     if 'stamp' in props:
104 |         if props['stamp'] is not None:
105 |             organization['updated_at'] = date(int(props['stamp']), 1, 1)
106 |     if 'updated_at' in props:
107 |         if props['updated_at'] is not None:
108 |             organization['updated_at'] = dateutil.parser.parse(props['updated_at'])
109 |     return organization
110 | 
111 | 
112 | def safe_access(props, key):
113 |     if not(key in props):
114 |         return None
115 |     x = props[key]
116 |     if x == "":
117 |         return None
118 |     return x
119 | 
120 | 
121 | def make_loc(props, rid):
122 |     location = {
123 |         'physical_address1': anykey(props, "Street Address",
124 |                                     "Street", "Physical Address", "street1"),
125 |         'physical_address2': anykey(props, "street2", None),
126 |         'physical_city': anykey(props, "city"),
127 |         'physical_state': anykey(props, "state"),
128 |         'physical_zip': anykey(props, "zip", "postal code"),
129 |         'physical_country': anykey(props, "country"),
130 |         'mailing_address1': anykey(props, "mailing_address1", None),
131 |         'mailing_address2': anykey(props, "mailing_address2", None),
132 |         'mailing_city': anykey(props, "mailing_city", None),
133 |         'mailing_state': anykey(props, "mailing_state", None),
134 |         'mailing_zip': anykey(props, "mailing_zip", None),
135 |         'mailing_country': anykey(props, "mailing_country", None),
136 |         'latitude': anykey(props, "lat", "Latitude", "latitude", None),
137 |         'longitude': anykey(props, "lng", "Longitude", "longitude", None),
138 |         'taggable_id': rid,
139 |         'taggable_type': "Organization",
140 |         'dccid': anykey(props, 'dccid')
141 |         }
142 |     return location
143 | 
144 | 
145 | class DirectToDB(object):
146 |     def __init__(self, cur):
147 |         self.cur = cur
148 | 
149 |     def column(self, tbl, column, example):
150 |         return self.cur[tbl].create_column_by_example(column, example)
151 | 
152 |     def index(self, tbl, columns):
153 |         return self.cur[tbl].create_index(columns)
154 | 
155 |     def insert(self, tbl, values):
156 |         return self.cur[tbl].insert(values)
157 | 
158 |     def delete(self, tbl, **conds):
159 |         return self.cur[tbl].delete(**conds)
160 | 
161 |     def update(self, tbl, values, keys):
162 |         self.cur[tbl].update(values, keys)
163 | 
164 |     def upsert(self, tbl, values, keys):
165 |         result = self.cur[tbl].upsert(values, keys)
166 |         if result is not True:
167 |             return result
168 |         vs = dict((k, values[k]) for k in keys)
169 |         return self.cur[tbl].find_one(**vs)['id']
170 | 
171 |     def find(self, tbl, **conds):
172 |         return self.cur[tbl].find(**conds)
173 | 
174 |     def find_one(self, tbl, **conds):
175 |         return self.cur[tbl].find_one(**conds)
176 | 
177 |     @contextmanager
178 |     def transaction(self):
179 |         with self.cur as x:
180 |             yield DirectToDB(x)
181 | 
182 | def is_blank(x):
183 |     return x is None or x == ""
184 | 
185 | def blanky(x):
186 |     if x == "" or x is None:
187 |         return None
188 |     return x
189 | 
190 | 
191 | def floaty(x):
192 |     if x is None or x == "":
193 |         return None
194 |     return float(x)
195 | 
196 | 
197 | class TargetDB(object):
198 | 
199 |     def __init__(self, target_db):
200 |         cur = DirectToDB(target_db)
201 |         cur.upsert("tag_contexts", {
202 |             'name': 'OrgType',
203 |             'friendly_name': 'Organization Type'
204 |         }, ['name'])
205 |         cur.upsert("tag_contexts", {
206 |             'name': 'MemberOrg',
207 |             'friendly_name': 'Member Organization Affiliation'
208 |         }, ['name'])
209 |         cur.upsert("tag_contexts", {
210 |             'name': 'Sector',
211 |             'friendly_name': 'Business Sector'
212 |         }, ['name'])
213 |         cur.upsert("tag_contexts", {
214 |             'name': 'LegalStructure',
215 |             'friendly_name': 'Legal Structure'
216 |         }, ['name'])
217 |         dcc = cur.upsert("tags", {
218 |             'name': 'dcc',
219 |             'root_id': 1,
220 |             'root_type': "TagWorld"
221 |         }, ['name'])
222 |         for name in ['OrgType', 'Sector', 'MemberOrg', 'LegalStructure']:
223 |             cur.upsert("tags", {
224 |                 'name': name,
225 |                 'root_id': cur.find_one('tag_contexts', name=name)['id'],
226 |                 'root_type': "TagContext",
227 |                 'parent_id': dcc
228 |             }, ['name'])
229 |         self.ot = cur.find_one("tags", name='OrgType')['id']
230 |         cur.upsert("tag_worlds", {
231 |             'name': 'dcc',
232 |         }, ['name'])
233 | 
234 |         cur.column('users', 'login', 'x')
235 |         cur.column('users', 'password', 'x')
236 |         cur.column('users', 'is_admin', 1)
237 |         cur.column('users', 'person_id', 1)
238 |         cur.column('users', 'last_login', datetime.now())
239 |         cur.column('organizations', 'grouping', 'x')
240 |         cur.column('locations', 'mailing_address1', 'x')
241 |         cur.column('locations', 'mailing_address2', 'x')
242 |         cur.column('locations', 'mailing_city', 'x')
243 |         cur.column('locations', 'mailing_state', 'x')
244 |         cur.column('locations', 'mailing_zip', 'x')
245 |         cur.column('locations', 'mailing_country', 'x')
246 |         cur.column('locations', 'mailing_county', 'x')
247 |         cur.column('locations', 'physical_zip', 'x')
248 |         cur.column('locations', 'physical_county', 'x')
249 |         for tab in ['organizations', 'locations']:
250 |             cur.column(tab, 'dccid', 'x')
251 |             cur.column(tab, 'created_at', datetime.now())
252 |             cur.column(tab, 'updated_at', datetime.now())
253 |         cur.column('people', 'firstname', 'x')
254 |         cur.column('people', 'lastname', 'x')
255 |         cur.column('people', 'updated_at', datetime.now())
256 |         cur.column('organizations_people', 'person_id', 1)
257 |         cur.column('organizations_people', 'organization_id', 1)
258 |         cur.column('tags', 'effective_id', 1)
259 |         cur.column('locations', 'note', 'x')
260 |         cur.column('organizations', 'fax', 'x')
261 |         cur.column('organizations', 'year_founded', datetime.now())
262 |         cur.column('organizations', 'source_grouping', 'x')
263 |         cur.column('product_services', 'name', 'x')
264 |         cur.column('product_services', 'organization_id', 1)
265 |         cur.column('organizations_users', 'user_id', 1)
266 |         cur.column('organizations_users', 'organization_id', 1)
267 |         cur.column('users', 'login', 'x')
268 | 
269 |         cur.column('access_rules', 'access_type', 'PUBLIC')
270 |         cur.upsert('access_rules', {'id': 1, 'access_type': 'PUBLIC'}, ['id'])
271 | 
272 |         cur.column('data_sharing_orgs', 'name', 'x')
273 | 
274 |         cur.column('data_sharing_orgs_users', 'user_id', 1)
275 |         cur.column('data_sharing_orgs_users', 'data_sharing_org_id', 1)
276 | 
277 |         cur.column('member_orgs_organizations', 'member_org_id', 1)
278 |         cur.column('member_orgs_organizations', 'organization_id', 1)
279 | 
280 |         cur.column('org_types_organizations', 'org_type_id', 1)
281 |         cur.column('org_types_organizations', 'organization_id', 1)
282 | 
283 |         cur.column('organizations_sectors', 'sector_id', 1)
284 |         cur.column('organizations_sectors', 'organization_id', 1)
285 | 
286 |         cur.column('member_orgs', 'name', 'x')
287 | 
288 |         cur.column('sectors', 'name', 'x')
289 | 
290 |         cur.column('taggings', 'tag_id', 1)
291 |         cur.column('taggings', 'taggable_id', 1)
292 |         cur.column('taggings', 'taggable_type', 'x')
293 | 
294 |         cur.column('data_sharing_orgs_taggables', 'data_sharing_org_id', 1)
295 |         cur.column('data_sharing_orgs_taggables', 'taggable_id', 1)
296 |         cur.column('data_sharing_orgs_taggables', 'taggable_type', 'x')
297 |         cur.column('data_sharing_orgs_taggables', 'verified', 1)
298 | 
299 |         cur.index('locations', ['taggable_id', 'taggable_type'])
300 |         cur.index('product_services', ['organization_id'])
301 |         cur.index('organizations_sectors', ['organization_id'])
302 |         cur.index('organizations_sectors', ['sector_id'])
303 |         cur.index('organizations_people', ['organization_id'])
304 |         cur.index('organizations_people', ['person_id'])
305 |         cur.index('tags', ['name'])
306 |         cur.index('tags', ['root_id', 'root_type'])
307 |         cur.index('tags', ['parent_id'])
308 |         cur.index('taggings', ['tag_id'])
309 |         cur.index('taggings', ['taggable_id', 'taggable_type'])
310 |         cur.index('tag_contexts', ['name'])
311 |         cur.index('tag_worlds', ['name'])
312 |         cur.index('data_sharing_orgs_taggables', ['data_sharing_org_id'])
313 |         cur.index('data_sharing_orgs_taggables', ['taggable_type'])
314 |         cur.index('data_sharing_orgs_taggables', ['taggable_id', 'taggable_type'])
315 | 
316 |         self.cur = cur
317 | 
318 |     def get_org_type(self):
319 |         return self.ot
320 | 
321 |     def set_name(self, name):
322 |         cur = self.cur
323 |         dso = name
324 |         dso_id = cur.upsert('data_sharing_orgs',
325 |                             {'name': name},
326 |                             ['name'])
327 |         self.dso = dso
328 |         self.dso_id = dso_id
329 |         tabs = ['locations', 'organizations', 'taggings',
330 |                 'data_sharing_orgs_taggables',
331 |                 'data_sharing_orgs']
332 | 
333 |         def prep(tab):
334 |             cur.column(tab, 'dso', 'x')
335 |             cur.column(tab, 'dso_update', 'x')
336 |             cur.update(tab, {
337 |                 'dso': dso,
338 |                 'dso_update': 'old'
339 |             }, ['dso'])
340 |         for tab in tabs:
341 |             prep(tab)
342 |         self.tabs = tabs
343 | 
344 |     def clear(self):
345 |         for tab in self.tabs:
346 |             self.cur.delete(tab, dso=self.dso, dso_update='old')
347 | 
348 | 
349 | def apply(params, state):
350 | 
351 |     path = merge_path = state['path']
352 |     output_file = state['output_file']
353 | 
354 |     if 'merge_path' in params:
355 |         merge_path = params['merge_path']
356 |     elif 'MERGE_PATH' in os.environ:
357 |         merge_path = os.environ['MERGE_PATH']
358 | 
359 |     target = os.path.abspath(os.path.join(merge_path,
360 |                                           'stonesoup.sqlite3'))
361 |     target_perm = os.path.abspath(os.path.join(path,
362 |                                                'stonesoup.sqlite3'))
363 |     state['sqlite_file'] = target_perm
364 | 
365 |     tdb = TargetDB(dataset.connect("sqlite:///" + target))
366 |     cur = tdb.cur
367 |     tdb.set_name(params['organization'])
368 |     dso = tdb.dso
369 |     dso_id = tdb.dso_id
370 |     ot = tdb.get_org_type()
371 | 
372 |     org_names = []
373 |     orgs = {}
374 | 
375 |     print("READING", output_file)
376 |     tables = json.load(open(output_file))
377 |     selection = tables['names'][0]
378 |     lol = tables['tables'][selection]["rows"]
379 | 
380 |     # collect all locations for each org
381 |     for idx, row in tqdm(list(enumerate(lol))):
382 |         name = anykey(row, 'row_group', 'NAME', 'CompanyName')
383 |         if not(name in orgs):
384 |             orgs[name] = []
385 |             org_names.append(name)
386 |         orgs[name].append(row)
387 | 
388 |     print("ORG COUNT " + str(len(org_names)))
389 | 
390 |     for idx, name in tqdm(list(enumerate(org_names))):
391 |         rows = orgs[name]
392 |         print("Org {} / {} has {} rows".format(idx, name, len(rows)))
393 |         lct = 0
394 |         for row in rows:
395 |             loc = make_loc(row, None)
396 |             if not(is_blank(loc['physical_state']) and is_blank(loc['physical_country'])
397 |                     and is_blank(loc['physical_address1'])):
398 |                 lct += 1
399 |         if lct == 0:
400 |             continue
401 |         common = get_common_props(rows)
402 |         main = get_main_props(rows)
403 |         # print(name + " : " + str(common) + " " + str(len(rows)))
404 |         organization = make_org(common)
405 |         # print(organization, rows)
406 |         # get a dccid
407 |         ids = set(filter(None, [row['dccid'] for row in rows])) - set([''])
408 |         oid = None
409 |         for id in ids:
410 |             y = list(cur.find('oids', dccid=id))
411 |             if len(y) > 0:
412 |                 oid = y[0]['oid']
413 |                 break
414 |         if oid is None:
415 |             oid = str(uuid.uuid4())
416 |         with cur.transaction() as cur1:
417 |             for id in ids:
418 |                 cur1.upsert('oids', {'oid': oid, 'dccid': id}, ['dccid'])
419 |         organization['oid'] = oid
420 |         organization['dso'] = dso
421 |         organization['dso_update'] = 'fresh'
422 |         rid = cur.upsert("organizations", organization, ['oid'])
423 |         fid = None
424 |         with cur.transaction() as cur1:
425 |             for row in rows:
426 |                 loc = make_loc(row, rid)
427 |                 if loc['latitude'] is None or loc['latitude'] == "":
428 |                     loc['latitude'] = floaty(blanky(row['Latitude']))
429 |                 if loc['longitude'] is None or loc['longitude'] == "":
430 |                     loc['longitude'] = floaty(blanky(row['Longitude']))
431 |                 if loc['physical_zip'] is None:
432 |                     loc['physical_zip'] = blanky(row['Postal Code'])
433 |                 if loc['dccid'] is None:
434 |                     loc['dccid'] = blanky(row['dccid'])
435 |                 loc['dso'] = dso
436 |                 loc['dso_update'] = 'fresh'
437 |                 fid0 = cur1.upsert("locations", loc, ['dccid'])
438 |                 if fid is None:
439 |                     fid = fid0
440 |         with cur.transaction() as cur1:
441 |             cur1.update('organizations',
442 |                         {'id': rid, 'primary_location_id': fid},
443 |                         ['id'])
444 |             cur1.upsert("data_sharing_orgs_taggables", {
445 |                 "data_sharing_org_id": dso_id,
446 |                 "taggable_id": rid,
447 |                 "taggable_type": "Organization",
448 |                 "verified": 1,
449 |                 "foreign_key_id": 999,
450 |                 "dso": dso,
451 |                 "dso_update": "fresh"
452 |             }, ['data_sharing_org_id', 'taggable_id', 'taggable_type'])
453 |         typs = main["TYPE"]
454 |         if typs is None:
455 |             typs = ""
456 |         typs = typs.split(',')
457 |         if "dcc_status" in main:
458 |             typ0 = main['dcc_status']
459 |             if typ0:
460 |                 typs.append(typ0)
461 |         typs = [typ.strip() for typ in typs if typ.strip() != ""]
462 |         for typ in typs:
463 |             v = list(cur.find('org_types', name=typ))
464 |             tid = None
465 |             if len(v) == 0:
466 |                 tid = cur.insert("org_types", {
467 |                         'name': typ
468 |                         })
469 |             else:
470 |                 tid = v[0]['id']
471 |             nid = cur.find_one('tags', root_id=tid, root_type='OrgType')
472 |             if nid is None:
473 |                 tid = cur.insert("tags", {
474 |                         'name': typ,
475 |                         'root_id': tid,
476 |                         'root_type': "OrgType",
477 |                         'parent_id': ot
478 |                         })
479 |             else:
480 |                 tid = nid['id']
481 |             cur.upsert("taggings", {
482 |                 "tag_id": tid,
483 |                 "taggable_id": rid,
484 |                 "taggable_type": "Organization",
485 |                 "dso": dso,
486 |                 "dso_update": "fresh"
487 |             }, ['tag_id', 'taggable_id', 'taggable_type'])
488 |         dex = main['Index']
489 |         if dex:
490 |             for dex in [x.strip() for x in dex.lower().split(',')]:
491 |                 v = list(cur.find('tags', name=dex))
492 |                 tid = None
493 |                 if len(v) == 0:
494 |                     tid = cur.insert("tags", {
495 |                             'name': dex
496 |                             })
497 |                 else:
498 |                     tid = v[0]['id']
499 |                 cur.insert("taggings", {
500 |                     "tag_id": tid,
501 |                     "taggable_id": rid,
502 |                     "taggable_type": "Organization",
503 |                     "dso": dso,
504 |                     "dso_update": "fresh"
505 |                 })
506 |         if 'tags' in main:
507 |             dex = main['tags']
508 |             if dex:
509 |                 try:
510 |                     lst = [x.strip() for x in dex.split(';;')]
511 |                 except:
512 |                     lst = dex
513 |                 for idex in lst:
514 |                     parts = idex.split('|')
515 |                     if len(parts) > 0:
516 |                         pass
517 |                     parent_id = None
518 |                     for part in parts:
519 |                         v = list(cur.find('tags', name=part, parent_id=parent_id))
520 |                         tid = None
521 |                         if len(v) == 0:
522 |                             tid = cur.insert("tags", {
523 |                                     'name': part,
524 |                                     'parent_id': parent_id
525 |                                     })
526 |                         else:
527 |                             tid = v[0]['id']
528 |                         parent_id = tid
529 |                     cur.insert("taggings", {
530 |                         "tag_id": parent_id,
531 |                         "taggable_id": rid,
532 |                         "taggable_type": "Organization",
533 |                         "dso": dso,
534 |                         "dso_update": "fresh"
535 |                     })
536 | 
537 |     tdb.clear()
538 | 
539 |     from shutil import copyfile
540 |     copyfile(target, target_perm)
541 | 
542 | 
543 | def apply_direct(target_db, name, source_db):
544 |     tdb = TargetDB(target_db)
545 |     tdb.set_name(name)
546 | 
547 |     oids = {}
548 |     pids = {}
549 | 
550 |     types = {}
551 | 
552 |     caps = {
553 |         'OrgType': 'org_types',
554 |         'Sector': 'sectors',
555 |         'LegalStructure': 'legal_structures',
556 |         'MemberOrg': 'member_orgs',
557 |         'TagContext': 'tag_contexts'
558 |     }
559 | 
560 |     dsos = {}
561 | 
562 |     # add dsos
563 |     with tdb.cur.transaction() as cur:
564 |         print('dsos')
565 |         for rec in tqdm(list(source_db['data_sharing_orgs'].all())):
566 |             fid = rec['id']
567 |             dccid = '{}_{}_{}'.format(name, 'DSO', fid)
568 |             rec['dccid'] = dccid
569 |             rec['dso'] = name
570 |             rec['dso_update'] = 'fresh'
571 |             rec.pop('id')
572 |             oid = cur.upsert("data_sharing_orgs", rec, ['dccid'])
573 |             dsos[fid] = oid
574 | 
575 |     # add types
576 |     for k in ['org_types', 'sectors', 'legal_structures', 'member_orgs', 'tag_contexts']:
577 |         print(k)
578 |         ts = types[k] = {}
579 |         with tdb.cur.transaction() as cur:
580 |             for rec in tqdm(list(source_db[k].all())):
581 |                 fid = rec.pop('id')
582 |                 tid = cur.upsert(k, rec, ['name'])
583 |                 ts[fid] = tid
584 | 
585 |     # add organizations
586 |     with tdb.cur.transaction() as cur:
587 |         print('organizations')
588 |         for org in tqdm(list(source_db['organizations'].all())):
589 |             fid = org['id']
590 |             dccid = '{}_{}_{}'.format(name, 'Organization', fid)
591 |             org['dccid'] = dccid
592 |             org['dso'] = name
593 |             org['dso_update'] = 'fresh'
594 |             pids[org['primary_location_id']] = fid
595 |             org.pop('id')
596 |             org.pop('created_by_id')
597 |             org.pop('updated_by_id')
598 |             org.pop('primary_location_id')
599 |             org.pop('legal_structure_id')
600 |             org.pop('access_rule_id')
601 |             org['access_rule_id'] = 1
602 |             oid = cur.upsert("organizations", org, ['dccid'])
603 |             oids[fid] = oid
604 | 
605 |     # add locations
606 |     with tdb.cur.transaction() as cur:
607 |         print('locations')
608 |         for org in tqdm(list(source_db['locations'].all())):
609 |             fid = org['id']
610 |             dccid = '{}_{}_{}'.format(name, 'Location', fid)
611 |             org['dccid'] = dccid
612 |             org['dso'] = name
613 |             org['dso_update'] = 'fresh'
614 |             org.pop('id')
615 |             if org['taggable_type'] != 'Organization':
616 |                 continue
617 |             org['taggable_id'] = oids[org['taggable_id']]
618 |             oid = cur.upsert("locations", org, ['dccid'])
619 |             pid = pids.get(fid)
620 |             if pid is not None:
621 |                 cur.update("organizations", {
622 |                     'primary_location_id': oid,
623 |                     'id': oids[pid]
624 |                 }, ['id'])
625 | 
626 |     tids = {}
627 | 
628 |     # add tags
629 |     with tdb.cur.transaction() as cur:
630 |         print('tags')
631 |         for rec in tqdm(list(source_db['tags'].all())):
632 |             fid = rec.pop('id')
633 |             rtype = rec['root_type']
634 |             rid = rec['root_id']
635 |             if rtype in caps:
636 |                 rtypes = types[caps[rtype]]
637 |                 rid = rtypes[rid]
638 |                 rec['root_id'] = rid
639 |             else:
640 |                 rec.pop('root_id')
641 |                 rec.pop('root_type')
642 |             pid = rec['parent_id']
643 |             rec.pop('parent_id')
644 |             if pid is not None:
645 |                 if pid in tids:
646 |                     rec['parent_id'] = tids[pid]
647 |             rec.pop('effective_id')
648 |             tid = cur.upsert("tags", rec, ['name'])
649 |             tids[fid] = tid
650 | 
651 |     # add taggings
652 |     ct = 0
653 |     goods = 0
654 |     with tdb.cur.transaction() as cur:
655 |         print('taggings')
656 |         for rec in tqdm(list(source_db['taggings'].all())):
657 |             if rec['taggable_type'] != 'Organization':
658 |                 continue
659 |             ct += 1
660 |             if rec['taggable_id'] is None:
661 |                 continue
662 |             if rec['tag_id'] is None:
663 |                 continue
664 |             fid = rec['id']
665 |             dccid = '{}_{}_{}'.format(name, 'Taggings', fid)
666 |             rec['dccid'] = dccid
667 |             rec.pop('id')
668 |             tid = rec['tag_id']
669 |             if tid not in tids:
670 |                 continue
671 |             rec['tag_id'] = tids[tid]
672 |             oid = rec['taggable_id']
673 |             if oid not in oids:
674 |                 continue
675 |             rec['taggable_id'] = oids[oid]
676 |             rec['dso'] = name
677 |             rec['dso_update'] = 'fresh'
678 |             cur.upsert("taggings", rec, ['dccid'])
679 |             goods += 1
680 |     print("taggings {} of which {} good".format(ct, goods))
681 | 
682 |     # add dso_taggables
683 |     with tdb.cur.transaction() as cur:
684 |         print('dso_taggables')
685 |         for rec in tqdm(list(source_db['data_sharing_orgs_taggables'].all())):
686 |             fid = rec['id']
687 |             dccid = '{}_{}_{}'.format(name, 'DSO_taggables', fid)
688 |             rec['dccid'] = dccid
689 |             rec['dso'] = name
690 |             rec['dso_update'] = 'fresh'
691 |             rec.pop('id')
692 |             did = rec['data_sharing_org_id']
693 |             if did not in dsos:
694 |                 continue
695 |             rec['data_sharing_org_id'] = dsos[did]
696 |             if rec['taggable_type'] != 'Organization':
697 |                 continue
698 |             tid = rec['taggable_id']
699 |             if tid not in oids:
700 |                 continue
701 |             rec['taggable_id'] = oids[tid]
702 |             oid = cur.upsert("data_sharing_orgs_taggables", rec, ['dccid'])
703 |             dsos[fid] = oid
704 | 
705 | 
706 |     tdb.clear()
707 | 
708 | 
709 | if __name__ == '__main__':
710 |     import sys
711 |     target = sys.argv[1]
712 |     name = sys.argv[2]
713 |     source = sys.argv[3]
714 |     target_db = dataset.connect('sqlite:///' + target)
715 |     source_db = dataset.connect('sqlite:///' + source)
716 |     apply_direct(target_db, name, source_db)
717 | 


--------------------------------------------------------------------------------
/sheetsite/expand.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import json
 3 | import os
 4 | import six
 5 | import yaml
 6 | 
 7 | 
 8 | # borrowed code to load yaml dicts as ordered
 9 | def ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict):
10 |     class OrderedLoader(Loader):
11 |         pass
12 | 
13 |     def construct_mapping(loader, node):
14 |         loader.flatten_mapping(node)
15 |         return object_pairs_hook(loader.construct_pairs(node))
16 |     OrderedLoader.add_constructor(
17 |         yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
18 |         construct_mapping)
19 |     return yaml.load(stream, OrderedLoader)
20 | 
21 | 
22 | def expand(x):
23 |     return os.path.expandvars(x)
24 | 
25 | 
26 | def expand_all(o):
27 |     if type(o) == dict:
28 |         return dict([[k, expand_all(v)] for k, v in o.items()])
29 |     if type(o) == list:
30 |         return [expand_all(x) for x in o]
31 |     if isinstance(o, six.string_types):
32 |         return expand(o)
33 |     return o
34 | 
35 | 
36 | def load_config(config_file):
37 |     with open(config_file, 'r') as config:
38 |         _, ext = os.path.splitext(config_file)
39 |         ext = ext.lower()
40 |         if ext == '.yml' or ext == '.yaml':
41 |             import yaml
42 |             params = ordered_load(config, yaml.SafeLoader)
43 |         else:
44 |             params = json.load(config)
45 |         params = expand_all(params)  # should make this optional
46 |     return params
47 | 


--------------------------------------------------------------------------------
/sheetsite/filtered_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | class FilteredSpreadsheet(object):
 2 |     def __init__(self, workbook, selector, processor):
 3 |         self.workbook = workbook
 4 |         titles = [(sheet, selector(sheet))
 5 |                   for sheet in self.workbook.worksheets()]
 6 |         self.sheets = [FilteredSheet(sheet, title, processor)
 7 |                        for sheet, title in titles
 8 |                        if title is not None]
 9 | 
10 |     def worksheets(self):
11 |         return self.sheets
12 | 
13 |     
14 | class FilteredSheet(object):
15 |     def __init__(self, sheet, title, processor):
16 |         self.sheet = sheet
17 |         self.name = title
18 |         self.processor = processor
19 | 
20 |     def get_all_values(self):
21 |         return self.processor(self.sheet, self.title)
22 | 
23 |     @property
24 |     def title(self):
25 |         return self.name
26 | 
27 | 


--------------------------------------------------------------------------------
/sheetsite/geocache.py:
--------------------------------------------------------------------------------
  1 | import dataset
  2 | import json
  3 | import logging
  4 | import os
  5 | import requests
  6 | import six
  7 | import time
  8 | 
  9 | GEOCODER = 'google' if 'GOOGLE_GEOCODER_KEY' in os.environ else None
 10 | 
 11 | class GeoCache(object):
 12 |     def __init__(self, filename, geocoder=GEOCODER, group_key=None):
 13 |         logging.basicConfig()
 14 |         logging.getLogger("dataset.persistence.table").setLevel(
 15 |             logging.ERROR
 16 |         )
 17 |         if '://' not in filename:
 18 |             filename = "sqlite:///{}".format(os.path.abspath(filename))
 19 |         self.db = dataset.connect(filename)
 20 |         self.geocache = self.db['geocache']
 21 |         self.update_schema()
 22 |         self.geocoder = geocoder
 23 |         self.group_key = group_key
 24 |         self.prev_row = None
 25 | 
 26 |     def update_schema(self):
 27 |         if 'geocache' not in self.db:
 28 |             self.db.create_table('geocache',
 29 |                                  primary_id='address',
 30 |                                  primary_type=self.db.types.string)
 31 | 
 32 |     def complete(self, result):
 33 |         if 'lat' in result and 'lng' in result:
 34 |             if result['lat'] is not None and result['lng'] is not None:
 35 |                 if result['lat'] != '' and result['lng'] != '':
 36 |                     result['latlng'] = "{},{}".format(result['lat'],
 37 |                                                       result['lng'])
 38 |         return result
 39 | 
 40 |     def find(self, address):
 41 |         if address is None or address.lower() == 'n/a':
 42 |             return {
 43 |                 'status': "not applicable"
 44 |             }
 45 |         results = self.geocache.find(address=address)
 46 |         for row in results:
 47 |             return self.complete(dict(row))
 48 |         result = self.find_without_cache(address)
 49 |         print("--- geocoded [{}]".format(result))
 50 |         if result is None:
 51 |             result = {
 52 |                 'address': address,
 53 |                 'status': 'unknown'
 54 |             }
 55 |             self.geocache.insert(result)
 56 |         else:
 57 |             result['status'] = 'ok'
 58 |             self.geocache.insert(result)
 59 |             self.db.commit()
 60 |         return self.complete(result)
 61 | 
 62 |     def blank(self, val):
 63 |         return val is None or val == ""
 64 | 
 65 |     def find_all(self, rows, pattern, cols):
 66 |         for row in rows:
 67 |             parts = []
 68 |             for p in pattern:
 69 |                 if isinstance(p, int):
 70 |                     if ((self.blank(row[p]) and self.prev_row and
 71 |                          self.prev_row[self.group_key] == row[self.group_key] and
 72 |                          not self.blank(self.group_key) and
 73 |                          not self.blank(row[self.group_key]))):
 74 |                         parts.append(self.prev_row[p])
 75 |                     else:
 76 |                         parts.append(row[p])
 77 |                 else:
 78 |                     parts.append(p)
 79 |             parts = [part for part in parts if not self.blank(part)]
 80 |             if six.PY2:
 81 |                 address = " ".join(str((x or '').encode('utf-8')) for x in parts)
 82 |             else:
 83 |                 address = " ".join(str(x or '') for x in parts)
 84 |             result = self.find(address)
 85 |             if result['status'] == 'ok':
 86 |                 for col in cols:
 87 |                     name = col[0].lower()
 88 |                     idx = col[1]
 89 |                     val = result[name]
 90 |                     if idx >= len(row):
 91 |                         row.append(None)
 92 |                     if row[idx] is None or row[idx] == '':
 93 |                         row[idx] = val
 94 |             if self.group_key:
 95 |                 if self.prev_row:
 96 |                     if self.prev_row[self.group_key] != row[self.group_key]:
 97 |                         self.prev_row = row
 98 |                 else:
 99 |                     self.prev_row = row
100 | 
101 |     def find_without_cache(self, address):
102 |         print("--- geocoding [{}]".format(address))
103 |         if self.geocoder == "datasciencetoolkit" or self.geocoder is None:
104 |             return self.find_without_cache_dstk(address)
105 |         if self.geocoder == "google":
106 |             return self.find_without_cache_gmap(address)
107 |         if self.geocoder == "dummy":
108 |             return self.find_without_cache_dummy(address)
109 |         raise ValueError('unknown geocoder {}'.format(self.geocoder))
110 | 
111 |     def find_without_cache_dummy(self, address):
112 |         return {
113 |             "address": address,
114 |             "lat": 10.0,
115 |             "lng": 10.0,
116 |             "street": "Street St",
117 |             "locality": "Cityville",
118 |             "region": "New State",
119 |             "country": "Countryland",
120 |             "postal_code": "PO-STAL",
121 |             "administrative_area_level_2": "Glig County",
122 |             "status": 'valid'
123 |         }
124 | 
125 |     def find_without_cache_dstk(self, address):
126 |         try:
127 |             r = requests.post("http://www.datasciencetoolkit.org/street2coordinates/", address,
128 |                               timeout=15)
129 |             v = json.loads(r.text)
130 |             v = v[address]
131 |             return {
132 |                 "address": address,
133 |                 "lat": v['latitude'],
134 |                 "lng": v['longitude'],
135 |                 "street": v['street_address'],
136 |                 "locality": v['locality'],
137 |                 "region": v['region'],
138 |                 "country": v['country_name'],
139 |                 "postal_code": None,
140 |                 "administrative_area_level_2": v['fips_county'],
141 |                 "status": 'valid'
142 |             }
143 |         except:
144 |             return None
145 | 
146 |     def find_without_cache_gmap(self, address, fallback=None):
147 |         try:
148 |             def get_part(cmps, name, fallback=None):
149 |                 zips = [cmp["long_name"] for cmp in cmps if name in cmp["types"]]
150 |                 zip = zips[0] if len(zips)>0 else fallback
151 |                 return zip
152 | 
153 |             v = None
154 |             xaddress = address
155 |             key = os.environ['GOOGLE_GEOCODER_KEY']
156 |             for delay in [1, 2, 4, 8]:
157 |                 r = requests.get("https://maps.googleapis.com/maps/api/geocode/json",
158 |                                  params={"sensor": "false", "address": xaddress, "key": key})
159 |                 time.sleep(delay)
160 |                 v = json.loads(r.text)
161 |                 print("v", v)
162 |                 if 'status' in v:
163 |                     if v['status'] == 'ZERO_RESULTS':
164 |                         if ',' in xaddress:
165 |                             xaddress = xaddress.split(',', 1)[1]
166 |                             continue
167 |                     if v['status'] != 'OVER_QUERY_LIMIT':
168 |                         break
169 |             coord = v["results"][0]["geometry"]["location"]
170 |             lat = coord["lat"]
171 |             lng = coord["lng"]
172 |             cmp = v["results"][0]["address_components"]
173 |             try:
174 |                 street = get_part(cmp, 'street_number', '') + ' ' + get_part(cmp, 'route')
175 |             except:
176 |                 street = None
177 |             return {
178 |                 "address": address,
179 |                 "lat": lat,
180 |                 "lng": lng,
181 |                 "street": street,
182 |                 "locality": get_part(cmp, 'locality'),
183 |                 "region": get_part(cmp, 'administrative_area_level_1'),
184 |                 "administrative_area_level_2": get_part(cmp, 'administrative_area_level_2'),
185 |                 "country": get_part(cmp, 'country'),
186 |                 "postal_code": get_part(cmp, 'postal_code')
187 |                 }
188 |         except Exception as e:
189 |             print("PROBLEM", e)
190 |             return None
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     cache = GeoCache("cache.db")
195 |     # print(cache.find("305 Memorial Dr, Cambridge, MA"))
196 |     # print(cache.find("Chittenden, Franklin County, Connecticut, United States"))
197 |     print(cache.find("Lamoille County, Connecticut, United States"))
198 | 


--------------------------------------------------------------------------------
/sheetsite/google_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | import pygsheets
 2 | 
 3 | 
 4 | class GoogleSpreadsheet(object):
 5 | 
 6 |     def __init__(self):
 7 |         self.connection = None
 8 |         self.workbook = None
 9 | 
10 |     def connect(self, credential_file):
11 |         self.connection = pygsheets.authorize(service_file=credential_file)
12 | 
13 |     def load_remote(self, spreadsheet):
14 |         self.workbook = self.connection.open_by_key(spreadsheet)
15 | 
16 |     def worksheets(self):
17 |         return self.workbook.worksheets()
18 | 


--------------------------------------------------------------------------------
/sheetsite/ids.py:
--------------------------------------------------------------------------------
 1 | import daff
 2 | import json
 3 | import os
 4 | import uuid
 5 | 
 6 | 
 7 | def process_ids(prev_file, curr_file, prev_id_file, id_file):
 8 |     io = daff.TableIO()
 9 |     dapp = daff.Coopy(io)
10 |     if not os.path.exists(prev_file):
11 |         prev_file = curr_file
12 |     v1 = dapp.loadTable(prev_file, 'local')
13 |     v2 = dapp.loadTable(curr_file, 'remote')
14 |     flags = daff.CompareFlags()
15 |     flags.allow_nested_cells = True
16 |     alignment = daff.compareTables3(None, v1, v2, flags).align()
17 |     daff.TableDiff(alignment, flags).hiliteSingle(daff.SimpleTable(0, 0))
18 |     if os.path.exists(prev_id_file):
19 |         in_refs = json.load(open(prev_id_file))
20 |     else:
21 |         in_refs = {}
22 |     out_refs = {}
23 |     for part in alignment.comp.child_order:
24 |         comp = alignment.comp.children.h.get(part)
25 |         nalignment = comp.alignment
26 |         order = nalignment.toOrder().getList()
27 |         v1 = comp.a
28 |         v2 = comp.b
29 |         ref = in_refs.get(part, {})
30 |         if part not in out_refs:
31 |             out_ref = out_refs[part] = {}
32 |         mints = 0
33 |         copies = 0
34 |         drops = 0
35 |         for o in order:
36 |             if o.r == 0:
37 |                 continue
38 |             if o.r >= 0 and o.l >= 0:
39 |                 src = ref.get(str(o.l))
40 |                 if src is None:
41 |                     out_ref[o.r] = str(uuid.uuid4())
42 |                     mints += 1
43 |                 else:
44 |                     out_ref[o.r] = ref[str(o.l)]
45 |                     copies += 1
46 |             if o.r < 0 and o.l >= 0:
47 |                 drops += 1
48 |             if o.r >= 0 and o.l < 0:
49 |                 out_ref[o.r] = str(uuid.uuid4())
50 |                 mints += 1
51 |     json.dump(out_refs, open(id_file, 'w'), indent=2)
52 |     return out_refs
53 | 


--------------------------------------------------------------------------------
/sheetsite/json_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import json
 3 | 
 4 | 
 5 | class JsonSpreadsheet(object):
 6 | 
 7 |     def __init__(self, filename, data=None):
 8 |         if data is not None:
 9 |             self.data = data
10 |         else:
11 |             self.data = json.load(open(filename))
12 |         if 'tables' in self.data:
13 |             self.sheets = [JsonSheet(n, self.data['tables'][n])
14 |                            for n in self.data['names']]
15 |         else:
16 |             self.sheets = [JsonSheet('sheet', self.data['data'])]
17 | 
18 |     def worksheets(self):
19 |         return self.sheets
20 | 
21 |     @classmethod
22 |     def as_dict(cls, workbook):
23 |         result = OrderedDict()
24 |         order = result['names'] = []
25 |         sheets = result['tables'] = OrderedDict()
26 |         for sheet in workbook.worksheets():
27 |             title = sheet.title
28 |             order.append(title)
29 |             ws = sheets[title] = OrderedDict()
30 |             vals = sheet.get_all_values()
31 |             if len(vals) > 0:
32 |                 columns = vals[0]
33 |                 rows = vals[1:]
34 |                 ws['columns'] = columns
35 |                 ws['rows'] = [OrderedDict(zip(columns, row)) for row in rows]
36 |             else:
37 |                 ws['columns'] = []
38 |                 ws['rows'] = []
39 |         return result
40 | 
41 | 
42 | class JsonSheet(object):
43 | 
44 |     def __init__(self, name, data):
45 |         self.name = name
46 |         self.data = data
47 |         if isinstance(data, list):
48 |             print("WORKING WITH", data[0].keys())
49 |             self.columns = data[0].keys()
50 |             self.data = {"rows": data}
51 |         else:
52 |             self.columns = data['columns']
53 | 
54 |     def get_all_values(self):
55 |         cols = [c for c in self.columns if c is not None]
56 |         results = [cols]
57 |         for row in self.data['rows']:
58 |             print("Working on", row)
59 |             results.append([row.get(c) for c in cols])
60 |         return results
61 | 
62 |     @property
63 |     def title(self):
64 |         return self.name
65 | 
66 | 


--------------------------------------------------------------------------------
/sheetsite/jsonify.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | import json
 3 | 
 4 | 
 5 | def json_serialize(obj):
 6 |     if isinstance(obj, (datetime, date)):
 7 |         return obj.isoformat()
 8 |     raise TypeError ("Cannot deserialize %s" % type(obj))
 9 | 
10 | 
11 | def dump(*args, **kwargs):
12 |     kwargs['default'] = json_serialize
13 |     json.dump(*args, **kwargs)
14 | 
15 | def dumps(*args, **kwargs):
16 |     kwargs['default'] = json_serialize
17 |     return json.dumps(*args, **kwargs)
18 | 


--------------------------------------------------------------------------------
/sheetsite/merged_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | class MergedSpreadsheet(object):
 2 |     def __init__(self, workbook, merge_tables):
 3 |         self.workbook = workbook
 4 |         merged = set()
 5 |         for key, lst in merge_tables.items():
 6 |             merged = merged | set(lst)
 7 |         original_sheets = self.workbook.worksheets()
 8 |         sheet_by_name = {}
 9 |         for sheet in original_sheets:
10 |             sheet_by_name[sheet.title] = sheet
11 |         sheets = [sheet for sheet in original_sheets if sheet.title not in merged and '*' not in merged]
12 |         for key, lst in merge_tables.items():
13 |             if lst[0] == '*':
14 |                 sheets.append(MergedSheet(key, original_sheets))
15 |             else:
16 |                 sheets.append(MergedSheet(key, [sheet_by_name[name] for name in lst]))
17 |         self.sheets = sheets
18 | 
19 |     def worksheets(self):
20 |         return self.sheets
21 | 
22 | class MergedSheet(object):
23 |     def __init__(self, name, sheets):
24 |         self.sheets = sheets
25 |         self.name = name
26 | 
27 |     def get_all_values(self):
28 |         rows = []
29 |         for sheet in self.sheets:
30 |             rows += sheet.get_all_values()
31 |         deduped_rows = []
32 |         keys = {}
33 |         for row in rows:
34 |             # I hate near dupes!!!!!
35 |             rowx = [row[0], row[3]]
36 |             # I hate python 2.7
37 |             key = ' // '.join(str((x or '').encode('utf-8')) for x in rowx)
38 |             import re
39 |             key = re.sub(r'[\n\r ]+', ' ', key)
40 |             if key not in keys:
41 |                 deduped_rows.append(row)
42 |                 keys[key] = True
43 |         return deduped_rows
44 | 
45 |     @property
46 |     def title(self):
47 |         return self.name
48 | 
49 | 


--------------------------------------------------------------------------------
/sheetsite/names.py:
--------------------------------------------------------------------------------
 1 | 
 2 | NAMES = {
 3 |     'lat': 'lat',
 4 |     'latitude': 'lat',
 5 |     'lng': 'lng',
 6 |     'lon': 'lng',
 7 |     'longitude': 'lng',
 8 |     'address': 'address',
 9 |     'zip': 'postal_code',
10 |     'zipcode': 'postal_code',
11 |     'zip_code': 'postal_code',
12 |     'zip code': 'postal_code',
13 |     'postal_code': 'postal_code',
14 |     'postal code': 'postal_code',
15 |     'locality': 'locality',
16 |     'city': 'locality',
17 |     'country': 'country',
18 |     'street': 'street',
19 |     'region': 'region',
20 |     'state': 'region',
21 |     'province': 'region',
22 |     'county': 'administrative_area_level_2',
23 |     'geo_county': 'administrative_area_level_2',
24 |     'latlng': 'latlng'
25 | }
26 | 
27 | 
28 | def normalize_name(name):
29 |     name = name.lower()
30 |     return NAMES.get(name, name)
31 | 


--------------------------------------------------------------------------------
/sheetsite/sheet.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | class Sheets(object):
 4 | 
 5 |     def __init__(self, data):
 6 |         self.data = data
 7 | 
 8 |     @property
 9 |     def tables(self):
10 |         return [self.table(name) for name in self.data['names']]
11 | 
12 |     def table(self, name):
13 |         return Table(self.data['tables'][name], name)
14 | 
15 |     def tables_with_columns(self, *columns, **keys):
16 |         lst = [self.table(name) for name in self.data['names']
17 |                 if set(columns) <= set(self.data['tables'][name]['columns'])]
18 |         if keys.get('require') and len(lst) == 0:
19 |             raise Exception('no table found with column(s) {}'.format(columns))
20 |         return lst
21 | 
22 |     def __repr__(self):
23 |         return json.dumps(self.data)
24 | 
25 |     def __getitem__(self, key):
26 |         return self.data[key]
27 | 
28 |     def __setitem__(self, key, val):
29 |         self.data[key] = val
30 | 
31 |     def __delitem__(self, key):
32 |         del self.data[key]
33 | 
34 | 
35 | class Table(object):
36 |     def __init__(self, data, name):
37 |         self.data = data
38 |         self.name = name
39 | 
40 |     @property
41 |     def columns(self):
42 |         return self.data['columns']
43 | 
44 |     def has_column(self, name):
45 |         return (name in self.columns)
46 | 
47 |     @property
48 |     def rows(self):
49 |         return [Row(row) for row in self.data['rows']]
50 | 
51 |     def add_column(self, name):
52 |         if self.has_column(name):
53 |             return
54 |         self.data['columns'].append(name)
55 |         for row in self.rows:
56 |             row[name] = None
57 | 
58 |     def remove_column(self, name):
59 |         if not self.has_column(name):
60 |             return
61 |         self.data['columns'] = [c for c in self.data['columns'] if c != name]
62 |         for row in self.rows:
63 |             del row[name]
64 | 
65 |     def __repr__(self):
66 |         return json.dumps(self.data)
67 | 
68 | class Row(object):
69 |     def __init__(self, data):
70 |         self.data = data
71 | 
72 |     def __getitem__(self, key):
73 |         return self.data[key]
74 | 
75 |     def __setitem__(self, key, val):
76 |         self.data[key] = val
77 | 
78 |     def __delitem__(self, key):
79 |         del self.data[key]
80 | 
81 |     def __repr__(self):
82 |         return json.dumps(self.data)
83 | 
84 |     def add_to_set(self, key, val):
85 |         if self.data[key] is None:
86 |             self.data[key] = []
87 |         if val not in self.data[key]:
88 |             self.data[key].append(val)
89 | 


--------------------------------------------------------------------------------
/sheetsite/sheetsend.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import sheetsite
  5 | import shutil
  6 | import subprocess
  7 | 
  8 | # States:
  9 | #   .pending -> will need to be processed
 10 | #   .processing -> working on it
 11 | #   .ack_pending -> will need to be acknowledged
 12 | #   .ack_processing -> working on acking
 13 | 
 14 | def run():
 15 |     parser = argparse.ArgumentParser(description='Update a website from a spreadsheet. '
 16 |                                      'Take a spreadsheet (from google sheets or locally), and '
 17 |                                      'convert it to a .json file that a static website '
 18 |                                      'generator like jekyll can use, then push it out.')
 19 |     parser.add_argument('layout_file', nargs="?", help='json file '
 20 |                         'describing source, destination, and all settings')
 21 |     parser.add_argument('--cache', nargs=1, required=False, default='cache',
 22 |                         help='cache directory where work is stored.')
 23 |     parser.add_argument('--spool', nargs=1, required=False,
 24 |                         help='if supplied, work only on sheets mentioned in this directory.'
 25 |                         '(see sheetmail)')
 26 |     args = parser.parse_args()
 27 |     if args.layout_file is None:
 28 |         print "Need a layout file, I should give you an example."
 29 |         print "See example_sites.json in github repository for sheetsite."
 30 |         print "Add -h for help."
 31 |         exit(1)
 32 | 
 33 |     layout = json.loads(open(args.layout_file).read())
 34 |     root = args.cache[0]
 35 |     spool = args.spool[0]
 36 | 
 37 |     names = layout['names']
 38 | 
 39 |     for name in names:
 40 | 
 41 |         site = layout['sites'][name]
 42 | 
 43 |         source = site['source']
 44 |         if source['name'] != 'google-sheets':
 45 |             print "do not know how to read from", source['name']
 46 |             exit(1)
 47 | 
 48 |         if spool is not None:
 49 |             key = source['key']
 50 |             pending_file = os.path.join(spool, '{}.pending.json'.format(key))
 51 |             processing_file = os.path.join(spool, '{}.processing.json'.format(key))
 52 |             present = False
 53 |             if os.path.exists(pending_file):
 54 |                 shutil.move(pending_file, processing_file)
 55 |                 present = True
 56 |             if os.path.exists(processing_file):
 57 |                 present = True
 58 |             if not present:
 59 |                 continue
 60 | 
 61 |         path = os.path.join(root, name)
 62 |         if not(os.path.exists(path)):
 63 |             os.makedirs(path)
 64 | 
 65 |         from sheetsite.google_spreadsheet import GoogleSpreadsheet
 66 |         from sheetsite.site import Site
 67 |         wb = GoogleSpreadsheet()
 68 |         wb.connect(source['credential_file'])
 69 |         wb.load_remote(source['key'])
 70 | 
 71 |         ss = Site(wb, os.path.join(path, 'geocache.sqlite'))
 72 |         if 'flags' in site:
 73 |             ss.configure(site['flags'])
 74 |         output_file = os.path.join(path, 'public.json')
 75 |         private_output_file = os.path.join(path, 'private.json')
 76 |         ss.save_local(output_file)
 77 |         ss.save_local(private_output_file, private_sheets=True)
 78 | 
 79 |         destination = site['destination']
 80 |         if destination['name'] != 'git':
 81 |             print "do not know how to write to", destination['name']
 82 |             exit(1)
 83 | 
 84 |         local_repo = os.path.join(path, 'repo')
 85 |         if not(os.path.exists(local_repo)):
 86 |             subprocess.check_output(['git', 'clone', destination['repo'], local_repo])
 87 |         wd = os.getcwd()
 88 |         os.chdir(local_repo)
 89 |         subprocess.check_output(['git', 'pull'])
 90 |         os.chdir(wd)
 91 |         shutil.copyfile(output_file, os.path.join(local_repo, destination['file']))
 92 |         os.chdir(local_repo)
 93 |         subprocess.check_output(['git', 'add', destination['file']])
 94 |         try:
 95 |             subprocess.check_output(['git', 'commit', '-m', 'update from sheetsite'])
 96 |             subprocess.check_output(['git', 'push'])
 97 |         except subprocess.CalledProcessError:
 98 |             print "Commit/push skipped"
 99 |         os.chdir(wd)
100 | 
101 |         if spool is not None:
102 |             key = source['key']
103 |             processing_file = os.path.join(spool, '{}.processing.json'.format(key))
104 |             ack_pending_file = os.path.join(spool, '{}.ack_pending.json'.format(key))
105 |             if os.path.exists(processing_file):
106 |                 shutil.move(processing_file, ack_pending_file)
107 | 


--------------------------------------------------------------------------------
/sheetsite/sheetwatch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import dataset
  3 | import datetime
  4 | import imaplib
  5 | import json
  6 | import os
  7 | import re
  8 | import six
  9 | import sys
 10 | import time
 11 | 
 12 | try:
 13 |     from sheetsite.tasks.detect_site import detect_site
 14 | except ImportError as e:
 15 |     print(e)
 16 |     print("*** Did you pip install sheetsite[queue]?")
 17 |     exit(1)
 18 | 
 19 | 
 20 | def find_sheet(msg):
 21 |     key = None
 22 |     title = None
 23 |     who = None
 24 |     m = re.search(r'docs.google.com/spreadsheets/d/([^/]*)/', msg.body)
 25 |     if m:
 26 |         key = m.group(1)
 27 |     m = re.search(r'\"([^\"]*)', msg.subject)
 28 |     if m:
 29 |         title = m.group(1)
 30 |         title = re.sub(r'[\r\n]', '', title)
 31 |     m = re.search(r'[\r\n]\* (.*) made changes', msg.body)
 32 |     if m:
 33 |         who = m.group(1)
 34 |     if key is not None:
 35 |         print("Found %s: %s (%s)" % (key, title, who))
 36 |         return {
 37 |             "key": key,
 38 |             "title": title,
 39 |             "who": who
 40 |         }
 41 |     return None
 42 | 
 43 | 
 44 | def store_work(job):
 45 |     if 'key' not in job:
 46 |         return
 47 |     detect_site.delay(job)
 48 | 
 49 | 
 50 | class TestMail(object):
 51 |     def __init__(self, subject=None, body=None, labels=None):
 52 |         self.subject = subject
 53 |         self.body = body
 54 |         self.labels = labels
 55 | 
 56 |     def fetch(self):
 57 |         pass
 58 | 
 59 |     def has_label(self, label):
 60 |         return label in self.labels
 61 | 
 62 |     def add_label(self, label):
 63 |         if not self.has_label(label):
 64 |             self.labels.append(label)
 65 | 
 66 | 
 67 | class TestMailbox(object):
 68 |     def __init__(self, fname):
 69 |         self.fname = fname
 70 |         self.data = json.load(open(fname))
 71 | 
 72 |     def inbox(self):
 73 |         return self
 74 | 
 75 |     def mail(self, **_):
 76 |         return [TestMail(**x) for x in self.data]
 77 | 
 78 |     def logout(self):
 79 |         pass
 80 | 
 81 | 
 82 | class ImapMail(object):
 83 |     def __init__(self, parent, uid):
 84 |         self.parent = parent
 85 |         self.uid = uid
 86 |         self.subject = ""
 87 |         self.body = ""
 88 | 
 89 |     def plain(self, part):
 90 |         if isinstance(part, six.string_types):
 91 |             return part.encode('utf8', 'xmlcharrefreplace').strip()
 92 |         return part.as_string()
 93 | 
 94 |     def parse_header(self, part):
 95 |         if isinstance(part, six.string_types):
 96 |             return self.plain(part)
 97 |         elif isinstance(part, list):
 98 |             return " ".join([self.parse_header(p) for p in part])
 99 |         elif isinstance(part, tuple):
100 |             return part[0]
101 |         return part
102 | 
103 |     def parse_body(self, message):
104 |         payload = message.get_payload(decode=True) or message.get_payload()
105 |         if isinstance(payload, six.string_types):
106 |             return self.plain(payload)
107 |         elif isinstance(payload, list):
108 |             for part in payload:
109 |                 if part.get_content_type() == 'text/plain':
110 |                     return self.plain(part)
111 |             return self.plain(payload[0])
112 |         return message.as_string()
113 |             
114 |     def fetch(self):
115 |         result, data = self.parent.mailer.uid('fetch', self.uid, '(RFC822)')
116 |         raw_email = data[0][1]
117 |         import email
118 |         from email.header import decode_header
119 |         email_message = email.message_from_string(raw_email.decode('utf-8'))
120 | 
121 |         def extract(key):
122 |             return self.parse_header(decode_header(email_message[key]))
123 |         self.subject = extract('Subject')
124 |         self.body = self.parse_body(email_message)
125 | 
126 |     def has_label(self, label):
127 |         return False
128 | 
129 |     def add_label(self, label):
130 |         self.parent.set_processed(self.uid)
131 | 
132 | 
133 | class ImapMailbox(object):
134 |     def __init__(self, username, pword):
135 |         self.mailer = imaplib.IMAP4_SSL('imap.gmail.com')
136 |         self.db_name = os.path.join(os.environ['SHEETSITE_CACHE'],
137 |                                     "emails.sqlite3")
138 |         self.db_uri = "sqlite:///{}".format(self.db_name)
139 |         print(self.db_uri)
140 |         self.db = dataset.connect(self.db_uri)
141 |         self.record = self.db['emails']
142 |         import sqlalchemy.types
143 |         if self.record.count() == 0:
144 |             self.record.create_column('uid', sqlalchemy.types.Text)
145 |             self.record.create_index(['uid'])
146 |         self.login(username, pword)
147 | 
148 |     def login(self, username, pword):
149 |         self.mailer.login(username, pword)
150 | 
151 |     def inbox(self):
152 |         self.mailer.select('inbox')
153 |         return self
154 |     
155 |     def set_processed(self, uid):
156 |         self.record.insert({'uid': uid})
157 |         
158 |     def mail(self, **_):
159 |         import datetime
160 |         date = (datetime.date.today() - datetime.timedelta(10)).strftime("%d-%b-%Y")
161 |         result, data = self.mailer.uid(
162 |             'search', 
163 |             None, 
164 |             '(SENTSINCE {date} FROM "notify@google.com")'.format(
165 |                 date=date)
166 |         )
167 |         email_uids = data[0].split()
168 |         mails = []
169 |         for uid in email_uids:
170 |             print("Checking", uid)
171 |             if len(list(self.record.find(uid=uid))) == 0:
172 |                 print("Not processed yet!")
173 |                 mails.append(ImapMail(self, uid))
174 |         return mails
175 | 
176 |     def logout(self):
177 |         self.mailer.logout()
178 | 
179 | 
180 | def worker():
181 |     from celery.__main__ import main
182 |     while len(sys.argv) > 0:
183 |         sys.argv.pop()
184 |     for arg in ['celery', '-A', 'sheetsite.site_queue', 'worker', '-l', 'info']:
185 |         sys.argv.append(arg)
186 |     sys.exit(main())
187 | 
188 | 
189 | def run():
190 | 
191 |     parser = argparse.ArgumentParser(description='Check email for sheet change notifications.'
192 |                                      'For when webhooks are not an option.')
193 | 
194 |     subparsers = parser.add_subparsers(dest='cmd')
195 | 
196 |     ping = subparsers.add_parser('ping')
197 | 
198 |     ping.add_argument('--clear', action='store_true',
199 |                       help="do not take action on initial emails, just absorb them")
200 | 
201 |     ping.add_argument('--no-notify', action='store_true',
202 |                       help="do not send notification emails")
203 | 
204 |     ping.add_argument('--delay', type=int, default=0,
205 |                       help="delay in seconds between pings"
206 |                       " (if not set, just one ping is made")
207 | 
208 |     subparsers.add_parser('worker')
209 | 
210 |     args = parser.parse_args()
211 | 
212 |     if args.cmd == 'worker':
213 |         worker()
214 |         return
215 | 
216 |     ignore = args.clear
217 |     while True:
218 |         # log in to gmail
219 |         if 'GMAIL_PASSWORD' in os.environ:
220 |             if os.environ['GMAIL_USERNAME'] == 'test':
221 |                 g = TestMailbox(os.environ['GMAIL_PASSWORD'])
222 |             else:
223 |                 g = ImapMailbox(os.environ['GMAIL_USERNAME'],
224 |                                 os.environ['GMAIL_PASSWORD'])
225 |         else:
226 |             print("Need GMAIL_USERNAME/GMAIL_PASSWORD to be set in environment.")
227 |             print("They should be set to whatever account receives change notications of sheet.")
228 |             exit(1)
229 | 
230 |         # look for recent emails from google notify
231 |         window = datetime.datetime.now() - datetime.timedelta(days=10)
232 |         mail = g.inbox().mail(sender='notify@google.com', after=window)
233 | 
234 |         # check emails for action items
235 |         keys = {}
236 |         for msg in mail:
237 |             msg.fetch()
238 |             print(msg.subject)
239 |             # msg.remove_label('sheetmailed')
240 |             if msg.has_label('sheetmailed'):
241 |                 continue
242 |             sheet = find_sheet(msg)
243 |             if sheet is not None:
244 |                 if sheet['key'] in keys:
245 |                     sheet = None
246 |                 else:
247 |                     keys[sheet['key']] = True
248 |             if sheet is not None:
249 |                 if not ignore:
250 |                     sheet['no_notify'] = args.no_notify
251 |                     store_work(sheet)
252 |                 else:
253 |                     print("  * ignoring this email as directed")
254 |             msg.add_label('sheetmailed')
255 | 
256 |         # leave
257 |         g.logout()
258 | 
259 |         if args.delay == 0:
260 |             break
261 | 
262 |         ignore = False
263 |         time.sleep(args.delay)
264 | 
265 | 
266 | if __name__ == '__main__':
267 |     run()
268 | 


--------------------------------------------------------------------------------
/sheetsite/site.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from sheetsite.names import normalize_name
  4 | from sheetsite.filtered_spreadsheet import FilteredSpreadsheet
  5 | from sheetsite.merged_spreadsheet import MergedSpreadsheet
  6 | 
  7 | 
  8 | class Site(object):
  9 | 
 10 |     def __init__(self, spreadsheet, geocache_filename=None, censor=True):
 11 |         self.workbook = spreadsheet
 12 |         self.geocache_filename = geocache_filename
 13 |         self.censor = censor
 14 |         self.include = None
 15 |         self.exclude = None
 16 |         self.fill_columns = None
 17 |         self.add_columns = {}
 18 |         self.const_columns = {}
 19 |         self.rename_columns = {}
 20 |         self.address_columns = {}
 21 |         self.constant_columns = {}
 22 |         self.merge_tables = None
 23 |         self.modify = True
 24 |         self.geocoder = None
 25 |         self.group_key = None
 26 |         self.ids = None
 27 | 
 28 |     def add_sheet_filter(self, include, exclude):
 29 |         self.include = include
 30 |         self.exclude = exclude
 31 | 
 32 |     def add_column_fills(self, fill_columns):
 33 |         if fill_columns is None:
 34 |             self.fill_columns = None
 35 |             return
 36 |         self.fill_columns = [normalize_name(n) for n in fill_columns]
 37 | 
 38 |     def save_local(self, output_file, private_sheets=False, enhance=True):
 39 |         self.modify = enhance
 40 |         ext = '-'
 41 |         if output_file is not None:
 42 |             _, ext = os.path.splitext(output_file)
 43 |             ext = ext.lower()
 44 | 
 45 |         return self.save(output_file, private_sheets)
 46 | 
 47 |     def add_ids(self, ids):
 48 |         self.ids = ids
 49 | 
 50 |     def process_cells(self, rows, name):
 51 |         if not(self.modify):
 52 |             return rows
 53 |         rows = self.clean_cells(rows, name)
 54 |         rows = self.add_location(rows, name)
 55 |         return rows
 56 | 
 57 |     def filter(self, sheet, private_sheets):
 58 |         title = sheet.title
 59 |         core_title = re.sub(r'\(\((.*)\)\)', r'\1', title)
 60 |         if self.exclude is not None:
 61 |             if core_title in self.exclude:
 62 |                 return None
 63 |         if self.include is not None:
 64 |             if core_title in self.include:
 65 |                 return core_title
 66 |             return None
 67 |         if (core_title == title) == private_sheets:
 68 |             return None
 69 |         return core_title
 70 | 
 71 |     def private_workbook(self):
 72 |         return self.filtered_workbook(True)
 73 | 
 74 |     def public_workbook(self):
 75 |         return self.filtered_workbook(False)
 76 | 
 77 |     def merge(self, wb, merge_tables):
 78 |         if merge_tables is None:
 79 |             return wb
 80 |         return MergedSpreadsheet(wb, merge_tables)
 81 | 
 82 |     def filtered_workbook(self, selector_flags):
 83 |         workbook = self.merge(self.workbook, self.merge_tables)
 84 |         selector = lambda sheet: self.filter(sheet, selector_flags)
 85 |         processor = lambda sheet, title: self.process_cells(sheet.get_all_values(), title)
 86 |         fs = FilteredSpreadsheet(workbook, selector=selector, processor=processor)
 87 |         return fs
 88 | 
 89 |     def save(self, output_file, selector_flags):
 90 |         from sheetsite.destination import write_destination
 91 |         params = { 'output_file': output_file }
 92 |         state = { 'workbook': self.filtered_workbook(selector_flags) }
 93 |         write_destination(params, state)
 94 |         return True
 95 | 
 96 |     def sanity_stick(self, locs):
 97 |         result = []
 98 |         if len(locs) <= 1:
 99 |             return locs
100 |         import re
101 |         if len(re.sub(r'[^,]', '', locs[0])) < 3:
102 |             return [' '.join(locs)]
103 |         return locs
104 | 
105 |     def clean_cells(self, vals, name):
106 |         if len(vals) == 0:
107 |             return vals
108 | 
109 |         hide_column = {}
110 |         split_column = {}
111 |         for idx, cell in enumerate(vals[0]):
112 |             if cell is None or len(cell) == 0 or cell[0] == '(':
113 |                 hide_column[idx] = True
114 |             if cell == "Other Addresses (deprecated)":
115 |                 split_column[idx] = '\n'
116 | 
117 |         results = []
118 | 
119 |         existing = {}
120 |         for ridx, row in enumerate(vals):
121 |             result = []
122 |             for idx, cell in enumerate(row):
123 |                 if idx in hide_column:
124 |                     continue
125 |                 if cell is not None:
126 |                     try:
127 |                         cell = re.sub(r'\(\(.*\)\)','', cell)
128 |                         cell = re.sub(r'[\n\r]+$','', cell)
129 |                         cell = re.sub(r'^[\t \n\r]+$','', cell)
130 |                     except TypeError:
131 |                         pass
132 |                 if ridx > 0:
133 |                     if idx in split_column:
134 |                         if cell is not None:
135 |                             splits = cell.split(split_column[idx])
136 |                             splits = self.sanity_stick(splits)
137 |                             cell = [['address']] + [[x] for x in splits]
138 |                             print(">>>", cell)
139 |                             cell = self.clean_cells(cell, "other")
140 |                             cell = self.add_location(cell, "other")
141 |                             cell = {
142 |                                 'columns': cell[0],
143 |                                 'rows': [dict(zip(cell[0], row)) for row in cell[1:]]
144 |                             }
145 |                             print("<<<", cell)
146 |                 result.append(cell)
147 |                 if ridx == 0:
148 |                     existing[cell] = 1
149 |             if name in self.add_columns:
150 |                 for col in self.add_columns[name]:
151 |                     if col not in existing:
152 |                         if ridx == 0:
153 |                             result.append(col)
154 |                         else:
155 |                             result.append(None)
156 |             if name in self.constant_columns:
157 |                 for col, val in self.constant_columns[name].items():
158 |                     if col not in existing:
159 |                         if ridx == 0:
160 |                             result.append(col)
161 |                         else:
162 |                             result.append(val)
163 |             results.append(result)
164 | 
165 |         return results
166 | 
167 |     def add_location(self, vals, name):
168 |         if len(vals) == 0:
169 |             return vals
170 | 
171 |         have_address = False
172 |         have_fill_in = False
173 |         pattern = [0]
174 |         fill_in = []
175 |         group_index = None
176 |         offset = 0
177 |         for idx, cell in enumerate(vals[0]):
178 |             if name in self.rename_columns:
179 |                 renames = self.rename_columns[name]
180 |                 if cell in renames:
181 |                     cell = renames[cell]
182 |                     vals[0][idx] = cell
183 |             if cell == self.group_key and self.group_key is not None:
184 |                 group_index = idx
185 |             nn = normalize_name(cell)
186 |             if nn == 'address':
187 |                 pattern = [idx]
188 |                 have_address = True
189 |             if cell is not None and len(cell) > 0 and cell[0] == '[':
190 |                 have_fill_in = True
191 |                 vals[0][idx] = cell[1:-1]
192 |                 fill_in.append([normalize_name(vals[0][idx]), idx])
193 |             if self.fill_columns is not None:
194 |                 if nn in self.fill_columns:
195 |                     have_fill_in = True
196 |                     fill_in.append([nn, idx])
197 |             if self.add_columns is not None:
198 |                 if name in self.add_columns:
199 |                     if cell in self.add_columns[name]:
200 |                         offset -= 1
201 |                         fill_in.append([normalize_name(nn), idx])
202 |                         have_fill_in = True
203 |         if self.address_columns is not None:
204 |             if name in self.address_columns:
205 |                 have_address = True
206 |                 pattern = self.address_columns[name]
207 |                 for idx, col in enumerate(pattern):
208 |                     try:
209 |                         pattern[idx] = vals[0].index(col)
210 |                     except ValueError:
211 |                         pass
212 |         if have_fill_in:
213 |             dccid = None
214 |             for at, (cname, cidx) in enumerate(fill_in):
215 |                 if cname == 'dccid' and self.ids is not None:
216 |                     dccid = at
217 |                     if name in self.ids:
218 |                         ref = self.ids[name]
219 |                         for idx, row in enumerate(vals):
220 |                             if idx == 0:
221 |                                 continue
222 |                             key = ref.get(idx)
223 |                             row[cidx] = key
224 |             if dccid is not None:
225 |                 del fill_in[dccid]
226 |                 if len(fill_in) == 0:
227 |                     have_fill_in = False
228 |         if not(have_fill_in) or not(have_address):
229 |             return vals
230 |         from sheetsite.geocache import GeoCache
231 |         cache = GeoCache(self.geocache_filename, geocoder=self.geocoder,
232 |                          group_key=group_index)
233 |         cache.find_all(vals[1:], pattern, fill_in)
234 |         return vals
235 | 
236 |     def configure(self, flags):
237 |         self.geocoder = flags.get('geocoder')
238 |         for key, val in flags.items():
239 |             if key == 'rename':
240 |                 self.rename_columns = val
241 |             if key == 'add':
242 |                 self.add_columns = val
243 |             if key == 'constant':
244 |                 self.constant_columns = val
245 |             if key == 'address':
246 |                 self.address_columns = val
247 |             if key == 'merge':
248 |                 self.merge_tables = val
249 |             if key == 'group':
250 |                 self.group_key = val
251 | 
252 | 
253 | 


--------------------------------------------------------------------------------
/sheetsite/site_queue.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | import os
 3 | 
 4 | app = Celery('sheetsite',
 5 |              broker=os.environ.get('SHEETSITE_BROKER_URL', None),
 6 |              backend=os.environ.get('SHEETSITE_RESULT_BACKEND', None),
 7 |              include=['sheetsite.tasks'])
 8 | 
 9 | if __name__ == '__main__':
10 |     app.start()
11 | 


--------------------------------------------------------------------------------
/sheetsite/source/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sheetsite.source.csv import read_source_csv
 3 | from sheetsite.source.google import read_source_google
 4 | from sheetsite.source.excel import read_source_excel
 5 | from sheetsite.source.json import read_source_json
 6 | 
 7 | 
 8 | def read_source(params):
 9 | 
10 |     readers = {
11 |         '.csv': read_source_csv,
12 |         'google-sheets': read_source_google,
13 |         '.json': read_source_json,
14 |         '.xls': read_source_excel,
15 |         '.xlsx': read_source_excel
16 |     }
17 | 
18 |     name = None
19 |     if 'name' in params:
20 |         name = params['name']
21 |     elif 'filename' in params:
22 |         _, ext = os.path.splitext(params['filename'])
23 |         name = ext
24 | 
25 |     if name is None:
26 |         raise IOError('source not specified')
27 | 
28 |     if name not in readers:
29 |         raise IOError('source not recognized: {}'.format(name))
30 | 
31 |     return readers[name](params)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/sheetsite/source/csv.py:
--------------------------------------------------------------------------------
1 | def read_source_csv(source):
2 |     from sheetsite.csv_spreadsheet import CsvSpreadsheet
3 |     wb = CsvSpreadsheet(source['filename'])
4 |     return wb
5 | 


--------------------------------------------------------------------------------
/sheetsite/source/excel.py:
--------------------------------------------------------------------------------
1 | def read_source_excel(source):
2 |     from sheetsite.xls_spreadsheet import XlsSpreadsheet
3 |     wb = XlsSpreadsheet(source['filename'])
4 |     return wb
5 | 
6 | 


--------------------------------------------------------------------------------
/sheetsite/source/google.py:
--------------------------------------------------------------------------------
1 | def read_source_google(source):
2 |     from sheetsite.google_spreadsheet import GoogleSpreadsheet
3 |     wb = GoogleSpreadsheet()
4 |     wb.connect(source['credential_file'])
5 |     wb.load_remote(source['key'])
6 |     return wb
7 | 
8 | 


--------------------------------------------------------------------------------
/sheetsite/source/json.py:
--------------------------------------------------------------------------------
1 | def read_source_json(source):
2 |     from sheetsite.json_spreadsheet import JsonSpreadsheet
3 |     wb = JsonSpreadsheet(source['filename'])
4 |     return wb
5 | 


--------------------------------------------------------------------------------
/sheetsite/spreadsheet.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import gspread
 3 | import json
 4 | import os
 5 | from oauth2client.client import SignedJwtAssertionCredentials
 6 | import re
 7 | from sheetsite.jsonify import dump
 8 | 
 9 | 
10 | class Spreadsheet(object):
11 | 
12 |     def __init__(self, censor=True):
13 |         self.connection = None
14 |         self.workbook = None
15 |         self.censor = censor
16 | 
17 |     def connect(self, credential_file):
18 |         json_key = json.load(open(credential_file))
19 |         scope = ['https://spreadsheets.google.com/feeds']
20 |         credentials = SignedJwtAssertionCredentials(json_key['client_email'],
21 |                                                     json_key['private_key'], scope)
22 |         self.connection = gspread.authorize(credentials)
23 | 
24 |     def load_remote(self, spreadsheet_key):
25 |         self.workbook = self.connection.open_by_key(spreadsheet_key)
26 | 
27 |     def save_local(self, output_file):
28 |         _, ext = os.path.splitext(output_file)
29 | 
30 |         if ext == ".xls":
31 |             return self.save_to_excel(output_file)
32 |         elif ext == ".json":
33 |             return self.save_to_json(output_file)
34 | 
35 |         print("Unknown extension", ext)
36 |         return False
37 | 
38 |     def save_to_excel(self, output_file):
39 |         import xlwt
40 |         wb = xlwt.Workbook()
41 |         for sheet in self.workbook.worksheets():
42 |             ws = wb.add_sheet(sheet.title)
43 |             rows = self.clean_cells(sheet.get_all_values())
44 |             for r, row in enumerate(rows):
45 |                 for c, cell in enumerate(row):
46 |                     ws.write(r, c, cell)
47 |         wb.save(output_file)
48 |         return True
49 | 
50 |     def save_to_json(self, output_file):
51 |         result = OrderedDict()
52 |         order = result['names'] = []
53 |         sheets = result['tables'] = OrderedDict()
54 |         for sheet in self.workbook.worksheets():
55 |             order.append(sheet.title)
56 |             ws = sheets[sheet.title] = OrderedDict()
57 |             vals = self.clean_cells(sheet.get_all_values())
58 |             columns = vals[0]
59 |             rows = vals[1:]
60 |             ws['columns'] = columns
61 |             ws['rows'] = [OrderedDict(zip(columns, row)) for row in rows]
62 |         with open(output_file, 'w') as f:
63 |             dump(result, f, indent=2)
64 |         return True
65 | 
66 |     def clean_cells(self, vals):
67 |         hide_column = {}
68 | 
69 |         for idx, cell in enumerate(vals[0]):
70 |             if len(cell) == 0 or cell[0] == '(':
71 |                 hide_column[idx] = True
72 | 
73 |         results = []
74 | 
75 |         for ridx, row in enumerate(vals):
76 |             result = []
77 |             for idx, cell in enumerate(row):
78 |                 if idx in hide_column:
79 |                     continue
80 |                 cell = re.sub(r'\(\(.*\)\)', '', cell)
81 |                 cell = re.sub(r'[\n\r]+$', '', cell)
82 |                 cell = re.sub(r'^[\t \n\r]+$', '', cell)
83 |                 result.append(cell)
84 |             results.append(result)
85 | 
86 |         return results
87 | 
88 | 


--------------------------------------------------------------------------------
/sheetsite/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | from sheetsite.site_queue import app
 2 | from sheetsite.site import Site
 3 | import sheetsite.tasks.notify
 4 | import sheetsite.tasks.update_site
 5 | import sheetsite.tasks.detect_site
 6 | 
 7 | 
 8 | @app.task
 9 | def add(x, y):
10 |     return x + y
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/sheetsite/tasks/detect_site.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from sheetsite.expand import load_config
 4 | from sheetsite.site_queue import app
 5 | from sheetsite.tasks.update_site import update_site
 6 | 
 7 | 
 8 | @app.task
 9 | def detect_site(params):
10 |     key = params['key']
11 |     print("PROCESS_spreadsheet", key, params)
12 | 
13 |     if os.path.isdir(os.environ['SHEETSITE_LAYOUT']):
14 |         from glob import glob
15 |         files = glob(os.path.join(os.environ['SHEETSITE_LAYOUT'], '*.yml'))
16 |         files += glob(os.path.join(os.environ['SHEETSITE_LAYOUT'], '*.json'))
17 |         layout = {
18 |             'names': [],
19 |             'sites': {}
20 |         }
21 |         for fname in files:
22 |             name = os.path.splitext(os.path.split(fname)[1])[0]
23 |             layout['names'].append(name)
24 |             layout['sites'][name] = load_config(fname)
25 |     else:
26 |         # old big json file
27 |         layout = json.loads(open(os.environ['SHEETSITE_LAYOUT']).read())
28 | 
29 |     root = os.environ['SHEETSITE_CACHE']
30 | 
31 |     names = layout['names']
32 | 
33 |     for name in names:
34 | 
35 |         site = layout['sites'][name]
36 | 
37 |         if key != site['source']['key']:
38 |             continue
39 | 
40 |         path = os.path.join(root, name)
41 |         if not(os.path.exists(path)):
42 |             os.makedirs(path)
43 | 
44 |         update_site.delay(params, path, site, name)
45 | 
46 |     return False
47 | 
48 | 


--------------------------------------------------------------------------------
/sheetsite/tasks/notify.py:
--------------------------------------------------------------------------------
 1 | from email.mime.multipart import MIMEMultipart
 2 | from email.mime.text import MIMEText
 3 | import json
 4 | import os
 5 | from sheetsite.site_queue import app
 6 | import smtplib
 7 | 
 8 | 
 9 | @app.task
10 | def notify_one(email, subject, page, text):
11 | 
12 |     print("send [%s] / %s / %s" % (email, subject, page))
13 | 
14 |     server_ssl = smtplib.SMTP_SSL("smtp.gmail.com", 465)
15 |     server_ssl.ehlo()  # optional, called by login()
16 |     me = os.environ['GMAIL_USERNAME']
17 |     server_ssl.login(me, os.environ['GMAIL_PASSWORD'])
18 | 
19 |     msg = MIMEMultipart('alternative')
20 |     msg['Subject'] = subject
21 |     msg['From'] = me
22 |     msg['To'] = email
23 | 
24 |     # Record the MIME types of both parts - text/plain and text/html.
25 |     part1 = MIMEText(text, 'plain')
26 |     part2 = MIMEText(page, 'html')
27 | 
28 |     msg.attach(part1)
29 |     msg.attach(part2)
30 | 
31 |     server_ssl.sendmail(me, email, msg.as_string())
32 |     server_ssl.close()
33 | 
34 |     return True
35 | 
36 | 
37 | @app.task
38 | def notify_all(name, site_params, diff_html, diff_text):
39 |     print("NOTIFY_spreadsheet", site_params, name)
40 | 
41 |     import daff
42 |     import jinja2
43 |     import premailer
44 | 
45 |     root = os.environ['SHEETSITE_CACHE']
46 |     path = os.path.join(root, name)
47 |     print("Should look in", path)
48 |     notifications = None
49 |     for fname in ['private.json', 'public.json']:
50 |         full_fname = os.path.join(path, fname)
51 |         print("Look in", full_fname)
52 |         book = json.loads(open(full_fname).read())
53 |         if 'notifications' in book['tables']:
54 |             notifications = book['tables']['notifications']
55 |             break
56 |     if notifications is None:
57 |         print("No notifications requested")
58 |         return True
59 |     print("Notifications", notifications)
60 | 
61 |     # make a html report
62 |     css = daff.DiffRender().sampleCss()
63 |     site_params = dict(site_params)
64 |     site_params['css'] = css
65 |     site_params['diff'] = diff_html
66 |     env = jinja2.Environment(loader=jinja2.PackageLoader('sheetsite', 'templates'))
67 |     template = env.get_template('update.html')
68 |     page = template.render(site_params)
69 |     page = premailer.transform(page)
70 |     site_params['diff'] = diff_text
71 |     template = env.get_template('update.txt')
72 |     page_text = template.render(site_params)
73 | 
74 |     for target in notifications['rows']:
75 |         email = target.get('EMAIL', None)
76 |         if email is None:
77 |             email = target.get('email', None)
78 |         if email is not None:
79 |             if site_params['no_notify']:
80 |                 print("skip email to {}".format(email))
81 |             else:
82 |                 notify_one.delay(email=email,
83 |                                  subject="update to {}".format(site_params.get('name',
84 |                                                                                'directory')),
85 |                                  page=page,
86 |                                  text=page_text)
87 | 
88 |     return True
89 | 


--------------------------------------------------------------------------------
/sheetsite/tasks/update_site.py:
--------------------------------------------------------------------------------
 1 | from sheetsite.chain import apply_chain, compute_diff
 2 | from sheetsite.site_queue import app
 3 | 
 4 | 
 5 | @app.task
 6 | def update_site(params, path, site, name):
 7 | 
 8 |     source = site['source']
 9 |     destination = site['destination']
10 | 
11 |     site_params = {
12 |         'name': params.get('title', None),
13 |         'who': params.get('who', None),
14 |         'sheet_link': source.get('link', None),
15 |         'site_link': destination.get('link', None),
16 |         'no_notify': params['no_notify']
17 |     }
18 | 
19 |     files = apply_chain(site, path)
20 |     diff_html, diff_text = compute_diff(files, format='both')
21 | 
22 |     from sheetsite.tasks.notify import notify_all
23 |     notify_all.delay(name=name,
24 |                      site_params=site_params,
25 |                      diff_html=diff_html,
26 |                      diff_text=diff_text)
27 |     return True
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/sheetsite/templates/update.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <meta charset='utf-8'>
 5 |         <title>{{ name }}</title>
 6 |         <style TYPE='text/css'>
 7 |          {{ css }}
 8 |         </style>
 9 |     </head>
10 |     <body>
11 | 
12 |         <p>There's been an update in the &ldquo;{{ name }}&rdquo; site.</p>
13 | 
14 |         <ul>
15 |             {% if who %}
16 |             <li>Edit made by: {{ who }}</li>
17 |             {% endif %}
18 |             {% if site_link %}
19 |             <li>See site at: <a href='{{ site_link }}'>{{ site_link }}</a></li>
20 |             {% endif %}
21 |             {% if sheet_link %}
22 |             <li>Edit at: <a href='{{ sheet_link }}'>{{ sheet_link }}</a></li>
23 |             {% endif %}
24 |             <li>Unsubscribe by removing your address from the notifications sheet.</li>
25 |         </ul>
26 |         
27 |         <div class='highlighter'>
28 |             {{ diff }}
29 |         </div>
30 | 
31 |     </body>
32 | </html>
33 | 


--------------------------------------------------------------------------------
/sheetsite/templates/update.txt:
--------------------------------------------------------------------------------
 1 | There's been an update in the "{{ name }}" site.
 2 | 
 3 | {% if who %}
 4 |  * Edit made by: {{ who }}
 5 | {% endif %}{% if site_link %}
 6 |  * See site at: {{ site_link }}
 7 | {% endif %}{% if sheet_link %}
 8 |  * Edit at: {{ sheet_link }}
 9 | {% endif %}
10 |  * Unsubscribe by removing your address from the notifications sheet.
11 | 
12 | {{ diff }}
13 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paulfitz/sheetsite/0556e5713f01d7d8950365501bedecb5cdfabe6a/sheetsite/tweaks/__init__.py


--------------------------------------------------------------------------------
/sheetsite/tweaks/add_dccid.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Early add id - this stuff needs to get reworked
 3 | 
 4 | (stone soup only)
 5 | '''
 6 | 
 7 | import json
 8 | 
 9 | def apply3(wb, params, state):
10 |     column = params['column']
11 |     id_file = state['id_file']
12 |     ids = json.load(open(id_file, 'r'))
13 |     for name, t in wb['tables'].items():
14 |         if name in ids:
15 |             ids0 = ids[name]
16 |             if column not in t['columns']:
17 |                 t['columns'].append(column)
18 |             for i, row in enumerate(t['rows']):
19 |                 idx = str(i + 1)
20 |                 row[column] = ids0[idx]
21 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/coalesce.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Replaces all blank values in a column with the first non-blank value in a series
 3 | of columns, falling back on a default_value if all are blank.
 4 | ```
 5 | tweaks:
 6 |   coalesce:
 7 |     # the first of the following list of columns is the one that is modified
 8 |     columns: first_priority_column second_priority_column third_priority_column
 9 |     default_value: N/A
10 |     table: sheet1    # optional
11 | ```
12 | '''
13 | 
14 | def apply(wb, params):
15 |     columns = params['columns']
16 |     default_value = params['default']
17 |     table = params.get('table')
18 |     active = False
19 |     for name, t in wb['tables'].items():
20 |         if name == table or table is None:
21 |             if len(set(columns) - set(t['columns'])) > 0:
22 |                 continue
23 |             active = True
24 |             for row in t['rows']:
25 |                 v = None
26 |                 for column in columns:
27 |                     if v is not None and v != "":
28 |                         break
29 |                     v = row[column]
30 |                 if v is None or v == '':
31 |                     v = default_value
32 |                 row[columns[0]] = v
33 |     if not active:
34 |         raise KeyError(column)
35 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/custom.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Apply a custom tweak.  Expects my_script_name.py in same dir as .yml, with a the_method
 3 | method that will receive wb, params.
 4 | 
 5 | tweaks:
 6 |   custom:
 7 |     script: my_script_name
 8 |     method: the_method
 9 |     arg1: val1
10 |     ...
11 | 
12 | '''
13 | 
14 | import importlib
15 | import os
16 | import sys
17 | 
18 | sys.path.append(os.getcwd())
19 | 
20 | def apply(wb, params):
21 |     script = params['script']
22 |     method = params['method']
23 |     module = importlib.import_module(script)
24 |     method_definition = getattr(module, method)
25 |     return method_definition(wb, params)
26 | 
27 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/formula.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Apply a python formatting string to a column.
 3 | 
 4 | tweaks:
 5 |   formula:
 6 |     formula: "%05d"
 7 |     column: zip
 8 |     table: addresses  # optional
 9 | '''
10 | 
11 | def apply(wb, params):
12 |     formula = params['formula']
13 |     column = params['column']
14 |     table = params.get('table')
15 |     for name, t in wb['tables'].items():
16 |         if name == table or table is None:
17 |             if column not in t['columns']:
18 |                 t['columns'].append(column)
19 |             for row in t['rows']:
20 |                 row[column] = formula.format(**row)
21 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/list_to_rows.py:
--------------------------------------------------------------------------------
 1 | 
 2 | '''
 3 | 
 4 | Take a list and make extra rows from it
 5 | 
 6 | tweaks:
 7 |   list_to_rows:
 8 |     column: "Other Addresses"
 9 |     target: "address"  # optional
10 | 
11 | '''
12 | 
13 | 
14 | import copy
15 | import re
16 | import six
17 | 
18 | 
19 | def apply(wb, params):
20 |     column = params['column']
21 |     target = params.get('target', column)
22 |     table = params.get('table')
23 |     active = False
24 |     for name, t in wb['tables'].items():
25 |         if name == table or table is None:
26 |             if column not in t['columns']:
27 |                 continue
28 |             if target not in t['columns']:
29 |                 continue
30 |             active = True
31 |             orows = []
32 |             for row in t['rows']:
33 |                 cell = row[column]
34 |                 print(">>>>", cell)
35 |                 orows.append(row)
36 |                 if cell is not None:
37 |                     if not isinstance(cell, six.string_types):
38 |                         for part in cell:
39 |                             nrow = copy.deepcopy(row)
40 |                             nrow[column] = None
41 |                             nrow[target] = part
42 |                             orows.append(nrow)
43 |             t['rows'] = orows
44 |     if not active:
45 |         raise KeyError(column + " / " + target)
46 |         
47 |     
48 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/merge_tables.py:
--------------------------------------------------------------------------------
 1 | 
 2 | '''
 3 | 
 4 | Smush all the tables together.
 5 | 
 6 | tweaks:
 7 |   merge_tables:
 8 |     table: directory   # name of the single created table
 9 |     column: thing      # sheet names are placed here
10 | 
11 | '''
12 | 
13 | def apply(wb, params):
14 |     table = params['table']
15 |     column = params['column']
16 |     input_names = wb['names']
17 |     input_tables = wb['tables']
18 |     wb['names'] = [table]
19 |     target_table = {}
20 |     wb['tables'] = {
21 |         table: target_table
22 |     }
23 |     order_cols = []
24 |     seen_cols = set()
25 |     tables = [(name, input_tables[name]) for name in input_names]
26 |     for name, t in tables:
27 |         cols = t['columns']
28 |         for col in cols:
29 |             if col not in seen_cols:
30 |                 order_cols.append(col)
31 |                 seen_cols.add(col)
32 |     if column not in seen_cols:
33 |         order_cols.append(column)
34 |         seen_cols.add(column)
35 |         
36 |     target_table['columns'] = order_cols
37 |     rows = target_table['rows'] = []
38 |     for name, t in tables:
39 |         extra = dict((c, None) for c in seen_cols - set(t['columns']))
40 |         for row in t['rows']:
41 |             row.update(extra)
42 |             row[column] = name
43 |             rows.append(row)
44 |     
45 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/patch.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Patch a cell
 4 | 
 5 | tweaks:
 6 |   patch:
 7 |     where:
 8 |       col1: val1
 9 |       col2: val2
10 |     update:
11 |       col3: val3
12 | '''
13 | 
14 | def apply(wb, params):
15 |     where = params['where']
16 |     update = params['update']
17 |     for name, t in wb['tables'].items():
18 |         for row in t['rows']:
19 |             ok = True
20 |             active = True
21 |             for key, val in where.items():
22 |                 if key not in row:
23 |                     ok = False
24 |                     break
25 |                 if row[key] != val:
26 |                     active = False
27 |                     break
28 |             if not ok:
29 |                 break
30 |             if not active:
31 |                 continue
32 |             for key, val in update.items():
33 |                 row[key] = val
34 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/prune_tables.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Little Bobby Drop Tables
 4 | 
 5 | tweaks:
 6 |   prune_tables:
 7 |     - Table1  # list of all tables in desired order
 8 |     - Table2
 9 | 
10 | '''
11 | 
12 | def apply(wb, params):
13 |     old_names = wb['names']
14 |     old_tables = wb['tables']
15 |     names = wb['names'] = list(params)
16 |     tables = wb['tables'] = {}
17 |     for name in names:
18 |         tables[name] = old_tables[name]
19 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/rename_column.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Rename a column
 4 | 
 5 | tweaks:
 6 |   rename_column:
 7 |     table: Table1  # optional
 8 |     from: OldColumnName
 9 |     to: NewColumnName # blank to delete
10 | 
11 | '''
12 | 
13 | def apply(wb, params):
14 |     table = params.get('table')
15 |     from_name = params['from']
16 |     to_name = params.get('to')
17 |     active = False
18 |     for name, t in wb['tables'].items():
19 |         if name == table or table is None:
20 |             if from_name not in t['columns']:
21 |                 continue
22 |             active = True
23 |             t['columns'] = [to_name if name == from_name else name
24 |                             for name in t['columns']
25 |                             if name != from_name or to_name]
26 |             for row in t['rows']:
27 |                 tmp = row[from_name]
28 |                 if to_name:
29 |                     row[to_name] = tmp
30 |     if not active:
31 |         raise KeyError(from_name)
32 | 
33 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/rename_table.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Rename a table
 4 | 
 5 | tweaks:
 6 |   rename_table:
 7 |     from: OldName
 8 |     to: NewName
 9 | 
10 | '''
11 | 
12 | def apply(wb, params):
13 |     from_name = params['from']
14 |     to_name = params['to']
15 |     old_names = wb['names']
16 |     if from_name in old_names:
17 |         wb['names'] = [to_name if name == from_name else name for name in old_names]
18 |         wb['tables'][to_name] = wb['tables'].pop(from_name)
19 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/replace_cell.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def apply(wb, params):
 3 |     column = params['column']
 4 |     table = params.get('table')
 5 |     mapping = params['map']
 6 |     active = False
 7 |     for name, t in wb['tables'].items():
 8 |         if name == table or table is None:
 9 |             if column not in t['columns']:
10 |                 continue
11 |             active = True
12 |             for row in t['rows']:
13 |                 code = str(row[column])
14 |                 if "," in code:
15 |                     cactive = False
16 |                     codes = [x.strip() for x in code.split(',')]
17 |                     for idx, code in enumerate(codes):
18 |                         if code in mapping:
19 |                             codes[idx] = mapping[code]
20 |                             cactive = True
21 |                     if cactive:
22 |                         code = ', '.join(codes)
23 |                         row[column] = code
24 |                 elif code in mapping:
25 |                     row[column] = mapping[code]
26 | 
27 |     if not active:
28 |         raise KeyError(column)
29 |         
30 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/required_field.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def apply(wb, params):
 3 |     column = params['column']
 4 |     table = params.get('table')
 5 |     value = params.get('value')
 6 |     not_value = params.get('not-value')
 7 |     active = False
 8 |     for name, t in wb['tables'].items():
 9 |         if name == table or table is None:
10 |             if column not in t['columns']:
11 |                 continue
12 |             active = True
13 |             orows = []
14 |             for row in t['rows']:
15 |                 v = row[column]
16 |                 if value is not None:
17 |                     if str(v) == str(value):
18 |                         orows.append(row)
19 |                 elif not_value is not None:
20 |                     if str(v) != str(not_value):
21 |                         orows.append(row)
22 |                 elif v is not None and v != '':
23 |                     orows.append(row)
24 |             t['rows'] = orows
25 |     if not active:
26 |         raise KeyError(column)
27 |         
28 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/sniff_inactive.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def apply(wb, params):
 3 |     table = params.get('table')
 4 |     for name, t in wb['tables'].items():
 5 |         if name == table or table is None:
 6 |             if 'dcc_status' not in t['columns']:
 7 |                 t['columns'].append('dcc_status')
 8 |             if 'dcc_stamp' not in t['columns']:
 9 |                 t['columns'].append('dcc_stamp')
10 |             for row in t['rows']:
11 |                 status = None
12 |                 stamp = None
13 |                 if 'NOTES' in t['columns']:
14 |                     code = str(row['NOTES'] or '')
15 |                     if 'DELETE' in code:
16 |                         status = 'Inactive'
17 |                 if 'Active' in t['columns']:
18 |                     code = str(row['Active'] or '')
19 |                     if code == 'no':
20 |                         status = 'Inactive'
21 |                     elif len(code) > 0 and code[0] >= '0' and code[0] <= '9':
22 |                         stamp = int(code)
23 |                 if 'Member' in t['columns']:
24 |                     code = str(row['Member'] or '')
25 |                     if code.lower() == 'closed':
26 |                         status = 'Inactive'
27 |                 row['dcc_status'] = status
28 |                 row['dcc_stamp'] = stamp
29 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/split_addresses.py:
--------------------------------------------------------------------------------
 1 | 
 2 | '''
 3 | 
 4 | Addresses separated by newlines in a single cell get parsed
 5 | 
 6 | tweaks:
 7 |   split_addresses:
 8 |     column: "Other Addresses"
 9 | 
10 | '''
11 | 
12 | 
13 | import re
14 | 
15 | 
16 | def sanity_stick(locs):
17 |     if len(locs) <= 1:
18 |         return locs
19 |     if len(re.sub(r'[^,]', '', locs[0])) < 3:
20 |         return [' '.join(locs)]
21 |     return locs
22 | 
23 | 
24 | def apply(wb, params):
25 |     column = params['column']
26 |     table = params.get('table')
27 |     active = False
28 |     for name, t in wb['tables'].items():
29 |         if name == table or table is None:
30 |             if column not in t['columns']:
31 |                 continue
32 |             active = True
33 |             for row in t['rows']:
34 |                 cell = row[column]
35 |                 if cell is not None:
36 |                     print("[{}]".format(cell))
37 |                     cell = re.sub(r'^[ \n\r\t]*', '', cell)
38 |                     cell = re.sub(r'[ \n\r\t]*$', '', cell)
39 |                     cell = re.sub(r'^n/a$', '', cell, flags=re.IGNORECASE)
40 |                     print("[{}]".format(cell))
41 |                     if cell == '':
42 |                         splits = None
43 |                     else:
44 |                         splits = cell.split('\n')
45 |                         splits = sanity_stick(splits)
46 |                     row[column] = splits
47 |     if not active:
48 |         raise KeyError(column)
49 |         
50 |     
51 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/split_addresses_v2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | '''
 3 | 
 4 | Addresses separated by *double* newlines in a single cell get parsed
 5 | 
 6 | tweaks:
 7 |   split_addresses_v2:
 8 |     column: "Other Addresses"
 9 | 
10 | '''
11 | 
12 | 
13 | import json
14 | import re
15 | 
16 | 
17 | def sanity_stick(locs):
18 |     if len(locs) <= 1:
19 |         return locs
20 |     if len(re.sub(r'[^,]', '', locs[0])) < 1:
21 |         return [' '.join(locs)]
22 |     return locs
23 | 
24 | 
25 | def apply(wb, params):
26 |     column = params['column']
27 |     table = params.get('table')
28 |     active = False
29 |     for name, t in wb['tables'].items():
30 |         if name == table or table is None:
31 |             if column not in t['columns']:
32 |                 continue
33 |             active = True
34 |             for row in t['rows']:
35 |                 cell = row[column]
36 |                 if cell is not None and 'See:' in cell:
37 |                     cell = None
38 |                 if cell is not None:
39 |                     print(">>> {}".format(cell))
40 |                     cell = re.sub(r'^[ \n\r\t]*', '', cell)
41 |                     cell = re.sub(r'[ \n\r\t]*$', '', cell)
42 |                     cell = re.sub(r'^n/a$', '', cell, flags=re.IGNORECASE)
43 |                     if cell == '':
44 |                         splits = None
45 |                     else:
46 |                         splits = re.split('[\n\r][\n\r]', cell)
47 |                         splits = sanity_stick(splits)
48 |                     print(json.dumps(splits))
49 |                     row[column] = splits
50 |     if not active:
51 |         raise KeyError(column)
52 |         
53 |     
54 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/us_state.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def apply(wb, params):
 3 |     column = params['column']
 4 |     table = params.get('table')
 5 |     active = False
 6 |     for name, t in wb['tables'].items():
 7 |         if name == table or table is None:
 8 |             if column not in t['columns']:
 9 |                 continue
10 |             active = True
11 |             for row in t['rows']:
12 |                 code = str(row[column])
13 |                 if code == "CT":
14 |                     row[column] = "Connecticut"
15 |                     # important to replace this or geocoder will sporadically
16 |                     # interpret it as Court or Crescent or the like
17 |                 if code == "RI":
18 |                     row[column] = "Rhode Island"
19 |                 if code == "MA":
20 |                     row[column] = "Massachusetts"
21 | 
22 |     if not active:
23 |         raise KeyError(column)
24 |         
25 | 


--------------------------------------------------------------------------------
/sheetsite/tweaks/us_zip.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def apply(wb, params):
 3 |     column = params['column']
 4 |     table = params.get('table')
 5 |     active = False
 6 |     for name, t in wb['tables'].items():
 7 |         if name == table or table is None:
 8 |             if column not in t['columns']:
 9 |                 continue
10 |             active = True
11 |             for row in t['rows']:
12 |                 code = str(row[column])
13 |                 if len(code) < 5 and len(code) > 0:
14 |                     try:
15 |                         code = "%05d" % int(code)
16 |                         row[column] = code
17 |                     except ValueError:
18 |                         pass  # let odd values through
19 |     if not active:
20 |         raise KeyError(column)
21 |         
22 | 


--------------------------------------------------------------------------------
/sheetsite/xls_spreadsheet.py:
--------------------------------------------------------------------------------
 1 | from openpyxl import load_workbook
 2 | 
 3 | 
 4 | class XlsSpreadsheet(object):
 5 | 
 6 |     def __init__(self, filename):
 7 |         self.book = book = load_workbook(filename=filename)
 8 |         self.sheets = [XlsSheet(n, book.get_sheet_by_name(n)) for n in book.get_sheet_names()]
 9 | 
10 |     def worksheets(self):
11 |         return self.sheets
12 | 
13 | 
14 | class XlsSheet(object):
15 | 
16 |     def __init__(self, name, data):
17 |         self.name = name
18 |         self.data = data
19 | 
20 |     def get_all_values(self):
21 |         input = self.data.rows
22 |         output = []
23 |         for i, row in enumerate(input):
24 |             output_row = []
25 |             output.append(output_row)
26 |             for j, cell in enumerate(row):
27 |                 try:
28 |                     output_row.append(cell.value)
29 |                 except:
30 |                     output_row.append(None)
31 |         return output
32 | 
33 |     @property
34 |     def title(self):
35 |         return self.name
36 | 
37 | 


--------------------------------------------------------------------------------
/sites/available/commoners.yml:
--------------------------------------------------------------------------------
 1 | source:
 2 |   name: google-sheets
 3 |   key: 19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA
 4 |   credential_file: service.json
 5 |   link: https://docs.google.com/spreadsheets/d/19UaXhqPQ0QHEfSWS_adDEtPwYstq8llK2YijpvFZcKA/edit
 6 | 
 7 | flags:
 8 |   add:
 9 |     directory:
10 |       - LAT
11 |       - LNG
12 |       - COUNTRY
13 |       - STREET
14 |       - REGION
15 |       - LOCALITY
16 | 
17 | destination:
18 |   name: chain
19 |   chain:
20 |     - name: git
21 |       repo: git@github.com:datacommons/commoners
22 |       local: commoners
23 |       file: _data/directory.json
24 |     - name: git
25 |       repo: git@github.com:datacommons/datacommons.github.io
26 |       file: _data/directory.json
27 |       local: website
28 |   link: http://datacommons.coop/members
29 | 


--------------------------------------------------------------------------------
/sites/available/hack_spots.yml:
--------------------------------------------------------------------------------
1 | source:
2 |   name: google-sheets
3 |   key: 1hnfQcggYcBYimuO_UOMvwoOi_I9vUvFpkMt4wjrrpLE
4 |   credential_file: service.json
5 | 
6 | destination:
7 |   file: hackspots.xlsx
8 | 


--------------------------------------------------------------------------------
/sites/available/local.yml:
--------------------------------------------------------------------------------
 1 | source:
 2 |   name: .json
 3 |   filename: test.json
 4 | 
 5 | flags:
 6 |   add:
 7 |     zig:
 8 |       - dccid
 9 |     zag:
10 |       - dccid
11 | 
12 | destination:
13 |   name: .json
14 |   output_file: foo.json
15 |   


--------------------------------------------------------------------------------
/sites/available/manitoba.yml:
--------------------------------------------------------------------------------
 1 | source:
 2 |   name: google-sheets
 3 |   key: 1LvBgFeYsI9GeN2PTw5klcwBFFFeROlbwvTVF2qAIuBk
 4 |   credential_file: service.json
 5 |   link: https://docs.google.com/spreadsheets/d/1LvBgFeYsI9GeN2PTw5klcwBFFFeROlbwvTVF2qAIuBk/edit
 6 | 
 7 | flags:
 8 |   add:
 9 |     directory:
10 |       - Latitude
11 |       - Longitude
12 |       - Postal Code
13 |       - State
14 |       - Country
15 |   address:
16 |     directory:
17 |       - Physical Address
18 |       - City
19 |       - Manitoba
20 |       - Canada
21 | 
22 | destination:
23 |   name: chain
24 |   chain:
25 |     - name: stone-soup
26 |       organization: Manitoba Cooperative Association
27 |     - name: install_local_soup
28 |   link: http://find.manitoba.coop
29 | 


--------------------------------------------------------------------------------
/sites/available/tap.yml:
--------------------------------------------------------------------------------
 1 | source:
 2 |   name: google-sheets
 3 |   key: 1mBqfuAWYkRO5M7dd-bw0jKbd0fJGI-4UUv4BDmsyJy4
 4 |   credential_file: service.json
 5 |   link: https://docs.google.com/spreadsheets/d/1mBqfuAWYkRO5M7dd-bw0jKbd0fJGI-4UUv4BDmsyJy4/edit
 6 | 
 7 | flags:
 8 |   add:
 9 |     directory:
10 |       - LAT
11 |       - LNG
12 |       - COUNTRY
13 |       - STREET
14 |       - REGION
15 |       - LOCALITY
16 | 
17 | destination:
18 |   name: git
19 |   repo: git@github.com:datacommons/tap
20 |   file: _data/directory.json
21 |   link: http://datacommons.coop/tap
22 | 


--------------------------------------------------------------------------------
/sites/available/test.yml:
--------------------------------------------------------------------------------
 1 | source:
 2 |   name: google-sheets
 3 |   key: 15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc
 4 |   credential_file: service.json
 5 |   link: https://docs.google.com/spreadsheets/d/15Vs_VGpupeGkljceEow7q1ig447FJIxqNS1Dd0dZpFc/edit
 6 | 
 7 | flags:
 8 |   add:
 9 |     directory:
10 |       - LAT
11 |       - LNG
12 |       - COUNTRY
13 |       - STREET
14 |       - REGION
15 |       - LOCALITY
16 | 
17 | destination:
18 |   name: git
19 |   repo: git@github.com:paulfitz/scrapyard
20 |   file: directory.json
21 |   link: https://github.com/paulfitz/scrapyard/blob/master/directory.json
22 | 


--------------------------------------------------------------------------------
/tests/configs/fill.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "names": ["countries"],
 3 |     "tables": {
 4 |         "countries": {
 5 |             "columns": ["country", "[zip]", "code", "(opinion)"],
 6 |             "rows": [
 7 |                 {
 8 |                     "country": "United Kingdom",
 9 |                     "code": "uk",
10 |                     "(opinion)": "dubious",
11 |                     "[zip]": ""
12 |                 },
13 |                 {
14 |                     "country": "United States",
15 |                     "code": "((usa))",
16 |                     "(opinion)": "dubious",
17 |                     "[zip]": ""
18 |                 }
19 |             ]
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/configs/json_to_json.json:
--------------------------------------------------------------------------------
1 | {
2 |     "source": {
3 |         "filename": "tests/configs/things.json"
4 |     },
5 |     "destination": {
6 |         "output_file": "${TEST_DIR}/out.json"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/tests/configs/multirow.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "names": ["places"],
 3 |     "tables": {
 4 |         "places": {
 5 |             "columns": ["street", "city", "state", "country", "web"],
 6 |             "rows": [
 7 |                 {
 8 |                     "street": "Test1",
 9 |                     "city": "Test2",
10 |                     "state": "",
11 |                     "country": "",
12 |                     "web": ""
13 |                 },
14 |                 {
15 |                     "street": "Test1",
16 |                     "city": "",
17 |                     "state": "",
18 |                     "country": "",
19 |                     "web": ""
20 |                 },
21 |                 {
22 |                     "street": "305 Memorial Dr",
23 |                     "city": "Cambridge",
24 |                     "state": "Massachusetts",
25 |                     "country": "United States",
26 |                     "web": "web1"
27 |                 },
28 |                 {
29 |                     "street": "306 Memorial Dr",
30 |                     "city": null,
31 |                     "state": "",
32 |                     "country": "",
33 |                     "web": "web1"
34 |                 }
35 |             ]
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/configs/things.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "names": ["countries", "((secret))"],
 3 |     "tables": {
 4 |         "countries": {
 5 |             "columns": ["country", "code", "(opinion)"],
 6 |             "rows": [
 7 |                 {
 8 |                     "country": "United Kingdom",
 9 |                     "code": "uk",
10 |                     "(opinion)": "dubious"
11 |                 },
12 |                 {
13 |                     "country": "United States",
14 |                     "code": "((usa))",
15 |                     "(opinion)": "dubious"
16 |                 }
17 |             ]
18 |         },
19 |         "((secret))": {
20 |             "columns": ["a", "b"],
21 |             "rows": {}
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/test_chain.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import unittest
  4 | from sheetsite.chain import apply_chain
  5 | from sheetsite.cmdline import run
  6 | 
  7 | ########################################################
  8 | # python2 doesn't have TemporaryDirectory
  9 | # replacement begins
 10 | 
 11 | import contextlib
 12 | import shutil
 13 | import tempfile
 14 | 
 15 | 
 16 | @contextlib.contextmanager
 17 | def TemporaryDirectory():
 18 |     dirpath = tempfile.mkdtemp()
 19 |     try:
 20 |         yield dirpath
 21 |     finally:
 22 |         shutil.rmtree(dirpath)
 23 | 
 24 | 
 25 | # replacement ends
 26 | # python2 doesn't have TemporaryDirectory
 27 | ########################################################
 28 | 
 29 | 
 30 | class TestChain(unittest.TestCase):
 31 | 
 32 |     def test_json_to_json_cmdline(self):
 33 |         with TemporaryDirectory() as temp_dir:
 34 |             os.environ['TEST_DIR'] = temp_dir
 35 |             run(['--config', 'tests/configs/json_to_json.json', '--cache-dir', temp_dir])
 36 | 
 37 |     def test_json_to_json(self):
 38 |         with TemporaryDirectory() as temp_dir:
 39 |             target = "{}/out.json".format(temp_dir)
 40 |             params = {
 41 |                 "source": {"filename": "tests/configs/things.json"},
 42 |                 "destination": {"output_file": target}
 43 |             }
 44 |             apply_chain(params, temp_dir)
 45 |             with open(target, 'r') as f:
 46 |                 data = json.load(f)
 47 |             assert len(data["tables"]["countries"]["columns"]) == 2
 48 |             assert data["tables"]["countries"]["rows"][1]["code"] == ""
 49 | 
 50 |     def test_fill(self):
 51 |         with TemporaryDirectory() as temp_dir:
 52 |             target = "{}/out.json".format(temp_dir)
 53 |             params = {
 54 |                 "source": {"filename": "tests/configs/fill.json"},
 55 |                 "flags": {
 56 |                     "geocoder": "dummy",
 57 |                     "address": {"countries": ["country"]}
 58 |                 },
 59 |                 "destination": {"output_file": target}
 60 |             }
 61 |             apply_chain(params, temp_dir)
 62 |             with open(target, 'r') as f:
 63 |                 data = json.load(f)
 64 |             assert data["tables"]["countries"]["rows"][0]["zip"] == "PO-STAL"
 65 | 
 66 |     def test_single_to_multiple_add(self):
 67 |         with TemporaryDirectory() as temp_dir:
 68 |             target = "{}/out.json".format(temp_dir)
 69 |             params = {
 70 |                 "source": {"filename": "tests/configs/things.json"},
 71 |                 "flags": {
 72 |                     "geocoder": "dummy",
 73 |                     "address": {"countries": ["country"]},
 74 |                     "add": {"countries": ["city", "address"]}
 75 |                 },
 76 |                 "destination": {"output_file": target}
 77 |             }
 78 |             apply_chain(params, temp_dir)
 79 |             with open(target, 'r') as f:
 80 |                 data = json.load(f)
 81 |             assert data["tables"]["countries"]["rows"][0]["city"] == "Cityville"
 82 |             assert data["tables"]["countries"]["rows"][0]["address"] == "United Kingdom"
 83 |             assert data["tables"]["countries"]["rows"][1]["address"] == "United States"
 84 | 
 85 |     def test_multiple_to_multiple_add(self):
 86 |         with TemporaryDirectory() as temp_dir:
 87 |             target = "{}/out.json".format(temp_dir)
 88 |             params = {
 89 |                 "source": {"filename": "tests/configs/things.json"},
 90 |                 "flags": {
 91 |                     "geocoder": "dummy",
 92 |                     "address": {"countries": ["code", "country", "Earth"]},
 93 |                     "add": {"countries": ["city", "address"]}
 94 |                 },
 95 |                 "destination": {"output_file": target}
 96 |             }
 97 |             apply_chain(params, temp_dir)
 98 |             with open(target, 'r') as f:
 99 |                 data = json.load(f)
100 |             assert data["tables"]["countries"]["rows"][0]["city"] == "Cityville"
101 |             assert data["tables"]["countries"]["rows"][0]["address"] == "uk United Kingdom Earth"
102 | 
103 |     def test_multirow(self):
104 |         with TemporaryDirectory() as temp_dir:
105 |             target = "{}/out.json".format(temp_dir)
106 |             params = {
107 |                 "source": {"filename": "tests/configs/multirow.json"},
108 |                 "flags": {
109 |                     "geocoder": "dummy",
110 |                     "group": "web",
111 |                     "address": {"places": ["street", "city", "state", "country"]},
112 |                     "add": {"places": ["lat", "lon", "address"]}
113 |                 },
114 |                 "destination": {"output_file": target}
115 |             }
116 |             apply_chain(params, temp_dir)
117 |             with open(target, 'r') as f:
118 |                 data = json.load(f)
119 |             places = data["tables"]["places"]["rows"]
120 |             self.assertEqual(places[0]["address"], "Test1 Test2")
121 |             self.assertEqual(places[1]["address"], "Test1")
122 |             self.assertEqual(places[2]["address"],
123 |                              "305 Memorial Dr Cambridge Massachusetts United States")
124 |             self.assertEqual(places[3]["address"],
125 |                              "306 Memorial Dr Cambridge Massachusetts United States")
126 | 
127 |     def test_rename(self):
128 |         with TemporaryDirectory() as temp_dir:
129 |             target = "{}/out.json".format(temp_dir)
130 |             params = {
131 |                 "source": {"filename": "tests/configs/multirow.json"},
132 |                 "flags": {
133 |                     "geocoder": "dummy",
134 |                     "rename": {"places": {"web": "website"}},
135 |                     "address": {"places": ["street", "city", "state", "country"]},
136 |                     "add": {"places": ["lat", "lon", "address"]}
137 |                 },
138 |                 "destination": {"output_file": target}
139 |             }
140 |             apply_chain(params, temp_dir)
141 |             with open(target, 'r') as f:
142 |                 data = json.load(f)
143 |             places = data["tables"]["places"]["rows"]
144 |             self.assertIn('website', places[0])
145 |             self.assertNotIn('web', places[0])
146 | 


--------------------------------------------------------------------------------
/tests/test_environment.py:
--------------------------------------------------------------------------------
1 | def test_general_import():
2 |     import sheetsite
3 | 
4 | def test_specific_import():
5 |     import sheetsite.cmdline
6 | 


--------------------------------------------------------------------------------
/tests/test_filter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from sheetsite.json_spreadsheet import JsonSpreadsheet
 3 | from sheetsite.site import Site
 4 | 
 5 | def test_filter():
 6 |     wb = JsonSpreadsheet('tests/configs/things.json')
 7 |     site = Site(wb)
 8 | 
 9 |     filtered_wb = site.public_workbook()
10 |     result = wb.as_dict(filtered_wb)
11 |     columns = result["tables"]["countries"]["columns"]
12 |     assert "country" in columns
13 |     assert not "opinion" in columns
14 |     assert not "secret" in result["tables"]
15 | 
16 |     filtered_wb = site.private_workbook()
17 |     result = wb.as_dict(filtered_wb)
18 |     assert "secret" in result["tables"]
19 | 


--------------------------------------------------------------------------------