├── .circleci └── config.yml ├── .gitignore ├── CODEOWNERS ├── LICENSE ├── README.md ├── setup.py ├── tests.py └── tidy_json_to_csv.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | workflows: 3 | version: 2 4 | test: 5 | jobs: 6 | - test-3.8.2 7 | - test-3.8.1 8 | - test-3.8.0 9 | - test-3.7.7 10 | - test-3.7.4 11 | - test-3.7.3 12 | - test-3.7.2 13 | - test-3.7.1 14 | - test-3.7.0 15 | - test-3.6.10 16 | - test-3.6.9 17 | - test-3.6.8 18 | - test-3.6.7 19 | - test-3.6.6 20 | - test-3.6.5 21 | - test-3.6.4 22 | - test-3.6.3 23 | jobs: 24 | test-3.8.2: &template 25 | docker: 26 | - image: python:3.8.2 27 | steps: 28 | - checkout 29 | - run: 30 | name: Run tests 31 | command: | 32 | python3 setup.py test 33 | test-3.8.1: 34 | <<: *template 35 | docker: 36 | - image: python:3.8.1 37 | test-3.8.0: 38 | <<: *template 39 | docker: 40 | - image: python:3.8.0 41 | test-3.7.7: 42 | <<: *template 43 | docker: 44 | - image: python:3.7.7 45 | test-3.7.6: 46 | <<: *template 47 | docker: 48 | - image: python:3.7.6 49 | test-3.7.5: 50 | <<: *template 51 | docker: 52 | - image: python:3.7.5 53 | test-3.7.4: 54 | <<: *template 55 | docker: 56 | - image: python:3.7.4 57 | test-3.7.3: 58 | <<: *template 59 | docker: 60 | - image: python:3.7.3 61 | test-3.7.2: 62 | <<: *template 63 | docker: 64 | - image: python:3.7.2 65 | test-3.7.1: 66 | <<: *template 67 | docker: 68 | - image: python:3.7.1 69 | test-3.7.0: 70 | <<: *template 71 | docker: 72 | - image: python:3.7.0 73 | test-3.6.10: 74 | <<: *template 75 | docker: 76 | - image: python:3.6.10 77 | test-3.6.9: 78 | <<: *template 79 | docker: 80 | - image: python:3.6.9 81 | test-3.6.8: 82 | <<: *template 83 | docker: 84 | - image: python:3.6.8 85 | test-3.6.7: 86 | <<: *template 87 | docker: 88 | - image: python:3.6.7 89 | test-3.6.6: 90 | <<: *template 91 | docker: 92 | - image: python:3.6.6 93 | test-3.6.5: 94 | <<: *template 95 | docker: 96 | - image: python:3.6.5 97 | test-3.6.4: 98 | <<: *template 99 | docker: 100 | - image: python:3.6.4 101 | test-3.6.3: 102 | <<: *template 103 | docker: 104 | - image: python:3.6.3 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @uktrade/data-infrastructure 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Department for International Trade 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tidy-json-to-csv [![CircleCI](https://circleci.com/gh/uktrade/tidy-json-to-csv.svg?style=svg)](https://circleci.com/gh/uktrade/tidy-json-to-csv) 2 | 3 | Converts a subset of JSON to a set of tidy CSVs. Supports both streaming processing of input JSON and output of CSV, and so suitable for large files in memory constrained environments. 4 | 5 | 6 | ## What problem does this solve? 7 | 8 | Most JSON to CSV converters do not result in data suitable for immediate analysis. They usually output a single CSV, and to do this, result in some combination of: 9 | 10 | - JSON inside CSV fields; 11 | - values in lists presented as columms; 12 | - data duplicated in multiple rows / a row's position in the CSV determines its context. 13 | 14 | Often these require subsequent manual, and so error-prone, data manipulation. This library aims to do all the conversion up-front, so you end up with a set of [tidy](https://vita.had.co.nz/papers/tidy-data.pdf) tables, which is often a great place from which to start analysis. 15 | 16 | 17 | ## Example input and output 18 | 19 | The JSON 20 | 21 | ```json 22 | { 23 | "songs": [ 24 | { 25 | "id": "1", 26 | "title": "Walk through the fire", 27 | "categories": [ 28 | {"id": "1", "name": "musicals"}, 29 | {"id": "2", "name": "television-shows"} 30 | ], 31 | "comments": [ 32 | {"content": "I love it"}, 33 | {"content": "I've heard better"} 34 | ], 35 | "artist": { 36 | "name": "Slayer" 37 | } 38 | }, 39 | { 40 | "id": "2", 41 | "title": "I could have danced all night", 42 | "categories": [ 43 | {"id": "1", "name": "musicals"}, 44 | {"id": "3", "name": "films"} 45 | ], 46 | "comments": [ 47 | {"content": "I also could have danced all night"} 48 | ], 49 | "artist": { 50 | "name": "Doolitle" 51 | } 52 | } 53 | ] 54 | } 55 | ``` 56 | 57 | maps to four files: 58 | 59 | ### `songs.csv` 60 | 61 | ```csv 62 | "id","title","artist__name" 63 | "1","Walk through the fire","Slayer" 64 | "2","I could have danced all night","Doolitle" 65 | ``` 66 | 67 | ### `songs__categories__id.csv` 68 | 69 | ```csv 70 | "songs__id","categories__id" 71 | "1","1" 72 | "1","2" 73 | "2","1" 74 | "2","3" 75 | ``` 76 | 77 | ### `songs__comments.csv` 78 | 79 | ```csv 80 | "songs__id","content" 81 | "1","I love it" 82 | "1","I've heard better" 83 | "2","I also could have danced all night" 84 | ``` 85 | 86 | ### `categories.csv` 87 | 88 | ```csv 89 | "id","name" 90 | "1","musicals" 91 | "2","television-shows" 92 | "3","films" 93 | ``` 94 | 95 | 96 | ## Installation 97 | 98 | ```bash 99 | pip install tidy-json-to-csv 100 | ``` 101 | 102 | 103 | ## Usage: Convert JSON to multiple CSV files (Command line) 104 | 105 | ```bash 106 | cat songs.json | tidy_json_to_csv 107 | ``` 108 | 109 | 110 | ## Usage: Convert JSON to multiple CSV files (Python) 111 | 112 | ```python 113 | from tidy_json_to_csv import to_csvs 114 | 115 | # A save function, called by to_csvs for each CSV file to be generated. 116 | # Will be run in a separate thread, started by to_csvs 117 | def save_csv_bytes(path, chunks): 118 | with open(f'{path}.csv', 'wb') as f: 119 | for chunk in chunks: 120 | f.write(chunk) 121 | 122 | def json_bytes(): 123 | with open(f'file.json', 'rb') as f: 124 | chunk = f.read(65536) 125 | if chunk: 126 | yield chunk 127 | 128 | to_csvs(json_bytes(), save_csv_bytes, null='#NA', output_chunk_size=65536) 129 | ``` 130 | 131 | 132 | ## Usage: Convert JSON to multiple Pandas data frames (Python) 133 | 134 | ```python 135 | import io 136 | import queue 137 | 138 | import pandas as pd 139 | from tidy_json_to_csv import to_csvs 140 | 141 | def json_to_pandas(json_filename): 142 | q = queue.Queue() 143 | 144 | class StreamedIterable(io.RawIOBase): 145 | def __init__(self, iterable): 146 | self.iterable = iterable 147 | self.remainder = b'' 148 | def readable(self): 149 | return True 150 | def readinto(self, b): 151 | buffer_size = len(b) 152 | 153 | while len(self.remainder) < buffer_size: 154 | try: 155 | self.remainder = self.remainder + next(self.iterable) 156 | except StopIteration: 157 | if self.remainder: 158 | break 159 | return 0 160 | 161 | chunk, self.remainder = self.remainder[:buffer_size], self.remainder[buffer_size:] 162 | b[:len(chunk)] = chunk 163 | return len(chunk) 164 | 165 | def save_csv_bytes(path, chunks): 166 | q.put((path, pd.read_csv(io.BufferedReader(StreamedIterable(chunks), buffer_size=65536), na_values=['#NA']))) 167 | 168 | def json_bytes(): 169 | with open(json_filename, 'rb') as f: 170 | chunk = f.read(65536) 171 | if chunk: 172 | yield chunk 173 | 174 | to_csvs(json_bytes(), save_csv_bytes, null='#NA') 175 | 176 | dfs = {} 177 | while not q.empty(): 178 | path, df = q.get() 179 | dfs[path] = df 180 | 181 | return dfs 182 | 183 | dfs = json_to_pandas('songs.json') 184 | for path, df in dfs.items(): 185 | print(path) 186 | print(df) 187 | ``` 188 | 189 | 190 | ## Constraints 191 | 192 | Denormalised input JSON is assumed, and the output is normalised. If a nested object has an `id` field, it is assumed to be the primary key of a top-level table. All objects that have a nested object or array _must_ have an `id` field that serves as its primary key in the final output. If present, `id` must be the _first_ key in a map. All arrays must be arrays of objects rather than primitives. 193 | 194 | Although _mostly_ streaming, to support denormalised input JSON and to avoid repeating the same rows in normalised CSVs, an internal record of output IDs is maintained during processing. 195 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | def long_description(): 5 | with open('README.md', 'r') as file: 6 | return file.read() 7 | 8 | 9 | setuptools.setup( 10 | name='tidy-json-to-csv', 11 | version='0.0.13', 12 | author='Department for International Trade', 13 | author_email='webops@digital.trade.gov.uk', 14 | description='Convert JSON to a set of tidy CSV files', 15 | long_description=long_description(), 16 | long_description_content_type='text/markdown', 17 | url='https://github.com/uktrade/tidy-json-to-csv', 18 | py_modules=[ 19 | 'tidy_json_to_csv', 20 | ], 21 | python_requires='>=3.6.3', 22 | install_requires=[ 23 | 'ijson>=3.0.4,<4', 24 | ], 25 | entry_points={ 26 | 'console_scripts': [ 27 | 'tidy_json_to_csv=tidy_json_to_csv:main' 28 | ], 29 | }, 30 | test_suite='tests', 31 | classifiers=[ 32 | 'Programming Language :: Python :: 3', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Operating System :: OS Independent', 35 | ] 36 | ) 37 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tidy_json_to_csv import to_csvs 4 | 5 | 6 | class TestIntegration(unittest.TestCase): 7 | 8 | def test_basic(self): 9 | total_received = {} 10 | 11 | def save_csv(path, chunks): 12 | total_received[path] = [] 13 | for chunk in chunks: 14 | total_received[path].append(chunk) 15 | 16 | for output_chunk_size in range(1, 200): 17 | to_csvs(json_bytes(50), save_csv, output_chunk_size=output_chunk_size) 18 | files = { 19 | path: b''.join(contents) 20 | for path, contents in total_received.items() 21 | } 22 | self.assertEqual(files, json_bytes_songs_parsed) 23 | 24 | for input_chunk_size in range(1, 200): 25 | to_csvs(json_bytes(input_chunk_size), save_csv, output_chunk_size=50) 26 | files = { 27 | path: b''.join(contents) 28 | for path, contents in total_received.items() 29 | } 30 | self.assertEqual(files, json_bytes_songs_parsed) 31 | 32 | def test_exception_during_input_propagates(self): 33 | total_received = {} 34 | 35 | class MyException(Exception): 36 | pass 37 | 38 | def json_bytes_with_exception(): 39 | raise MyException() 40 | 41 | def save_csv(path, chunks): 42 | pass 43 | 44 | with self.assertRaises(MyException): 45 | to_csvs(json_bytes_with_exception(), save_csv) 46 | 47 | 48 | def test_exception_during_output_propagates(self): 49 | total_received = {} 50 | 51 | class MyException(Exception): 52 | pass 53 | 54 | def save_csv(path, chunks): 55 | raise MyException() 56 | 57 | with self.assertRaises(MyException): 58 | to_csvs(json_bytes(50), save_csv) 59 | 60 | 61 | def json_bytes(chunk_size): 62 | remaining = json_bytes_songs 63 | while remaining: 64 | yield remaining[:chunk_size] 65 | remaining = remaining[chunk_size:] 66 | 67 | 68 | json_bytes_songs = b'''{ 69 | "songs": [ 70 | { 71 | "id": "1", 72 | "title": "Walk through the fire", 73 | "categories": [ 74 | {"id": 1, "name": "musicals"}, 75 | {"id": 2, "name": "television-shows"} 76 | ], 77 | "comments": [ 78 | {"content": "I love it"}, 79 | {"content": "I've heard better"} 80 | ], 81 | "artist": { 82 | "name": "Slayer" 83 | } 84 | }, 85 | { 86 | "id": "2", 87 | "title": "I could have danced all night", 88 | "categories": [ 89 | {"id": 1, "name": "musicals"}, 90 | {"id": 3, "name": "films"} 91 | ], 92 | "comments": [ 93 | {"content": "I also could have danced all night"} 94 | ], 95 | "artist": { 96 | "name": "Dolittle" 97 | } 98 | } 99 | ] 100 | }''' 101 | 102 | json_bytes_songs_parsed = { 103 | 'songs__categories__id': b'"songs__id","categories__id"\r\n"1",1\r\n"1",2\r\n"2",1\r\n"2",3\r\n', 104 | 'songs__comments': b'"songs__id","content"\r\n"1","I love it"\r\n"1","I\'ve heard better"\r\n"2","I also could have danced all night"\r\n', 105 | 'songs': b'"id","title","artist__name"\r\n"1","Walk through the fire","Slayer"\r\n"2","I could have danced all night","Dolittle"\r\n', 106 | 'categories': b'"id","name"\r\n1,"musicals"\r\n2,"television-shows"\r\n3,"films"\r\n', 107 | } 108 | -------------------------------------------------------------------------------- /tidy_json_to_csv.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import codecs 3 | import concurrent.futures 4 | import csv 5 | import re 6 | import ijson 7 | import queue 8 | import sys 9 | 10 | 11 | def to_csvs(json_bytes, save_csv_bytes, null='#NA', output_chunk_size=65536, save_chunk_timeout=5, max_files=1024): 12 | STOP_SENTINAL = object() 13 | top_level_saved = defaultdict(set) 14 | open_maps = {} 15 | parent_ids = [] 16 | open_csv_qs = {} 17 | 18 | class PseudoBuffer: 19 | def write(self, value): 20 | return value 21 | 22 | csv_writer = csv.writer(PseudoBuffer(), quoting=csv.QUOTE_NONNUMERIC) 23 | 24 | class QueuedIterable(): 25 | def __init__(self, q): 26 | self.q = q 27 | 28 | def __iter__(self): 29 | return self 30 | 31 | def __next__(self): 32 | item = self.q.get() 33 | self.q.task_done() 34 | if item is STOP_SENTINAL: 35 | raise StopIteration() 36 | return item 37 | 38 | def buffer(chunks): 39 | queue = [] 40 | queue_length = 0 41 | 42 | for chunk in chunks: 43 | queue.append(chunk) 44 | queue_length += len(chunk) 45 | 46 | while queue_length >= output_chunk_size: 47 | to_send_later = b''.join(queue) 48 | chunk, to_send_later = \ 49 | to_send_later[:output_chunk_size], to_send_later[output_chunk_size:] 50 | 51 | queue = \ 52 | [to_send_later] if to_send_later else \ 53 | [] 54 | queue_length = len(to_send_later) 55 | 56 | yield chunk 57 | 58 | if queue_length: 59 | yield b''.join(queue) 60 | 61 | def save(executor, path, dict_data): 62 | try: 63 | f, q = open_csv_qs[path] 64 | except KeyError: 65 | if len(open_csv_qs) >= max_files: 66 | raise Exception('Too many open files') 67 | 68 | q = queue.Queue(maxsize=1) 69 | f = executor.submit(save_csv_bytes, path, buffer(QueuedIterable(q))) 70 | open_csv_qs[path] = (f, q) 71 | q.put(csv_writer.writerow(dict_data.keys()).encode('utf-8'), timeout=save_chunk_timeout) 72 | 73 | q.put(csv_writer.writerow(dict_data.values()).encode('utf-8'), timeout=save_chunk_timeout) 74 | 75 | def to_path(prefix): 76 | return re.sub(r'([^.]+)\.item\.?', r'\1__', prefix).rstrip('_') 77 | 78 | def handle_start_map(executor, prefix, value): 79 | open_maps[prefix] = {} 80 | 81 | def handle_end_map(executor, prefix, value): 82 | key = prefix.rpartition('.item')[0].rpartition('.')[2] 83 | is_top_level = 'id' in open_maps[prefix] 84 | is_sub_object = not prefix.endswith('.item') 85 | 86 | # If a plain object, append to parent 87 | if is_sub_object: 88 | parent_prefix = prefix[:prefix.rfind('.')] 89 | sub_object_key = prefix[prefix.rfind('.') + 1:] 90 | parent = open_maps[parent_prefix] 91 | for sub_value_key, value in open_maps[prefix].items(): 92 | parent[sub_object_key + '__' + sub_value_key] = value 93 | 94 | # IDs of parents so the user can do JOINs 95 | parent_id_dict = { 96 | f'{parent_key}__id': parent_id 97 | for (parent_key, parent_id) in parent_ids 98 | } 99 | 100 | # ... and only save these for nested top level 101 | if not is_sub_object and is_top_level and len(parent_ids) > 1: 102 | save(executor, to_path(prefix) + '__id', parent_id_dict) 103 | 104 | # ... but if _not_ top level (i.e. no ID), save the IDs and other data 105 | if not is_sub_object and not is_top_level and len(parent_ids): 106 | save(executor, to_path(prefix), {**parent_id_dict, **open_maps[prefix]}) 107 | 108 | # ... and if top level, but not yet saved it, save it 109 | if not is_sub_object and is_top_level and open_maps[prefix]['id'] not in top_level_saved[key]: 110 | save(executor, f'{key}', open_maps[prefix]) 111 | top_level_saved[key].add(open_maps[prefix]['id']) 112 | 113 | # We're going to be moving up a level, so no need for last ID 114 | if is_top_level: 115 | parent_ids.pop() 116 | 117 | del open_maps[prefix] 118 | 119 | def handle_map_key(executor, prefix, value): 120 | pass 121 | 122 | def handle_start_array(executor, prefix, value): 123 | pass 124 | 125 | def handle_end_array(executor, prefix, value): 126 | pass 127 | 128 | def handle_null(executor, prefix, _): 129 | parent, _, key = prefix.rpartition('.') 130 | open_maps[parent][key] = null 131 | 132 | def handle_boolean(executor, prefix, value): 133 | parent, _, key = prefix.rpartition('.') 134 | open_maps[parent][key] = value 135 | 136 | def handle_number(executor, prefix, value): 137 | parent, _, key = prefix.rpartition('.') 138 | open_maps[parent][key] = value 139 | 140 | if key == 'id': 141 | parent_key = prefix.rpartition('.item.')[0].rpartition('.item.')[2] 142 | parent_ids.append((parent_key, value)) 143 | 144 | def handle_string(executor, prefix, value): 145 | parent, _, key = prefix.rpartition('.') 146 | open_maps[parent][key] = value 147 | 148 | if key == 'id': 149 | parent_key = prefix.rpartition('.item.')[0].rpartition('.item.')[2] 150 | parent_ids.append((parent_key, value)) 151 | 152 | handlers = locals() 153 | 154 | def process(executor, events): 155 | for prefix, event, value in events: 156 | handlers[f'handle_{event}'](executor, prefix, value) 157 | 158 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_files) as executor: 159 | try: 160 | events = ijson.sendable_list() 161 | coro = ijson.parse_coro(events) 162 | for chunk in json_bytes: 163 | coro.send(chunk) 164 | process(executor, events) 165 | del events[:] 166 | 167 | coro.close() 168 | process(executor, events) 169 | 170 | # Close all open CSVs 171 | finally: 172 | for _, q in open_csv_qs.values(): 173 | try: 174 | q.put(STOP_SENTINAL, timeout=save_chunk_timeout) 175 | except: 176 | pass 177 | 178 | for f, _ in open_csv_qs.values(): 179 | exception = f.exception(timeout=save_chunk_timeout) 180 | if exception: 181 | raise exception 182 | 183 | def main(): 184 | def json_bytes_from_stdin(): 185 | while True: 186 | chunk = sys.stdin.buffer.read(65536) 187 | if not chunk: 188 | break 189 | yield chunk 190 | 191 | def save_csv_bytes(path, chunks): 192 | with open(f'{path}.csv', 'wb') as f: 193 | for chunk in chunks: 194 | f.write(chunk) 195 | 196 | to_csvs(json_bytes_from_stdin(), save_csv_bytes) 197 | 198 | 199 | if __name__ == '__main__': 200 | main() 201 | --------------------------------------------------------------------------------