├── .circleci
    └── config.yml
├── .gitignore
├── CODEOWNERS
├── LICENSE
├── README.md
├── setup.py
├── tests.py
└── tidy_json_to_csv.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2
  2 | workflows:
  3 |   version: 2
  4 |   test:
  5 |     jobs:
  6 |       - test-3.8.2
  7 |       - test-3.8.1
  8 |       - test-3.8.0
  9 |       - test-3.7.7
 10 |       - test-3.7.4
 11 |       - test-3.7.3
 12 |       - test-3.7.2
 13 |       - test-3.7.1
 14 |       - test-3.7.0
 15 |       - test-3.6.10
 16 |       - test-3.6.9
 17 |       - test-3.6.8
 18 |       - test-3.6.7
 19 |       - test-3.6.6
 20 |       - test-3.6.5
 21 |       - test-3.6.4
 22 |       - test-3.6.3
 23 | jobs:
 24 |   test-3.8.2: &template
 25 |     docker:
 26 |       - image: python:3.8.2
 27 |     steps:
 28 |       - checkout
 29 |       - run:
 30 |           name: Run tests
 31 |           command: |
 32 |             python3 setup.py test
 33 |   test-3.8.1:
 34 |     <<: *template
 35 |     docker:
 36 |       - image: python:3.8.1
 37 |   test-3.8.0:
 38 |     <<: *template
 39 |     docker:
 40 |       - image: python:3.8.0
 41 |   test-3.7.7:
 42 |     <<: *template
 43 |     docker:
 44 |       - image: python:3.7.7
 45 |   test-3.7.6:
 46 |     <<: *template
 47 |     docker:
 48 |       - image: python:3.7.6
 49 |   test-3.7.5:
 50 |     <<: *template
 51 |     docker:
 52 |       - image: python:3.7.5
 53 |   test-3.7.4:
 54 |     <<: *template
 55 |     docker:
 56 |       - image: python:3.7.4
 57 |   test-3.7.3:
 58 |     <<: *template
 59 |     docker:
 60 |       - image: python:3.7.3
 61 |   test-3.7.2:
 62 |     <<: *template
 63 |     docker:
 64 |       - image: python:3.7.2
 65 |   test-3.7.1:
 66 |     <<: *template
 67 |     docker:
 68 |       - image: python:3.7.1
 69 |   test-3.7.0:
 70 |     <<: *template
 71 |     docker:
 72 |       - image: python:3.7.0
 73 |   test-3.6.10:
 74 |     <<: *template
 75 |     docker:
 76 |       - image: python:3.6.10
 77 |   test-3.6.9:
 78 |     <<: *template
 79 |     docker:
 80 |       - image: python:3.6.9
 81 |   test-3.6.8:
 82 |     <<: *template
 83 |     docker:
 84 |       - image: python:3.6.8
 85 |   test-3.6.7:
 86 |     <<: *template
 87 |     docker:
 88 |       - image: python:3.6.7
 89 |   test-3.6.6:
 90 |     <<: *template
 91 |     docker:
 92 |       - image: python:3.6.6
 93 |   test-3.6.5:
 94 |     <<: *template
 95 |     docker:
 96 |       - image: python:3.6.5
 97 |   test-3.6.4:
 98 |     <<: *template
 99 |     docker:
100 |       - image: python:3.6.4
101 |   test-3.6.3:
102 |     <<: *template
103 |     docker:
104 |       - image: python:3.6.3
105 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @uktrade/data-infrastructure
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Department for International Trade
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tidy-json-to-csv [![CircleCI](https://circleci.com/gh/uktrade/tidy-json-to-csv.svg?style=svg)](https://circleci.com/gh/uktrade/tidy-json-to-csv)
  2 | 
  3 | Converts a subset of JSON to a set of tidy CSVs. Supports both streaming processing of input JSON and output of CSV, and so suitable for large files in memory constrained environments.
  4 | 
  5 | 
  6 | ## What problem does this solve?
  7 | 
  8 | Most JSON to CSV converters do not result in data suitable for immediate analysis. They usually output a single CSV, and to do this, result in some combination of:
  9 | 
 10 | - JSON inside CSV fields;
 11 | - values in lists presented as columms;
 12 | - data duplicated in multiple rows / a row's position in the CSV determines its context.
 13 | 
 14 | Often these require subsequent manual, and so error-prone, data manipulation. This library aims to do all the conversion up-front, so you end up with a set of [tidy](https://vita.had.co.nz/papers/tidy-data.pdf) tables, which is often a great place from which to start analysis.
 15 | 
 16 | 
 17 | ## Example input and output
 18 | 
 19 | The JSON
 20 | 
 21 | ```json
 22 | {
 23 |   "songs": [
 24 |     {
 25 |       "id": "1",
 26 |       "title": "Walk through the fire",
 27 |       "categories": [
 28 |         {"id": "1", "name": "musicals"},
 29 |         {"id": "2", "name": "television-shows"}
 30 |       ],
 31 |       "comments": [
 32 |         {"content": "I love it"},
 33 |         {"content": "I've heard better"}
 34 |       ],
 35 |       "artist": {
 36 |         "name": "Slayer"
 37 |       }
 38 |     },
 39 |     {
 40 |       "id": "2",
 41 |       "title": "I could have danced all night",
 42 |       "categories": [
 43 |         {"id": "1", "name": "musicals"},
 44 |         {"id": "3", "name": "films"}
 45 |       ],
 46 |       "comments": [
 47 |         {"content": "I also could have danced all night"}
 48 |       ],
 49 |       "artist": {
 50 |         "name": "Doolitle"
 51 |       }
 52 |     }
 53 |   ]
 54 | }
 55 | ```
 56 | 
 57 | maps to four files:
 58 | 
 59 | ### `songs.csv`
 60 | 
 61 | ```csv
 62 | "id","title","artist__name"
 63 | "1","Walk through the fire","Slayer"
 64 | "2","I could have danced all night","Doolitle"
 65 | ```
 66 | 
 67 | ### `songs__categories__id.csv`
 68 | 
 69 | ```csv
 70 | "songs__id","categories__id"
 71 | "1","1"
 72 | "1","2"
 73 | "2","1"
 74 | "2","3"
 75 | ```
 76 | 
 77 | ### `songs__comments.csv`
 78 | 
 79 | ```csv
 80 | "songs__id","content"
 81 | "1","I love it"
 82 | "1","I've heard better"
 83 | "2","I also could have danced all night"
 84 | ```
 85 | 
 86 | ### `categories.csv`
 87 | 
 88 | ```csv
 89 | "id","name"
 90 | "1","musicals"
 91 | "2","television-shows"
 92 | "3","films"
 93 | ```
 94 | 
 95 | 
 96 | ## Installation
 97 | 
 98 | ```bash
 99 | pip install tidy-json-to-csv
100 | ```
101 | 
102 | 
103 | ## Usage: Convert JSON to multiple CSV files (Command line)
104 | 
105 | ```bash
106 | cat songs.json | tidy_json_to_csv
107 | ```
108 | 
109 | 
110 | ## Usage: Convert JSON to multiple CSV files (Python)
111 | 
112 | ```python
113 | from tidy_json_to_csv import to_csvs
114 | 
115 | # A save function, called by to_csvs for each CSV file to be generated.
116 | # Will be run in a separate thread, started by to_csvs
117 | def save_csv_bytes(path, chunks):
118 |     with open(f'{path}.csv', 'wb') as f:
119 |         for chunk in chunks:
120 |             f.write(chunk)
121 | 
122 | def json_bytes():
123 |     with open(f'file.json', 'rb') as f:
124 |         chunk = f.read(65536)
125 |         if chunk:
126 |             yield chunk
127 | 
128 | to_csvs(json_bytes(), save_csv_bytes, null='#NA', output_chunk_size=65536)
129 | ```
130 | 
131 | 
132 | ## Usage: Convert JSON to multiple Pandas data frames (Python)
133 | 
134 | ```python
135 | import io
136 | import queue
137 | 
138 | import pandas as pd
139 | from tidy_json_to_csv import to_csvs
140 | 
141 | def json_to_pandas(json_filename):
142 |     q = queue.Queue()
143 | 
144 |     class StreamedIterable(io.RawIOBase):
145 |         def __init__(self, iterable):
146 |             self.iterable = iterable
147 |             self.remainder = b''
148 |         def readable(self):
149 |             return True
150 |         def readinto(self, b):
151 |             buffer_size = len(b)
152 | 
153 |             while len(self.remainder) < buffer_size:
154 |                 try:
155 |                     self.remainder = self.remainder + next(self.iterable)
156 |                 except StopIteration:
157 |                     if self.remainder:
158 |                         break
159 |                     return 0
160 | 
161 |             chunk, self.remainder = self.remainder[:buffer_size], self.remainder[buffer_size:]
162 |             b[:len(chunk)] = chunk
163 |             return len(chunk)
164 | 
165 |     def save_csv_bytes(path, chunks):
166 |         q.put((path, pd.read_csv(io.BufferedReader(StreamedIterable(chunks), buffer_size=65536), na_values=['#NA'])))
167 | 
168 |     def json_bytes():
169 |         with open(json_filename, 'rb') as f:
170 |             chunk = f.read(65536)
171 |             if chunk:
172 |                 yield chunk
173 | 
174 |     to_csvs(json_bytes(), save_csv_bytes, null='#NA')
175 | 
176 |     dfs = {}
177 |     while not q.empty():
178 |         path, df = q.get()
179 |         dfs[path] = df
180 | 
181 |     return dfs
182 | 
183 | dfs = json_to_pandas('songs.json')
184 | for path, df in dfs.items():
185 |     print(path)
186 |     print(df)
187 | ```
188 | 
189 | 
190 | ## Constraints
191 | 
192 | Denormalised input JSON is assumed, and the output is normalised. If a nested object has an `id` field, it is assumed to be the primary key of a top-level table. All objects that have a nested object or array _must_ have an `id` field that serves as its primary key in the final output. If present, `id` must be the _first_ key in a map. All arrays must be arrays of objects rather than primitives.
193 | 
194 | Although _mostly_ streaming, to support denormalised input JSON and to avoid repeating the same rows in normalised CSVs, an internal record of output IDs is maintained during processing.
195 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | def long_description():
 5 |     with open('README.md', 'r') as file:
 6 |         return file.read()
 7 | 
 8 | 
 9 | setuptools.setup(
10 |     name='tidy-json-to-csv',
11 |     version='0.0.13',
12 |     author='Department for International Trade',
13 |     author_email='webops@digital.trade.gov.uk',
14 |     description='Convert JSON to a set of tidy CSV files',
15 |     long_description=long_description(),
16 |     long_description_content_type='text/markdown',
17 |     url='https://github.com/uktrade/tidy-json-to-csv',
18 |     py_modules=[
19 |         'tidy_json_to_csv',
20 |     ],
21 |     python_requires='>=3.6.3',
22 |     install_requires=[
23 |         'ijson>=3.0.4,<4',
24 |     ],
25 |     entry_points={
26 |         'console_scripts': [
27 |             'tidy_json_to_csv=tidy_json_to_csv:main'
28 |         ],
29 |     },
30 |     test_suite='tests',
31 |     classifiers=[
32 |         'Programming Language :: Python :: 3',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Operating System :: OS Independent',
35 |     ]
36 | )
37 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from tidy_json_to_csv import to_csvs
  4 | 
  5 | 
  6 | class TestIntegration(unittest.TestCase):
  7 | 
  8 |     def test_basic(self):
  9 |         total_received = {}
 10 | 
 11 |         def save_csv(path, chunks):
 12 |             total_received[path] = []
 13 |             for chunk in chunks:
 14 |                 total_received[path].append(chunk)
 15 | 
 16 |         for output_chunk_size in range(1, 200):
 17 |             to_csvs(json_bytes(50), save_csv, output_chunk_size=output_chunk_size)
 18 |             files = {
 19 |                 path: b''.join(contents)
 20 |                 for path, contents in total_received.items()
 21 |             }
 22 |             self.assertEqual(files, json_bytes_songs_parsed)
 23 | 
 24 |         for input_chunk_size in range(1, 200):
 25 |             to_csvs(json_bytes(input_chunk_size), save_csv, output_chunk_size=50)
 26 |             files = {
 27 |                 path: b''.join(contents)
 28 |                 for path, contents in total_received.items()
 29 |             }
 30 |             self.assertEqual(files, json_bytes_songs_parsed)
 31 | 
 32 |     def test_exception_during_input_propagates(self):
 33 |         total_received = {}
 34 | 
 35 |         class MyException(Exception):
 36 |           pass
 37 | 
 38 |         def json_bytes_with_exception():
 39 |             raise MyException()
 40 | 
 41 |         def save_csv(path, chunks):
 42 |             pass
 43 | 
 44 |         with self.assertRaises(MyException):
 45 |             to_csvs(json_bytes_with_exception(), save_csv)
 46 | 
 47 | 
 48 |     def test_exception_during_output_propagates(self):
 49 |         total_received = {}
 50 | 
 51 |         class MyException(Exception):
 52 |           pass
 53 | 
 54 |         def save_csv(path, chunks):
 55 |             raise MyException()
 56 | 
 57 |         with self.assertRaises(MyException):
 58 |             to_csvs(json_bytes(50), save_csv)
 59 | 
 60 | 
 61 | def json_bytes(chunk_size):
 62 |     remaining = json_bytes_songs
 63 |     while remaining:
 64 |         yield remaining[:chunk_size]
 65 |         remaining = remaining[chunk_size:]
 66 | 
 67 | 
 68 | json_bytes_songs = b'''{
 69 |   "songs": [
 70 |     {
 71 |       "id": "1",
 72 |       "title": "Walk through the fire",
 73 |       "categories": [
 74 |         {"id": 1, "name": "musicals"},
 75 |         {"id": 2, "name": "television-shows"}
 76 |       ],
 77 |       "comments": [
 78 |         {"content": "I love it"},
 79 |         {"content": "I've heard better"}
 80 |       ],
 81 |       "artist": {
 82 |         "name": "Slayer"
 83 |       }
 84 |     },
 85 |     {
 86 |       "id": "2",
 87 |       "title": "I could have danced all night",
 88 |       "categories": [
 89 |         {"id": 1, "name": "musicals"},
 90 |         {"id": 3, "name": "films"}
 91 |       ],
 92 |       "comments": [
 93 |         {"content": "I also could have danced all night"}
 94 |       ],
 95 |       "artist": {
 96 |         "name": "Dolittle"
 97 |       }
 98 |     }
 99 |   ]
100 | }'''
101 | 
102 | json_bytes_songs_parsed = {
103 |     'songs__categories__id': b'"songs__id","categories__id"\r\n"1",1\r\n"1",2\r\n"2",1\r\n"2",3\r\n',
104 |     'songs__comments': b'"songs__id","content"\r\n"1","I love it"\r\n"1","I\'ve heard better"\r\n"2","I also could have danced all night"\r\n',
105 |     'songs': b'"id","title","artist__name"\r\n"1","Walk through the fire","Slayer"\r\n"2","I could have danced all night","Dolittle"\r\n',
106 |     'categories': b'"id","name"\r\n1,"musicals"\r\n2,"television-shows"\r\n3,"films"\r\n',
107 | }
108 | 


--------------------------------------------------------------------------------
/tidy_json_to_csv.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import codecs
  3 | import concurrent.futures
  4 | import csv
  5 | import re
  6 | import ijson
  7 | import queue
  8 | import sys
  9 | 
 10 | 
 11 | def to_csvs(json_bytes, save_csv_bytes, null='#NA', output_chunk_size=65536, save_chunk_timeout=5, max_files=1024):
 12 |     STOP_SENTINAL = object()
 13 |     top_level_saved = defaultdict(set)
 14 |     open_maps = {}
 15 |     parent_ids = []
 16 |     open_csv_qs = {}
 17 | 
 18 |     class PseudoBuffer:
 19 |         def write(self, value):
 20 |             return value
 21 | 
 22 |     csv_writer = csv.writer(PseudoBuffer(), quoting=csv.QUOTE_NONNUMERIC)
 23 | 
 24 |     class QueuedIterable():
 25 |         def __init__(self, q):
 26 |             self.q = q
 27 | 
 28 |         def __iter__(self):
 29 |             return self
 30 | 
 31 |         def __next__(self):
 32 |             item = self.q.get()
 33 |             self.q.task_done()
 34 |             if item is STOP_SENTINAL:
 35 |                 raise StopIteration()
 36 |             return item
 37 | 
 38 |     def buffer(chunks):
 39 |         queue = []
 40 |         queue_length = 0
 41 | 
 42 |         for chunk in chunks:
 43 |             queue.append(chunk)
 44 |             queue_length += len(chunk)
 45 | 
 46 |             while queue_length >= output_chunk_size:
 47 |                 to_send_later = b''.join(queue)
 48 |                 chunk, to_send_later = \
 49 |                     to_send_later[:output_chunk_size], to_send_later[output_chunk_size:]
 50 | 
 51 |                 queue = \
 52 |                     [to_send_later] if to_send_later else \
 53 |                     []
 54 |                 queue_length = len(to_send_later)
 55 | 
 56 |                 yield chunk
 57 | 
 58 |         if queue_length:
 59 |             yield b''.join(queue)
 60 | 
 61 |     def save(executor, path, dict_data):
 62 |         try:
 63 |             f, q = open_csv_qs[path]
 64 |         except KeyError:
 65 |             if len(open_csv_qs) >= max_files:
 66 |                 raise Exception('Too many open files')
 67 | 
 68 |             q = queue.Queue(maxsize=1)
 69 |             f = executor.submit(save_csv_bytes, path, buffer(QueuedIterable(q)))
 70 |             open_csv_qs[path] = (f, q)
 71 |             q.put(csv_writer.writerow(dict_data.keys()).encode('utf-8'), timeout=save_chunk_timeout)
 72 | 
 73 |         q.put(csv_writer.writerow(dict_data.values()).encode('utf-8'), timeout=save_chunk_timeout)
 74 | 
 75 |     def to_path(prefix):
 76 |         return re.sub(r'([^.]+)\.item\.?', r'\1__', prefix).rstrip('_')
 77 | 
 78 |     def handle_start_map(executor, prefix, value):
 79 |         open_maps[prefix] = {}
 80 | 
 81 |     def handle_end_map(executor, prefix, value):
 82 |         key = prefix.rpartition('.item')[0].rpartition('.')[2]
 83 |         is_top_level = 'id' in open_maps[prefix]
 84 |         is_sub_object = not prefix.endswith('.item')
 85 | 
 86 |         # If a plain object, append to parent
 87 |         if is_sub_object:
 88 |             parent_prefix = prefix[:prefix.rfind('.')]
 89 |             sub_object_key = prefix[prefix.rfind('.') + 1:]
 90 |             parent = open_maps[parent_prefix]
 91 |             for sub_value_key, value in open_maps[prefix].items():
 92 |                 parent[sub_object_key + '__' + sub_value_key] = value
 93 | 
 94 |         # IDs of parents so the user can do JOINs
 95 |         parent_id_dict = {
 96 |             f'{parent_key}__id': parent_id
 97 |             for (parent_key, parent_id) in parent_ids
 98 |         }
 99 | 
100 |         # ... and only save these for nested top level
101 |         if not is_sub_object and is_top_level and len(parent_ids) > 1:
102 |             save(executor, to_path(prefix) + '__id', parent_id_dict)
103 | 
104 |         # ... but if _not_ top level (i.e. no ID), save the IDs and other data
105 |         if not is_sub_object and not is_top_level and len(parent_ids):
106 |             save(executor, to_path(prefix), {**parent_id_dict, **open_maps[prefix]})
107 | 
108 |         # ... and if top level, but not yet saved it, save it
109 |         if not is_sub_object and is_top_level and open_maps[prefix]['id'] not in top_level_saved[key]:
110 |             save(executor, f'{key}', open_maps[prefix])
111 |             top_level_saved[key].add(open_maps[prefix]['id'])
112 | 
113 |         # We're going to be moving up a level, so no need for last ID
114 |         if is_top_level:
115 |             parent_ids.pop()
116 | 
117 |         del open_maps[prefix]
118 | 
119 |     def handle_map_key(executor, prefix, value):
120 |         pass
121 | 
122 |     def handle_start_array(executor, prefix, value):
123 |         pass
124 | 
125 |     def handle_end_array(executor, prefix, value):
126 |         pass
127 | 
128 |     def handle_null(executor, prefix, _):
129 |         parent, _, key = prefix.rpartition('.')
130 |         open_maps[parent][key] = null
131 | 
132 |     def handle_boolean(executor, prefix, value):
133 |         parent, _, key = prefix.rpartition('.')
134 |         open_maps[parent][key] = value
135 | 
136 |     def handle_number(executor, prefix, value):
137 |         parent, _, key = prefix.rpartition('.')
138 |         open_maps[parent][key] = value
139 | 
140 |         if key == 'id':
141 |             parent_key = prefix.rpartition('.item.')[0].rpartition('.item.')[2]
142 |             parent_ids.append((parent_key, value))
143 | 
144 |     def handle_string(executor, prefix, value):
145 |         parent, _, key = prefix.rpartition('.')
146 |         open_maps[parent][key] = value
147 | 
148 |         if key == 'id':
149 |             parent_key = prefix.rpartition('.item.')[0].rpartition('.item.')[2]
150 |             parent_ids.append((parent_key, value))
151 | 
152 |     handlers = locals()
153 | 
154 |     def process(executor, events):
155 |         for prefix, event, value in events:
156 |             handlers[f'handle_{event}'](executor, prefix, value)
157 | 
158 |     with concurrent.futures.ThreadPoolExecutor(max_workers=max_files) as executor:
159 |         try:
160 |             events = ijson.sendable_list()
161 |             coro = ijson.parse_coro(events)
162 |             for chunk in json_bytes:
163 |                 coro.send(chunk)
164 |                 process(executor, events)
165 |                 del events[:]
166 | 
167 |             coro.close()
168 |             process(executor, events)
169 | 
170 |         # Close all open CSVs
171 |         finally:
172 |             for _, q in open_csv_qs.values():
173 |                 try:
174 |                     q.put(STOP_SENTINAL, timeout=save_chunk_timeout)
175 |                 except:
176 |                     pass
177 | 
178 |             for f, _ in open_csv_qs.values():
179 |                 exception = f.exception(timeout=save_chunk_timeout)
180 |                 if exception:
181 |                     raise exception
182 | 
183 | def main():
184 |     def json_bytes_from_stdin():
185 |         while True:
186 |             chunk = sys.stdin.buffer.read(65536)
187 |             if not chunk:
188 |                 break
189 |             yield chunk
190 | 
191 |     def save_csv_bytes(path, chunks):
192 |         with open(f'{path}.csv', 'wb') as f:
193 |             for chunk in chunks:
194 |                 f.write(chunk)
195 | 
196 |     to_csvs(json_bytes_from_stdin(), save_csv_bytes)
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     main()
201 | 


--------------------------------------------------------------------------------