├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml ├── src ├── overrides.py └── partial_json_parser │ ├── __init__.py │ ├── core │ ├── api.py │ ├── complete.py │ ├── exceptions.py │ ├── myelin.py │ └── options.py │ ├── options.py │ ├── playground.py │ └── version.py └── tests ├── test_examples.py ├── test_hypotheses.py └── test_performance.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | jobs: 6 | diff: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | with: 11 | fetch-depth: 2 12 | - name: Check for changes 13 | id: changes-check 14 | run: | 15 | if git diff --quiet HEAD~1 HEAD -- src/partial_json_parser/version.py; then 16 | echo "CHANGED=false" >> $GITHUB_OUTPUT 17 | else 18 | echo "CHANGED=true" >> $GITHUB_OUTPUT 19 | fi 20 | outputs: 21 | CHANGED: ${{ steps.changes-check.outputs.CHANGED }} 22 | publish: 23 | runs-on: ubuntu-latest 24 | needs: diff 25 | environment: release 26 | if: needs.diff.outputs.CHANGED == 'true' 27 | permissions: 28 | id-token: write 29 | steps: 30 | - uses: actions/checkout@v4 31 | - name: Set up python 32 | uses: astral-sh/setup-uv@v5 33 | with: 34 | python-version: 3.13 35 | - name: Build package 36 | run: | 37 | python src/overrides.py 3.6 38 | uv build 39 | - name: Publish to PyPI 40 | continue-on-error: true 41 | run: uv publish 42 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: pytest & hypothesis 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | test: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | os: [ubuntu-latest, macos-latest, windows-latest] 12 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.7", "pypy3.8", "pypy3.9", "pypy3.10"] 13 | exclude: 14 | - os: windows-latest 15 | python-version: "3.7" 16 | - os: windows-latest 17 | python-version: "pypy3.7" 18 | - os: windows-latest 19 | python-version: "pypy3.8" 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Setup Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: 3.12 26 | - name: Install uv 27 | run: pip install uv --disable-pip-version-check 28 | - name: Install pdm 29 | if: matrix.os != 'windows-latest' 30 | run: uv tool install pdm 31 | - name: Setup Python ${{ matrix.python-version }} 32 | if: matrix.os != 'windows-latest' && matrix.python-version != '3.7' && matrix.python-version != 'pypy3.7' 33 | uses: actions/setup-python@v5 34 | with: 35 | python-version: ${{ matrix.python-version }} 36 | - name: Setup PDM and Python ${{ matrix.python-version }} 37 | uses: pdm-project/setup-pdm@v4 38 | if: matrix.os == 'windows-latest' 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | - name: Create venv 42 | run: uv v -p ${{ matrix.python-version }} || pdm venv create --force 43 | - name: Install dependencies 44 | run: pdm export --pyproject | uv pip install -r - .[playground] 45 | - name: Run tests 46 | run: | 47 | pdm test 48 | - name: Run static type checking 49 | run: | 50 | npm i -g pyright 51 | pdm run pyright ./src/ 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | dist/ 3 | 4 | pdm.lock 5 | .pdm.toml 6 | .pdm-python 7 | .pdm-build/ 8 | .hypothesis 9 | .venv 10 | .vscode 11 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Promplate 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Partial JSON Parser 2 | 3 | Sometimes we need **LLM (Large Language Models)** to produce **structural information** instead of natural language. The easiest way is to use JSON. 4 | 5 | But before receiving the last token of response, the JSON is broken, which means you can't use `JSON.parse` to decode it. But we still want to stream the data to the user. 6 | 7 | Here comes `partial-json-parser`, a lightweight and customizable library for parsing partial JSON strings. Here is a [demo](https://promplate.dev/partial-json-parser). 8 | 9 | (Note that there is [a JavaScript implementation](https://github.com/promplate/partial-json-parser-js) too) 10 | 11 | ## Installation 12 | 13 | ```sh 14 | pip install partial-json-parser # or poetry / pdm / uv 15 | ``` 16 | 17 | `partial-json-parser` is implemented purely in Python, with good type hints. It is zero-dependency and works with Python 3.6+. 18 | 19 | You can install run its demo playground by installing `rich` too or: 20 | 21 | ```sh 22 | pip install partial-json-parser[playground] 23 | ``` 24 | 25 | Then run the `json-playground` in your terminal, and you can try the parser interactively. 26 | 27 | ## Usage 28 | 29 | ```py 30 | from partial_json_parser import loads 31 | 32 | >>> loads('{"key": "v') # {'key': 'v'} 33 | ``` 34 | 35 | Alternatively, you can use `ensure_json` to get the completed JSON string: 36 | 37 | ```py 38 | from partial_json_parser import ensure_json 39 | 40 | >>> ensure_json('{"key": "v') # '{"key": "v"}' 41 | ``` 42 | 43 | ### Detailed Usage 44 | 45 | You can import the `loads` function and the `Allow` object from the library like this: 46 | 47 | ```py 48 | from partial_json_parser import loads, Allow 49 | ``` 50 | 51 | The `Allow` object is just an Enum for options. It determines what types can be partial. types not included in `allow` only appears after its completion can be ensured. 52 | 53 | ### Parsing complete / partial JSON strings 54 | 55 | The `loads` function works just like the built-in `json.loads` when parsing a complete JSON string: 56 | 57 | ```py 58 | result = loads('{"key":"value"}') 59 | print(result) # Outputs: {'key': 'value'} 60 | ``` 61 | 62 | You can parse a partial JSON string by passing an additional parameter to the `loads` function. This parameter is a **bitwise OR** of the constants from the `Allow` flag: 63 | 64 | (Note that you can directly import the constants you need from `partial-json-parser`) 65 | 66 | ```py 67 | from partial_json_parser import loads, Allow, STR, OBJ 68 | 69 | result = loads('{"key": "v', STR | OBJ) 70 | print(result) # Outputs: {'key': 'v'} 71 | ``` 72 | 73 | In this example, `Allow.STR` tells the parser that it's okay if a string is incomplete, and `Allow.OBJ` tells the parser so as a dict. The parser then try to return as much data as it can. 74 | 75 | If you don't allow partial strings, then it will not add `"key"` to the object because `"v` is not close: 76 | 77 | ```py 78 | result = loads('{"key": "v', OBJ) 79 | print(result) # Outputs: {} 80 | 81 | result = loads('{"key": "value"', OBJ) 82 | print(result) # Outputs: {'key': 'value'} 83 | ``` 84 | 85 | Similarity, you can parse partial lists or even partial special values if you allow it: 86 | 87 | (Note that `allow` defaults to `Allow.ALL`) 88 | 89 | ```py 90 | result = loads('[ {"key1": "value1", "key2": [ "value2') 91 | print(result) # Outputs: [{'key1': 'value1', 'key2': ['value2']}] 92 | 93 | result = loads("-Inf") 94 | print(result) # Outputs: -inf 95 | ``` 96 | 97 | ### Handling malformed JSON 98 | 99 | If the JSON string is malformed, the `parse` function will throw an error: 100 | 101 | ```py 102 | loads("wrong") # MalformedJSON: Malformed node or string on line 1 103 | ``` 104 | 105 | ## API Reference 106 | 107 | ### loads(json_string, [allow_partial], [parser]) 108 | 109 | - `json_string` ``: The (incomplete) JSON string to parse. 110 | - `allow_partial` ``: Specify what kind of partialness is allowed during JSON parsing (default: `Allow.ALL`). 111 | - `parser` `(str) -> JSON`: An ordinary JSON parser. Default is `json.loads`. 112 | 113 | Complete the JSON string and parse it with `parser` function. 114 | 115 | Returns the parsed Python value. 116 | 117 | Alias: `decode`, `parse_json`. 118 | 119 | ### ensure_json(json_string, [allow_partial]) 120 | 121 | - `json_string` ``: The (incomplete) JSON string to complete. 122 | - `allow_partial` ``: Specify what kind of partialness is allowed during JSON parsing (default: `Allow.ALL`). 123 | 124 | Returns the completed JSON string. 125 | 126 | ### fix(json_string, [allow_partial]) 127 | 128 | - `json_string` ``: The (incomplete) JSON string to complete. 129 | - `allow_partial` ``: Specify what kind of partialness is allowed during JSON parsing (default: `Allow.ALL`). 130 | 131 | Returns a tuple of a slice of the input string and the completion. 132 | 133 | Note that this is a low-level API, only useful for debugging and demonstration. 134 | 135 | ### Allow 136 | 137 | Enum class that specifies what kind of partialness is allowed during JSON parsing. It has the following members: 138 | 139 | - `STR`: Allow partial string. 140 | - `NUM`: Allow partial number. 141 | - `ARR`: Allow partial array. 142 | - `OBJ`: Allow partial object. 143 | - `NULL`: Allow partial null. 144 | - `BOOL`: Allow partial boolean. 145 | - `NAN`: Allow partial NaN. 146 | - `INFINITY`: Allow partial Infinity. 147 | - `_INFINITY`: Allow partial -Infinity. 148 | - `INF`: Allow both partial Infinity and -Infinity. 149 | - `SPECIAL`: Allow all special values. 150 | - `ATOM`: Allow all atomic values. 151 | - `COLLECTION`: Allow all collection values. 152 | - `ALL`: Allow all values. 153 | 154 | ## Testing 155 | 156 | To run the tests for this library, you should clone the repository and install the dependencies: 157 | 158 | ```sh 159 | git clone https://github.com/promplate/partial-json-parser.git 160 | cd partial-json-parser 161 | pdm install 162 | ``` 163 | 164 | Then, you can run the tests using [Hypothesis](https://hypothesis.works/) and [Pytest](https://pytest.org/): 165 | 166 | ```sh 167 | pdm test 168 | ``` 169 | 170 | Please note that while we strive to cover as many edge cases as possible, it's always possible that some cases might not be covered. 171 | 172 | ## License 173 | 174 | This project is licensed under the MIT License. 175 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "partial-json-parser" 3 | dynamic = ["version"] 4 | description = "Parse partial JSON generated by LLM" 5 | authors = [{ name = "Muspi Merol", email = "me@promplate.dev" }] 6 | optional-dependencies = { playground = ["rich"] } 7 | requires-python = ">=3.7" # 3.6 in production indeed 8 | readme = "README.md" 9 | license = { text = "MIT" } 10 | scripts = { json-playground = "partial_json_parser.playground:main" } 11 | keywords = ["JSON", "parser", "LLM", "nlp"] 12 | classifiers = [ 13 | "Development Status :: 5 - Production/Stable", 14 | "Intended Audience :: Developers", 15 | ] 16 | 17 | [project.urls] 18 | repository = "https://github.com/promplate/partial-json-parser" 19 | homepage = "https://promplate.dev/partial-json-parser" 20 | 21 | [build-system] 22 | requires = ["pdm-backend"] 23 | build-backend = "pdm.backend" 24 | 25 | [tool.pdm.build] 26 | excludes = ["tests"] 27 | 28 | [tool.pdm.dev-dependencies] 29 | dev = ["hypothesis", "tqdm", "pytest"] 30 | 31 | [tool.pdm.scripts] 32 | test-performance = { call = "tests.test_performance:main" } 33 | test-hypotheses = { call = "tests.test_hypotheses:main" } 34 | test-examples = "pytest tests/test_examples.py" 35 | test = { composite = ["test-examples", "test-hypotheses", "test-performance"] } 36 | format = { composite = ["isort ./{args}", "black ./{args}"] } 37 | playground = { call = "partial_json_parser.playground:main" } 38 | pre_build = { composite = ["format", "python src/overrides.py 3.6"] } 39 | post_build = "python src/overrides.py 3.8" 40 | 41 | [tool.pdm.version] 42 | source = "file" 43 | path = "src/partial_json_parser/version.py" 44 | 45 | [tool.black] 46 | line-length = 160 47 | 48 | [tool.isort] 49 | profile = "black" 50 | 51 | [tool.pyright] 52 | reportPossiblyUnboundVariable = false 53 | -------------------------------------------------------------------------------- /src/overrides.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from re import sub 3 | from sys import argv 4 | 5 | file = Path(__file__).parent.parent / "pyproject.toml" 6 | 7 | file.write_text(sub(r'requires-python = ".*?"', f'requires-python = ">={argv[-1]}"', file.read_text())) 8 | -------------------------------------------------------------------------------- /src/partial_json_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .core.api import JSON, ensure_json, parse_json 2 | from .core.complete import fix 3 | from .core.exceptions import * 4 | from .core.myelin import fix_fast 5 | from .core.options import * 6 | 7 | loads = decode = parse_json 8 | -------------------------------------------------------------------------------- /src/partial_json_parser/core/api.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, List, Optional, Union 2 | 3 | from .complete import fix 4 | from .myelin import fix_fast 5 | from .options import * 6 | 7 | Number = Union[int, float] 8 | JSON = Union[str, bool, Number, List["JSON"], Dict[str, "JSON"], None] 9 | 10 | 11 | def parse_json(json_string: str, allow_partial: Union[Allow, int] = ALL, parser: Optional[Callable[[str], JSON]] = None, use_fast_fix=True) -> JSON: 12 | if parser is None: 13 | from json import loads as parser 14 | 15 | return parser(ensure_json(json_string, allow_partial, use_fast_fix)) 16 | 17 | 18 | def ensure_json(json_string: str, allow_partial: Union[Allow, int] = ALL, use_fast_fix=True) -> str: 19 | """get the completed JSON string""" 20 | 21 | if use_fast_fix: 22 | head, tail = fix_fast(json_string, allow_partial) 23 | else: 24 | head, tail = fix(json_string, allow_partial) 25 | 26 | return head + tail 27 | -------------------------------------------------------------------------------- /src/partial_json_parser/core/complete.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Tuple, Union 2 | 3 | from .exceptions import MalformedJSON, PartialJSON 4 | from .options import * 5 | 6 | if TYPE_CHECKING: 7 | from typing import Literal 8 | 9 | CompleteResult = Union[Tuple[int, Union[str, "Literal[True]"]], "Literal[False]"] # (length, complete_string / already completed) / partial 10 | 11 | 12 | def fix(json_string: str, allow_partial: Union[Allow, int] = ALL): 13 | """get the original slice and the trailing suffix separately""" 14 | 15 | return _fix(json_string, Allow(allow_partial), True) 16 | 17 | 18 | def _fix(json_string: str, allow: Allow, is_top_level=False): 19 | try: 20 | result = complete_any(json_string.strip(), allow, is_top_level) 21 | if result is False: 22 | raise PartialJSON 23 | 24 | index, completion = result 25 | return json_string[:index], ("" if completion is True else completion) 26 | 27 | except (AssertionError, IndexError) as err: 28 | raise MalformedJSON(*err.args) from err 29 | 30 | 31 | def skip_blank(text: str, index: int): 32 | try: 33 | while text[index].isspace(): 34 | index += 1 35 | finally: 36 | return index 37 | 38 | 39 | def complete_any(json_string: str, allow: Allow, is_top_level=False) -> CompleteResult: 40 | i = skip_blank(json_string, 0) 41 | char = json_string[i] 42 | 43 | if char == '"': 44 | return complete_str(json_string, allow) 45 | 46 | if char in "1234567890": 47 | return complete_num(json_string, allow, is_top_level) 48 | 49 | if char == "[": 50 | return complete_arr(json_string, allow) 51 | 52 | if char == "{": 53 | return complete_obj(json_string, allow) 54 | 55 | if json_string.startswith("null"): 56 | return (4, True) 57 | if "null".startswith(json_string): 58 | return (0, "null") if NULL in allow else False 59 | 60 | if json_string.startswith("true"): 61 | return (4, True) 62 | if "true".startswith(json_string): 63 | return (0, "true") if BOOL in allow else False 64 | 65 | if json_string.startswith("false"): 66 | return (5, True) 67 | if "false".startswith(json_string): 68 | return (0, "false") if BOOL in allow else False 69 | 70 | if json_string.startswith("Infinity"): 71 | return (8, True) 72 | if "Infinity".startswith(json_string): 73 | return (0, "Infinity") if INFINITY in allow else False 74 | 75 | if char == "-": 76 | if len(json_string) == 1: 77 | return False 78 | elif json_string[1] != "I": 79 | return complete_num(json_string, allow, is_top_level) 80 | 81 | if json_string.startswith("-Infinity"): 82 | return (9, True) 83 | if "-Infinity".startswith(json_string): 84 | return (0, "-Infinity") if _INFINITY in allow else False 85 | 86 | if json_string.startswith("NaN"): 87 | return (3, True) 88 | if "NaN".startswith(json_string): 89 | return (0, "NaN") if NAN in allow else False 90 | 91 | raise MalformedJSON(f"Unexpected character {char}") 92 | 93 | 94 | def complete_str(json_string: str, allow: Allow) -> CompleteResult: 95 | assert json_string[0] == '"' 96 | 97 | length = len(json_string) 98 | 99 | i = 1 100 | 101 | try: 102 | while True: 103 | char = json_string[i] 104 | 105 | if char == "\\": 106 | if i + 1 == length: 107 | raise IndexError 108 | i += 2 109 | continue 110 | if char == '"': 111 | return i + 1, True 112 | 113 | i += 1 114 | 115 | except IndexError: 116 | if STR not in allow: 117 | return False 118 | 119 | def not_escaped(index: int): 120 | text_before = json_string[:index] 121 | count = index - len(text_before.rstrip("\\")) 122 | return count % 2 == 0 123 | 124 | # \uXXXX 125 | _u = json_string.rfind("\\u", max(0, i - 5), i) 126 | if _u != -1 and not_escaped(_u): 127 | return _u, '"' 128 | 129 | # \UXXXXXXXX 130 | _U = json_string.rfind("\\U", max(0, i - 9), i) 131 | if _U != -1 and not_escaped(_U): 132 | return _U, '"' 133 | 134 | # \xXX 135 | _x = json_string.rfind("\\x", max(0, i - 3), i) 136 | if _x != -1 and not_escaped(_x): 137 | return _x, '"' 138 | 139 | return i, '"' 140 | 141 | 142 | def complete_arr(json_string: str, allow: Allow) -> CompleteResult: 143 | assert json_string[0] == "[" 144 | i = j = 1 145 | 146 | try: 147 | while True: 148 | j = skip_blank(json_string, j) 149 | 150 | if json_string[j] == "]": 151 | return j + 1, True 152 | 153 | result = complete_any(json_string[j:], allow) 154 | 155 | if result is False: # incomplete 156 | return (i, "]") if ARR in allow else False 157 | if result[1] is True: # complete 158 | i = j = j + result[0] 159 | else: # incomplete 160 | return (j + result[0], result[1] + "]") if ARR in allow else False 161 | 162 | j = skip_blank(json_string, j) 163 | 164 | if json_string[j] == ",": 165 | j += 1 166 | elif json_string[j] == "]": 167 | return j + 1, True 168 | else: 169 | raise MalformedJSON(f"Expected ',' or ']', got {json_string[j]}") 170 | except IndexError: 171 | return (i, "]") if ARR in allow else False 172 | 173 | 174 | def complete_obj(json_string: str, allow: Allow) -> CompleteResult: 175 | assert json_string[0] == "{" 176 | i = j = 1 177 | 178 | try: 179 | while True: 180 | j = skip_blank(json_string, j) 181 | 182 | if json_string[j] == "}": 183 | return j + 1, True 184 | 185 | result = complete_str(json_string[j:], allow) 186 | if result and result[1] is True: # complete 187 | j += result[0] 188 | else: # incomplete 189 | return (i, "}") if OBJ in allow else False 190 | 191 | j = skip_blank(json_string, j) 192 | 193 | if json_string[j] != ":": 194 | raise MalformedJSON(f"Expected ':', got {json_string[j]}") 195 | j += 1 196 | 197 | j = skip_blank(json_string, j) 198 | 199 | result = complete_any(json_string[j:], allow) 200 | if result is False: # incomplete 201 | return (i, "}") if OBJ in allow else False 202 | if result[1] is True: # complete 203 | i = j = j + result[0] 204 | else: # incomplete 205 | return (j + result[0], result[1] + "}") if OBJ in allow else False 206 | 207 | j = skip_blank(json_string, j) 208 | 209 | if json_string[j] == ",": 210 | j += 1 211 | elif json_string[j] == "}": 212 | return j + 1, True 213 | else: 214 | raise MalformedJSON(f"Expected ',' or '}}', got {json_string[j]}") 215 | except IndexError: 216 | return (i, "}") if OBJ in allow else False 217 | 218 | 219 | def complete_num(json_string: str, allow: Allow, is_top_level=False) -> CompleteResult: 220 | i = 1 221 | length = len(json_string) 222 | 223 | # forward 224 | while i < length and json_string[i] in "1234567890.-+eE": 225 | i += 1 226 | 227 | modified = False 228 | 229 | # backward 230 | while json_string[i - 1] in ".-+eE": 231 | modified = True 232 | i -= 1 233 | 234 | if modified or i == length and not is_top_level: 235 | return (i, "") if NUM in allow else False 236 | else: 237 | return i, True 238 | -------------------------------------------------------------------------------- /src/partial_json_parser/core/exceptions.py: -------------------------------------------------------------------------------- 1 | class JSONDecodeError(ValueError): 2 | pass 3 | 4 | 5 | class PartialJSON(JSONDecodeError): 6 | pass 7 | 8 | 9 | class MalformedJSON(JSONDecodeError): 10 | pass 11 | -------------------------------------------------------------------------------- /src/partial_json_parser/core/myelin.py: -------------------------------------------------------------------------------- 1 | """Myelin acts as the highway among neurons, epitomizing the leapfrog methodology within this algorithm.""" 2 | 3 | from re import compile 4 | from typing import List, Tuple, Union 5 | 6 | from .complete import _fix 7 | from .exceptions import PartialJSON 8 | from .options import * 9 | 10 | finditer = compile(r'["\[\]{}]').finditer 11 | 12 | 13 | def scan(json_string: str): 14 | return [(match.start(), match.group()) for match in finditer(json_string)] 15 | 16 | 17 | def join_closing_tokens(stack: List[Tuple[int, str]]): 18 | return "".join("}" if char == "{" else "]" for _, char in reversed(stack)) 19 | 20 | 21 | def fix_fast(json_string: str, allow_partial: Union[Allow, int] = ALL): 22 | allow = Allow(allow_partial) 23 | 24 | def is_escaped(index: int): 25 | text_before = json_string[:index] 26 | count = index - len(text_before.rstrip("\\")) 27 | return count % 2 28 | 29 | stack = [] 30 | in_string = False 31 | last_string_start = -1 32 | last_string_end = -1 33 | 34 | tokens = scan(json_string) 35 | 36 | if not tokens or tokens[0][1] == '"': 37 | return _fix(json_string, allow, True) 38 | 39 | for i, char in tokens: 40 | if char == '"': 41 | if not in_string: 42 | in_string = True 43 | last_string_start = i 44 | elif not is_escaped(i): 45 | in_string = False 46 | last_string_end = i 47 | 48 | elif not in_string: 49 | if char == "}": 50 | _i, _char = stack.pop() 51 | assert _char == "{", f"Expected '{{' at index {_i}, got '{_char}'" 52 | elif char == "]": 53 | _i, _char = stack.pop() 54 | assert _char == "[", f"Expected '[' at index {_i}, got '{_char}'" 55 | else: 56 | stack.append((i, char)) 57 | 58 | if not stack: 59 | return json_string, "" 60 | 61 | # check if the opening tokens are allowed 62 | 63 | if (STR | COLLECTION) not in allow: 64 | 65 | def truncate_before_last_key_start(container_start: int, last_string_end: int, stack): 66 | last_key_start = last_string_end # backtrace the last key's start and retry finding the last comma 67 | while True: 68 | last_key_start = json_string.rfind('"', container_start, last_key_start) 69 | if last_key_start == -1: # this is the only key 70 | # { "key": "v 71 | return json_string[: container_start + 1], join_closing_tokens(stack) 72 | if is_escaped(last_key_start): 73 | last_key_start -= 1 74 | else: 75 | last_comma = json_string.rfind(",", container_start, last_key_start) 76 | if last_comma == -1: 77 | # { "key": " 78 | return json_string[: container_start + 1], join_closing_tokens(stack) 79 | # # { ... "key": ... , " 80 | return json_string[:last_comma], join_closing_tokens(stack) 81 | 82 | if COLLECTION not in allow: 83 | for index, [_i, _char] in enumerate(stack): 84 | if _char == "{" and OBJ not in allow or _char == "[" and ARR not in allow: 85 | if index == 0: 86 | raise PartialJSON 87 | 88 | # to truncate before the last container token and the last comma (if exists) of its parent container 89 | 90 | # reset `last_string_end` to before `_i` 91 | if _i < last_string_start: 92 | if last_string_start < _i: # ... { "k 93 | last_string_end = json_string.rfind('"', last_string_end, _i) 94 | else: # ... { "" ... 95 | last_string_end = json_string.rfind('"', None, _i) 96 | 97 | last_comma = json_string.rfind(",", max(stack[index - 1][0], last_string_end) + 1, _i) 98 | 99 | if last_comma == -1: 100 | if stack[index - 1][1] == "[": 101 | # [ ... [ 102 | return json_string[:_i], join_closing_tokens(stack[:index]) 103 | 104 | # { "key": [ 1, 2, "v 105 | # { "key": [ 1, 2, "value" 106 | if last_string_start > last_string_end: 107 | return truncate_before_last_key_start(stack[index - 1][0], last_string_end, stack[:index]) 108 | 109 | last_comma = json_string.rfind(",", stack[index - 1][0] + 1, last_string_start) 110 | if last_comma == -1: 111 | return json_string[: stack[index - 1][0] + 1], join_closing_tokens(stack[:index]) 112 | return json_string[:last_comma], join_closing_tokens(stack[:index]) 113 | 114 | # { ..., "key": { 115 | # ..., { 116 | return json_string[:last_comma], join_closing_tokens(stack[:index]) 117 | 118 | if STR not in allow and in_string: # truncate before the last key 119 | if stack[-1][0] > last_string_end and stack[-1][1] == "{": 120 | # { "k 121 | return json_string[: stack[-1][0] + 1], join_closing_tokens(stack) 122 | 123 | last_comma = json_string.rfind(",", max(stack[-1][0], last_string_end) + 1, last_string_start - 1) 124 | if last_comma != -1: 125 | # { "key": "v", "k 126 | # { "key": 123, "k 127 | # [ 1, 2, 3, "k 128 | return json_string[:last_comma], join_closing_tokens(stack) 129 | 130 | # { ... "key": "v 131 | return truncate_before_last_key_start(stack[-1][0], last_string_end, stack) 132 | 133 | # only fix the rest of the container in O(1) time complexity 134 | 135 | assert in_string == (last_string_end < last_string_start) 136 | 137 | if in_string: 138 | if stack[-1][1] == "[": # [ ... "val 139 | head, tail = _fix(json_string[last_string_start:], allow) # fix the last string 140 | return json_string[:last_string_start] + head, tail + join_closing_tokens(stack) 141 | 142 | assert stack[-1][1] == "{" # { ... "val 143 | 144 | start = max(last_string_end, stack[-1][0]) 145 | 146 | if "," in json_string[start + 1 : last_string_start]: 147 | # { ... "k": "v", "key 148 | # { ... "k": 123, "key 149 | last_comma = json_string.rindex(",", start, last_string_start) 150 | head, tail = _fix(stack[-1][1] + json_string[last_comma + 1 :], allow) 151 | return json_string[:last_comma] + head[1:], tail + join_closing_tokens(stack[:-1]) 152 | 153 | if ":" in json_string[start + 1 : last_string_start]: 154 | # { ... ": "val 155 | head, tail = _fix(json_string[last_string_start:], allow) # fix the last string (same as array) 156 | return json_string[:last_string_start] + head, tail + join_closing_tokens(stack) 157 | 158 | # {"key 159 | return json_string[:last_string_start], join_closing_tokens(stack) 160 | 161 | last_comma = json_string.rfind(",", max(last_string_end, i) + 1) 162 | 163 | if last_comma != -1: 164 | i, char = stack[-1] 165 | 166 | if not json_string[last_comma + 1 :].strip(): # comma at the end 167 | # { ... "key": "value", 168 | return json_string[:last_comma], join_closing_tokens(stack) 169 | 170 | assert char == "[", json_string # array with many non-string literals 171 | 172 | # [ ..., 1, 2, 3, 4 173 | 174 | head, tail = _fix(char + json_string[last_comma + 1 :], allow) 175 | if not head[1:] + tail[:-1].strip(): # empty, so trim the last comma 176 | return json_string[:last_comma] + head[1:], tail + join_closing_tokens(stack[:-1]) 177 | return json_string[: last_comma + 1] + head[1:], tail + join_closing_tokens(stack[:-1]) 178 | 179 | # can't find comma after the last string and after the last container token 180 | 181 | if char in "]}": 182 | # ... [ ... ] 183 | # ... { ... } 184 | assert not json_string[i + 1 :].strip() 185 | return json_string, join_closing_tokens(stack) 186 | 187 | if char in "[{": 188 | # ... [ ... 189 | # ... { ... 190 | head, tail = _fix(json_string[i:], allow) 191 | return json_string[:i] + head, tail + join_closing_tokens(stack[:-1]) 192 | 193 | assert char == '"' 194 | 195 | i, char = stack[-1] 196 | 197 | if char == "[": # [ ... "val" 198 | return json_string, join_closing_tokens(stack) 199 | 200 | assert char == "{" 201 | last_colon = json_string.rfind(":", last_string_end) 202 | last_comma = json_string.rfind(",", i + 1, last_string_start) 203 | 204 | if last_comma == -1: # only 1 key 205 | # ... { "key" 206 | # ... { "key": "value" 207 | head, tail = _fix(json_string[i:], allow) 208 | return json_string[:i] + head, tail + join_closing_tokens(stack[:-1]) 209 | 210 | if last_colon == -1: 211 | if json_string.rfind(":", max(i, last_comma) + 1, last_string_start) != -1: 212 | # { ... , "key": "value" 213 | return json_string, join_closing_tokens(stack) 214 | 215 | # { ... , "key" 216 | head, tail = _fix("{" + json_string[last_comma + 1 :], allow) 217 | if not head[1:] + tail[:-1].strip(): 218 | return json_string[:last_comma] + head[1:], tail + join_closing_tokens(stack[:-1]) 219 | return json_string[: last_comma + 1] + head[1:], tail + join_closing_tokens(stack) 220 | 221 | assert last_colon > last_comma # { ... , "key": 222 | 223 | head, tail = _fix("{" + json_string[last_comma + 1 :], allow) 224 | if not head[1:] + tail[:-1].strip(): 225 | return json_string[:last_comma] + head[1:], tail + join_closing_tokens(stack[:-1]) 226 | return json_string[: last_comma + 1] + head[1:], tail + join_closing_tokens(stack[:-1]) 227 | -------------------------------------------------------------------------------- /src/partial_json_parser/core/options.py: -------------------------------------------------------------------------------- 1 | from enum import IntFlag, auto 2 | 3 | 4 | class Allow(IntFlag): 5 | """Specify what kind of partialness is allowed during JSON parsing""" 6 | 7 | STR = auto() 8 | NUM = auto() 9 | ARR = auto() 10 | OBJ = auto() 11 | NULL = auto() 12 | BOOL = auto() 13 | NAN = auto() 14 | INFINITY = auto() 15 | _INFINITY = auto() 16 | 17 | INF = INFINITY | _INFINITY 18 | SPECIAL = NULL | BOOL | INF | NAN 19 | ATOM = STR | NUM | SPECIAL 20 | COLLECTION = ARR | OBJ 21 | ALL = ATOM | COLLECTION 22 | 23 | 24 | STR = Allow.STR 25 | NUM = Allow.NUM 26 | ARR = Allow.ARR 27 | OBJ = Allow.OBJ 28 | NULL = Allow.NULL 29 | BOOL = Allow.BOOL 30 | NAN = Allow.NAN 31 | INFINITY = Allow.INFINITY 32 | _INFINITY = Allow._INFINITY 33 | INF = Allow.INF 34 | SPECIAL = Allow.SPECIAL 35 | ATOM = Allow.ATOM 36 | COLLECTION = Allow.COLLECTION 37 | ALL = Allow.ALL 38 | 39 | 40 | __all__ = [ 41 | "Allow", 42 | "STR", 43 | "NUM", 44 | "ARR", 45 | "OBJ", 46 | "NULL", 47 | "BOOL", 48 | "NAN", 49 | "INFINITY", 50 | "_INFINITY", 51 | "INF", 52 | "SPECIAL", 53 | "ATOM", 54 | "COLLECTION", 55 | "ALL", 56 | ] 57 | -------------------------------------------------------------------------------- /src/partial_json_parser/options.py: -------------------------------------------------------------------------------- 1 | """For backward compatibility.""" 2 | 3 | from .core.options import * 4 | -------------------------------------------------------------------------------- /src/partial_json_parser/playground.py: -------------------------------------------------------------------------------- 1 | from rich.console import Console 2 | from rich.highlighter import JSONHighlighter 3 | from rich.style import Style 4 | from rich.text import Span 5 | 6 | from partial_json_parser import fix_fast 7 | 8 | console = Console() 9 | highlight = JSONHighlighter() 10 | 11 | 12 | def main(): 13 | while True: 14 | try: 15 | input_str = console.input("[d]>>> ") 16 | 17 | head, tail = fix_fast(input_str) 18 | json = head + tail 19 | 20 | rich_text = highlight(json) 21 | if tail: 22 | rich_text.spans.append(Span(len(head), len(json), Style(dim=True))) 23 | 24 | console.print(" " * 3, rich_text) 25 | 26 | except KeyboardInterrupt: 27 | return 28 | except Exception as err: 29 | console.print(f"{err.__class__.__name__}:", style="bold red", highlight=False, end=" ") 30 | console.print(" ".join(map(str, err.args)), style="yellow", highlight=False) 31 | -------------------------------------------------------------------------------- /src/partial_json_parser/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.1.1.post5" 2 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | from itertools import accumulate 2 | from json import dumps 3 | from math import isnan 4 | 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import integers 7 | from pytest import raises 8 | from test_hypotheses import json 9 | 10 | from partial_json_parser import * 11 | from partial_json_parser.core.options import * 12 | 13 | 14 | def test_str(): 15 | assert parse_json('"', STR) == "" 16 | with raises(PartialJSON): 17 | parse_json('"', ~STR) 18 | 19 | assert parse_json(r'"\\') == "\\" 20 | assert parse_json(r'"\\u') == "\\u" 21 | assert parse_json(r'"\\U\\u') == "\\U\\u" 22 | 23 | 24 | def test_arr(): 25 | assert parse_json('["', ARR) == [] 26 | assert parse_json('["', ARR | STR) == [""] 27 | 28 | with raises(PartialJSON): 29 | parse_json("[", STR) 30 | with raises(PartialJSON): 31 | parse_json('["', STR) 32 | with raises(PartialJSON): 33 | parse_json('[""', STR) 34 | with raises(PartialJSON): 35 | parse_json('["",', STR) 36 | 37 | 38 | def test_obj(): 39 | assert parse_json('{"": "', OBJ) == {} 40 | assert parse_json('{"": "', OBJ | STR) == {"": ""} 41 | 42 | with raises(PartialJSON): 43 | parse_json("{", STR) 44 | with raises(PartialJSON): 45 | parse_json('{"', STR) 46 | with raises(PartialJSON): 47 | parse_json('{""', STR) 48 | with raises(PartialJSON): 49 | parse_json('{"":', STR) 50 | with raises(PartialJSON): 51 | parse_json('{"":"', STR) 52 | with raises(PartialJSON): 53 | parse_json('{"":""', STR) 54 | 55 | 56 | def test_singletons(): 57 | assert parse_json("n", NULL) is None 58 | with raises(PartialJSON): 59 | parse_json("n", ~NULL) 60 | 61 | assert parse_json("t", BOOL) == True 62 | with raises(PartialJSON): 63 | parse_json("t", ~BOOL) 64 | 65 | assert parse_json("f", BOOL) == False 66 | with raises(PartialJSON): 67 | parse_json("f", ~BOOL) 68 | 69 | assert parse_json("I", INF) == float("inf") 70 | with raises(PartialJSON): 71 | parse_json("I", ~INFINITY) 72 | 73 | assert parse_json("-I", INF) == float("-inf") 74 | with raises(PartialJSON): 75 | parse_json("-I", ~_INFINITY) 76 | 77 | assert isnan(parse_json("N", NAN)) # type: ignore 78 | with raises(PartialJSON): 79 | parse_json("N", ~NAN) 80 | 81 | 82 | def test_num(): 83 | assert parse_json("0", ~NUM) == 0 84 | assert parse_json("-1.25e+4", ~NUM) == -1.25e4 85 | assert parse_json("-1.25e+", NUM) == -1.25 86 | assert parse_json("-1.25e", NUM) == -1.25 87 | 88 | 89 | def test_error(): 90 | with raises(MalformedJSON): 91 | parse_json("a") 92 | with raises(MalformedJSON): 93 | parse_json("{0") 94 | with raises(MalformedJSON): 95 | parse_json("--") 96 | 97 | 98 | def test_fix(): 99 | assert fix("[") == fix_fast("[") == ("[", "]") 100 | assert fix("[0.") == fix_fast("[0.") == ("[0", "]") 101 | assert fix('{"key": ') == fix_fast('{"key": ') == ("{", "}") 102 | assert fix("t") == fix_fast("t") == ("", "true") 103 | assert fix("[1", ~NUM) == fix_fast("[1", ~NUM) == ("[", "]") 104 | assert fix("1", ~NUM) == fix_fast("1", ~NUM) == ("1", "") 105 | with raises(PartialJSON): 106 | fix("-") 107 | 108 | 109 | def consistent(json_string, allow): 110 | try: 111 | res = fix(json_string, allow) 112 | return res == fix_fast(json_string, allow) 113 | except PartialJSON as err: 114 | with raises(PartialJSON, match=str(err)): 115 | fix_fast(json_string, allow) 116 | return True 117 | 118 | 119 | def test_consistency(): 120 | dict_example = {"key1": 123, "key2": "value"} 121 | list_example = [1, 2, None, float("inf"), float("-inf"), float("nan"), True, False, "string", dict_example] 122 | 123 | dict_json = dumps(dict_example) 124 | list_json = dumps(list_example) 125 | 126 | for json_string in (*accumulate(dict_json), *accumulate(list_json)): 127 | for allow in range(ALL + 1): 128 | assert consistent(json_string, allow), f"{Allow(allow)!r} - {json_string}" 129 | 130 | 131 | @settings(deadline=None) 132 | @given(json.map(dumps), integers(0, ALL).map(Allow)) 133 | def test_consistencies(json_string, allow): 134 | for json_string in accumulate(json_string): 135 | assert consistent(json_string, allow), f"{Allow(allow)!r} - {json_string}" 136 | -------------------------------------------------------------------------------- /tests/test_hypotheses.py: -------------------------------------------------------------------------------- 1 | from json import dumps 2 | 3 | from hypothesis import given, settings 4 | from hypothesis import strategies as st 5 | from tqdm import tqdm 6 | 7 | from partial_json_parser.core.api import parse_json 8 | 9 | json = st.recursive( 10 | st.none() | st.booleans() | st.floats() | st.text(), 11 | lambda children: st.lists(children) | st.dictionaries(st.text(), children), 12 | ) 13 | 14 | 15 | bar = tqdm(ascii=True, ncols=200, leave=False) 16 | FINE_JSON_EXAMPLES = 333 17 | PARTIAL_JSON_EXAMPLES = 333 18 | 19 | 20 | @settings(max_examples=FINE_JSON_EXAMPLES, deadline=None) 21 | @given(json) 22 | def test_fine_json(anything): 23 | assert str(anything) == str(parse_json(dumps(anything, ensure_ascii=False))) 24 | bar.update() 25 | 26 | 27 | @settings(max_examples=PARTIAL_JSON_EXAMPLES, deadline=None) 28 | @given(json) 29 | def test_partial_json(anything): 30 | json_string = dumps(anything, ensure_ascii=False) 31 | for i in range(1, len(json_string), max(1, len(json_string) // 10)): 32 | if json_string.startswith("-", 0, i): 33 | continue 34 | parse_json(json_string[:i]) 35 | bar.update() 36 | 37 | 38 | def main(): 39 | bar.set_description(" Testing Partial JSON ", False) 40 | bar.clear() 41 | bar.reset(PARTIAL_JSON_EXAMPLES) 42 | test_partial_json() 43 | 44 | bar.set_description(" Testing F i n e JSON ", False) 45 | bar.clear() 46 | bar.reset(FINE_JSON_EXAMPLES) 47 | test_fine_json() 48 | 49 | bar.close() 50 | -------------------------------------------------------------------------------- /tests/test_performance.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from json import dumps 3 | from timeit import timeit 4 | 5 | from hypothesis import HealthCheck, given, settings 6 | from hypothesis import strategies as st 7 | 8 | from partial_json_parser import ALL, ARR, COLLECTION, OBJ, SPECIAL, STR, fix, fix_fast 9 | 10 | 11 | def deep_json(depth: int): 12 | strategy: st.SearchStrategy = st.none() | st.booleans() | st.floats() | st.text() 13 | for _ in range(depth): 14 | strategy = st.lists(strategy, min_size=5, max_size=20) | st.dictionaries(st.text(), strategy, min_size=5, max_size=20) 15 | return strategy 16 | 17 | 18 | dumps = partial(dumps, ensure_ascii=False) 19 | 20 | 21 | @settings(deadline=None, suppress_health_check={HealthCheck.data_too_large, HealthCheck.too_slow}) 22 | @given(deep_json(2).map(dumps)) 23 | def test_complete_json_faster(json_string: str): 24 | t1 = timeit(lambda: fix(json_string, 0), number=500) * 1000 25 | t2 = timeit(lambda: fix_fast(json_string, 0), number=500) * 1000 26 | 27 | v1 = 1 / t1 28 | v2 = 1 / t2 29 | if t1 > t2: 30 | print(f" {len(json_string):>10} chars - {(v2 - v1) / v1:>6.1%} faster") 31 | else: 32 | print() 33 | print(json_string) 34 | print() 35 | print(f" {len(json_string):>10} chars - {(v1 - v2) / v1:>6.1%} slower") 36 | 37 | 38 | @settings(deadline=None, suppress_health_check={HealthCheck.data_too_large, HealthCheck.too_slow}) 39 | @given(deep_json(2).map(dumps).map(lambda s: s[: -len(s) // 2]), st.integers(0, 3).map([ALL, COLLECTION, ARR | STR, OBJ | SPECIAL].__getitem__)) 40 | def test_incomplete_json_faster(json_string: str, allow): 41 | if json_string.startswith("[") and ARR not in allow or json_string.startswith("{") and OBJ not in allow: 42 | return 43 | 44 | t1 = timeit(lambda: fix(json_string, allow), number=200) * 1000 45 | t2 = timeit(lambda: fix_fast(json_string, allow), number=200) * 1000 46 | 47 | v1 = 1 / t1 48 | v2 = 1 / t2 49 | if t1 > t2: 50 | print(f" {len(json_string):>10} chars - {(v2 - v1) / v1:>6.1%} faster : {allow!r}") 51 | else: 52 | print() 53 | print(json_string) 54 | print() 55 | print(f" {len(json_string):>10} chars - {(v1 - v2) / v1:>6.1%} slower : {allow!r}") 56 | 57 | 58 | def main(): 59 | print() 60 | test_incomplete_json_faster() 61 | test_complete_json_faster() 62 | print() 63 | --------------------------------------------------------------------------------