├── Makefile ├── streamingjson ├── __init__.py ├── lexer_helper.py ├── lexer_tokens.py └── lexer.py ├── examples ├── simple_json_stream │ └── main.py └── gpt_function_call │ └── main.py ├── pyproject.toml ├── .github └── workflows │ └── coverage.yaml ├── .gitignore ├── LICENSE ├── tox.ini ├── tests ├── test_lexer_helper.py └── test_lexer.py └── README.md /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all manual-build manual-upload-to-testpypi manual-upload-to-pypi test 2 | 3 | all: manual-build 4 | 5 | manual-build: 6 | @python -m build 7 | 8 | manual-upload-to-testpypi: 9 | @python -m twine upload --repository testpypi dist/streamingjson-$(VERSION)* 10 | 11 | manual-upload-to-pypi: 12 | @python -m twine upload --repository pypi dist/streamingjson-$(VERSION)* 13 | 14 | test: 15 | @python -m tox 16 | -------------------------------------------------------------------------------- /streamingjson/__init__.py: -------------------------------------------------------------------------------- 1 | from .lexer import Lexer 2 | 3 | __version__ = "0.0.4" 4 | 5 | __title__ = "streamingjson" 6 | __description__ = ( 7 | "A streamlined, user-friendly JSON streaming preprocessor, crafted in Python." 8 | ) 9 | __url__ = "https://github.com/karminski/streaming-json-py" 10 | __uri__ = __url__ 11 | __doc__ = f"{__description__} <{__uri__}>" 12 | 13 | __author__ = "Karminski" 14 | __email__ = "code.karminski@outlook.com" 15 | 16 | __license__ = "MIT" 17 | __copyright__ = "Copyright 2024 Karminski" 18 | 19 | __all__ = [ 20 | "Lexer", 21 | ] 22 | -------------------------------------------------------------------------------- /streamingjson/lexer_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | helper method for lexer 3 | """ 4 | 5 | 6 | def is_ignore_token(c): 7 | """ 8 | check if target character is ignore token 9 | """ 10 | return c in "\t\n\v\f\r " 11 | 12 | 13 | def match_stack(stack, tokens): 14 | """ 15 | check if target stack match given tokens 16 | """ 17 | pointer = len(stack) 18 | tokens_left = len(tokens) 19 | 20 | while True: 21 | tokens_left -= 1 22 | pointer -= 1 23 | if tokens_left < 0: 24 | break 25 | if pointer < 0: 26 | return False 27 | if stack[pointer] != tokens[tokens_left]: 28 | return False 29 | return True 30 | -------------------------------------------------------------------------------- /examples/simple_json_stream/main.py: -------------------------------------------------------------------------------- 1 | import streamingjson 2 | 3 | 4 | def main(): 5 | # Case A, complete the incomplete JSON object 6 | json_segment_a = '{"a":' # will complete to `{"a":null}` 7 | lexer = streamingjson.Lexer() 8 | lexer.append_string(json_segment_a) 9 | completed_json = lexer.complete_json() 10 | print(f"completedJSON: {completed_json}") 11 | 12 | # Case B, complete the incomplete JSON array 13 | json_segment_b = "[t" # will complete to `[true]` 14 | lexer = streamingjson.Lexer() 15 | lexer.append_string(json_segment_b) 16 | completed_json = lexer.complete_json() 17 | print(f"completedJSON: {completed_json}") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "streamingjson" 7 | version = "0.0.5" 8 | authors = [ 9 | { name="Karminski", email="code.karminski@outlook.com" }, 10 | ] 11 | description = "A streamlined, user-friendly JSON streaming preprocessor, crafted in Python." 12 | readme = "README.md" 13 | requires-python = ">=3.7" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | 20 | [project.urls] 21 | Homepage = "https://github.com/karminski/streaming-json-py" 22 | Issues = "https://github.com/karminski/streaming-json-py/issues" 23 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yaml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | on: [push, pull_request] 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | python: ["3.9", "3.10", "3.11", "3.12"] 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Setup Python ${{ matrix.python }} 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: ${{ matrix.python }} 16 | - name: Install dependencies 17 | run: pip install poetry tox tox-gh-actions codecov 18 | - name: Run tox 19 | run: tox 20 | - name: Upload coverage reports to Codecov 21 | uses: codecov/codecov-action@v4.0.1 22 | with: 23 | token: ${{ secrets.CODECOV_TOKEN }} 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | .pytest_cache 62 | .mypy_cache 63 | pip-wheel-metadata/ 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | 4 | Copyright (c) 2024 Karminski 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -ra 3 | testpaths = tests 4 | filterwarnings = 5 | once::Warning 6 | ignore:::pympler[.*] 7 | 8 | 9 | [gh-actions] 10 | python = 11 | 3.7: py37, docs 12 | 3.8: py38, typing 13 | 3.9: py39 14 | 3.10: py310 15 | 3.11: py311 16 | pypy-3.8: pypy3 17 | pypy-3.9: pypy3 18 | 19 | 20 | [tox] 21 | envlist = 22 | lint 23 | typing 24 | py{37,38,39,310,311,py3} 25 | docs 26 | pypi-description 27 | coverage-report 28 | isolated_build = True 29 | 30 | 31 | [testenv] 32 | # Prevent random setuptools/pip breakages like 33 | # https://github.com/pypa/setuptools/issues/1042 from breaking our builds. 34 | setenv = 35 | VIRTUALENV_NO_DOWNLOAD=1 36 | deps = 37 | coverage 38 | pytest 39 | extras = 40 | tests 41 | commands = {envpython} -b -m coverage run -m pytest {posargs} 42 | 43 | 44 | [testenv:docs] 45 | basepython = python3.9 46 | extras = docs 47 | commands = 48 | sphinx-build -n -T -W -b html -d {envtmpdir}/doctrees docs docs/_build/html 49 | sphinx-build -n -T -W -b doctest -d {envtmpdir}/doctrees docs docs/_build/html 50 | python -m doctest README.rst 51 | 52 | 53 | [testenv:lint] 54 | basepython = python3.9 55 | extras = dev 56 | passenv = HOMEPATH # needed on Windows 57 | commands = pre-commit run --all-files 58 | 59 | 60 | [testenv:pypi-description] 61 | basepython = python3.9 62 | skip_install = true 63 | deps = 64 | twine 65 | pip >= 18.0.0 66 | commands = 67 | pip wheel -w {envtmpdir}/build --no-deps . 68 | twine check {envtmpdir}/build/* 69 | 70 | 71 | [testenv:coverage-report] 72 | basepython = python3.9 73 | skip_install = true 74 | deps = coverage[toml]==5.0.4 75 | commands = 76 | coverage combine 77 | coverage report 78 | -------------------------------------------------------------------------------- /tests/test_lexer_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | test cases for lexer_helper 3 | """ 4 | 5 | from streamingjson import lexer_helper 6 | from streamingjson import lexer_tokens 7 | 8 | 9 | class TestMatchStack: 10 | """ 11 | test cases for match_stack method 12 | """ 13 | 14 | def test_match_stack_0(self): 15 | """ 16 | simple match test 17 | """ 18 | stack = [lexer_tokens.TOKEN_LEFT_BRACE] 19 | tokens = [lexer_tokens.TOKEN_LEFT_BRACE] 20 | match_result = lexer_helper.match_stack(stack, tokens) 21 | assert match_result is True 22 | 23 | def test_match_stack_1(self): 24 | """ 25 | match full stack test 26 | """ 27 | stack = [ 28 | lexer_tokens.TOKEN_RIGHT_BRACE, 29 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 30 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 31 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 32 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 33 | lexer_tokens.TOKEN_COLON, 34 | ] 35 | tokens = [ 36 | lexer_tokens.TOKEN_RIGHT_BRACE, 37 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 38 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 39 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 40 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 41 | lexer_tokens.TOKEN_COLON, 42 | ] 43 | match_result = lexer_helper.match_stack(stack, tokens) 44 | assert match_result is True 45 | 46 | def test_match_stack_2(self): 47 | """ 48 | match multi element in stack test 49 | """ 50 | stack = [ 51 | lexer_tokens.TOKEN_LEFT_BRACE, 52 | lexer_tokens.TOKEN_QUOTE, 53 | lexer_tokens.TOKEN_QUOTE, 54 | lexer_tokens.TOKEN_COLON, 55 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 56 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 57 | ] 58 | tokens = [ 59 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 60 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 61 | ] 62 | match_result = lexer_helper.match_stack(stack, tokens) 63 | assert match_result is True 64 | -------------------------------------------------------------------------------- /examples/gpt_function_call/main.py: -------------------------------------------------------------------------------- 1 | import streamingjson 2 | 3 | 4 | # In GPT's chat completion stream mode, the request for tool_calls returns a structure as follows: 5 | # 6 | # { 7 | # "id": "chatcmpl-?", 8 | # "object": "chat.completion.chunk", 9 | # "created": 1712000001, 10 | # "model": "gpt-4-0125-preview", 11 | # "system_fingerprint": "fp_?", 12 | # "choices": [ 13 | # { 14 | # "index": 0, 15 | # "delta": { 16 | # "tool_calls": [ 17 | # { 18 | # "index": 0, 19 | # "function": { 20 | # "arguments": "{\"fi" 21 | # } 22 | # } 23 | # ] 24 | # }, 25 | # "logprobs": null, 26 | # "finish_reason": null 27 | # } 28 | # ] 29 | # } 30 | # 31 | # We need extract data.choices[0].delta.tool_calls[0].function.arguments. 32 | # The arguments fiels is a JSON fragment, we can use steaming-json-go complete it to a syntactically correct JSON and Unmarshal it. 33 | 34 | 35 | def main(): 36 | # We use string slice to simulate the arguments field in the return of GPT. 37 | arguments = [ 38 | '{"fu', 39 | "nction", 40 | "_name", 41 | '"', 42 | ":", 43 | '"run', 44 | "_code", 45 | '", ', 46 | '"argu', 47 | 'ments"', 48 | ": ", 49 | '"print(', 50 | '\\"hello', 51 | " world", 52 | '\\"', 53 | ')"', 54 | ] 55 | lexer = streamingjson.Lexer() 56 | 57 | for json_fragment in arguments: 58 | try: 59 | lexer.append_string(json_fragment) 60 | print(lexer.complete_json()) 61 | except ValueError as e: 62 | print(f"invalied json string appended: {e}") 63 | 64 | 65 | # will print: 66 | # {"fu":null} 67 | # {"function":null} 68 | # {"function_name":null} 69 | # {"function_name":null} 70 | # {"function_name":null} 71 | # {"function_name":"run"} 72 | # {"function_name":"run_code"} 73 | # {"function_name":"run_code"} 74 | # {"function_name":"run_code", "argu":null} 75 | # {"function_name":"run_code", "arguments":null} 76 | # {"function_name":"run_code", "arguments":null} 77 | # {"function_name":"run_code", "arguments": "print("} 78 | # {"function_name":"run_code", "arguments": "print(\"hello"} 79 | # {"function_name":"run_code", "arguments": "print(\"hello world"} 80 | # {"function_name":"run_code", "arguments": "print(\"hello world\""} 81 | # {"function_name":"run_code", "arguments": "print(\"hello world\")"} 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # streaming-json-py 2 | 3 | [![codecov](https://codecov.io/gh/karminski/streaming-json-py/graph/badge.svg?token=1901RX87FK)](https://codecov.io/gh/karminski/streaming-json-py) 4 | 5 | ```python 6 | import streamingjson 7 | ``` 8 | 9 | Welcome to **streaming-json-py**, a groundbreaking library designed to revolutionize the way we handle stream JSON parsing. 10 | 11 | In an era dominated by LLMs (Large Language Models), the ability to efficiently parse JSON streams is more critical than ever. Traditionally, JSON parsing libraries have fallen short, requiring JSON data to be fully generated before any parsing can begin. streaming-json-py challenges this limitation head-on. 12 | 13 | ### Key Features 14 | 15 | - **Real-Time JSON Parsing**: With streaming-json-py, you no longer need to wait for the entire JSON data to be generated. This library allows for the parsing of JSON as it is being streamed (this means JSON stream can stops at any position), significantly cutting down the time-to-first-token. 16 | - **Seamless Integration**: Designed to complement existing JSON parsing libraries, streaming-json-py preprocesses incomplete JSON strings, transforming them into valid, parseable JSON. This means you can continue using your preferred JSON library with our tool seamlessly. 17 | - **Enhanced User Experience**: By enabling real-time data processing, our library drastically reduces the wait time for end-users. Display JSON structures to users without the delay typically associated with complete JSON generation. 18 | 19 | ### Example Usage 20 | 21 | Basically, this library is used to complete fragmented JSON, making it into syntactically correct JSON. For example: 22 | 23 | ```{"a":``` will complete to ```{"a":null}``` 24 | 25 | and When the JSON stream continues to output as: 26 | 27 | ```{"a":[tr``` will complete to ```{"a":[true]}``` 28 | 29 | Do not worry about the JSON stream stopping anywhere, such as at a comma: 30 | 31 | ```{"a":[true],``` will complete to ```{"a":[true]}``` 32 | 33 | Escaped characters? No problem: 34 | 35 | ```{"a":[true], "b": "this is unicode \u54"``` will complete to ```{"a":[true], "b": "this is unicode "}``` 36 | 37 | (After the stream outputs the complete Unicode, it will then display.) 38 | 39 | 40 | **Here’s a quick example to get you started:** 41 | 42 | install from pypi: 43 | 44 | ```bash 45 | pip install streamingjson 46 | ``` 47 | 48 | run example: 49 | 50 | ```python 51 | # init, @NOTE: We need to assign a new lexer for each JSON stream. 52 | lexer = streamingjson.Lexer() 53 | 54 | # append your JSON segment 55 | lexer.append_string('{"a":') 56 | 57 | # complete the JSON 58 | print(lexer.complete_json()) # will print `{"a":null}` 59 | 60 | # append more JSON segment 61 | lexer.append_string('[tr') 62 | 63 | # complete the JSON again 64 | print(lexer.complete_json()) # will print `{"a":[true]}` 65 | ``` 66 | 67 | 68 | For more examples please see: [examples](./examples/) 69 | 70 | ### Try to Find This Library in Another Programming Language? 71 | 72 | Please see: 73 | 74 | - [streaming-json-go](https://github.com/karminski/streaming-json-go) 75 | - [streaming-json-py](https://github.com/karminski/streaming-json-py) 76 | - [streaming-json-js](https://github.com/karminski/streaming-json-js) 77 | 78 | 79 | ### License 80 | 81 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details. 82 | -------------------------------------------------------------------------------- /streamingjson/lexer_tokens.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tokens for lexer 3 | """ 4 | 5 | # Token constants 6 | TOKEN_EOF = 0 # end-of-file 7 | TOKEN_IGNORED = 1 # \t', '\n', '\v', '\f', '\r', ' ' 8 | TOKEN_LEFT_BRACKET = 2 # [ 9 | TOKEN_RIGHT_BRACKET = 3 # ] 10 | TOKEN_LEFT_BRACE = 4 # { 11 | TOKEN_RIGHT_BRACE = 5 # } 12 | TOKEN_COLON = 6 # : 13 | TOKEN_DOT = 7 # . 14 | TOKEN_COMMA = 8 # , 15 | TOKEN_QUOTE = 9 # " 16 | TOKEN_ESCAPE_CHARACTER = 10 # \ 17 | TOKEN_SLASH = 11 # / 18 | TOKEN_NEGATIVE = 12 # - 19 | TOKEN_NULL = 13 # null 20 | TOKEN_TRUE = 14 # true 21 | TOKEN_FALSE = 15 # false 22 | TOKEN_ALPHABET_LOWERCASE_A = 16 # a 23 | TOKEN_ALPHABET_LOWERCASE_B = 17 # b 24 | TOKEN_ALPHABET_LOWERCASE_C = 18 # c 25 | TOKEN_ALPHABET_LOWERCASE_D = 19 # d 26 | TOKEN_ALPHABET_LOWERCASE_E = 20 # e 27 | TOKEN_ALPHABET_LOWERCASE_F = 21 # f 28 | TOKEN_ALPHABET_LOWERCASE_L = 22 # l 29 | TOKEN_ALPHABET_LOWERCASE_N = 23 # n 30 | TOKEN_ALPHABET_LOWERCASE_R = 24 # r 31 | TOKEN_ALPHABET_LOWERCASE_S = 25 # s 32 | TOKEN_ALPHABET_LOWERCASE_T = 26 # t 33 | TOKEN_ALPHABET_LOWERCASE_U = 27 # u 34 | TOKEN_ALPHABET_UPPERCASE_A = 28 # A 35 | TOKEN_ALPHABET_UPPERCASE_B = 29 # B 36 | TOKEN_ALPHABET_UPPERCASE_C = 30 # C 37 | TOKEN_ALPHABET_UPPERCASE_D = 31 # D 38 | TOKEN_ALPHABET_UPPERCASE_E = 32 # E 39 | TOKEN_ALPHABET_UPPERCASE_F = 33 # F 40 | TOKEN_NUMBER = 34 # number 41 | TOKEN_NUMBER_0 = 35 # 0 42 | TOKEN_NUMBER_1 = 36 # 1 43 | TOKEN_NUMBER_2 = 37 # 2 44 | TOKEN_NUMBER_3 = 38 # 3 45 | TOKEN_NUMBER_4 = 39 # 4 46 | TOKEN_NUMBER_5 = 40 # 5 47 | TOKEN_NUMBER_6 = 41 # 6 48 | TOKEN_NUMBER_7 = 42 # 7 49 | TOKEN_NUMBER_8 = 43 # 8 50 | TOKEN_NUMBER_9 = 44 # 9 51 | TOKEN_OTHERS = 45 # anything else in json 52 | 53 | # Token Symbols 54 | TOKEN_LEFT_BRACKET_SYMBOL = "[" 55 | TOKEN_RIGHT_BRACKET_SYMBOL = "]" 56 | TOKEN_LEFT_BRACE_SYMBOL = "{" 57 | TOKEN_RIGHT_BRACE_SYMBOL = "}" 58 | TOKEN_COLON_SYMBOL = ":" 59 | TOKEN_DOT_SYMBOL = "." 60 | TOKEN_COMMA_SYMBOL = "," 61 | TOKEN_QUOTE_SYMBOL = '"' 62 | TOKEN_ESCAPE_CHARACTER_SYMBOL = "\\" 63 | TOKEN_SLASH_SYMBOL = "/" 64 | TOKEN_NEGATIVE_SYMBOL = "-" 65 | TOKEN_ALPHABET_LOWERCASE_A_SYMBOL = "a" 66 | TOKEN_ALPHABET_LOWERCASE_B_SYMBOL = "b" 67 | TOKEN_ALPHABET_LOWERCASE_C_SYMBOL = "c" 68 | TOKEN_ALPHABET_LOWERCASE_D_SYMBOL = "d" 69 | TOKEN_ALPHABET_LOWERCASE_E_SYMBOL = "e" 70 | TOKEN_ALPHABET_LOWERCASE_F_SYMBOL = "f" 71 | TOKEN_ALPHABET_LOWERCASE_L_SYMBOL = "l" 72 | TOKEN_ALPHABET_LOWERCASE_N_SYMBOL = "n" 73 | TOKEN_ALPHABET_LOWERCASE_R_SYMBOL = "r" 74 | TOKEN_ALPHABET_LOWERCASE_S_SYMBOL = "s" 75 | TOKEN_ALPHABET_LOWERCASE_T_SYMBOL = "t" 76 | TOKEN_ALPHABET_LOWERCASE_U_SYMBOL = "u" 77 | TOKEN_ALPHABET_UPPERCASE_A_SYMBOL = "A" 78 | TOKEN_ALPHABET_UPPERCASE_B_SYMBOL = "B" 79 | TOKEN_ALPHABET_UPPERCASE_C_SYMBOL = "C" 80 | TOKEN_ALPHABET_UPPERCASE_D_SYMBOL = "D" 81 | TOKEN_ALPHABET_UPPERCASE_E_SYMBOL = "E" 82 | TOKEN_ALPHABET_UPPERCASE_F_SYMBOL = "F" 83 | TOKEN_NUMBER_0_SYMBOL = "0" 84 | TOKEN_NUMBER_1_SYMBOL = "1" 85 | TOKEN_NUMBER_2_SYMBOL = "2" 86 | TOKEN_NUMBER_3_SYMBOL = "3" 87 | TOKEN_NUMBER_4_SYMBOL = "4" 88 | TOKEN_NUMBER_5_SYMBOL = "5" 89 | TOKEN_NUMBER_6_SYMBOL = "6" 90 | TOKEN_NUMBER_7_SYMBOL = "7" 91 | TOKEN_NUMBER_8_SYMBOL = "8" 92 | TOKEN_NUMBER_9_SYMBOL = "9" 93 | 94 | 95 | # Token symbol map 96 | token_symbol_map = { 97 | TOKEN_EOF: "EOF", 98 | TOKEN_LEFT_BRACKET: "[", 99 | TOKEN_RIGHT_BRACKET: "]", 100 | TOKEN_LEFT_BRACE: "{", 101 | TOKEN_RIGHT_BRACE: "}", 102 | TOKEN_COLON: ":", 103 | TOKEN_DOT: ".", 104 | TOKEN_COMMA: ",", 105 | TOKEN_QUOTE: '"', 106 | TOKEN_ESCAPE_CHARACTER: "\\", 107 | TOKEN_SLASH: "/", 108 | TOKEN_NEGATIVE: "-", 109 | TOKEN_NULL: "null", 110 | TOKEN_TRUE: "true", 111 | TOKEN_FALSE: "false", 112 | TOKEN_ALPHABET_LOWERCASE_A: "a", 113 | TOKEN_ALPHABET_LOWERCASE_B: "b", 114 | TOKEN_ALPHABET_LOWERCASE_C: "c", 115 | TOKEN_ALPHABET_LOWERCASE_D: "d", 116 | TOKEN_ALPHABET_LOWERCASE_E: "e", 117 | TOKEN_ALPHABET_LOWERCASE_F: "f", 118 | TOKEN_ALPHABET_LOWERCASE_L: "l", 119 | TOKEN_ALPHABET_LOWERCASE_N: "n", 120 | TOKEN_ALPHABET_LOWERCASE_R: "r", 121 | TOKEN_ALPHABET_LOWERCASE_S: "s", 122 | TOKEN_ALPHABET_LOWERCASE_T: "t", 123 | TOKEN_ALPHABET_LOWERCASE_U: "u", 124 | TOKEN_ALPHABET_UPPERCASE_A: "A", 125 | TOKEN_ALPHABET_UPPERCASE_B: "B", 126 | TOKEN_ALPHABET_UPPERCASE_C: "C", 127 | TOKEN_ALPHABET_UPPERCASE_D: "D", 128 | TOKEN_ALPHABET_UPPERCASE_E: "E", 129 | TOKEN_ALPHABET_UPPERCASE_F: "F", 130 | TOKEN_NUMBER_0: "0", 131 | TOKEN_NUMBER_1: "1", 132 | TOKEN_NUMBER_2: "2", 133 | TOKEN_NUMBER_3: "3", 134 | TOKEN_NUMBER_4: "4", 135 | TOKEN_NUMBER_5: "5", 136 | TOKEN_NUMBER_6: "6", 137 | TOKEN_NUMBER_7: "7", 138 | TOKEN_NUMBER_8: "8", 139 | TOKEN_NUMBER_9: "9", 140 | } 141 | -------------------------------------------------------------------------------- /tests/test_lexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | test cases for lexer 3 | """ 4 | 5 | import json 6 | from streamingjson import lexer 7 | 8 | 9 | class TestCompleteJSONBase: 10 | """ 11 | lexer test cases 12 | """ 13 | 14 | def test_complete_json_base(self): 15 | """ 16 | base test cases, will test all case in incomplete json 17 | """ 18 | streaming_json_case = { 19 | # test case: basic object properity 20 | "{": "{}", # mirror stack: [], should remove from stack: [], should push into mirror stack: ['}'] 21 | "{}": "{}", # mirror stack: [], should remove from stack: [], should push into mirror stack: [] 22 | '{"': '{"":null}', # mirror stack: ['}'], should remove from stack: [], should push into mirror stack: ['"', ':', 'n', 'u', 'l', 'l'] 23 | '{""': '{"":null}', # mirror stack: ['"', ':', 'n', 'u', 'l', 'l','}'], should remove from stack: ['"'], should push into mirror stack: [] 24 | '{"a': '{"a":null}', 25 | '{"a"': '{"a":null}', 26 | '{"a":': '{"a":null}', 27 | '{"a":n': '{"a":null}', 28 | '{"a":nu': '{"a":null}', 29 | '{"a":nul': '{"a":null}', 30 | '{"a":null': '{"a":null}', 31 | '{"a":null , "b': '{"a":null , "b":null}', 32 | '{"a":t': '{"a":true}', 33 | '{"a":tr': '{"a":true}', 34 | '{"a":tru': '{"a":true}', 35 | '{"a":true': '{"a":true}', 36 | '{"a":true,': '{"a":true}', 37 | '{"a":true , "b': '{"a":true , "b":null}', 38 | '{"a":f': '{"a":false}', 39 | '{"a":fa': '{"a":false}', 40 | '{"a":fal': '{"a":false}', 41 | '{"a":fals': '{"a":false}', 42 | '{"a":false': '{"a":false}', 43 | '{"a":false,': '{"a":false}', 44 | '{"a":false , "b': '{"a":false , "b":null}', 45 | '{"a":-': '{"a":0}', 46 | '{"a":12': '{"a":12}', 47 | '{"a":-0': '{"a":-0}', # @TODO: should be 0, not -0 48 | '{"a":-12': '{"a":-12}', 49 | '{"a":12,': '{"a":12}', 50 | '{"a":12.': '{"a":12.0}', 51 | '{"a":12.15': '{"a":12.15}', 52 | '{"a":12.15,': '{"a":12.15}', 53 | '{"a":-12.15,': '{"a":-12.15}', 54 | '{"a":-1.215e,': '{"a":-1.215}', 55 | '{"a":-1.215E,': '{"a":-1.215}', 56 | '{"a":-1.215e1,': '{"a":-1.215e1}', 57 | '{"a":-1.215e-1,': '{"a":-1.215e-1}', 58 | '{"a":-1.215e+1,': '{"a":-1.215e+1}', 59 | '{"a":-1.215E1,': '{"a":-1.215E1}', 60 | '{"a":-1.215E-1,': '{"a":-1.215E-1}', 61 | '{"a":-1.215E+1,': '{"a":-1.215E+1}', 62 | '{"a":-1.215e12': '{"a":-1.215e12}', 63 | '{"a":-1.215E12': '{"a":-1.215E12}', 64 | '{"a":-1.215e12,': '{"a":-1.215e12}', 65 | '{"a":-1.215E12,': '{"a":-1.215E12}', 66 | '{"a":"': '{"a":""}', 67 | '{"a":""': '{"a":""}', 68 | '{"a":"",': '{"a":""}', 69 | '{"a":"string': '{"a":"string"}', 70 | '{"a":"string"': '{"a":"string"}', 71 | '{"a":"string",': '{"a":"string"}', 72 | '{"a":"abcdefghijklmnopqrstuvwxyz",': '{"a":"abcdefghijklmnopqrstuvwxyz"}', 73 | '{"a":"ABCDEFGHIJKLMNOPQRSTUVWXYZ",': '{"a":"ABCDEFGHIJKLMNOPQRSTUVWXYZ"}', 74 | '{"a":"0123456789",': '{"a":"0123456789"}', 75 | '{"a":"https://': '{"a":"https://"}', 76 | '{"a":"\\u0': '{"a":""}', 77 | '{"a":"\\u00': '{"a":""}', 78 | '{"a":"\\u004': '{"a":""}', 79 | '{"a":"\\u0049': '{"a":"\\u0049"}', 80 | '{"a":"\\u0049"': '{"a":"\\u0049"}', 81 | '{"a":"\\u0049",': '{"a":"\\u0049"}', 82 | '{"a":"\\u0049","b":"': '{"a":"\\u0049","b":""}', 83 | '{"a":"\\u0049","b":"\\': '{"a":"\\u0049","b":""}', 84 | '{"a":"\\u0049","b":"\\u': '{"a":"\\u0049","b":""}', 85 | '{"a":"\\u0049","b":"\\u0': '{"a":"\\u0049","b":""}', 86 | '{"a":"\\u0049","b":"\\u00': '{"a":"\\u0049","b":""}', 87 | '{"a":"\\u0049","b":"\\u005': '{"a":"\\u0049","b":""}', 88 | '{"a":"\\u0049","b":"\\u0050': '{"a":"\\u0049","b":"\\u0050"}', 89 | '{"a":"\\u0049","b":"\\u0050"': '{"a":"\\u0049","b":"\\u0050"}', 90 | '{"a":"\\u0049","b":"\\u0050"}': '{"a":"\\u0049","b":"\\u0050"}', 91 | '{"a":"\\u0123",': '{"a":"\\u0123"}', 92 | '{"a":"\\u4567",': '{"a":"\\u4567"}', 93 | '{"a":"\\u89ab",': '{"a":"\\u89ab"}', 94 | '{"a":"\\u89AB",': '{"a":"\\u89AB"}', 95 | '{"a":"\\ucdef",': '{"a":"\\ucdef"}', 96 | '{"a":"\\ucdee",': '{"a":"\\ucdee"}', 97 | '{"a":"\\uaaaa",': '{"a":"\\uaaaa"}', 98 | '{"a":"\\uCDEF",': '{"a":"\\uCDEF"}', 99 | # test case: escape character 100 | '{"\\': '{"":null}', 101 | '{"\\"': '{"\\"":null}', 102 | '{"\\""': '{"\\"":null}', 103 | '{"\\"\\': '{"\\"":null}', 104 | '{"\\"\\""': '{"\\"\\"":null}', 105 | '{"\\"":': '{"\\"":null}', 106 | '{"a":"\\"': '{"a":"\\""}', 107 | '{"a":"\\""': '{"a":"\\""}', 108 | '{"a":"\\"\\"': '{"a":"\\"\\""}', 109 | '{"a":"\\"\\""': '{"a":"\\"\\""}', 110 | '{"a":"\\"\\"",': '{"a":"\\"\\""}', 111 | '{"a":"\\"\\""}': '{"a":"\\"\\""}', 112 | '{"\\\\': '{"\\\\":null}', 113 | '{"\\/': '{"\\/":null}', 114 | '{"\\b': '{"\\b":null}', 115 | '{"\\f': '{"\\f":null}', 116 | '{"\\n': '{"\\n":null}', 117 | '{"\\r': '{"\\r":null}', 118 | '{"\\t': '{"\\t":null}', 119 | '{"\\u0111': '{"\\u0111":null}', 120 | # test case: token in string 121 | '{"a":"["': '{"a":"["}', 122 | '{"a":"[]"': '{"a":"[]"}', 123 | '{"a":"]"': '{"a":"]"}', 124 | '{"a":"{"': '{"a":"{"}', 125 | '{"a":"{}"': '{"a":"{}"}', 126 | '{"a":"}"': '{"a":"}"}', 127 | '{"a":","': '{"a":","}', 128 | '{"a":"."': '{"a":"."}', 129 | '{"a":"","': '{"a":"","":null}', 130 | '{"a":"","b': '{"a":"","b":null}', 131 | '{"a":"","b"': '{"a":"","b":null}', 132 | '{"a":"","b":': '{"a":"","b":null}', 133 | '{"a":"","b":"': '{"a":"","b":""}', 134 | '{"a":"","b":""': '{"a":"","b":""}', 135 | '{"a":"","b":""}': '{"a":"","b":""}', 136 | '{"1': '{"1":null}', 137 | '{"1.': '{"1.":null}', 138 | '{"1.1': '{"1.1":null}', 139 | '{"1.10': '{"1.10":null}', 140 | '{"1"': '{"1":null}', 141 | '{"1":': '{"1":null}', 142 | '{"1":"': '{"1":""}', 143 | '{"1":"1': '{"1":"1"}', 144 | '{"1":"1.': '{"1":"1."}', 145 | '{"1":"1.1': '{"1":"1.1"}', 146 | '{"1":"1.10': '{"1":"1.10"}', 147 | '{"1":"1"': '{"1":"1"}', 148 | '{"1":"1"}': '{"1":"1"}', 149 | '{"-1":"-1"}': '{"-1":"-1"}', 150 | '{"t': '{"t":null}', 151 | '{"tr': '{"tr":null}', 152 | '{"tru': '{"tru":null}', 153 | '{"true': '{"true":null}', 154 | '{"true"': '{"true":null}', 155 | '{"true":': '{"true":null}', 156 | '{"true":"t': '{"true":"t"}', 157 | '{"true":"tr': '{"true":"tr"}', 158 | '{"true":"tru': '{"true":"tru"}', 159 | '{"true":"true': '{"true":"true"}', 160 | '{"true":"true"': '{"true":"true"}', 161 | '{"true":"true"}': '{"true":"true"}', 162 | '{"f': '{"f":null}', 163 | '{"fa': '{"fa":null}', 164 | '{"fal': '{"fal":null}', 165 | '{"fals': '{"fals":null}', 166 | '{"false': '{"false":null}', 167 | '{"false"': '{"false":null}', 168 | '{"false":': '{"false":null}', 169 | '{"false":"f': '{"false":"f"}', 170 | '{"false":"fa': '{"false":"fa"}', 171 | '{"false":"fal': '{"false":"fal"}', 172 | '{"false":"fals': '{"false":"fals"}', 173 | '{"false":"false': '{"false":"false"}', 174 | '{"false":"false"': '{"false":"false"}', 175 | '{"false":"false"}': '{"false":"false"}', 176 | '{"n': '{"n":null}', 177 | '{"nu': '{"nu":null}', 178 | '{"nul': '{"nul":null}', 179 | '{"null': '{"null":null}', 180 | '{"null"': '{"null":null}', 181 | '{"null":': '{"null":null}', 182 | '{"null":"n': '{"null":"n"}', 183 | '{"null":"nu': '{"null":"nu"}', 184 | '{"null":"nul': '{"null":"nul"}', 185 | '{"null":"null': '{"null":"null"}', 186 | '{"null":"null"': '{"null":"null"}', 187 | '{"null":"null"}': '{"null":"null"}', 188 | # test case: array as object value 189 | '{"a":[': '{"a":[]}', 190 | '{"a":[]': '{"a":[]}', 191 | '{"a":[1': '{"a":[1]}', 192 | '{"a":[1,': '{"a":[1]}', 193 | '{"a":[-0,': '{"a":[-0]}', # @TODO: should be 0, not -0 194 | '{"a":[-1,': '{"a":[-1]}', 195 | '{"a":[1,0': '{"a":[1,0]}', 196 | '{"a":[1,0.0': '{"a":[1,0.0]}', 197 | '{"a":[1,0.01': '{"a":[1,0.01]}', 198 | '{"a":[1,0.01]': '{"a":[1,0.01]}', 199 | '{"a":[1,0.01]}': '{"a":[1,0.01]}', 200 | '{"a":[-1,0.01]}': '{"a":[-1,0.01]}', 201 | '{"a":[-1,-': '{"a":[-1,0]}', 202 | '{"a":[-1,-0': '{"a":[-1,-0]}', # @TODO: should be 0, not -0 203 | '{"a":[1,-0.01]}': '{"a":[1,-0.01]}', 204 | '{"a":[-1,-0.01]}': '{"a":[-1,-0.01]}', 205 | '{"a":[n': '{"a":[null]}', 206 | '{"a":[nu': '{"a":[null]}', 207 | '{"a":[nul': '{"a":[null]}', 208 | '{"a":[null': '{"a":[null]}', 209 | '{"a":[null,': '{"a":[null]}', 210 | '{"a":[null]': '{"a":[null]}', 211 | '{"a":[null]}': '{"a":[null]}', 212 | '{"a":[t': '{"a":[true]}', 213 | '{"a":[tr': '{"a":[true]}', 214 | '{"a":[tru': '{"a":[true]}', 215 | '{"a":[true': '{"a":[true]}', 216 | '{"a":[true,': '{"a":[true]}', 217 | '{"a":[true]': '{"a":[true]}', 218 | '{"a":[true]}': '{"a":[true]}', 219 | '{"a":[f': '{"a":[false]}', 220 | '{"a":[fa': '{"a":[false]}', 221 | '{"a":[fal': '{"a":[false]}', 222 | '{"a":[fals': '{"a":[false]}', 223 | '{"a":[false': '{"a":[false]}', 224 | '{"a":[false,': '{"a":[false]}', 225 | '{"a":[false]': '{"a":[false]}', 226 | '{"a":[false]}': '{"a":[false]}', 227 | '{"a":["': '{"a":[""]}', 228 | '{"a":["b': '{"a":["b"]}', 229 | '{"a":["b"': '{"a":["b"]}', 230 | '{"a":["b",': '{"a":["b"]}', 231 | '{"a":["b"]': '{"a":["b"]}', 232 | '{"a":["b"]}': '{"a":["b"]}', 233 | '{"a":[{': '{"a":[{}]}', 234 | '{"a":[{"': '{"a":[{"":null}]}', 235 | '{"a":[{"b': '{"a":[{"b":null}]}', 236 | '{"a":[{"b"': '{"a":[{"b":null}]}', 237 | '{"a":[{"b":': '{"a":[{"b":null}]}', 238 | '{"a":[{"b":"': '{"a":[{"b":""}]}', 239 | '{"a":[{"b":"c': '{"a":[{"b":"c"}]}', 240 | '{"a":[{"b":"c"': '{"a":[{"b":"c"}]}', 241 | '{"a":[{"b":"c",': '{"a":[{"b":"c"}]}', 242 | '{"a":[{"b":"c"}': '{"a":[{"b":"c"}]}', 243 | '{"a":[{"b":"c"}]': '{"a":[{"b":"c"}]}', 244 | '{"a":[{"b":"c"}]}': '{"a":[{"b":"c"}]}', 245 | # test case: object as object value 246 | '{"a":{': '{"a":{}}', 247 | '{"a":{"': '{"a":{"":null}}', 248 | '{"a":{"b': '{"a":{"b":null}}', 249 | '{"a":{"b"': '{"a":{"b":null}}', 250 | '{"a":{"b":': '{"a":{"b":null}}', 251 | '{"a":{"b":"': '{"a":{"b":""}}', 252 | '{"a":{"b":"c': '{"a":{"b":"c"}}', 253 | '{"a":{"b":"c"': '{"a":{"b":"c"}}', 254 | '{"a":{"b":"c",': '{"a":{"b":"c"}}', 255 | '{"a":{"b":"c"}': '{"a":{"b":"c"}}', 256 | '{"a":{"b":"c"}}': '{"a":{"b":"c"}}', 257 | # test case: multiple object properity 258 | '{"a":1,"b":1.20,"c":0.03,"d":-1,"e":-1.20,"f":-0.03,"g":1.997e3,"h":-1.338e19,"i":"a","j":null,"k":true,"l":false,"m":{},"n":[]]}': '{"a":1,"b":1.20,"c":0.03,"d":-1,"e":-1.20,"f":-0.03,"g":1.997e3,"h":-1.338e19,"i":"a","j":null,"k":true,"l":false,"m":{},"n":[]]}', 259 | # test case: basic array element 260 | "[": "[]", 261 | "[]": "[]", 262 | "[n": "[null]", 263 | "[nu": "[null]", 264 | "[nul": "[null]", 265 | "[null": "[null]", 266 | "[null,": "[null]", 267 | "[null,null": "[null,null]", 268 | "[t": "[true]", 269 | "[tr": "[true]", 270 | "[tru": "[true]", 271 | "[true": "[true]", 272 | "[true,": "[true]", 273 | "[true,true": "[true,true]", 274 | "[f": "[false]", 275 | "[fa": "[false]", 276 | "[fal": "[false]", 277 | "[fals": "[false]", 278 | "[false": "[false]", 279 | "[false,": "[false]", 280 | "[false,false": "[false,false]", 281 | "[0": "[0]", 282 | "[-": "[0]", 283 | "[-1": "[-1]", 284 | "[0,": "[0]", 285 | "[-1,": "[-1]", 286 | "[-1,-": "[-1,0]", 287 | "[0.": "[0.0]", 288 | "[-0.": "[-0.0]", 289 | "[0.1": "[0.1]", 290 | "[0.12,": "[0.12]", 291 | "[-0.12,": "[-0.12]", 292 | "[1,2,": "[1,2]", 293 | "[1,2,0": "[1,2,0]", 294 | "[1,2,0.": "[1,2,0.0]", 295 | "[1,2,0.1": "[1,2,0.1]", 296 | "[1,2,0.10": "[1,2,0.10]", 297 | "[-1,2,0.10": "[-1,2,0.10]", 298 | "[-1,-2,0.10": "[-1,-2,0.10]", 299 | "[-1,-2,-0.10": "[-1,-2,-0.10]", 300 | "[1,-2,-0.10": "[1,-2,-0.10]", 301 | "[1,2,-0.10": "[1,2,-0.10]", 302 | "[1,-2,0.10": "[1,-2,0.10]", 303 | "[2.998e": "[2.998]", 304 | "[2.998E": "[2.998]", 305 | "[2.998e1": "[2.998e1]", 306 | "[2.998e-1": "[2.998e-1]", 307 | "[2.998e+1": "[2.998e+1]", 308 | "[2.998E1": "[2.998E1]", 309 | "[2.998E-1": "[2.998E-1]", 310 | "[2.998E+1": "[2.998E+1]", 311 | "[2.998e10": "[2.998e10]", 312 | "[2.998E10": "[2.998E10]", 313 | "[2.998e10,": "[2.998e10]", 314 | "[2.998E10,": "[2.998E10]", 315 | "[-2.998e": "[-2.998]", 316 | "[-2.998E": "[-2.998]", 317 | "[-2.998e1": "[-2.998e1]", 318 | "[-2.998e-1": "[-2.998e-1]", 319 | "[-2.998e+1": "[-2.998e+1]", 320 | "[-2.998E1": "[-2.998E1]", 321 | "[-2.998E-1": "[-2.998E-1]", 322 | "[-2.998E+1": "[-2.998E+1]", 323 | "[-2.998e10": "[-2.998e10]", 324 | "[-2.998E10": "[-2.998E10]", 325 | "[2.998e10,1": "[2.998e10,1]", 326 | "[2.998e10,1.0": "[2.998e10,1.0]", 327 | "[2.998e10,1.02": "[2.998e10,1.02]", 328 | "[2.998e10,1.02e": "[2.998e10,1.02]", 329 | "[2.998e10,1.02e8": "[2.998e10,1.02e8]", 330 | "[2.998E10,1.02E8": "[2.998E10,1.02E8]", 331 | "[2.998e10,1.02e8,": "[2.998e10,1.02e8]", 332 | "[2.998E10,1.02E8,": "[2.998E10,1.02E8]", 333 | '["': '[""]', 334 | '[""': '[""]', 335 | '["",': '[""]', 336 | '["a': '["a"]', 337 | '["a"': '["a"]', 338 | '["a",': '["a"]', 339 | '["a","': '["a",""]', 340 | '["a","b': '["a","b"]', 341 | '["a","b"': '["a","b"]', 342 | '["a","b",': '["a","b"]', 343 | '["a","b"]': '["a","b"]', 344 | '["\\u0': '[""]', 345 | '["\\u00': '[""]', 346 | '["\\u004': '[""]', 347 | '["\\u0049': '["\\u0049"]', 348 | '["\\u0049"': '["\\u0049"]', 349 | '["\\u0049",': '["\\u0049"]', 350 | '["\\u0049","': '["\\u0049",""]', 351 | '["\\u0049","\\': '["\\u0049",""]', 352 | '["\\u0049","\\u': '["\\u0049",""]', 353 | '["\\u0049","\\u0': '["\\u0049",""]', 354 | '["\\u0049","\\u00': '["\\u0049",""]', 355 | '["\\u0049","\\u005': '["\\u0049",""]', 356 | '["\\u0049","\\u0050': '["\\u0049","\\u0050"]', 357 | '["\\u0049","\\u0050"': '["\\u0049","\\u0050"]', 358 | '["\\u0049","\\u0050"]': '["\\u0049","\\u0050"]', 359 | '["\\u0123': '["\\u0123"]', 360 | '["\\u4567': '["\\u4567"]', 361 | '["\\u89ab': '["\\u89ab"]', 362 | '["\\u89AB': '["\\u89AB"]', 363 | '["\\ucdef': '["\\ucdef"]', 364 | '["\\uCDEF': '["\\uCDEF"]', 365 | # test case: object as array element 366 | "[{": "[{}]", 367 | '[{"': '[{"":null}]', 368 | '[{""': '[{"":null}]', 369 | '[{"":': '[{"":null}]', 370 | '[{"":"': '[{"":""}]', 371 | '[{"":""': '[{"":""}]', 372 | '[{"":""}': '[{"":""}]', 373 | '[{"":""}]': '[{"":""}]', 374 | '[{"a': '[{"a":null}]', 375 | '[{"a"': '[{"a":null}]', 376 | '[{"a":': '[{"a":null}]', 377 | '[{"a":"': '[{"a":""}]', 378 | '[{"a":"b': '[{"a":"b"}]', 379 | '[{"a":"b"': '[{"a":"b"}]', 380 | '[{"a":"b"}': '[{"a":"b"}]', 381 | '[{"a":"b"}]': '[{"a":"b"}]', 382 | '[{"a":n': '[{"a":null}]', 383 | '[{"a":nu': '[{"a":null}]', 384 | '[{"a":nul': '[{"a":null}]', 385 | '[{"a":null': '[{"a":null}]', 386 | '[{"a":null,': '[{"a":null}]', 387 | '[{"a":null}': '[{"a":null}]', 388 | '[{"a":null}]': '[{"a":null}]', 389 | '[{"a":t': '[{"a":true}]', 390 | '[{"a":tr': '[{"a":true}]', 391 | '[{"a":tru': '[{"a":true}]', 392 | '[{"a":true': '[{"a":true}]', 393 | '[{"a":true,': '[{"a":true}]', 394 | '[{"a":true}': '[{"a":true}]', 395 | '[{"a":true}]': '[{"a":true}]', 396 | '[{"a":f': '[{"a":false}]', 397 | '[{"a":fa': '[{"a":false}]', 398 | '[{"a":fal': '[{"a":false}]', 399 | '[{"a":fals': '[{"a":false}]', 400 | '[{"a":false': '[{"a":false}]', 401 | '[{"a":false,': '[{"a":false}]', 402 | '[{"a":false}': '[{"a":false}]', 403 | '[{"a":false}]': '[{"a":false}]', 404 | '[{"a":-': '[{"a":0}]', 405 | '[{"a":0': '[{"a":0}]', 406 | '[{"a":-0': '[{"a":-0}]', # @TODO: should be 0, not -0 407 | '[{"a":0.': '[{"a":0.0}]', 408 | '[{"a":0.1': '[{"a":0.1}]', 409 | '[{"a":0.10': '[{"a":0.10}]', 410 | '[{"a":0.10,': '[{"a":0.10}]', 411 | '[{"a":0.10}': '[{"a":0.10}]', 412 | '[{"a":0.10}]': '[{"a":0.10}]', 413 | '[{"a":-0.10}]': '[{"a":-0.10}]', 414 | '[{"a":[': '[{"a":[]}]', 415 | '[{"a":[1': '[{"a":[1]}]', 416 | '[{"a":[t': '[{"a":[true]}]', 417 | '[{"a":[f': '[{"a":[false]}]', 418 | '[{"a":[n': '[{"a":[null]}]', 419 | '[{"a":["': '[{"a":[""]}]', 420 | '[{"a":[{': '[{"a":[{}]}]', 421 | '[{"a":[{"b":"c"},{': '[{"a":[{"b":"c"},{}]}]', 422 | '[{"a":[{"b":"c"},{"': '[{"a":[{"b":"c"},{"":null}]}]', 423 | '[{"a":[{"b":"c"},{"d"': '[{"a":[{"b":"c"},{"d":null}]}]', 424 | '[{"a":[{"b":"c"},{"d":-': '[{"a":[{"b":"c"},{"d":0}]}]', 425 | '[{"a":[{"b":"c"},{"d":-0': '[{"a":[{"b":"c"},{"d":-0}]}]', # @TODO: should be 0, not -0 426 | '[{"a":[{"b":"c"},{"d":1.': '[{"a":[{"b":"c"},{"d":1.0}]}]', 427 | '[{"a":[{"b":"c"},{"d":1.1': '[{"a":[{"b":"c"},{"d":1.1}]}]', 428 | '[{"a":[{"b":"c"},{"d":-1.1': '[{"a":[{"b":"c"},{"d":-1.1}]}]', 429 | '[{"a":[{"b":"c"},{"d":[': '[{"a":[{"b":"c"},{"d":[]}]}]', 430 | '[{"a":[{"b":"c"},{"d":[{': '[{"a":[{"b":"c"},{"d":[{}]}]}]', 431 | # test case: multiple array element 432 | '[1,1.20,0.03,-1,-1.20,-0.03,1.997e3,-1.338e19,"a",null,true,false,{},[]]': '[1,1.20,0.03,-1,-1.20,-0.03,1.997e3,-1.338e19,"a",null,true,false,{},[]]', 433 | # test case: array as array element 434 | "[[": "[[]]", 435 | "[[]": "[[]]", 436 | "[[]]": "[[]]", 437 | "[[{": "[[{}]]", 438 | '[["': '[[""]]', 439 | '[[""': '[[""]]', 440 | '[["a': '[["a"]]', 441 | '[["a"': '[["a"]]', 442 | '[["a"]': '[["a"]]', 443 | '[["a"],': '[["a"]]', 444 | '[["a"],[': '[["a"],[]]', 445 | '[["a"],[]': '[["a"],[]]', 446 | '[["a"],[]]': '[["a"],[]]', 447 | '[["a"],{': '[["a"],{}]', 448 | '[["a"],{}': '[["a"],{}]', 449 | '[["a"],{}]': '[["a"],{}]', 450 | '[["a"],{"': '[["a"],{"":null}]', 451 | '[["a"],{"b': '[["a"],{"b":null}]', 452 | '[["a"],{"b"': '[["a"],{"b":null}]', 453 | '[["a"],{"b":': '[["a"],{"b":null}]', 454 | '[["a"],{"b":"': '[["a"],{"b":""}]', 455 | '[["a"],{"b":"c': '[["a"],{"b":"c"}]', 456 | '[["a"],{"b":"c"': '[["a"],{"b":"c"}]', 457 | '[["a"],{"b":"c"}': '[["a"],{"b":"c"}]', 458 | '[["a"],{"b":"c"}]': '[["a"],{"b":"c"}]', 459 | # test case: ignore token 460 | "{ }": "{ }", 461 | '{ " a " : -1.2 , ': '{ " a " : -1.2}', 462 | '{ " a " : -1.2 , " b " : " c " ': '{ " a " : -1.2 , " b " : " c "}', 463 | '{ " a " : -1.2 , " b " : " c " , " d" : true ': '{ " a " : -1.2 , " b " : " c " , " d" : true}', 464 | '{ " a " : -1.2 , " b " : " c " , " d" : true , "e " : { } } ': '{ " a " : -1.2 , " b " : " c " , " d" : true , "e " : { } }', 465 | "[ ]": "[ ]", 466 | "[ 1": "[ 1]", 467 | "[ 1 , -1.020 , true , false, null": "[ 1 , -1.020 , true , false, null]", 468 | "[ 1 , -1.020 , true , false, null, { }": "[ 1 , -1.020 , true , false, null, { }]", 469 | } 470 | for test_case, expect in streaming_json_case.items(): 471 | lexer_instance = lexer.Lexer() 472 | err_in_append_string = lexer_instance.append_string(test_case) 473 | ret = lexer_instance.complete_json() 474 | assert err_in_append_string is None 475 | assert expect == ret, "unexpected JSON" 476 | 477 | def test_complete_json_nestad(self): 478 | """ 479 | test nestad JSON by each caracter 480 | """ 481 | streaming_json_content = '{"string": "这是一个字符串", "integer": 42, "float": 3.14159, "boolean_true": true, "boolean_false": false, "null": null, "object": {"empty_object": {}, "non_empty_object": {"key": "value"}, "nested_object": {"nested_key": {"sub_nested_key": "sub_nested_value"}}}, "array":["string in array", 123, 45.67, true, false, null, {"object_in_array": "object_value"},["nested_array"]]}' 482 | lexer_instance = lexer.Lexer() 483 | for char in streaming_json_content: 484 | err_in_append_string = lexer_instance.append_string(char) 485 | assert err_in_append_string is None 486 | ret = lexer_instance.complete_json() 487 | interface_for_json = None 488 | err_in_unmarshal = None 489 | try: 490 | interface_for_json = json.loads(ret) 491 | except Exception as e: 492 | err_in_unmarshal = e 493 | assert err_in_unmarshal is None 494 | 495 | def test_complete_json_nestad2(self): 496 | """ 497 | test nestad JSON by each caracter, new line included 498 | """ 499 | streaming_json_content = """{ 500 | "string_with_escape_chars": "This string contains escape characters like \\\"quotes\\\", \\\\backslashes\\\\, \\/forwardslashes/, \\bbackspace\\b, \\fformfeed\\f, \\nnewline\\n, \\rcarriage return\\r, \\ttab\\t.", 501 | "scientific_notation": 2.998e8, 502 | "unicode_characters": "Some unicode characters: \\u0041\\u0042\\u0043\\u0044", 503 | "multiple_lang_strings": { 504 | "english": "Hello, World!", 505 | "chinese": "你好,世界!", 506 | "spanish": "¡Hola, mundo!", 507 | "russian": "Привет, мир!" 508 | }, 509 | "json_tokens_as_strings": "{\\"key_with_invalid_token\\": \\"value_with_invalid_separator\\": \\"a\\"}", 510 | "nested_objects": { 511 | "nested_object1": { 512 | "key1": "value1", 513 | "key2": "value2", 514 | "nested_object2": { 515 | "inner_key1": "inner_value1", 516 | "inner_key2": "inner_value2" 517 | } 518 | }, 519 | "nested_object2": { 520 | "name": "John Doe", 521 | "age": 30, 522 | "address": { 523 | "street": "123 Main St", 524 | "city": "Anytown" 525 | } 526 | } 527 | }, 528 | "array_test": { 529 | "simple_array": [10, 20, 30, 40, 50], 530 | "array_of_objects": [ 531 | { 532 | "name": "Alice", 533 | "age": 25 534 | }, 535 | { 536 | "name": "Bob", 537 | "age": 30 538 | } 539 | ], 540 | "nested_arrays": [ 541 | [1, 2, 3], 542 | [true, false, null] 543 | ], 544 | "empty_objects": {}, 545 | "empty_arrays": [] 546 | } 547 | } 548 | """ 549 | lexer_instance = lexer.Lexer() 550 | for char in streaming_json_content: 551 | err_in_append_string = lexer_instance.append_string(char) 552 | assert err_in_append_string is None 553 | ret = lexer_instance.complete_json() 554 | interface_for_json = None 555 | err_in_unmarshal = None 556 | try: 557 | interface_for_json = json.loads(ret) 558 | except Exception as e: 559 | err_in_unmarshal = e 560 | assert err_in_unmarshal is None 561 | 562 | def test_complete_json_escape_and_etc(self): 563 | """ 564 | test escape caracter and unicode 565 | """ 566 | streaming_json_content = """{ 567 | "string": "含有转义字符的字符串:\\"\\\\\\/\\b\\f\\n\\r\\t", 568 | "string_unicode": "含Unicode字符:\\u6211\\u662F", 569 | "negative_integer": -42, 570 | "float_scientific_notation": 6.02e23, 571 | "negative_float": -3.14159, 572 | "array_with_various_numbers": [ 573 | 0, 574 | -1, 575 | 2.99792458e8, 576 | -6.62607015e-34 577 | ], 578 | "special_characters": "\\u003C\\u003E\\u0026\\u0027\\u0022", 579 | "nested_structure": { 580 | "nested_key_with_escaped_chars": "这是一个带有转义字符的字符串:\\\\n\\\\r\\\\t", 581 | "nested_object": { 582 | "bool_true": true, 583 | "bool_false": false, 584 | "null_value": null, 585 | "complex_number": 3.14e-10 586 | } 587 | } 588 | } 589 | """ 590 | lexer_instance = lexer.Lexer() 591 | for char in streaming_json_content: 592 | err_in_append_string = lexer_instance.append_string(char) 593 | assert err_in_append_string is None 594 | ret = lexer_instance.complete_json() 595 | interface_for_json = None 596 | err_in_unmarshal = None 597 | try: 598 | interface_for_json = json.loads(ret) 599 | except Exception as e: 600 | err_in_unmarshal = e 601 | assert err_in_unmarshal is None 602 | -------------------------------------------------------------------------------- /streamingjson/lexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | streaming-json-py main lexer method 3 | This method will 4 | """ 5 | 6 | from streamingjson import lexer_tokens 7 | from streamingjson import lexer_helper 8 | 9 | 10 | class Lexer: 11 | """ 12 | lexer for json fragment 13 | """ 14 | 15 | def __init__(self): 16 | self.json_content = [] # input JSON content 17 | self.padding_content = ( 18 | [] 19 | ) # padding content for ignored characters and escape characters, etc. 20 | self.json_segment = "" # appended JSON segment by the AppendString() method. 21 | self.token_stack = [] # token stack for input JSON 22 | self.mirror_token_stack = [] # token stack for auto-completed tokens 23 | 24 | def __get_top_token_on_stack(self): 25 | """ 26 | get token on the stack top 27 | """ 28 | if not self.token_stack: 29 | return lexer_tokens.TOKEN_EOF 30 | return self.token_stack[-1] 31 | 32 | def __get_top_token_on_mirror_stack(self): 33 | """ 34 | get token on the mirror stack top 35 | """ 36 | if not self.mirror_token_stack: 37 | return lexer_tokens.TOKEN_EOF 38 | return self.mirror_token_stack[-1] 39 | 40 | def __pop_token_stack(self): 41 | """ 42 | pop token on the stack top 43 | """ 44 | if not self.token_stack: 45 | return lexer_tokens.TOKEN_EOF 46 | return self.token_stack.pop() 47 | 48 | def __pop_mirror_token_stack(self): 49 | """ 50 | pop token on the mirror stack top 51 | """ 52 | if not self.mirror_token_stack: 53 | return lexer_tokens.TOKEN_EOF 54 | return self.mirror_token_stack.pop() 55 | 56 | def __push_token_stack(self, token): 57 | """ 58 | push token into the stack 59 | """ 60 | self.token_stack.append(token) 61 | 62 | def __push_mirror_token_stack(self, token): 63 | """ 64 | push token into the mirror stack 65 | """ 66 | self.mirror_token_stack.append(token) 67 | 68 | def __dump_mirror_token_stack_to_string(self): 69 | """ 70 | convert mirror stack token into string 71 | """ 72 | return "".join( 73 | [ 74 | lexer_tokens.token_symbol_map[x] 75 | for x in reversed(self.mirror_token_stack) 76 | ] 77 | ) 78 | 79 | def __skip_json_segment(self, n): 80 | """ 81 | skip JSON segment by length n 82 | """ 83 | self.json_segment = self.json_segment[n:] 84 | 85 | def __push_negative_into_json_content(self): 86 | """ 87 | push negative symbol `-` into JSON content 88 | """ 89 | self.json_content.append(lexer_tokens.TOKEN_NEGATIVE_SYMBOL) 90 | 91 | def __push_byte_into_padding_content(self, b): 92 | """ 93 | push byte into JSON content by given 94 | """ 95 | self.padding_content.append(b) 96 | 97 | def __append_padding_content_to_json_content(self): 98 | """ 99 | append padding content into JSON content 100 | """ 101 | self.json_content.extend(self.padding_content) 102 | self.padding_content = [] 103 | 104 | def __have_padding_content(self): 105 | """ 106 | check if padding content is empty 107 | """ 108 | return bool(self.padding_content) 109 | 110 | def __clean_padding_content(self): 111 | """ 112 | set padding content to empty 113 | """ 114 | self.padding_content = [] 115 | 116 | def __stream_stopped_in_an_object_key_start(self) -> bool: 117 | """ 118 | check if JSON stream stopped at an object properity's key start, like `{"` 119 | """ 120 | # `{`, `"` in stack, or `,`, `"` in stack 121 | case1 = [lexer_tokens.TOKEN_LEFT_BRACE, lexer_tokens.TOKEN_QUOTE] 122 | case2 = [lexer_tokens.TOKEN_COMMA, lexer_tokens.TOKEN_QUOTE] 123 | # `}` in mirror stack 124 | case3 = [lexer_tokens.TOKEN_RIGHT_BRACE] 125 | return ( 126 | lexer_helper.match_stack(self.token_stack, case1) 127 | or lexer_helper.match_stack(self.token_stack, case2) 128 | ) and lexer_helper.match_stack(self.mirror_token_stack, case3) 129 | 130 | def __stream_stopped_in_an_object_key_end(self) -> bool: 131 | """ 132 | check if JSON stream stopped in an object properity's key, like `{"field` 133 | """ 134 | # // `{`, `"`, `"` in stack, or `,`, `"`, `"` in stack 135 | case1 = [ 136 | lexer_tokens.TOKEN_LEFT_BRACE, 137 | lexer_tokens.TOKEN_QUOTE, 138 | lexer_tokens.TOKEN_QUOTE, 139 | ] 140 | case2 = [ 141 | lexer_tokens.TOKEN_COMMA, 142 | lexer_tokens.TOKEN_QUOTE, 143 | lexer_tokens.TOKEN_QUOTE, 144 | ] 145 | # // `"`, `:`, `n`, `u`, `l`, `l`, `}` in mirror stack 146 | case3 = [ 147 | lexer_tokens.TOKEN_RIGHT_BRACE, 148 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 149 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 150 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 151 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 152 | lexer_tokens.TOKEN_COLON, 153 | lexer_tokens.TOKEN_QUOTE, 154 | ] 155 | return ( 156 | lexer_helper.match_stack(self.token_stack, case1) 157 | or lexer_helper.match_stack(self.token_stack, case2) 158 | ) and lexer_helper.match_stack(self.mirror_token_stack, case3) 159 | 160 | def __stream_stopped_in_an_object_string_value_start(self) -> bool: 161 | """ 162 | check if JSON stream stopped in an object properity's value start, 163 | like `{"field": "` 164 | """ 165 | 166 | # `:`, `"` in stack 167 | case1 = [lexer_tokens.TOKEN_COLON, lexer_tokens.TOKEN_QUOTE] 168 | # // `n`, `u`, `l`, `l`, `}` in mirror stack 169 | case2 = [ 170 | lexer_tokens.TOKEN_RIGHT_BRACE, 171 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 172 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 173 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 174 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 175 | ] 176 | return lexer_helper.match_stack( 177 | self.token_stack, case1 178 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 179 | 180 | def __stream_stopped_in_an_object_value_end(self) -> bool: 181 | """ 182 | check if JSON stream stopped in an object properity's value finish, 183 | like `{"field": "value"` 184 | """ 185 | # `"`, `}` left 186 | tokens = [lexer_tokens.TOKEN_RIGHT_BRACE, lexer_tokens.TOKEN_QUOTE] 187 | return lexer_helper.match_stack(self.mirror_token_stack, tokens) 188 | 189 | def __stream_stopped_in_an_object_array_value_start(self) -> bool: 190 | """ 191 | check if JSON stream stopped in an object properity's value start by array, 192 | like `{"field":[` 193 | """ 194 | # `:`, `[` in stack 195 | case1 = [lexer_tokens.TOKEN_COLON, lexer_tokens.TOKEN_LEFT_BRACKET] 196 | # `n`, `u`, `l`, `l`, `}` in mirror stack 197 | case2 = [ 198 | lexer_tokens.TOKEN_RIGHT_BRACE, 199 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 200 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 201 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 202 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 203 | ] 204 | return lexer_helper.match_stack( 205 | self.token_stack, case1 206 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 207 | 208 | def __stream_stopped_in_an_object_object_value_start(self) -> bool: 209 | """ 210 | check if JSON stream stopped in an object properity's value start by array, 211 | like `{"field":{` 212 | """ 213 | # `:`, `{` in stack 214 | case1 = [lexer_tokens.TOKEN_COLON, lexer_tokens.TOKEN_LEFT_BRACE] 215 | # `n`, `u`, `l`, `l`, `}` in mirror stack 216 | case2 = [ 217 | lexer_tokens.TOKEN_RIGHT_BRACE, 218 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 219 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 220 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 221 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 222 | ] 223 | return lexer_helper.match_stack( 224 | self.token_stack, case1 225 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 226 | 227 | def __stream_stopped_in_an_object_negative_number_value_start(self) -> bool: 228 | """ 229 | check if JSON stream stopped in an object properity's negative number value, 230 | like `:-` 231 | """ 232 | # `:`, `-` in stack 233 | case1 = [lexer_tokens.TOKEN_COLON, lexer_tokens.TOKEN_NEGATIVE] 234 | return lexer_helper.match_stack(self.token_stack, case1) 235 | 236 | def __stream_stopped_in_a_negative_number_value_start(self) -> bool: 237 | """ 238 | check if JSON stream stopped in an object properity's negative number value, 239 | like `-` 240 | """ 241 | # `-` in stack 242 | case1 = [lexer_tokens.TOKEN_NEGATIVE] 243 | # `0`in mirror stack 244 | case2 = [lexer_tokens.TOKEN_NUMBER_0] 245 | return lexer_helper.match_stack( 246 | self.token_stack, case1 247 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 248 | 249 | def __stream_stopped_in_an_array(self) -> bool: 250 | """ 251 | check if JSON stream stopped in an array 252 | """ 253 | return ( 254 | self.__get_top_token_on_mirror_stack() == lexer_tokens.TOKEN_RIGHT_BRACKET 255 | ) 256 | 257 | def __stream_stopped_in_an_array_string_value_end(self) -> bool: 258 | """ 259 | check if JSON stream stopped in an array's string value end, like `["value"` 260 | """ 261 | # `"`, `"` in stack 262 | case1 = [lexer_tokens.TOKEN_QUOTE, lexer_tokens.TOKEN_QUOTE] 263 | # `"`, `]` in mirror stack 264 | case2 = [lexer_tokens.TOKEN_RIGHT_BRACKET, lexer_tokens.TOKEN_QUOTE] 265 | return lexer_helper.match_stack( 266 | self.token_stack, case1 267 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 268 | 269 | def __stream_stopped_in_an_object_null_value_placeholder_start(self) -> bool: 270 | """ 271 | check if JSON stream stopped in an object properity's value start by array, 272 | like `{"field":{` 273 | """ 274 | # `n`, `u`, `l`, `l`, `}` in mirror stack 275 | case1 = [ 276 | lexer_tokens.TOKEN_RIGHT_BRACE, 277 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 278 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 279 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 280 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 281 | ] 282 | return lexer_helper.match_stack(self.mirror_token_stack, case1) 283 | 284 | def __stream_stopped_in_a_string(self) -> bool: 285 | """ 286 | check if JSON stream stopped in a string, like `""` 287 | """ 288 | return ( 289 | self.__get_top_token_on_stack() == lexer_tokens.TOKEN_QUOTE 290 | and self.__get_top_token_on_mirror_stack() == lexer_tokens.TOKEN_QUOTE 291 | ) 292 | 293 | def __stream_stopped_in_an_string_unicode_escape(self) -> bool: 294 | """ 295 | check if JSON stream stopped in a string's unicode escape, like `\u0001` 296 | """ 297 | # `\`, `u` in stack 298 | case1 = [ 299 | lexer_tokens.TOKEN_ESCAPE_CHARACTER, 300 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 301 | ] 302 | # `"` in mirror stack 303 | case2 = [lexer_tokens.TOKEN_QUOTE] 304 | return lexer_helper.match_stack( 305 | self.token_stack, case1 306 | ) and lexer_helper.match_stack(self.mirror_token_stack, case2) 307 | 308 | def __stream_stopped_in_a_number(self) -> bool: 309 | """ 310 | check if JSON stream stopped in a number, like `[0-9]` 311 | """ 312 | return self.__get_top_token_on_stack() == lexer_tokens.TOKEN_NUMBER 313 | 314 | def __stream_stopped_in_a_number_decimal_part(self) -> bool: 315 | """ 316 | check if JSON stream stopped in a number first decimal place, like `.?` 317 | """ 318 | # `.`, lexer_tokens.TOKEN_NUMBER in stack 319 | return self.__get_top_token_on_stack() == lexer_tokens.TOKEN_DOT 320 | 321 | def __stream_stopped_in_a_number_decimal_part_middle(self) -> bool: 322 | """ 323 | check if JSON stream stopped in a number other decimal place (except first place), 324 | like `.[0-9]?` 325 | """ 326 | case1 = [lexer_tokens.TOKEN_DOT, lexer_tokens.TOKEN_NUMBER] 327 | return lexer_helper.match_stack(self.token_stack, case1) 328 | 329 | def __stream_stopped_with_leading_escape_character(self) -> bool: 330 | """ 331 | check if JSON stream stopped in escape character, like \ 332 | """ 333 | return self.__get_top_token_on_stack() == lexer_tokens.TOKEN_ESCAPE_CHARACTER 334 | 335 | def __match_token(self): 336 | """ 337 | lexer match JSON token method, convert JSON segment to JSON tokens 338 | """ 339 | # Segment end 340 | if len(self.json_segment) == 0: 341 | return lexer_tokens.TOKEN_EOF, 0 342 | 343 | token_symbol = self.json_segment[0] 344 | 345 | # Check if ignored token 346 | if lexer_helper.is_ignore_token(token_symbol): 347 | self.__skip_json_segment(1) 348 | return lexer_tokens.TOKEN_IGNORED, token_symbol 349 | # Match token 350 | token_mapping = { 351 | lexer_tokens.TOKEN_LEFT_BRACKET_SYMBOL: lexer_tokens.TOKEN_LEFT_BRACKET, 352 | lexer_tokens.TOKEN_RIGHT_BRACKET_SYMBOL: lexer_tokens.TOKEN_RIGHT_BRACKET, 353 | lexer_tokens.TOKEN_LEFT_BRACE_SYMBOL: lexer_tokens.TOKEN_LEFT_BRACE, 354 | lexer_tokens.TOKEN_RIGHT_BRACE_SYMBOL: lexer_tokens.TOKEN_RIGHT_BRACE, 355 | lexer_tokens.TOKEN_COLON_SYMBOL: lexer_tokens.TOKEN_COLON, 356 | lexer_tokens.TOKEN_DOT_SYMBOL: lexer_tokens.TOKEN_DOT, 357 | lexer_tokens.TOKEN_COMMA_SYMBOL: lexer_tokens.TOKEN_COMMA, 358 | lexer_tokens.TOKEN_QUOTE_SYMBOL: lexer_tokens.TOKEN_QUOTE, 359 | lexer_tokens.TOKEN_ESCAPE_CHARACTER_SYMBOL: lexer_tokens.TOKEN_ESCAPE_CHARACTER, 360 | lexer_tokens.TOKEN_SLASH_SYMBOL: lexer_tokens.TOKEN_SLASH, 361 | lexer_tokens.TOKEN_NEGATIVE_SYMBOL: lexer_tokens.TOKEN_NEGATIVE, 362 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A, 363 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_B_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_B, 364 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_C_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_C, 365 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_D_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_D, 366 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 367 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F, 368 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 369 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 370 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R, 371 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S, 372 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T, 373 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U_SYMBOL: lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 374 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_A_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_A, 375 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_B_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_B, 376 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_C_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_C, 377 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_D_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_D, 378 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_E_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_E, 379 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_F_SYMBOL: lexer_tokens.TOKEN_ALPHABET_UPPERCASE_F, 380 | lexer_tokens.TOKEN_NUMBER_0_SYMBOL: lexer_tokens.TOKEN_NUMBER_0, 381 | lexer_tokens.TOKEN_NUMBER_1_SYMBOL: lexer_tokens.TOKEN_NUMBER_1, 382 | lexer_tokens.TOKEN_NUMBER_2_SYMBOL: lexer_tokens.TOKEN_NUMBER_2, 383 | lexer_tokens.TOKEN_NUMBER_3_SYMBOL: lexer_tokens.TOKEN_NUMBER_3, 384 | lexer_tokens.TOKEN_NUMBER_4_SYMBOL: lexer_tokens.TOKEN_NUMBER_4, 385 | lexer_tokens.TOKEN_NUMBER_5_SYMBOL: lexer_tokens.TOKEN_NUMBER_5, 386 | lexer_tokens.TOKEN_NUMBER_6_SYMBOL: lexer_tokens.TOKEN_NUMBER_6, 387 | lexer_tokens.TOKEN_NUMBER_7_SYMBOL: lexer_tokens.TOKEN_NUMBER_7, 388 | lexer_tokens.TOKEN_NUMBER_8_SYMBOL: lexer_tokens.TOKEN_NUMBER_8, 389 | lexer_tokens.TOKEN_NUMBER_9_SYMBOL: lexer_tokens.TOKEN_NUMBER_9, 390 | } 391 | 392 | token_result = token_mapping.get(token_symbol, lexer_tokens.TOKEN_OTHERS) 393 | self.__skip_json_segment(1) 394 | return token_result, token_symbol 395 | 396 | def append_string( 397 | self, 398 | string: str, 399 | ): 400 | """ 401 | append JSON string to current JSON stream content 402 | this method will traversal all token and generate mirror token for complete full JSON 403 | """ 404 | 405 | self.json_segment = string 406 | while True: 407 | token, token_symbol = self.__match_token() 408 | 409 | if token == lexer_tokens.TOKEN_EOF: 410 | # nothing to do with TOKEN_EOF 411 | pass 412 | elif token == lexer_tokens.TOKEN_IGNORED: 413 | if self.__stream_stopped_in_a_string(): 414 | self.json_content += token_symbol 415 | continue 416 | self.__push_byte_into_padding_content(token_symbol) 417 | elif token == lexer_tokens.TOKEN_OTHERS: 418 | # check if json stream stopped with padding content 419 | if self.__have_padding_content(): 420 | self.__append_padding_content_to_json_content() 421 | self.__clean_padding_content() 422 | # write current token symbol to JSON content 423 | self.json_content += token_symbol 424 | elif token == lexer_tokens.TOKEN_LEFT_BRACKET: 425 | # check if json stream stopped with padding content 426 | if self.__have_padding_content(): 427 | self.__append_padding_content_to_json_content() 428 | self.__clean_padding_content() 429 | self.json_content += token_symbol 430 | if self.__stream_stopped_in_a_string(): 431 | continue 432 | self.__push_token_stack(token) 433 | if self.__stream_stopped_in_an_object_array_value_start(): 434 | # pop `n`, `u`, `l`, `l` from mirror stack 435 | self.__pop_mirror_token_stack() 436 | self.__pop_mirror_token_stack() 437 | self.__pop_mirror_token_stack() 438 | self.__pop_mirror_token_stack() 439 | # push `]` into mirror stack 440 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_RIGHT_BRACKET) 441 | elif token == lexer_tokens.TOKEN_RIGHT_BRACKET: 442 | if self.__stream_stopped_in_a_string(): 443 | self.json_content += token_symbol 444 | continue 445 | # check if json stream stopped with padding content 446 | if self.__have_padding_content(): 447 | self.__append_padding_content_to_json_content() 448 | self.__clean_padding_content() 449 | # write current token symbol to JSON content 450 | self.json_content += token_symbol 451 | # push `]` into stack 452 | self.__push_token_stack(token) 453 | # pop `]` from mirror stack 454 | self.__pop_mirror_token_stack() 455 | elif token == lexer_tokens.TOKEN_LEFT_BRACE: 456 | # check if json stream stopped with padding content 457 | if self.__have_padding_content(): 458 | self.__append_padding_content_to_json_content() 459 | self.__clean_padding_content() 460 | # write current token symbol to JSON content 461 | self.json_content += token_symbol 462 | if self.__stream_stopped_in_a_string(): 463 | continue 464 | self.__push_token_stack(token) 465 | if self.__stream_stopped_in_an_object_object_value_start(): 466 | # pop `n`, `u`, `l`, `l` from mirror stack 467 | self.__pop_mirror_token_stack() 468 | self.__pop_mirror_token_stack() 469 | self.__pop_mirror_token_stack() 470 | self.__pop_mirror_token_stack() 471 | # push `}` into mirror stack 472 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_RIGHT_BRACE) 473 | elif token == lexer_tokens.TOKEN_RIGHT_BRACE: 474 | if self.__stream_stopped_in_a_string(): 475 | self.json_content += token_symbol 476 | continue 477 | # check if json stream stopped with padding content 478 | if self.__have_padding_content(): 479 | self.__append_padding_content_to_json_content() 480 | self.__clean_padding_content() 481 | self.json_content += token_symbol 482 | # push `}` into stack 483 | self.__push_token_stack(token) 484 | # pop `}` from mirror stack 485 | self.__pop_mirror_token_stack() 486 | elif token == lexer_tokens.TOKEN_QUOTE: 487 | # check if escape quote `\"` 488 | if self.__stream_stopped_with_leading_escape_character(): 489 | # push padding escape character `\` into JSON content 490 | self.__append_padding_content_to_json_content() 491 | self.__clean_padding_content() 492 | # write current token symbol to JSON content 493 | self.json_content += token_symbol 494 | # pop `\` from stack 495 | self.__pop_token_stack() 496 | continue 497 | # check if json stream stopped with padding content 498 | if self.__have_padding_content(): 499 | self.__append_padding_content_to_json_content() 500 | self.__clean_padding_content() 501 | 502 | # write current token symbol to JSON content 503 | self.json_content += token_symbol 504 | self.__push_token_stack(token) 505 | if self.__stream_stopped_in_an_array(): 506 | # push `"` into mirror stack 507 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_QUOTE) 508 | elif self.__stream_stopped_in_an_array_string_value_end(): 509 | # pop `"` from mirror stack 510 | self.__pop_mirror_token_stack() 511 | elif self.__stream_stopped_in_an_object_key_start(): 512 | # check if stopped in key of object's properity or value of object's properity 513 | # push `"`, `:`, `n`, `u`, `l`, `l` into mirror stack 514 | self.__push_mirror_token_stack( 515 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 516 | ) 517 | self.__push_mirror_token_stack( 518 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 519 | ) 520 | self.__push_mirror_token_stack( 521 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U 522 | ) 523 | self.__push_mirror_token_stack( 524 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N 525 | ) 526 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_COLON) 527 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_QUOTE) 528 | elif self.__stream_stopped_in_an_object_key_end(): 529 | # check if stopped in key of object's properity or value of object's properity 530 | # pop `"` from mirror stack 531 | self.__pop_mirror_token_stack() 532 | elif self.__stream_stopped_in_an_object_string_value_start(): 533 | # pop `n`, `u`, `l`, `l` from mirror stack 534 | self.__pop_mirror_token_stack() 535 | self.__pop_mirror_token_stack() 536 | self.__pop_mirror_token_stack() 537 | self.__pop_mirror_token_stack() 538 | # push `"` into mirror stack 539 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_QUOTE) 540 | elif self.__stream_stopped_in_an_object_value_end(): 541 | # pop `"` from mirror stack 542 | self.__pop_mirror_token_stack() 543 | else: 544 | return "Invalid quote token in JSON stream" 545 | elif token == lexer_tokens.TOKEN_COLON: 546 | if self.__stream_stopped_in_a_string(): 547 | self.json_content += token_symbol 548 | continue 549 | # check if json stream stopped with padding content 550 | if self.__have_padding_content(): 551 | self.__append_padding_content_to_json_content() 552 | self.__clean_padding_content() 553 | # write current token symbol to JSON content 554 | self.json_content += token_symbol 555 | self.__push_token_stack(token) 556 | # pop `:` from mirror stack 557 | self.__pop_mirror_token_stack() 558 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A: 559 | # as hex in unicode 560 | if self.__stream_stopped_in_an_string_unicode_escape(): 561 | self.__push_byte_into_padding_content(token_symbol) 562 | # check if unicode escape is full length 563 | if len(self.padding_content) == 6: 564 | self.__append_padding_content_to_json_content() 565 | self.__clean_padding_content() 566 | # pop `\`, `u` from stack 567 | self.__pop_token_stack() 568 | self.__pop_token_stack() 569 | continue 570 | # write current token symbol to JSON content 571 | self.json_content += token_symbol 572 | # in a string, just skip token 573 | if self.__stream_stopped_in_a_string(): 574 | continue 575 | 576 | # check if `f` in token stack and `a`, `l`, `s`, `e in mirror stack 577 | def it_is_part_of_token_false(): 578 | left = [lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F] 579 | right = [ 580 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 581 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S, 582 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 583 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A, 584 | ] 585 | return lexer_helper.match_stack( 586 | self.token_stack, left 587 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 588 | 589 | if not it_is_part_of_token_false(): 590 | continue 591 | 592 | self.__push_token_stack(token) 593 | self.__pop_mirror_token_stack() 594 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_B: 595 | # as hex in unicode 596 | if self.__stream_stopped_in_an_string_unicode_escape(): 597 | self.__push_byte_into_padding_content(token_symbol) 598 | # check if unicode escape is full length 599 | if len(self.padding_content) == 6: 600 | self.__append_padding_content_to_json_content() 601 | self.__clean_padding_content() 602 | # pop `\`, `u` from stack 603 | self.__pop_token_stack() 604 | self.__pop_token_stack() 605 | continue 606 | 607 | # \b escape `\`, `b` 608 | if self.__stream_stopped_with_leading_escape_character(): 609 | # push padding escape character `\` into JSON content 610 | self.__append_padding_content_to_json_content() 611 | self.__clean_padding_content() 612 | # write current token symbol to JSON content 613 | self.json_content += token_symbol 614 | # pop `\` from stack 615 | self.__pop_token_stack() 616 | continue 617 | 618 | # write current token symbol to JSON content 619 | self.json_content += token_symbol 620 | 621 | # in a string, just skip token 622 | if self.__stream_stopped_in_a_string(): 623 | continue 624 | 625 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E: 626 | # as hex in unicode 627 | if self.__stream_stopped_in_an_string_unicode_escape(): 628 | self.__push_byte_into_padding_content(token_symbol) 629 | # check if unicode escape is full length 630 | if len(self.padding_content) == 6: 631 | self.__append_padding_content_to_json_content() 632 | self.__clean_padding_content() 633 | # pop `\`, `u` from stack 634 | self.__pop_token_stack() 635 | self.__pop_token_stack() 636 | continue 637 | 638 | # check if in a number, as `e` (exponent) in scientific notation 639 | if self.__stream_stopped_in_a_number_decimal_part_middle(): 640 | self.__push_byte_into_padding_content(token_symbol) 641 | continue 642 | 643 | # write current token symbol to JSON content 644 | self.json_content += token_symbol 645 | 646 | # in a string, just skip token 647 | if self.__stream_stopped_in_a_string(): 648 | continue 649 | 650 | # check if `f`, `a`, `l`, `s` in token stack and `e` in mirror stack 651 | def it_is_part_of_token_false(): 652 | left = [ 653 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F, 654 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A, 655 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 656 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S, 657 | ] 658 | right = [ 659 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 660 | ] 661 | return lexer_helper.match_stack( 662 | self.token_stack, left 663 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 664 | 665 | # check if `t`, `r`, `u` in token stack and `e` in mirror stack 666 | def it_is_part_of_token_true(): 667 | left = [ 668 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T, 669 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R, 670 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 671 | ] 672 | right = [ 673 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 674 | ] 675 | return lexer_helper.match_stack( 676 | self.token_stack, left 677 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 678 | 679 | if not it_is_part_of_token_false() and not it_is_part_of_token_true(): 680 | continue 681 | 682 | self.__push_token_stack(token) 683 | self.__pop_mirror_token_stack() 684 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F: 685 | # as hex in unicode 686 | if self.__stream_stopped_in_an_string_unicode_escape(): 687 | self.__push_byte_into_padding_content(token_symbol) 688 | # check if unicode escape is full length 689 | if len(self.padding_content) == 6: 690 | self.__append_padding_content_to_json_content() 691 | self.__clean_padding_content() 692 | # pop `\`, `u` from stack 693 | self.__pop_token_stack() 694 | self.__pop_token_stack() 695 | continue 696 | 697 | # \f escape `\`, `f` 698 | if self.__stream_stopped_with_leading_escape_character(): 699 | # push padding escape character `\` into JSON content 700 | self.__append_padding_content_to_json_content() 701 | self.__clean_padding_content() 702 | # write current token symbol to JSON content 703 | self.json_content.append(token_symbol) 704 | # pop `\` from stack 705 | self.__pop_token_stack() 706 | continue 707 | 708 | # check if json stream stopped with padding content, like case `[true , f` 709 | if self.__have_padding_content(): 710 | self.__append_padding_content_to_json_content() 711 | self.__clean_padding_content() 712 | 713 | # write current token symbol to JSON content 714 | self.json_content.append(token_symbol) 715 | 716 | # in a string, just skip token 717 | if self.__stream_stopped_in_a_string(): 718 | continue 719 | 720 | # push `f` into stack 721 | self.__push_token_stack(token) 722 | if self.__stream_stopped_in_an_array(): 723 | # in array 724 | # push `a`, `l`, `s`, `e` 725 | self.__push_mirror_token_stack( 726 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E 727 | ) 728 | self.__push_mirror_token_stack( 729 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S 730 | ) 731 | self.__push_mirror_token_stack( 732 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 733 | ) 734 | self.__push_mirror_token_stack( 735 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A 736 | ) 737 | else: 738 | # in object 739 | # pop `n`, `u`, `l`, `l` 740 | self.__pop_mirror_token_stack() 741 | self.__pop_mirror_token_stack() 742 | self.__pop_mirror_token_stack() 743 | self.__pop_mirror_token_stack() 744 | # push `a`, `l`, `s`, `e` 745 | self.__push_mirror_token_stack( 746 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E 747 | ) 748 | self.__push_mirror_token_stack( 749 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S 750 | ) 751 | self.__push_mirror_token_stack( 752 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 753 | ) 754 | self.__push_mirror_token_stack( 755 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A 756 | ) 757 | 758 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L: 759 | # write current token symbol to JSON content 760 | self.json_content.append(token_symbol) 761 | # in a string, just skip token 762 | if self.__stream_stopped_in_a_string(): 763 | continue 764 | 765 | # check if `f`, `a` in token stack and, `l`, `s`, `e` in mirror stack 766 | def it_is_part_of_token_false(): 767 | left = [ 768 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F, 769 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A, 770 | ] 771 | right = [ 772 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 773 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S, 774 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 775 | ] 776 | return lexer_helper.match_stack( 777 | self.token_stack, left 778 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 779 | 780 | # check if `n`, `u` in token stack and `l`, `l` in mirror stack 781 | def it_is_part_of_token_null1(): 782 | left = [ 783 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 784 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 785 | ] 786 | right = [ 787 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 788 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 789 | ] 790 | return lexer_helper.match_stack( 791 | self.token_stack, left 792 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 793 | 794 | # check if `n`, `u`, `l` in token stack and `l` in mirror stack 795 | def it_is_part_of_token_null2(): 796 | left = [ 797 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N, 798 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 799 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 800 | ] 801 | right = [ 802 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 803 | ] 804 | return lexer_helper.match_stack( 805 | self.token_stack, left 806 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 807 | 808 | if ( 809 | not it_is_part_of_token_false() 810 | and not it_is_part_of_token_null1() 811 | and not it_is_part_of_token_null2() 812 | ): 813 | continue 814 | 815 | self.__push_token_stack(token) 816 | self.__pop_mirror_token_stack() 817 | 818 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N: 819 | # \n escape `\`, `n` 820 | if self.__stream_stopped_with_leading_escape_character(): 821 | # push padding escape character `\` into JSON content 822 | self.__append_padding_content_to_json_content() 823 | self.__clean_padding_content() 824 | # write current token symbol to JSON content 825 | self.json_content.append(token_symbol) 826 | # pop `\` from stack 827 | self.__pop_token_stack() 828 | continue 829 | 830 | # check if json stream stopped with padding content, like case `[true , n` 831 | if self.__have_padding_content(): 832 | self.__append_padding_content_to_json_content() 833 | self.__clean_padding_content() 834 | 835 | # write current token symbol to JSON content 836 | self.json_content.append(token_symbol) 837 | 838 | # in a string, just skip token 839 | if self.__stream_stopped_in_a_string(): 840 | continue 841 | 842 | # push `n` 843 | self.__push_token_stack(token) 844 | if self.__stream_stopped_in_an_array(): 845 | # in array, push `u`, `l`, `l` 846 | self.__push_mirror_token_stack( 847 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 848 | ) 849 | self.__push_mirror_token_stack( 850 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L 851 | ) 852 | self.__push_mirror_token_stack( 853 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U 854 | ) 855 | else: 856 | # in object, pop `n` 857 | self.__pop_mirror_token_stack() 858 | 859 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R: 860 | # \r escape `\`, `r` 861 | if self.__stream_stopped_with_leading_escape_character(): 862 | # push padding escape character `\` into JSON content 863 | self.__append_padding_content_to_json_content() 864 | self.__clean_padding_content() 865 | # write current token symbol to JSON content 866 | self.json_content.append(token_symbol) 867 | # pop `\` from stack 868 | self.__pop_token_stack() 869 | continue 870 | 871 | # write current token symbol to JSON content 872 | self.json_content.append(token_symbol) 873 | 874 | # in a string, just skip token 875 | if self.__stream_stopped_in_a_string(): 876 | continue 877 | 878 | # check if `t` in token stack and `r`, `u`, `e in mirror stack 879 | def it_is_part_of_token_true(): 880 | left = [ 881 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T, 882 | ] 883 | right = [ 884 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 885 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 886 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R, 887 | ] 888 | return lexer_helper.match_stack( 889 | self.token_stack, left 890 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 891 | 892 | if not it_is_part_of_token_true(): 893 | continue 894 | 895 | self.__push_token_stack(token) 896 | self.__pop_mirror_token_stack() 897 | 898 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S: 899 | # write current token symbol to JSON content 900 | self.json_content.append(token_symbol) 901 | 902 | # in a string, just skip token 903 | if self.__stream_stopped_in_a_string(): 904 | continue 905 | 906 | # check if `f`, `a`, `l` in token stack and `s`, `e in mirror stack 907 | def it_is_part_of_token_false(): 908 | left = [ 909 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_F, 910 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_A, 911 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 912 | ] 913 | right = [ 914 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 915 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_S, 916 | ] 917 | return lexer_helper.match_stack( 918 | self.token_stack, left 919 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 920 | 921 | if not it_is_part_of_token_false(): 922 | continue 923 | 924 | self.__push_token_stack(token) 925 | self.__pop_mirror_token_stack() 926 | 927 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T: 928 | # \t escape `\`, `t` 929 | if self.__stream_stopped_with_leading_escape_character(): 930 | # push padding escape character `\` into JSON content 931 | self.__append_padding_content_to_json_content() 932 | self.__clean_padding_content() 933 | # write current token symbol to JSON content 934 | self.json_content.append(token_symbol) 935 | # pop `\` from stack 936 | self.__pop_token_stack() 937 | continue 938 | 939 | # check if json stream stopped with padding content, like case `[true , t` 940 | if self.__have_padding_content(): 941 | self.__append_padding_content_to_json_content() 942 | self.__clean_padding_content() 943 | 944 | # write current token symbol to JSON content 945 | self.json_content.append(token_symbol) 946 | 947 | # in a string, just skip token 948 | if self.__stream_stopped_in_a_string(): 949 | continue 950 | 951 | # push `t` to stack 952 | self.__push_token_stack(token) 953 | if self.__stream_stopped_in_an_array(): 954 | # in array 955 | # push `r`, `u`, `e` 956 | self.__push_mirror_token_stack( 957 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E 958 | ) 959 | self.__push_mirror_token_stack( 960 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U 961 | ) 962 | self.__push_mirror_token_stack( 963 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R 964 | ) 965 | else: 966 | # in object 967 | # pop `n`, `u`, `l`, `l` 968 | self.__pop_mirror_token_stack() 969 | self.__pop_mirror_token_stack() 970 | self.__pop_mirror_token_stack() 971 | self.__pop_mirror_token_stack() 972 | # push `r`, `u`, `e` 973 | self.__push_mirror_token_stack( 974 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E 975 | ) 976 | self.__push_mirror_token_stack( 977 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U 978 | ) 979 | self.__push_mirror_token_stack( 980 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R 981 | ) 982 | 983 | elif token == lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U: 984 | # unicode escape `\`, `u` 985 | if self.__stream_stopped_with_leading_escape_character(): 986 | self.__push_token_stack(token) 987 | self.padding_content.append(token_symbol) 988 | continue 989 | 990 | # write current token symbol to JSON content 991 | self.json_content.append(token_symbol) 992 | 993 | # in a string, just skip token 994 | if self.__stream_stopped_in_a_string(): 995 | continue 996 | 997 | # check if `t`, `r` in token stack and, `u`, `e` in mirror stack 998 | def it_is_part_of_token_true(): 999 | left = [ 1000 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_T, 1001 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_R, 1002 | ] 1003 | right = [ 1004 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_E, 1005 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 1006 | ] 1007 | return lexer_helper.match_stack( 1008 | self.token_stack, left 1009 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 1010 | 1011 | # check if `n` in token stack and `u`, `l`, `l` in mirror stack 1012 | def it_is_part_of_token_null(): 1013 | left = [lexer_tokens.TOKEN_ALPHABET_LOWERCASE_N] 1014 | right = [ 1015 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 1016 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_L, 1017 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_U, 1018 | ] 1019 | return lexer_helper.match_stack( 1020 | self.token_stack, left 1021 | ) and lexer_helper.match_stack(self.mirror_token_stack, right) 1022 | 1023 | if not it_is_part_of_token_true() and not it_is_part_of_token_null(): 1024 | continue 1025 | 1026 | self.__push_token_stack(token) 1027 | self.__pop_mirror_token_stack() 1028 | 1029 | elif token in [ 1030 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_A, 1031 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_B, 1032 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_C, 1033 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_D, 1034 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_C, 1035 | lexer_tokens.TOKEN_ALPHABET_LOWERCASE_D, 1036 | lexer_tokens.TOKEN_ALPHABET_UPPERCASE_F, 1037 | ]: 1038 | # as hex in unicode 1039 | if self.__stream_stopped_in_an_string_unicode_escape(): 1040 | self.__push_byte_into_padding_content(token_symbol) 1041 | # check if unicode escape is full length 1042 | if len(self.padding_content) == 6: 1043 | self.__append_padding_content_to_json_content() 1044 | self.__clean_padding_content() 1045 | # pop `\`, `u` from stack 1046 | self.__pop_token_stack() 1047 | self.__pop_token_stack() 1048 | continue 1049 | 1050 | # write current token symbol to JSON content 1051 | self.json_content.append(token_symbol) 1052 | 1053 | # in a string, just skip token 1054 | if self.__stream_stopped_in_a_string(): 1055 | continue 1056 | 1057 | elif token == lexer_tokens.TOKEN_ALPHABET_UPPERCASE_E: 1058 | # as hex in unicode 1059 | if self.__stream_stopped_in_an_string_unicode_escape(): 1060 | self.__push_byte_into_padding_content(token_symbol) 1061 | # check if unicode escape is full length 1062 | if len(self.padding_content) == 6: 1063 | self.__append_padding_content_to_json_content() 1064 | self.__clean_padding_content() 1065 | # pop `\`, `u` from stack 1066 | self.__pop_token_stack() 1067 | self.__pop_token_stack() 1068 | continue 1069 | 1070 | # check if in a number, as `E` (exponent) in scientific notation 1071 | if self.__stream_stopped_in_a_number_decimal_part_middle(): 1072 | self.__push_byte_into_padding_content(token_symbol) 1073 | continue 1074 | 1075 | # write current token symbol to JSON content 1076 | self.json_content.append(token_symbol) 1077 | 1078 | # in a string, just skip token 1079 | if self.__stream_stopped_in_a_string(): 1080 | continue 1081 | elif token in [ 1082 | lexer_tokens.TOKEN_NUMBER_0, 1083 | lexer_tokens.TOKEN_NUMBER_1, 1084 | lexer_tokens.TOKEN_NUMBER_2, 1085 | lexer_tokens.TOKEN_NUMBER_3, 1086 | lexer_tokens.TOKEN_NUMBER_4, 1087 | lexer_tokens.TOKEN_NUMBER_5, 1088 | lexer_tokens.TOKEN_NUMBER_6, 1089 | lexer_tokens.TOKEN_NUMBER_7, 1090 | lexer_tokens.TOKEN_NUMBER_8, 1091 | lexer_tokens.TOKEN_NUMBER_9, 1092 | ]: 1093 | # as number in unicode 1094 | if self.__stream_stopped_in_an_string_unicode_escape(): 1095 | self.__push_byte_into_padding_content(token_symbol) 1096 | # check if unicode escape is full length 1097 | if len(self.padding_content) == 6: 1098 | self.__append_padding_content_to_json_content() 1099 | self.__clean_padding_content() 1100 | # pop `\`, `u` from stack 1101 | self.__pop_token_stack() 1102 | self.__pop_token_stack() 1103 | continue 1104 | 1105 | # check if json stream stopped with padding content, like `[1 , 1` 1106 | if self.__have_padding_content(): 1107 | self.__append_padding_content_to_json_content() 1108 | self.__clean_padding_content() 1109 | 1110 | # in negative part of a number 1111 | if self.__stream_stopped_in_a_negative_number_value_start(): 1112 | self.__push_negative_into_json_content() 1113 | # pop `0` from mirror stack 1114 | self.__pop_mirror_token_stack() 1115 | 1116 | # write current token symbol to JSON content 1117 | self.json_content.append(token_symbol) 1118 | 1119 | # in a string or a number, just skip token 1120 | if ( 1121 | self.__stream_stopped_in_a_string() 1122 | or self.__stream_stopped_in_a_number() 1123 | ): 1124 | continue 1125 | 1126 | # in decimal part of a number 1127 | if self.__stream_stopped_in_a_number_decimal_part(): 1128 | self.__push_token_stack(lexer_tokens.TOKEN_NUMBER) 1129 | # pop placeholder `0` in decimal part 1130 | self.__pop_mirror_token_stack() 1131 | continue 1132 | 1133 | # first number type token, push token into stack 1134 | self.__push_token_stack(lexer_tokens.TOKEN_NUMBER) 1135 | 1136 | # check if we are in an object or an array 1137 | if self.__stream_stopped_in_an_array(): 1138 | continue 1139 | elif self.__stream_stopped_in_an_object_null_value_placeholder_start(): 1140 | # pop `n`, `u`, `l`, `l` 1141 | self.__pop_mirror_token_stack() 1142 | self.__pop_mirror_token_stack() 1143 | self.__pop_mirror_token_stack() 1144 | self.__pop_mirror_token_stack() 1145 | 1146 | elif token == lexer_tokens.TOKEN_COMMA: 1147 | # in a string, just skip token 1148 | if self.__stream_stopped_in_a_string(): 1149 | self.json_content.append(token_symbol) 1150 | continue 1151 | # in a object or a array, keep the comma in stack but not write it into JSONContent, until next token arrival 1152 | # the comma must following with token: quote, null, true, false, number 1153 | self.__push_byte_into_padding_content(token_symbol) 1154 | self.__push_token_stack(token) 1155 | elif token == lexer_tokens.TOKEN_DOT: 1156 | # write current token symbol to JSON content 1157 | self.json_content.append(token_symbol) 1158 | 1159 | # in a string, just skip token 1160 | if self.__stream_stopped_in_a_string(): 1161 | continue 1162 | 1163 | # use 0 for decimal part place holder 1164 | self.__push_token_stack(token) 1165 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_NUMBER_0) 1166 | 1167 | elif token == lexer_tokens.TOKEN_SLASH: 1168 | # escape character `\`, `/` 1169 | if self.__stream_stopped_with_leading_escape_character(): 1170 | # push padding escape character `\` into JSON content 1171 | self.__append_padding_content_to_json_content() 1172 | self.__clean_padding_content() 1173 | # write current token symbol to JSON content 1174 | self.json_content.append(token_symbol) 1175 | # pop `\` from stack 1176 | self.__pop_token_stack() 1177 | continue 1178 | elif self.__stream_stopped_in_a_string(): 1179 | self.json_content.append(token_symbol) 1180 | continue 1181 | 1182 | elif token == lexer_tokens.TOKEN_ESCAPE_CHARACTER: 1183 | # double escape character `\`, `\` 1184 | if self.__stream_stopped_with_leading_escape_character(): 1185 | # push padding escape character `\` into JSON content 1186 | self.__append_padding_content_to_json_content() 1187 | self.__clean_padding_content() 1188 | # write current token symbol to JSON content 1189 | self.json_content.append(token_symbol) 1190 | # pop `\` from stack 1191 | self.__pop_token_stack() 1192 | continue 1193 | 1194 | # just write escape character into stack and waitting other token trigger escape method. 1195 | self.__push_token_stack(token) 1196 | self.__push_byte_into_padding_content( 1197 | lexer_tokens.TOKEN_ESCAPE_CHARACTER_SYMBOL 1198 | ) 1199 | elif token == lexer_tokens.TOKEN_NEGATIVE: 1200 | # in a string, just skip token 1201 | if self.__stream_stopped_in_a_string(): 1202 | self.json_content.append(token_symbol) 1203 | continue 1204 | 1205 | # check if json stream stopped with padding content, like `[1 , -` 1206 | if self.__have_padding_content(): 1207 | self.__append_padding_content_to_json_content() 1208 | self.__clean_padding_content() 1209 | 1210 | # just write negative character into stack and waitting other token trigger it. 1211 | self.__push_token_stack(token) 1212 | if self.__stream_stopped_in_an_object_negative_number_value_start(): 1213 | # pop `n`, `u`, `l`, `l` from mirror stack 1214 | self.__pop_mirror_token_stack() 1215 | self.__pop_mirror_token_stack() 1216 | self.__pop_mirror_token_stack() 1217 | self.__pop_mirror_token_stack() 1218 | 1219 | # push `0` into mirror stack for placeholder 1220 | self.__push_mirror_token_stack(lexer_tokens.TOKEN_NUMBER_0) 1221 | 1222 | else: 1223 | return f"Unexpected token: {token}, token symbol: {token_symbol}" 1224 | 1225 | if token == lexer_tokens.TOKEN_EOF: 1226 | break 1227 | 1228 | return None 1229 | 1230 | def complete_json(self) -> str: 1231 | """ 1232 | complete the incomplete JSON string by concat JSON content and mirror tokens 1233 | """ 1234 | # This combines json_content and mirror token stack into a complete JSON string 1235 | return "".join(self.json_content) + self.__dump_mirror_token_stack_to_string() 1236 | --------------------------------------------------------------------------------