├── .github └── workflows │ ├── deploy.yml │ └── test-deploy.yml ├── .gitignore ├── CHANGELOG ├── LICENSE ├── MANIFEST.in ├── README.md ├── _chompjs ├── buffer.c ├── buffer.h ├── module.c ├── parser.c └── parser.h ├── chompjs ├── __init__.py ├── chompjs.py └── test_parser.py ├── docs └── index.html ├── setup.py └── tox.ini /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build_wheels: 8 | name: Build wheels on ${{ matrix.os }} 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest, windows-latest, macos-13, macos-latest] 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Build wheels 18 | uses: pypa/cibuildwheel@v2.22.0 19 | env: 20 | CIBW_SKIP: cp36-* cp37-* cp38-* pp* 21 | 22 | - uses: actions/upload-artifact@v4 23 | with: 24 | name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }} 25 | path: ./wheelhouse/*.whl 26 | 27 | build_sdist: 28 | name: Build source distribution 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - name: Build sdist 34 | run: pipx run build --sdist 35 | 36 | - uses: actions/upload-artifact@v4 37 | with: 38 | name: chompjs-sdist 39 | path: dist/*.tar.gz 40 | 41 | upload_pypi: 42 | needs: [build_wheels, build_sdist] 43 | runs-on: ubuntu-latest 44 | environment: pypi 45 | permissions: 46 | id-token: write 47 | steps: 48 | - uses: actions/download-artifact@v4 49 | with: 50 | pattern: chompjs-* 51 | path: dist 52 | merge-multiple: true 53 | 54 | - uses: pypa/gh-action-pypi-publish@release/v1 55 | -------------------------------------------------------------------------------- /.github/workflows/test-deploy.yml: -------------------------------------------------------------------------------- 1 | name: test-deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build_wheels: 8 | name: Build wheels on ${{ matrix.os }} 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest, windows-latest, macos-13, macos-latest] 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Build wheels 18 | uses: pypa/cibuildwheel@v2.22.0 19 | env: 20 | CIBW_SKIP: cp36-* cp37-* cp38-* pp* 21 | 22 | - uses: actions/upload-artifact@v4 23 | with: 24 | name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }} 25 | path: ./wheelhouse/*.whl 26 | 27 | build_sdist: 28 | name: Build source distribution 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - name: Build sdist 34 | run: pipx run build --sdist 35 | 36 | - uses: actions/upload-artifact@v4 37 | with: 38 | name: chompjs-sdist 39 | path: dist/*.tar.gz 40 | 41 | upload_pypi: 42 | needs: [build_wheels, build_sdist] 43 | runs-on: ubuntu-latest 44 | environment: pypi 45 | permissions: 46 | id-token: write 47 | steps: 48 | - uses: actions/download-artifact@v4 49 | with: 50 | pattern: chompjs-* 51 | path: dist 52 | merge-multiple: true 53 | 54 | - uses: pypa/gh-action-pypi-publish@release/v1 55 | with: 56 | repository-url: https://test.pypi.org/legacy/ 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | 55 | # Byte-compiled / optimized / DLL files 56 | __pycache__/ 57 | *.py[cod] 58 | *$py.class 59 | 60 | # C extensions 61 | *.so 62 | 63 | # Distribution / packaging 64 | .Python 65 | build/ 66 | develop-eggs/ 67 | dist/ 68 | downloads/ 69 | eggs/ 70 | .eggs/ 71 | lib/ 72 | lib64/ 73 | parts/ 74 | sdist/ 75 | var/ 76 | wheels/ 77 | pip-wheel-metadata/ 78 | share/python-wheels/ 79 | *.egg-info/ 80 | .installed.cfg 81 | *.egg 82 | MANIFEST 83 | 84 | # PyInstaller 85 | # Usually these files are written by a python script from a template 86 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 87 | *.manifest 88 | *.spec 89 | 90 | # Installer logs 91 | pip-log.txt 92 | pip-delete-this-directory.txt 93 | 94 | # Unit test / coverage reports 95 | htmlcov/ 96 | .tox/ 97 | .nox/ 98 | .coverage 99 | .coverage.* 100 | .cache 101 | nosetests.xml 102 | coverage.xml 103 | *.cover 104 | *.py,cover 105 | .hypothesis/ 106 | .pytest_cache/ 107 | cover/ 108 | 109 | # Translations 110 | *.mo 111 | *.pot 112 | 113 | # Django stuff: 114 | *.log 115 | local_settings.py 116 | db.sqlite3 117 | db.sqlite3-journal 118 | 119 | # Flask stuff: 120 | instance/ 121 | .webassets-cache 122 | 123 | # Scrapy stuff: 124 | .scrapy 125 | 126 | # Sphinx documentation 127 | docs/_build/ 128 | 129 | # PyBuilder 130 | target/ 131 | 132 | # Jupyter Notebook 133 | .ipynb_checkpoints 134 | 135 | # IPython 136 | profile_default/ 137 | ipython_config.py 138 | 139 | # pyenv 140 | # For a library or package, you might want to ignore these files since the code is 141 | # intended to run in multiple environments; otherwise, check them in: 142 | # .python-version 143 | 144 | # pipenv 145 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 146 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 147 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 148 | # install all needed dependencies. 149 | #Pipfile.lock 150 | 151 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 152 | __pypackages__/ 153 | 154 | # Celery stuff 155 | celerybeat-schedule 156 | celerybeat.pid 157 | 158 | # SageMath parsed files 159 | *.sage.py 160 | 161 | # Environments 162 | .env 163 | .venv 164 | env/ 165 | venv/ 166 | ENV/ 167 | env.bak/ 168 | venv.bak/ 169 | 170 | # Spyder project settings 171 | .spyderproject 172 | .spyproject 173 | 174 | # Rope project settings 175 | .ropeproject 176 | 177 | # mkdocs documentation 178 | /site 179 | 180 | # mypy 181 | .mypy_cache/ 182 | .dmypy.json 183 | dmypy.json 184 | 185 | # Pyre type checker 186 | .pyre/ 187 | 188 | # pytype static type analyzer 189 | .pytype/ 190 | 191 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | [1.3.2] 2 | * Release the GIL during parsing in C (#69) 3 | 4 | [1.3.1] 5 | * Introduced CI to build wheels (#68) 6 | 7 | [1.3.0] 8 | * Allow custom load function (https://github.com/Nykakin/chompjs/pull/63) 9 | 10 | [1.2.4] 11 | * Remove trailing whitespaces for unrecognized values (#59) 12 | * Fix segfault on empty string (#62) 13 | 14 | [1.2.3] 15 | * Remove trailing whitespaces from unquoted keys (#57) 16 | 17 | [1.2.2] 18 | * Fix parsing some floating numbers as octal (#52) 19 | * Fix number of digits calculation (#50) 20 | 21 | [1.2.1] 22 | * Fix compilation on Windows (#49) 23 | 24 | [1.2.0] 25 | * Allow trailing dot at the end of numerals (#39) 26 | * Parse hexadecimal, binary and octal literals as numbers (#40) 27 | * Drop support for Python 2.7 (#44) 28 | * Add parse_js_objects function (#45) 29 | * Drop jsonlines flag in parse_json_object in favor of parse_json_objects (#46) 30 | * Improve documentation (#47, #32) 31 | 32 | [1.1.9] 33 | * Handle NaN in input (#37) 34 | 35 | [1.1.8] 36 | * Fixed previous release (package couldn't be installed) 37 | 38 | [1.1.7] 39 | * Handle unquoted properties starting with reserved JS keywords (#34) 40 | 41 | [1.1.6] 42 | * Handle bug with parsing arrays like `["","/"]` (#33) 43 | 44 | [1.1.5] 45 | * Correctly handle malformed quotations (#31) 46 | 47 | [1.1.4] 48 | * Performance improvement (#19) 49 | * Handle numeric keys (#20) 50 | * Refactor error handling (#29) 51 | 52 | [1.1.3] 53 | * Avoid an infinite loop on malformed input (#27) 54 | 55 | [1.1.2] 56 | * Handle comments in JavaScript code (#22) 57 | 58 | [1.1.1] 59 | * Fix installation bug (headers moved to a different dir) 60 | 61 | [1.1.0] 62 | * Parser refactored and rewritten in order to simplify code and improve speed 63 | * Allow handling JavaScript functions and other strange stuff such as regexes (#16) 64 | * Allow passing down json.loads parameters 65 | * Allow handling hexadecimal, octal and binary literals (#12) 66 | 67 | [1.0.17] 68 | * Handle memory corruption on unclosed quotations (#13) 69 | 70 | [1.0.16] 71 | * Handle floats with leading zeros (#10) 72 | 73 | [1.0.15] 74 | * Handle $ and _ characters at the beginning of keys (#9) 75 | 76 | [1.0.14] 77 | * Handle "undefined" keyword in JavaScript objects (#7) 78 | 79 | [1.0.13] 80 | * Handle escaped quotations correctly (#6) 81 | 82 | [1.0.12] 83 | * Handle windows newlines (#5) 84 | 85 | [1.0.11] 86 | * Handle jsonlines (#3) 87 | 88 | [1.0.1] 89 | * Handle Unicode in keys (#2) 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Mariusz Obajtek 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft _chompjs 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chompjs 2 | 3 | ![license](https://img.shields.io/github/license/Nykakin/chompjs?style=flat-square) 4 | ![pypi version](https://img.shields.io/pypi/v/chompjs.svg) 5 | ![python version](https://img.shields.io/pypi/pyversions/chompjs.svg) 6 | ![downloads](https://img.shields.io/pypi/dm/chompjs.svg) 7 | 8 | Transforms JavaScript objects into Python data structures. 9 | 10 | In web scraping, you sometimes need to transform Javascript objects embedded in HTML pages into valid Python dictionaries. `chompjs` is a library designed to do that as a more powerful replacement of standard `json.loads`: 11 | 12 | ```python 13 | >>> chompjs.parse_js_object("{a: 100}") 14 | {'a': 100} 15 | >>> 16 | >>> json_lines = """ 17 | ... {'a': 12} 18 | ... {'b': 13} 19 | ... {'c': 14} 20 | ... """ 21 | >>> for entry in chompjs.parse_js_objects(json_lines): 22 | ... print(entry) 23 | ... 24 | {'a': 12} 25 | {'b': 13} 26 | {'c': 14} 27 | ``` 28 | 29 | [Reference documentation](https://nykakin.github.io/chompjs/) 30 | 31 | ## Quickstart 32 | 33 | **1. installation** 34 | 35 | ``` 36 | > pip install chompjs 37 | ``` 38 | 39 | or build from source: 40 | 41 | ```bash 42 | $ git clone https://github.com/Nykakin/chompjs 43 | $ cd chompjs 44 | $ python setup.py build 45 | $ python setup.py install 46 | ``` 47 | 48 | ## Features 49 | 50 | There are two functions available: 51 | * `parse_js_object` - try reading first encountered JSON-like object. Raises `ValueError` on failure 52 | * `parse_js_objects` - returns a generator yielding all encountered JSON-like objects. Can be used to read [JSON Lines](https://jsonlines.org/). Does not raise on invalid input. 53 | 54 | An example usage with `scrapy`: 55 | 56 | ```python 57 | import chompjs 58 | import scrapy 59 | 60 | 61 | class MySpider(scrapy.Spider): 62 | # ... 63 | 64 | def parse(self, response): 65 | script_css = 'script:contains("__NEXT_DATA__")::text' 66 | script_pattern = r'__NEXT_DATA__ = (.*);' 67 | # warning: for some pages you need to pass replace_entities=True 68 | # into re_first to have JSON escaped properly 69 | script_text = response.css(script_css).re_first(script_pattern) 70 | try: 71 | json_data = chompjs.parse_js_object(script_text) 72 | except ValueError: 73 | self.log('Failed to extract data from {}'.format(response.url)) 74 | return 75 | 76 | # work on json_data 77 | ``` 78 | 79 | Parsing of [JSON5 objects](https://json5.org/) is supported: 80 | 81 | ```python 82 | >>> data = """ 83 | ... { 84 | ... // comments 85 | ... unquoted: 'and you can quote me on that', 86 | ... singleQuotes: 'I can use "double quotes" here', 87 | ... lineBreaks: "Look, Mom! \ 88 | ... No \\n's!", 89 | ... hexadecimal: 0xdecaf, 90 | ... leadingDecimalPoint: .8675309, andTrailing: 8675309., 91 | ... positiveSign: +1, 92 | ... trailingComma: 'in objects', andIn: ['arrays',], 93 | ... "backwardsCompatible": "with JSON", 94 | ... } 95 | ... """ 96 | >>> chompjs.parse_js_object(data) 97 | {'unquoted': 'and you can quote me on that', 'singleQuotes': 'I can use "double quotes" here', 'lineBreaks': "Look, Mom! No \n's!", 'hexadecimal': 912559, 'leadingDecimalPoint': 0.8675309, 'andTrailing': 8675309.0, 'positiveSign': '+1', 'trailingComma': 'in objects', 'andIn': ['arrays'], 'backwardsCompatible': 'with JSON'} 98 | ``` 99 | 100 | If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it: 101 | 102 | ```python 103 | >>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True) 104 | {'a': 12} 105 | ``` 106 | 107 | By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest: 108 | 109 | ```python 110 | >>> chompjs.parse_js_object('
...
...
') 111 | [1, 2, 3] 112 | ``` 113 | 114 | Post-processed input is parsed using `json.loads` by default. A different loader such as `orsjon` can be used with `loader` argument: 115 | 116 | ```python 117 | >>> import orjson 118 | >>> import chompjs 119 | >>> 120 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads) 121 | {'a': 12} 122 | ``` 123 | 124 | `loader_args` and `loader_kwargs` arguments can be used to pass options to underlying loader function. For example for default `json.loads` you can pass down options such as `strict` or `object_hook`: 125 | 126 | ```python 127 | >>> import decimal 128 | >>> import chompjs 129 | >>> chompjs.parse_js_object('[23.2]', loader_kwargs={'parse_float': decimal.Decimal}) 130 | [Decimal('23.2')] 131 | ``` 132 | 133 | # Rationale 134 | 135 | In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example: 136 | 137 | ```html 138 | 139 | ... 140 | 141 | ... 142 | 143 | ... 144 | 145 | 146 | ``` 147 | 148 | Standard library function `json.loads` is usually sufficient to extract this data: 149 | 150 | ```python 151 | >>> # scrapy shell file:///tmp/test.html 152 | >>> import json 153 | >>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)') 154 | >>> json.loads(script_text) 155 | {u'foo': u'bar'} 156 | 157 | ``` 158 | The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs: 159 | 160 | * `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote 161 | * `'{a: "b"}'`is not a valid JSON because property name is not quoted at all 162 | * `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array 163 | * `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0 164 | 165 | As a result, `json.loads` fail to extract any of those: 166 | 167 | ```python 168 | >>> json.loads("{'a': 'b'}") 169 | Traceback (most recent call last): 170 | ... 171 | ValueError: Expecting property name: line 1 column 2 (char 1) 172 | >>> json.loads('{a: "b"}') 173 | Traceback (most recent call last): 174 | ... 175 | ValueError: Expecting property name: line 1 column 2 (char 1) 176 | >>> json.loads('{"a": [1, 2, 3,]}') 177 | Traceback (most recent call last): 178 | ... 179 | ValueError: No JSON object could be decoded 180 | >>> json.loads('{"a": .99}') 181 | Traceback (most recent call last): 182 | ... 183 | json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6) 184 | 185 | ``` 186 | `chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries: 187 | 188 | ```python 189 | >>> import chompjs 190 | >>> 191 | >>> chompjs.parse_js_object("{'a': 'b'}") 192 | {'a': 'b'} 193 | >>> chompjs.parse_js_object('{a: "b"}') 194 | {'a': 'b'} 195 | >>> chompjs.parse_js_object('{"a": [1, 2, 3,]}') 196 | {'a': [1, 2, 3]} 197 | >>> chompjs.parse_js_object('{"a": .99}') 198 | {'a': 0.99} 199 | ``` 200 | 201 | Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`. 202 | 203 | ```python 204 | >>> import json 205 | >>> import _chompjs 206 | >>> 207 | >>> _chompjs.parse('{a: 1}') 208 | '{"a":1}' 209 | >>> json.loads(_) 210 | {'a': 1} 211 | ``` 212 | 213 | # Development 214 | Pull requests are welcome. 215 | 216 | To run unittests 217 | 218 | ``` 219 | $ tox 220 | ``` 221 | -------------------------------------------------------------------------------- /_chompjs/buffer.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved. 3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "buffer.h" 12 | 13 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size) { 14 | buffer->data = malloc(initial_depth_buffer_size); 15 | buffer->memory_buffer_length = initial_depth_buffer_size; 16 | buffer->index = 0; 17 | } 18 | 19 | void release_char_buffer(struct CharBuffer* buffer) { 20 | free(buffer->data); 21 | } 22 | 23 | void check_capacity(struct CharBuffer* buffer, size_t to_save) { 24 | if(buffer->index + to_save >= buffer->memory_buffer_length) { 25 | buffer->data = realloc(buffer->data, 2*buffer->memory_buffer_length); 26 | buffer->memory_buffer_length *= 2; 27 | } 28 | } 29 | 30 | void push(struct CharBuffer* buffer, char value) { 31 | check_capacity(buffer, 1); 32 | buffer->data[buffer->index] = value; 33 | buffer->index += 1; 34 | } 35 | 36 | void push_string(struct CharBuffer* buffer, const char* value, size_t len) { 37 | check_capacity(buffer, len); 38 | memcpy(buffer->data + buffer->index, value, len); 39 | buffer->index += len; 40 | } 41 | 42 | void push_number(struct CharBuffer* buffer, long value) { 43 | int size_in_chars; 44 | if (value == 0) { 45 | size_in_chars = 2; 46 | } else { 47 | size_in_chars = floor(log10(value)) + 2; 48 | } 49 | check_capacity(buffer, size_in_chars); 50 | buffer->index += sprintf(buffer->data + buffer->index, "%ld", value); 51 | } 52 | 53 | void pop(struct CharBuffer* buffer) { 54 | buffer->index -= 1; 55 | } 56 | 57 | char top(struct CharBuffer* buffer) { 58 | return buffer->data[buffer->index-1]; 59 | } 60 | 61 | bool empty(struct CharBuffer* buffer) { 62 | return buffer->index <= 0; 63 | } 64 | 65 | void clear(struct CharBuffer* buffer) { 66 | buffer->index = 0; 67 | } 68 | 69 | size_t size(struct CharBuffer* buffer) { 70 | return buffer->index; 71 | } 72 | -------------------------------------------------------------------------------- /_chompjs/buffer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved. 3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE 4 | */ 5 | 6 | #ifndef CHOMPJS_BUFFER_H 7 | #define CHOMPJS_BUFFER_H 8 | 9 | #include 10 | #include 11 | 12 | /** 13 | Implements a safe, dynamically growing char buffer 14 | */ 15 | struct CharBuffer { 16 | char* data; 17 | size_t memory_buffer_length; 18 | size_t index; 19 | }; 20 | 21 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size); 22 | 23 | void release_char_buffer(struct CharBuffer* buffer); 24 | 25 | void check_capacity(struct CharBuffer* buffer, size_t to_save); 26 | 27 | void push(struct CharBuffer* buffer, char value); 28 | 29 | void push_string(struct CharBuffer* buffer, const char* value, size_t len); 30 | 31 | void push_number(struct CharBuffer* buffer, long value); 32 | 33 | void pop(struct CharBuffer* buffer); 34 | 35 | char top(struct CharBuffer* buffer); 36 | 37 | bool empty(struct CharBuffer* buffer); 38 | 39 | void clear(struct CharBuffer* buffer); 40 | 41 | size_t size(struct CharBuffer* buffer); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /_chompjs/module.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved. 3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE 4 | */ 5 | 6 | #define PY_SSIZE_T_CLEAN 7 | #include 8 | #include 9 | #include "parser.h" 10 | 11 | static PyObject* parse_python_object(PyObject *self, PyObject *args) { 12 | const char* string; 13 | if (!PyArg_ParseTuple(args, "s", &string)) { 14 | return NULL; 15 | } 16 | 17 | struct Lexer lexer; 18 | init_lexer(&lexer, string); 19 | Py_BEGIN_ALLOW_THREADS 20 | while(lexer.lexer_status == CAN_ADVANCE) { 21 | advance(&lexer); 22 | } 23 | Py_END_ALLOW_THREADS 24 | 25 | PyObject* ret = Py_BuildValue("s#", lexer.output.data, lexer.output.index-1); 26 | release_lexer(&lexer); 27 | if(lexer.lexer_status == ERROR) { 28 | const char* msg_sting = "Error parsing input near character %d"; 29 | size_t error_buffer_size = snprintf( 30 | NULL, 31 | 0, 32 | msg_sting, 33 | lexer.input_position 34 | ); 35 | char* error_buffer = malloc(error_buffer_size + 1); 36 | sprintf( 37 | error_buffer, 38 | msg_sting, 39 | lexer.input_position - 1 40 | ); 41 | PyErr_SetString(PyExc_ValueError, error_buffer); 42 | free(error_buffer); 43 | return NULL; 44 | } 45 | return ret; 46 | } 47 | 48 | typedef struct { 49 | PyObject_HEAD 50 | struct Lexer lexer; 51 | } JsonIterState; 52 | 53 | static PyObject* json_iter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { 54 | JsonIterState* json_iter_state = (JsonIterState *)type->tp_alloc(type, 0); 55 | if (!json_iter_state) { 56 | return NULL; 57 | } 58 | 59 | const char* string; 60 | if (!PyArg_ParseTuple(args, "s", &string)) { 61 | return NULL; 62 | } 63 | init_lexer(&json_iter_state->lexer, string); 64 | 65 | return (PyObject* )json_iter_state; 66 | } 67 | 68 | static void json_iter_dealloc(JsonIterState* json_iter_state) { 69 | release_lexer(&json_iter_state->lexer); 70 | Py_TYPE(json_iter_state)->tp_free(json_iter_state); 71 | } 72 | 73 | static PyObject* json_iter_next(JsonIterState* json_iter_state) { 74 | Py_BEGIN_ALLOW_THREADS 75 | while(json_iter_state->lexer.lexer_status == CAN_ADVANCE) { 76 | advance(&json_iter_state->lexer); 77 | } 78 | Py_END_ALLOW_THREADS 79 | 80 | if(json_iter_state->lexer.output.index == 1) { 81 | return NULL; 82 | } 83 | PyObject* ret = Py_BuildValue( 84 | "s#", 85 | json_iter_state->lexer.output.data, 86 | json_iter_state->lexer.output.index-1 87 | ); 88 | reset_lexer_output(&json_iter_state->lexer); 89 | return ret; 90 | } 91 | 92 | PyTypeObject JSONIter_Type = { 93 | PyVarObject_HEAD_INIT(NULL, 0) 94 | "json_iter", /* tp_name */ 95 | sizeof(JsonIterState), /* tp_basicsize */ 96 | 0, /* tp_itemsize */ 97 | (destructor)json_iter_dealloc, /* tp_dealloc */ 98 | 0, /* tp_print */ 99 | 0, /* tp_getattr */ 100 | 0, /* tp_setattr */ 101 | 0, /* tp_reserved */ 102 | 0, /* tp_repr */ 103 | 0, /* tp_as_number */ 104 | 0, /* tp_as_sequence */ 105 | 0, /* tp_as_mapping */ 106 | 0, /* tp_hash */ 107 | 0, /* tp_call */ 108 | 0, /* tp_str */ 109 | 0, /* tp_getattro */ 110 | 0, /* tp_setattro */ 111 | 0, /* tp_as_buffer */ 112 | Py_TPFLAGS_DEFAULT, /* tp_flags */ 113 | 0, /* tp_doc */ 114 | 0, /* tp_traverse */ 115 | 0, /* tp_clear */ 116 | 0, /* tp_richcompare */ 117 | 0, /* tp_weaklistoffset */ 118 | PyObject_SelfIter, /* tp_iter */ 119 | (iternextfunc)json_iter_next, /* tp_iternext */ 120 | 0, /* tp_methods */ 121 | 0, /* tp_members */ 122 | 0, /* tp_getset */ 123 | 0, /* tp_base */ 124 | 0, /* tp_dict */ 125 | 0, /* tp_descr_get */ 126 | 0, /* tp_descr_set */ 127 | 0, /* tp_dictoffset */ 128 | 0, /* tp_init */ 129 | PyType_GenericAlloc, /* tp_alloc */ 130 | json_iter_new, /* tp_new */ 131 | }; 132 | 133 | static PyObject* parse_python_objects(PyObject *self, PyObject *args) { 134 | PyObject *obj = PyObject_CallObject((PyObject *) &JSONIter_Type, args); 135 | return obj; 136 | } 137 | 138 | static PyMethodDef parser_methods[] = { 139 | { 140 | "parse", parse_python_object, METH_VARARGS, 141 | "Extract JSON object from the string" 142 | }, 143 | { 144 | "parse_objects", parse_python_objects, METH_VARARGS, 145 | "Iterate over all JSON objects in the string" 146 | }, 147 | {NULL, NULL, 0, NULL} 148 | }; 149 | 150 | 151 | static struct PyModuleDef parser_definition = { 152 | PyModuleDef_HEAD_INIT, 153 | "_chompjs", 154 | "C extension for fast JavaScript object parsing", 155 | -1, 156 | parser_methods 157 | }; 158 | 159 | PyMODINIT_FUNC PyInit__chompjs(void) { 160 | Py_Initialize(); 161 | PyObject* module = PyModule_Create(&parser_definition); 162 | if (!module) { 163 | return NULL; 164 | } 165 | if (PyType_Ready(&JSONIter_Type) < 0) { 166 | return NULL; 167 | } 168 | return module; 169 | } 170 | -------------------------------------------------------------------------------- /_chompjs/parser.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved. 3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE 4 | */ 5 | 6 | #include "parser.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define INITIAL_NESTING_DEPTH 20 15 | 16 | struct State states[] = { 17 | {begin}, 18 | {json}, 19 | {value}, 20 | {end}, 21 | {error}, 22 | }; 23 | 24 | enum StateIndex { 25 | BEGIN_STATE, JSON_STATE, VALUE_STATE, END_STATE, ERROR_STATE 26 | }; 27 | 28 | void advance(struct Lexer* lexer) { 29 | lexer->state = lexer->state->change(lexer); 30 | } 31 | 32 | char next_char(struct Lexer* lexer) { 33 | while(1) { 34 | if(isspace(lexer->input[lexer->input_position])) { 35 | lexer->input_position += 1; 36 | continue; 37 | } 38 | return lexer->input[lexer->input_position]; 39 | } 40 | return '\0'; 41 | } 42 | 43 | char last_char(struct Lexer* lexer) { 44 | return top(&lexer->output); 45 | } 46 | 47 | void emit(char c, struct Lexer* lexer) { 48 | push(&lexer->output, c); 49 | lexer->input_position += 1; 50 | } 51 | 52 | void emit_in_place(char c, struct Lexer* lexer) { 53 | push(&lexer->output, c); 54 | } 55 | 56 | void unemit(struct Lexer* lexer) { 57 | pop(&lexer->output); 58 | } 59 | 60 | void emit_string(const char *s, size_t size, struct Lexer* lexer) { 61 | push_string(&lexer->output, s, size); 62 | lexer->input_position += size; 63 | } 64 | 65 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer) { 66 | push_string(&lexer->output, s, size); 67 | } 68 | 69 | void emit_number_in_place(long value, struct Lexer* lexer) { 70 | push_number(&lexer->output, value); 71 | } 72 | 73 | void init_lexer(struct Lexer* lexer, const char* string) { 74 | lexer->input = string; 75 | // allocate in advance more memory for output than for input because we might need 76 | // to add extra characters 77 | // for example `{a: undefined}` will be translated as `{"a": "undefined"}` 78 | lexer->output_size = 2 * strlen(string) + 1; 79 | init_char_buffer(&lexer->output, lexer->output_size); 80 | lexer->input_position = 0; 81 | init_char_buffer(&lexer->nesting_depth, INITIAL_NESTING_DEPTH); 82 | lexer->unrecognized_nesting_depth = 0; 83 | lexer->lexer_status = CAN_ADVANCE; 84 | lexer->state = &states[BEGIN_STATE]; 85 | lexer->is_key = false; 86 | } 87 | 88 | void reset_lexer_output(struct Lexer* lexer) { 89 | clear(&lexer->output); 90 | lexer->lexer_status = CAN_ADVANCE; 91 | lexer->state = &states[BEGIN_STATE]; 92 | lexer->is_key = false; 93 | lexer->input_position -= 1; 94 | } 95 | 96 | void release_lexer(struct Lexer* lexer) { 97 | release_char_buffer(&lexer->output); 98 | } 99 | 100 | struct State* begin(struct Lexer* lexer) { 101 | // Ignoring characters until either '{' or '[' appears 102 | for(;;) { 103 | switch(next_char(lexer)) { 104 | case '{': 105 | lexer->is_key = true; 106 | case '[':; 107 | return &states[JSON_STATE]; 108 | break; 109 | case '\0':; 110 | return &states[END_STATE]; 111 | default: 112 | lexer->input_position += 1; 113 | } 114 | } 115 | return &states[ERROR_STATE]; 116 | } 117 | 118 | struct State* json(struct Lexer* lexer) { 119 | for(;;) { 120 | switch(next_char(lexer)) { 121 | case '{': 122 | push(&lexer->nesting_depth, '{'); 123 | lexer->is_key = true; 124 | emit('{', lexer); 125 | break; 126 | case '[': 127 | push(&lexer->nesting_depth, '['); 128 | emit('[', lexer); 129 | break; 130 | case '}': 131 | if(last_char(lexer) == ',') { 132 | unemit(lexer); 133 | } 134 | pop(&lexer->nesting_depth); 135 | lexer->is_key = top(&lexer->nesting_depth) == '{'; 136 | emit('}', lexer); 137 | if(size(&lexer->nesting_depth) <= 0) { 138 | return &states[END_STATE]; 139 | } 140 | break; 141 | case ']': 142 | if(last_char(lexer) == ',') { 143 | unemit(lexer); 144 | } 145 | pop(&lexer->nesting_depth); 146 | lexer->is_key = top(&lexer->nesting_depth) == '{'; 147 | emit(']', lexer); 148 | if(size(&lexer->nesting_depth) <= 0) { 149 | return &states[END_STATE]; 150 | } 151 | break; 152 | case ':': 153 | lexer->is_key = false; 154 | emit(':', lexer); 155 | break; 156 | case ',': 157 | emit(',', lexer); 158 | lexer->is_key = top(&lexer->nesting_depth) == '{'; 159 | break; 160 | 161 | case '/':; 162 | char next_c = lexer->input[lexer->input_position+1]; 163 | if(next_c == '/' || next_c == '*') { 164 | handle_comments(lexer); 165 | } else { 166 | return &states[VALUE_STATE]; 167 | } 168 | break; 169 | 170 | // This should never happen, but an malformed input can 171 | // cause an infinite loop without this check 172 | case '>': 173 | case ')':; 174 | return &states[ERROR_STATE]; 175 | break; 176 | 177 | default: 178 | return &states[VALUE_STATE]; 179 | } 180 | } 181 | 182 | return &states[ERROR_STATE]; 183 | } 184 | 185 | struct State* _handle_string(struct Lexer* lexer, const char* string, size_t length) { 186 | char next_char = lexer->input[lexer->input_position+length+1]; 187 | if(next_char == '_' || isalnum(next_char)) { 188 | return handle_unrecognized(lexer); 189 | } 190 | emit_string(string, length, lexer); 191 | return &states[JSON_STATE]; 192 | } 193 | 194 | struct State* value(struct Lexer* lexer) { 195 | char c = next_char(lexer); 196 | const char* position = lexer->input + lexer->input_position; 197 | 198 | if(c == '"' || c == '\'' || c == '`') { 199 | return handle_quoted(lexer); 200 | } else if(isdigit(c) || c == '.' || c == '-') { 201 | if(lexer->is_key) { 202 | return handle_unrecognized(lexer); 203 | } else { 204 | return handle_numeric(lexer); 205 | } 206 | } else if(strncmp(position, "true", 4) == 0) { 207 | return _handle_string(lexer, "true", 4); 208 | } else if(strncmp(position, "false", 5) == 0) { 209 | return _handle_string(lexer, "false", 5); 210 | } else if(strncmp(position, "null", 4) == 0) { 211 | return _handle_string(lexer, "null", 4); 212 | } else if(c == ']' || c == '}' || c == '[' || c == '{') { 213 | return &states[JSON_STATE]; 214 | } else if(strncmp(position, "NaN", 3) == 0) { 215 | return _handle_string(lexer, "NaN", 3); 216 | } else { 217 | return handle_unrecognized(lexer); 218 | } 219 | 220 | return &states[JSON_STATE]; 221 | } 222 | 223 | struct State* end(struct Lexer* lexer) { 224 | emit('\0', lexer); 225 | lexer->lexer_status = FINISHED; 226 | return lexer->state; 227 | } 228 | 229 | struct State* error(struct Lexer* lexer) { 230 | emit('\0', lexer); 231 | lexer->lexer_status = ERROR; 232 | return lexer->state; 233 | } 234 | 235 | struct State* handle_quoted(struct Lexer* lexer) { 236 | char current_quotation = next_char(lexer); 237 | emit('"', lexer); 238 | 239 | for(;;) { 240 | char c = lexer->input[lexer->input_position]; 241 | // handle escape sequences such as \\ and \' 242 | if(c == '\\') { 243 | char escaped = lexer->input[lexer->input_position+1]; 244 | if(escaped == '\'') { 245 | emit('\'', lexer); 246 | lexer->input_position += 1; 247 | } else { 248 | emit('\\', lexer); 249 | emit(escaped, lexer); 250 | } 251 | continue; 252 | } 253 | // in case of malformed quotation we can reach end of the input 254 | if(c == '\0') { 255 | return &states[ERROR_STATE]; 256 | } 257 | // if we're closing the quotations, we're done with the string 258 | if(c == current_quotation) { 259 | emit('"', lexer); 260 | return &states[JSON_STATE]; 261 | } 262 | // otherwise, emit character 263 | if(c == '"') { 264 | emit_string_in_place("\\\"", 2, lexer); 265 | lexer->input_position += 1; 266 | } else { 267 | emit(c, lexer); 268 | } 269 | } 270 | 271 | return &states[ERROR_STATE]; 272 | } 273 | 274 | struct State* handle_numeric(struct Lexer* lexer) { 275 | char c = next_char(lexer); 276 | if(c >= 49 && c <= 57) { // 1-9 range 277 | return handle_numeric_standard_base(lexer); 278 | } else if(c == '.') { 279 | emit_in_place('0', lexer); 280 | emit('.', lexer); 281 | return handle_numeric_standard_base(lexer); 282 | } else if(c == '-') { 283 | emit('-', lexer); 284 | return handle_numeric(lexer); 285 | } else if(c == '0') { 286 | char nc = tolower(lexer->input[lexer->input_position+1]); 287 | if(nc == '.') { 288 | emit('0', lexer); 289 | emit('.', lexer); 290 | return handle_numeric_standard_base(lexer); 291 | } else if(nc == 'x' || nc == 'X') { 292 | return handle_numeric_non_standard_base(lexer, 16); 293 | } else if(nc == 'o' || nc == 'O') { 294 | lexer->input_position += 2; 295 | return handle_numeric_non_standard_base(lexer, 8); 296 | } else if(isdigit(nc)) { 297 | return handle_numeric_non_standard_base(lexer, 8); 298 | } else if(nc == 'b' || nc == 'B') { 299 | lexer->input_position += 2; 300 | return handle_numeric_non_standard_base(lexer, 2); 301 | } else { 302 | emit('0', lexer); 303 | return &states[JSON_STATE]; 304 | } 305 | } else { 306 | return &states[ERROR_STATE]; 307 | } 308 | return &states[JSON_STATE]; 309 | } 310 | 311 | struct State* handle_numeric_standard_base(struct Lexer* lexer) { 312 | char c = next_char(lexer); 313 | do { 314 | if(c != '_') { 315 | emit(c, lexer); 316 | } else { 317 | lexer->input_position += 1; 318 | } 319 | c = tolower(lexer->input[lexer->input_position]); 320 | } while(isdigit(c) || c == '.' || c == 'e' || c == 'E' || c == '+' || c =='-' || c == '_'); 321 | if(last_char(lexer) == '.') { 322 | emit_in_place('0', lexer); 323 | } 324 | return &states[JSON_STATE]; 325 | } 326 | 327 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base) { 328 | char* end; 329 | long n = strtol(lexer->input + lexer->input_position, &end, base); 330 | emit_number_in_place(n, lexer); 331 | lexer->input_position = end - lexer->input; 332 | return &states[JSON_STATE]; 333 | } 334 | 335 | struct State* handle_unrecognized(struct Lexer* lexer) { 336 | emit_in_place('"', lexer); 337 | char currently_quoted_with = '\0'; 338 | 339 | lexer->unrecognized_nesting_depth = 0; 340 | do { 341 | char c = lexer->input[lexer->input_position]; 342 | 343 | switch(c) { 344 | case '\\': 345 | emit_in_place('\\', lexer); 346 | emit('\\', lexer); 347 | break; 348 | 349 | case '\'': 350 | case '"': 351 | case '`': 352 | if(c == '"') { 353 | emit_in_place('\\', lexer); 354 | emit('"', lexer); 355 | } else { 356 | emit(c, lexer); 357 | } 358 | 359 | if(!currently_quoted_with) { 360 | currently_quoted_with = c; 361 | } else if (currently_quoted_with == c) { 362 | currently_quoted_with = '\0'; 363 | } 364 | break; 365 | 366 | case '{': 367 | case '[': 368 | case '<': 369 | case '(': 370 | emit(c, lexer); 371 | lexer->unrecognized_nesting_depth += 1; 372 | break; 373 | 374 | case '}': 375 | case ']': 376 | case '>': 377 | case ')': 378 | if(currently_quoted_with && lexer->unrecognized_nesting_depth > 0) { 379 | emit(c, lexer); 380 | } else if(lexer->unrecognized_nesting_depth > 0) { 381 | emit(c, lexer); 382 | lexer->unrecognized_nesting_depth -= 1; 383 | } else { 384 | // remove trailing whitespaces after value 385 | while(isspace(last_char(lexer))) { 386 | pop(&lexer->output); 387 | } 388 | emit_in_place('"', lexer); 389 | return &states[JSON_STATE]; 390 | } 391 | break; 392 | 393 | case ',': 394 | case ':': 395 | if(!currently_quoted_with && lexer->unrecognized_nesting_depth <= 0) { 396 | // remove trailing whitespaces after key 397 | while(isspace(last_char(lexer))) { 398 | pop(&lexer->output); 399 | } 400 | emit_in_place('"', lexer); 401 | return &states[JSON_STATE]; 402 | } else { 403 | emit(c, lexer); 404 | } 405 | break; 406 | 407 | default: 408 | emit(c, lexer); 409 | } 410 | } while (lexer->input[lexer->input_position] != '\0'); 411 | 412 | return &states[ERROR_STATE]; 413 | } 414 | 415 | void handle_comments(struct Lexer* lexer) { 416 | char c, next_c; 417 | 418 | lexer->input_position += 1; 419 | if(lexer->input[lexer->input_position] == '/' ) { 420 | for(;;) { 421 | lexer->input_position+=1; 422 | c = lexer->input[lexer->input_position]; 423 | if((c == '\0') || (c == '\n')) { 424 | break; 425 | } 426 | } 427 | } else if(lexer->input[lexer->input_position] == '*') { 428 | for(;;) { 429 | lexer->input_position+=1; 430 | c = lexer->input[lexer->input_position]; 431 | next_c = lexer->input[lexer->input_position+1]; 432 | if((c == '\0') || (c == '*' && next_c == '/')) { 433 | break; 434 | } 435 | } 436 | lexer->input_position+=2; 437 | } 438 | } 439 | -------------------------------------------------------------------------------- /_chompjs/parser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved. 3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE 4 | */ 5 | 6 | #ifndef CHOMPJS_PARSER_H 7 | #define CHOMPJS_PARSER_H 8 | 9 | #include 10 | #include 11 | 12 | #include "buffer.h" 13 | 14 | struct Lexer; 15 | 16 | /** 17 | States of internal state machine: 18 | * begin - start parsing 19 | * json - handle special characters: "[", "{", "}", "]", ",", ":" 20 | * value - handle a JSON value, such as strings and numbers 21 | * end - finish work 22 | * error - finish work, mark an error 23 | */ 24 | struct State* begin(struct Lexer* lexer); 25 | struct State* json(struct Lexer* lexer); 26 | struct State* value(struct Lexer* lexer); 27 | struct State* end(struct Lexer* lexer); 28 | struct State* error(struct Lexer* lexer); 29 | 30 | /* 31 | Helper functions used in "value" state 32 | * handle_quoted - handles quoted strings 33 | * handle_numeric - handle numbers 34 | * handle_numeric_standard_base - handle numbers in standard base-10 35 | * handle_numeric_non_standard_base - handle numbers in non-standard bases (hex, oct) 36 | * handle_unrecognized - save all unrecognized data as a string 37 | */ 38 | struct State* handle_quoted(struct Lexer* lexer); 39 | struct State* handle_numeric(struct Lexer* lexer); 40 | struct State* handle_numeric_standard_base(struct Lexer* lexer); 41 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base); 42 | struct State* handle_unrecognized(struct Lexer* lexer); 43 | 44 | /** 45 | State wrapper 46 | */ 47 | struct State { 48 | struct State* (*change)(struct Lexer *); 49 | }; 50 | 51 | /** Possible results of internal state machine state change state */ 52 | typedef enum { 53 | CAN_ADVANCE, 54 | FINISHED, 55 | ERROR, 56 | } LexerStatus; 57 | 58 | /** Main object, responsible for everything */ 59 | struct Lexer { 60 | const char* input; 61 | size_t output_size; 62 | struct CharBuffer output; 63 | size_t input_position; 64 | LexerStatus lexer_status; 65 | struct State* state; 66 | struct CharBuffer nesting_depth; 67 | size_t unrecognized_nesting_depth; 68 | bool is_key; 69 | }; 70 | 71 | /** Switch state of internal state machine */ 72 | void advance(struct Lexer* lexer); 73 | 74 | /** Get next char, ignore whitespaces */ 75 | char next_char(struct Lexer* lexer); 76 | 77 | /** Get previously handled char */ 78 | char last_char(struct Lexer* lexer); 79 | 80 | /** Send character to output buffer, advance input position */ 81 | void emit(char c, struct Lexer* lexer); 82 | 83 | /** Send character to output buffer, keep old input position */ 84 | void emit_in_place(char c, struct Lexer* lexer); 85 | 86 | /** Remove last character from output buffer */ 87 | void unemit(struct Lexer* lexer); 88 | 89 | /** Send string to output buffer, advance input position */ 90 | void emit_string(const char *s, size_t size, struct Lexer* lexer); 91 | 92 | /** Send string to output buffer, keep old input position */ 93 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer); 94 | 95 | /** Send number to output buffer, keep old input position */ 96 | void emit_number_in_place(long value, struct Lexer* lexer); 97 | 98 | /** Handle comments in JSON body */ 99 | void handle_comments(struct Lexer* lexer); 100 | 101 | /** Initialize main lexer object */ 102 | void init_lexer(struct Lexer* lexer, const char* string); 103 | 104 | /** Reset main lexer object output buffer */ 105 | void reset_lexer_output(struct Lexer* lexer); 106 | 107 | /** Release main lexer object and its memory */ 108 | void release_lexer(struct Lexer* lexer); 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /chompjs/__init__.py: -------------------------------------------------------------------------------- 1 | from .chompjs import parse_js_object, parse_js_objects 2 | -------------------------------------------------------------------------------- /chompjs/chompjs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import warnings 5 | 6 | from _chompjs import parse, parse_objects 7 | 8 | 9 | def _preprocess(string, unicode_escape=False): 10 | if unicode_escape: 11 | string = string.encode().decode("unicode_escape") 12 | return string 13 | 14 | 15 | def _process_loader_arguments(loader_args, loader_kwargs, json_params): 16 | if json_params: 17 | msg = "json_params argument is deprecated, please use loader_kwargs instead" 18 | warnings.warn(msg, DeprecationWarning) 19 | loader_kwargs = json_params 20 | 21 | if not loader_args: 22 | loader_args = [] 23 | 24 | if not loader_kwargs: 25 | loader_kwargs = {} 26 | 27 | return (loader_args, loader_kwargs) 28 | 29 | 30 | def parse_js_object( 31 | string, 32 | unicode_escape=False, 33 | loader=json.loads, 34 | loader_args=None, 35 | loader_kwargs=None, 36 | json_params=None, 37 | ): 38 | """ 39 | Extracts first JSON object encountered in the input string 40 | 41 | Parameters 42 | ---------- 43 | string: str 44 | Input string 45 | 46 | >>> parse_js_object("{a: 100}") 47 | {'a': 100} 48 | 49 | unicode_escape: bool, optional 50 | Attempt to fix input string if it contains escaped special characters 51 | 52 | >>> parse_js_object('{\\\\"a\\\\": 100}') 53 | {'\\\\"a\\\\"': 100} 54 | >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True) 55 | {'a': 100} 56 | 57 | loader: func, optional 58 | Function used to load processed input data. By default `json.loads` is used 59 | 60 | >>> import orjson 61 | >>> import chompjs 62 | >>> 63 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads) 64 | {'a': 12} 65 | 66 | loader_args: list, optional 67 | Allow passing down positional arguments to loader function 68 | 69 | loader_kwargs: dict, optional 70 | Allow passing down keyword arguments to loader function 71 | 72 | >>> parse_js_object("{'a': 10.1}") 73 | {'a': 10.1} 74 | >>> import decimal 75 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}) 76 | {'a': Decimal('10.1')} 77 | 78 | .. deprecated:: 1.3.0 79 | json_params: dict, optional 80 | Use `loader_kwargs` instead 81 | 82 | Returns 83 | ------- 84 | list | dict 85 | Extracted JSON object 86 | 87 | Raises 88 | ------ 89 | ValueError 90 | If failed to parse input properly 91 | 92 | ```python 93 | >>> parse_js_object(None) 94 | Traceback (most recent call last): 95 | ... 96 | ValueError: Invalid input 97 | >>> parse_js_object("No JSON objects in sight...") 98 | Traceback (most recent call last): 99 | ... 100 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) 101 | 102 | ``` 103 | 104 | """ 105 | if not string: 106 | raise ValueError("Invalid input") 107 | 108 | loader_args, loader_kwargs = _process_loader_arguments( 109 | loader_args, loader_kwargs, json_params 110 | ) 111 | 112 | string = _preprocess(string, unicode_escape) 113 | parsed_data = parse(string) 114 | return loader(parsed_data, *loader_args, **loader_kwargs) 115 | 116 | 117 | def parse_js_objects( 118 | string, 119 | unicode_escape=False, 120 | omitempty=False, 121 | loader=json.loads, 122 | loader_args=None, 123 | loader_kwargs=None, 124 | json_params=None, 125 | ): 126 | """ 127 | Returns a generator extracting all JSON objects encountered in the input string. 128 | Can be used to read JSON Lines 129 | 130 | Parameters 131 | ---------- 132 | string: str 133 | Input string 134 | 135 | >>> it = parse_js_objects("{a: 100} {b: 100}") 136 | >>> next(it) 137 | {'a': 100} 138 | >>> next(it) 139 | {'b': 100} 140 | 141 | unicode_escape: bool, optional 142 | Attempt to fix input string if it contains escaped special characters 143 | 144 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}')) 145 | {'\\\\"a\\\\"': 100} 146 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True)) 147 | {'a': 100} 148 | 149 | omitempty: bool, optional 150 | Skip empty dictionaries and lists 151 | 152 | >>> list(parse_js_objects("{a: 12} {} {b: 13}")) 153 | [{'a': 12}, {}, {'b': 13}] 154 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True)) 155 | [{'a': 12}, {'b': 13}] 156 | 157 | loader: func, optional 158 | Function used to load processed input data. By default `json.loads` is used 159 | 160 | >>> import orjson 161 | >>> import chompjs 162 | >>> 163 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads)) 164 | {'a': 12} 165 | 166 | loader_args: list, optional 167 | Allow passing down positional arguments to loader function 168 | 169 | loader_kwargs: dict, optional 170 | Allow passing down keyword arguments to loader function 171 | 172 | >>> next(parse_js_objects("{'a': 10.1}")) 173 | {'a': 10.1} 174 | >>> import decimal 175 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})) 176 | {'a': Decimal('10.1')} 177 | 178 | .. deprecated:: 1.3.0 179 | json_params: dict, optional 180 | Use `loader_kwargs` instead 181 | 182 | Returns 183 | ------- 184 | generator 185 | Iterating over it yields all encountered JSON objects 186 | """ 187 | 188 | if not string: 189 | return 190 | 191 | loader_args, loader_kwargs = _process_loader_arguments( 192 | loader_args, loader_kwargs, json_params 193 | ) 194 | 195 | string = _preprocess(string, unicode_escape) 196 | for raw_data in parse_objects(string): 197 | try: 198 | data = loader(raw_data, *loader_args, **loader_kwargs) 199 | except ValueError: 200 | continue 201 | 202 | if not data and omitempty: 203 | continue 204 | 205 | yield data 206 | -------------------------------------------------------------------------------- /chompjs/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import functools 5 | import math 6 | import unittest 7 | 8 | from chompjs import parse_js_object, parse_js_objects 9 | 10 | 11 | def parametrize_test(*arguments_list): 12 | def decorate(func): 13 | @functools.wraps(func) 14 | def wrapper(self, *args, **kwargs): 15 | for arguments in arguments_list: 16 | func(self, *arguments) 17 | return wrapper 18 | return decorate 19 | 20 | 21 | class TestParser(unittest.TestCase): 22 | @parametrize_test( 23 | ("{'hello': 'world'}", {'hello': 'world'}), 24 | ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}), 25 | ( 26 | "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}", 27 | {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}, 28 | ), 29 | ("{}", {}), 30 | ) 31 | def test_parse_object(self, in_data, expected_data): 32 | result = parse_js_object(in_data) 33 | self.assertEqual(result, expected_data) 34 | 35 | @parametrize_test( 36 | ("[]", []), 37 | ("[[[]]]", [[[]]]), 38 | ("[[[1]]]", [[[1]]]), 39 | ("[1]", [1]), 40 | ("[1, 2, 3, 4]", [1, 2, 3, 4]), 41 | ("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']), 42 | ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]), 43 | ) 44 | def test_parse_list(self, in_data, expected_data): 45 | result = parse_js_object(in_data) 46 | self.assertEqual(result, expected_data) 47 | 48 | @parametrize_test( 49 | ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}), 50 | ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}), 51 | ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]), 52 | ( 53 | "[false, {'true': true, `pies`: \"kot\"}, false,]", 54 | [False, {"true": True, 'pies': 'kot'}, False], 55 | ), 56 | ( 57 | "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}", 58 | {k: 1 for k in 'abcdefghij'}, 59 | ), 60 | ( 61 | "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}", 62 | {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]}, 63 | ), 64 | ) 65 | def test_parse_mixed(self, in_data, expected_data): 66 | result = parse_js_object(in_data) 67 | self.assertEqual(result, expected_data) 68 | 69 | @parametrize_test( 70 | ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}), 71 | ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]), 72 | ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}), 73 | ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}), 74 | ("[\"\\uD834\\uDD1E\"]", [u'𝄞']), 75 | ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}), 76 | ("['\u00E9']", ['é']), 77 | ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}), 78 | ('{"a": 3.125e7}', {'a': 3.125e7}), 79 | ('''{"a": "b\\'"}''', {'a': "b'"}), 80 | ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -.1}), 81 | ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]), 82 | ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}), 83 | ) 84 | def test_parse_standard_values(self, in_data, expected_data): 85 | result = parse_js_object(in_data) 86 | self.assertEqual(result, expected_data) 87 | 88 | def test_parse_nan(self): 89 | in_data = '{"A": NaN}' 90 | result = parse_js_object(in_data) 91 | self.assertTrue(math.isnan(result["A"])) 92 | 93 | @parametrize_test( 94 | ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}), 95 | ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}), 96 | ( 97 | "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}", 98 | {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"} 99 | ), 100 | ( 101 | "{\"abc\": function() {return '])))))))))))))))';}}", 102 | {"abc": "function() {return '])))))))))))))))';}"}, 103 | ), 104 | ('{"a": undefined}', {"a": "undefined"}), 105 | ('[undefined, undefined]', ["undefined", "undefined"]), 106 | ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}), 107 | ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}), 108 | ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}), 109 | ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}), 110 | ("{'a': 121.}", {'a': 121.0}), 111 | ("{abc : 100}", {'abc': 100}), 112 | ("{abc : 100}", {'abc': 100}), 113 | ("{abc: name }", {'abc': "name"}), 114 | ("{abc: name\t}", {'abc': "name"}), 115 | ("{abc: value\n}", {'abc': "value"}), 116 | ("{abc: name}", {'abc': "name"}), 117 | ("{abc: \tname}", {'abc': "name"}), 118 | ("{abc: \nvalue}", {'abc': "value"}), 119 | ) 120 | def test_parse_strange_values(self, in_data, expected_data): 121 | result = parse_js_object(in_data) 122 | self.assertEqual(result, expected_data) 123 | 124 | @parametrize_test( 125 | ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}), 126 | ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}), 127 | ('{"a":\r\n10}', {'a': 10}), 128 | ("{'foo': 0,\r\n}", {'foo': 0}), 129 | ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}), 130 | ) 131 | def test_strange_input(self, in_data, expected_data): 132 | result = parse_js_object(in_data) 133 | self.assertEqual(result, expected_data) 134 | 135 | @parametrize_test( 136 | ("[0]", [0]), 137 | ("[1]", [1]), 138 | ("[12]", [12]), 139 | ("[12_12]", [1212]), 140 | ("[0x12]", [18]), 141 | ("[0xab]", [171]), 142 | ("[0xAB]", [171]), 143 | ("[0X12]", [18]), 144 | ("[0Xab]", [171]), 145 | ("[0XAB]", [171]), 146 | ("[01234]", [668]), 147 | ("[0o1234]", [668]), 148 | ("[0O1234]", [668]), 149 | ("[0b1111]", [15]), 150 | ("[0B1111]", [15]), 151 | ("[-0]", [-0]), 152 | ("[-1]", [-1]), 153 | ("[-12]", [-12]), 154 | ("[-12_12]", [-1212]), 155 | ("[-0x12]", [-18]), 156 | ("[-0xab]", [-171]), 157 | ("[-0xAB]", [-171]), 158 | ("[-0X12]", [-18]), 159 | ("[-0Xab]", [-171]), 160 | ("[-0XAB]", [-171]), 161 | ("[-01234]", [-668]), 162 | ("[-0o1234]", [-668]), 163 | ("[-0O1234]", [-668]), 164 | ("[-0b1111]", [-15]), 165 | ("[-0B1111]", [-15]), 166 | ) 167 | def test_integer_numeric_values(self, in_data, expected_data): 168 | result = parse_js_object(in_data) 169 | self.assertEqual(result, expected_data) 170 | 171 | @parametrize_test( 172 | ("[0.32]", [0.32]), 173 | ("[-0.32]", [-0.32]), 174 | ("[.32]", [0.32]), 175 | ("[-.32]", [-0.32]), 176 | ("[12.]", [12.0]), 177 | ("[-12.]", [-12.0]), 178 | ("[12.32]", [12.32]), 179 | ("[-12.12]", [-12.12]), 180 | ("[3.1415926]", [3.1415926]), 181 | ("[.123456789]", [.123456789]), 182 | ("[.0123]", [0.0123]), 183 | ("[0.0123]", [0.0123]), 184 | ("[-.0123]", [-0.0123]), 185 | ("[-0.0123]", [-0.0123]), 186 | ("[3.1E+12]", [3.1E+12]), 187 | ("[3.1e+12]", [3.1E+12]), 188 | ("[.1E-23]", [.1e-23]), 189 | ("[.1e-23]", [.1e-23]), 190 | ) 191 | def test_float_numeric_values(self, in_data, expected_data): 192 | result = parse_js_object(in_data) 193 | self.assertEqual(result, expected_data) 194 | 195 | 196 | @parametrize_test( 197 | ( 198 | """ 199 | var obj = { 200 | // Comment 201 | x: "X", // Comment 202 | }; 203 | """, 204 | {"x": "X"}, 205 | ), 206 | ( 207 | """ 208 | var /* Comment */ obj = /* Comment */ { 209 | /* Comment */ 210 | x: /* Comment */ "X", /* Comment */ 211 | }; 212 | """, 213 | {"x": "X"}, 214 | ), 215 | ( 216 | """[/*...*/1,2,3,/*...*/4,5,6]""", 217 | [1, 2, 3, 4, 5, 6], 218 | ), 219 | ) 220 | def test_comments(self, in_data, expected_data): 221 | result = parse_js_object(in_data) 222 | self.assertEqual(result, expected_data) 223 | 224 | @parametrize_test( 225 | ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]), 226 | ) 227 | def test_jsonlines(self, in_data, expected_data): 228 | result = list(parse_js_objects(in_data)) 229 | self.assertEqual(result, expected_data) 230 | 231 | 232 | class TestParserExceptions(unittest.TestCase): 233 | @parametrize_test( 234 | ('}{', ValueError), 235 | ('', ValueError), 236 | (None, ValueError), 237 | ) 238 | def test_exceptions(self, in_data, expected_exception): 239 | with self.assertRaises(expected_exception): 240 | parse_js_object(in_data) 241 | 242 | @parametrize_test( 243 | ("{whose: 's's', category_name: '>'}", ValueError), 244 | ) 245 | def test_malformed_input(self, in_data, expected_exception): 246 | with self.assertRaises(expected_exception): 247 | parse_js_object(in_data) 248 | 249 | @parametrize_test( 250 | ( 251 | '{"test": """}', 252 | ValueError, 253 | 'Error parsing input near character 13', 254 | ), 255 | ) 256 | def test_error_messages(self, in_data, expected_exception, expected_exception_text): 257 | with self.assertRaisesRegex(expected_exception, expected_exception_text): 258 | parse_js_object(in_data) 259 | 260 | 261 | class TestOptions(unittest.TestCase): 262 | @parametrize_test( 263 | ('{\\\"a\\\": 12}', {'a': 12}), 264 | ) 265 | def test_unicode_escape(self, in_data, expected_data): 266 | result = parse_js_object(in_data, unicode_escape=True) 267 | self.assertEqual(result, expected_data) 268 | 269 | @parametrize_test( 270 | ('["\n"]', ["\n"]), 271 | ("{'a': '\"\"', 'b': '\\\\', 'c': '\t\n'}", {'a': '""', 'b': '\\', 'c': '\t\n'}), 272 | ( 273 | """var myObj = { 274 | myMethod: function(params) { 275 | // ... 276 | }, 277 | myValue: 100 278 | }""", 279 | {'myMethod': 'function(params) {\n // ...\n }', 'myValue': 100}, 280 | ), 281 | ) 282 | def test_json_non_strict(self, in_data, expected_data): 283 | result = parse_js_object(in_data, loader_kwargs={'strict': False}) 284 | self.assertEqual(result, expected_data) 285 | 286 | @parametrize_test( 287 | ("[]", []), 288 | ("[1, 2, 3]", [1, 2, 3]), 289 | ('var x = [1, 2, 3, 4, 5,]', [1, 2, 3, 4, 5]), 290 | ('{}', {}), 291 | ("{'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}), 292 | ("var x = {'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}), 293 | ) 294 | def test_loader(self, in_data, expected_data): 295 | import ast 296 | result = parse_js_object(in_data, loader=ast.literal_eval) 297 | self.assertEqual(result, expected_data) 298 | 299 | 300 | class TestParseJsonObjects(unittest.TestCase): 301 | @parametrize_test( 302 | ("", []), 303 | ("aaaaaaaaaaaaaaaa", []), 304 | (" ", []), 305 | (" {'a': 12}", [{'a': 12}]), 306 | ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]), 307 | ("[12] [13] [14]", [[12], [13], [14]]), 308 | ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]), 309 | ("[1][1][1]", [[1], [1], [1]]), 310 | ("[1] [2] {'a': ", [[1], [2]]), 311 | ("[]", [[]]), 312 | ("[][][][]", [[], [], [], []]), 313 | ("{}", [{}]), 314 | ("{}{}{}{}", [{}, {}, {}, {}]), 315 | ("{{}}{{}}", []), 316 | ("[[]][[]]", [[[]], [[]]]), 317 | ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]), 318 | ( 319 | 'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])', 320 | [{}, {'a': 12}, [1, 2, 3]], 321 | ), 322 | ('{"a": 12, broken}{"c": 100}', [{'c': 100}]), 323 | ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]), 324 | ) 325 | def test_parse_json_objects(self, in_data, expected_data): 326 | result = list(parse_js_objects(in_data)) 327 | self.assertEqual(result, expected_data) 328 | 329 | @parametrize_test( 330 | ("[1][][2]", [[1], [2]]), 331 | ("{'a': 12}{}{'b': 13}", [{'a': 12}, {'b': 13}]), 332 | ("[][][][][][][][][]", []), 333 | ("{}{}{}{}{}{}{}{}{}", []), 334 | ) 335 | def test_parse_json_objects_without_empty(self, in_data, expected_data): 336 | result = list(parse_js_objects(in_data, omitempty=True)) 337 | self.assertEqual(result, expected_data) 338 | 339 | 340 | if __name__ == '__main__': 341 | unittest.main() 342 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | chompjs API documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 |
22 |

Module chompjs

23 |
24 |
25 |
26 | 27 | Expand source code 28 | 29 |
# -*- coding: utf-8 -*-
 30 | 
 31 | import json
 32 | import warnings
 33 | 
 34 | from _chompjs import parse, parse_objects
 35 | 
 36 | 
 37 | def _preprocess(string, unicode_escape=False):
 38 |     if unicode_escape:
 39 |         string = string.encode().decode("unicode_escape")
 40 |     return string
 41 | 
 42 | 
 43 | def _process_loader_arguments(loader_args, loader_kwargs, json_params):
 44 |     if json_params:
 45 |         msg = "json_params argument is deprecated, please use loader_kwargs instead"
 46 |         warnings.warn(msg, DeprecationWarning)
 47 |         loader_kwargs = json_params
 48 | 
 49 |     if not loader_args:
 50 |         loader_args = []
 51 | 
 52 |     if not loader_kwargs:
 53 |         loader_kwargs = {}
 54 | 
 55 |     return (loader_args, loader_kwargs)
 56 | 
 57 | 
 58 | def parse_js_object(
 59 |     string,
 60 |     unicode_escape=False,
 61 |     loader=json.loads,
 62 |     loader_args=None,
 63 |     loader_kwargs=None,
 64 |     json_params=None,
 65 | ):
 66 |     """
 67 |     Extracts first JSON object encountered in the input string
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     string: str
 72 |         Input string
 73 | 
 74 |     >>> parse_js_object("{a: 100}")
 75 |     {'a': 100}
 76 | 
 77 |     unicode_escape: bool, optional
 78 |         Attempt to fix input string if it contains escaped special characters
 79 | 
 80 |     >>> parse_js_object('{\\\\"a\\\\": 100}')
 81 |     {'\\\\"a\\\\"': 100}
 82 |     >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
 83 |     {'a': 100}
 84 | 
 85 |     loader: func, optional
 86 |         Function used to load processed input data. By default `json.loads` is used
 87 | 
 88 |     >>> import orjson
 89 |     >>> import chompjs
 90 |     >>> 
 91 |     >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
 92 |     {'a': 12}
 93 | 
 94 |     loader_args: list, optional
 95 |         Allow passing down positional arguments to loader function
 96 | 
 97 |     loader_kwargs: dict, optional
 98 |         Allow passing down keyword arguments to loader function
 99 | 
100 |     >>> parse_js_object("{'a': 10.1}")
101 |     {'a': 10.1}
102 |     >>> import decimal
103 |     >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
104 |     {'a': Decimal('10.1')}
105 | 
106 |     .. deprecated:: 1.3.0
107 |     json_params: dict, optional
108 |         Use `loader_kwargs` instead
109 | 
110 |     Returns
111 |     -------
112 |     list | dict
113 |         Extracted JSON object
114 | 
115 |     Raises
116 |     ------
117 |     ValueError
118 |         If failed to parse input properly
119 | 
120 |     ```python
121 |     >>> parse_js_object(None)
122 |     Traceback (most recent call last):
123 |       ...
124 |     ValueError: Invalid input
125 |     >>> parse_js_object("No JSON objects in sight...")
126 |     Traceback (most recent call last):
127 |       ...
128 |     json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
129 | 
130 |     ```
131 | 
132 |     """
133 |     if not string:
134 |         raise ValueError("Invalid input")
135 | 
136 |     loader_args, loader_kwargs = _process_loader_arguments(
137 |         loader_args, loader_kwargs, json_params
138 |     )
139 | 
140 |     if json_params:
141 |         msg = "json_params argument is deprecated, please use loader_kwargs instead"
142 |         warnings.warn(msg, DeprecationWarning)
143 | 
144 |     string = _preprocess(string, unicode_escape)
145 |     parsed_data = parse(string)
146 |     return loader(parsed_data, *loader_args, **loader_kwargs)
147 | 
148 | 
149 | def parse_js_objects(
150 |     string,
151 |     unicode_escape=False,
152 |     omitempty=False, 
153 |     loader=json.loads,
154 |     loader_args=None,
155 |     loader_kwargs=None,
156 |     json_params=None,
157 | ):
158 |     """
159 |     Returns a generator extracting all JSON objects encountered in the input string.
160 |     Can be used to read JSON Lines
161 | 
162 |     Parameters
163 |     ----------
164 |     string: str
165 |         Input string
166 | 
167 |     >>> it = parse_js_objects("{a: 100} {b: 100}")
168 |     >>> next(it)
169 |     {'a': 100}
170 |     >>> next(it)
171 |     {'b': 100}
172 | 
173 |     unicode_escape: bool, optional
174 |         Attempt to fix input string if it contains escaped special characters
175 | 
176 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
177 |     {'\\\\"a\\\\"': 100}
178 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
179 |     {'a': 100}
180 | 
181 |     omitempty: bool, optional
182 |         Skip empty dictionaries and lists
183 | 
184 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
185 |     [{'a': 12}, {}, {'b': 13}]
186 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
187 |     [{'a': 12}, {'b': 13}]
188 | 
189 |     loader: func, optional
190 |         Function used to load processed input data. By default `json.loads` is used
191 | 
192 |     >>> import orjson
193 |     >>> import chompjs
194 |     >>> 
195 |     >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
196 |     {'a': 12}
197 | 
198 |     loader_args: list, optional
199 |         Allow passing down positional arguments to loader function
200 | 
201 |     loader_kwargs: dict, optional
202 |         Allow passing down keyword arguments to loader function
203 | 
204 |     >>> next(parse_js_objects("{'a': 10.1}"))
205 |     {'a': 10.1}
206 |     >>> import decimal
207 |     >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
208 |     {'a': Decimal('10.1')}
209 | 
210 |     .. deprecated:: 1.3.0
211 |     json_params: dict, optional
212 |         Use `loader_kwargs` instead
213 | 
214 |     Returns
215 |     -------
216 |     generator
217 |         Iterating over it yields all encountered JSON objects
218 |     """
219 | 
220 |     if not string:
221 |         return
222 | 
223 |     loader_args, loader_kwargs = _process_loader_arguments(
224 |         loader_args, loader_kwargs, json_params
225 |     )
226 | 
227 |     string = _preprocess(string, unicode_escape)
228 |     for raw_data in parse_objects(string):
229 |         try:
230 |             data = loader(raw_data, *loader_args, **loader_kwargs)
231 |         except ValueError:
232 |             continue
233 | 
234 |         if not data and omitempty:
235 |             continue
236 | 
237 |         yield data
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |

Functions

246 |
247 |
248 | def parse_js_object(string, unicode_escape=False, loader=<function loads>, loader_args=None, loader_kwargs=None, json_params=None) 249 |
250 |
251 |

Extracts first JSON object encountered in the input string

252 |

Parameters

253 |
254 |
string : str
255 |
Input string
256 |
257 |
>>> parse_js_object("{a: 100}")
258 | {'a': 100}
259 | 
260 |
261 |
unicode_escape : bool, optional
262 |
Attempt to fix input string if it contains escaped special characters
263 |
264 |
>>> parse_js_object('{\\"a\\": 100}')
265 | {'\\"a\\"': 100}
266 | >>> parse_js_object('{\\"a\\": 100}', unicode_escape=True)
267 | {'a': 100}
268 | 
269 |
270 |
loader : func, optional
271 |
Function used to load processed input data. By default json.loads is used
272 |
273 |
>>> import orjson
274 | >>> import chompjs
275 | >>> 
276 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
277 | {'a': 12}
278 | 
279 |
280 |
loader_args : list, optional
281 |
Allow passing down positional arguments to loader function
282 |
loader_kwargs : dict, optional
283 |
Allow passing down keyword arguments to loader function
284 |
285 |
>>> parse_js_object("{'a': 10.1}")
286 | {'a': 10.1}
287 | >>> import decimal
288 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
289 | {'a': Decimal('10.1')}
290 | 
291 |
292 |

Deprecated since version: 1.3.0

293 |
294 |
295 |
json_params : dict, optional
296 |
Use loader_kwargs instead
297 |
298 |

Returns

299 |
300 |
list | dict
301 |
Extracted JSON object
302 |
303 |

Raises

304 |
305 |
ValueError
306 |
If failed to parse input properly
307 |
308 |
>>> parse_js_object(None)
309 | Traceback (most recent call last):
310 |   ...
311 | ValueError: Invalid input
312 | >>> parse_js_object("No JSON objects in sight...")
313 | Traceback (most recent call last):
314 |   ...
315 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
316 | 
317 | 
318 |
319 | 320 | Expand source code 321 | 322 |
def parse_js_object(
323 |     string,
324 |     unicode_escape=False,
325 |     loader=json.loads,
326 |     loader_args=None,
327 |     loader_kwargs=None,
328 |     json_params=None,
329 | ):
330 |     """
331 |     Extracts first JSON object encountered in the input string
332 | 
333 |     Parameters
334 |     ----------
335 |     string: str
336 |         Input string
337 | 
338 |     >>> parse_js_object("{a: 100}")
339 |     {'a': 100}
340 | 
341 |     unicode_escape: bool, optional
342 |         Attempt to fix input string if it contains escaped special characters
343 | 
344 |     >>> parse_js_object('{\\\\"a\\\\": 100}')
345 |     {'\\\\"a\\\\"': 100}
346 |     >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
347 |     {'a': 100}
348 | 
349 |     loader: func, optional
350 |         Function used to load processed input data. By default `json.loads` is used
351 | 
352 |     >>> import orjson
353 |     >>> import chompjs
354 |     >>> 
355 |     >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
356 |     {'a': 12}
357 | 
358 |     loader_args: list, optional
359 |         Allow passing down positional arguments to loader function
360 | 
361 |     loader_kwargs: dict, optional
362 |         Allow passing down keyword arguments to loader function
363 | 
364 |     >>> parse_js_object("{'a': 10.1}")
365 |     {'a': 10.1}
366 |     >>> import decimal
367 |     >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
368 |     {'a': Decimal('10.1')}
369 | 
370 |     .. deprecated:: 1.3.0
371 |     json_params: dict, optional
372 |         Use `loader_kwargs` instead
373 | 
374 |     Returns
375 |     -------
376 |     list | dict
377 |         Extracted JSON object
378 | 
379 |     Raises
380 |     ------
381 |     ValueError
382 |         If failed to parse input properly
383 | 
384 |     ```python
385 |     >>> parse_js_object(None)
386 |     Traceback (most recent call last):
387 |       ...
388 |     ValueError: Invalid input
389 |     >>> parse_js_object("No JSON objects in sight...")
390 |     Traceback (most recent call last):
391 |       ...
392 |     json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
393 | 
394 |     ```
395 | 
396 |     """
397 |     if not string:
398 |         raise ValueError("Invalid input")
399 | 
400 |     loader_args, loader_kwargs = _process_loader_arguments(
401 |         loader_args, loader_kwargs, json_params
402 |     )
403 | 
404 |     if json_params:
405 |         msg = "json_params argument is deprecated, please use loader_kwargs instead"
406 |         warnings.warn(msg, DeprecationWarning)
407 | 
408 |     string = _preprocess(string, unicode_escape)
409 |     parsed_data = parse(string)
410 |     return loader(parsed_data, *loader_args, **loader_kwargs)
411 |
412 |
413 |
414 | def parse_js_objects(string, unicode_escape=False, omitempty=False, loader=<function loads>, loader_args=None, loader_kwargs=None, json_params=None) 415 |
416 |
417 |

Returns a generator extracting all JSON objects encountered in the input string. 418 | Can be used to read JSON Lines

419 |

Parameters

420 |
421 |
string : str
422 |
Input string
423 |
424 |
>>> it = parse_js_objects("{a: 100} {b: 100}")
425 | >>> next(it)
426 | {'a': 100}
427 | >>> next(it)
428 | {'b': 100}
429 | 
430 |
431 |
unicode_escape : bool, optional
432 |
Attempt to fix input string if it contains escaped special characters
433 |
434 |
>>> next(parse_js_objects('{\\"a\\": 100}'))
435 | {'\\"a\\"': 100}
436 | >>> next(parse_js_objects('{\\"a\\": 100}', unicode_escape=True))
437 | {'a': 100}
438 | 
439 |
440 |
omitempty : bool, optional
441 |
Skip empty dictionaries and lists
442 |
443 |
>>> list(parse_js_objects("{a: 12} {} {b: 13}"))
444 | [{'a': 12}, {}, {'b': 13}]
445 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
446 | [{'a': 12}, {'b': 13}]
447 | 
448 |
449 |
loader : func, optional
450 |
Function used to load processed input data. By default json.loads is used
451 |
452 |
>>> import orjson
453 | >>> import chompjs
454 | >>> 
455 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
456 | {'a': 12}
457 | 
458 |
459 |
loader_args : list, optional
460 |
Allow passing down positional arguments to loader function
461 |
loader_kwargs : dict, optional
462 |
Allow passing down keyword arguments to loader function
463 |
464 |
>>> next(parse_js_objects("{'a': 10.1}"))
465 | {'a': 10.1}
466 | >>> import decimal
467 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
468 | {'a': Decimal('10.1')}
469 | 
470 |
471 |

Deprecated since version: 1.3.0

472 |
473 |
474 |
json_params : dict, optional
475 |
Use loader_kwargs instead
476 |
477 |

Returns

478 |
479 |
generator
480 |
Iterating over it yields all encountered JSON objects
481 |
482 |
483 | 484 | Expand source code 485 | 486 |
def parse_js_objects(
487 |     string,
488 |     unicode_escape=False,
489 |     omitempty=False, 
490 |     loader=json.loads,
491 |     loader_args=None,
492 |     loader_kwargs=None,
493 |     json_params=None,
494 | ):
495 |     """
496 |     Returns a generator extracting all JSON objects encountered in the input string.
497 |     Can be used to read JSON Lines
498 | 
499 |     Parameters
500 |     ----------
501 |     string: str
502 |         Input string
503 | 
504 |     >>> it = parse_js_objects("{a: 100} {b: 100}")
505 |     >>> next(it)
506 |     {'a': 100}
507 |     >>> next(it)
508 |     {'b': 100}
509 | 
510 |     unicode_escape: bool, optional
511 |         Attempt to fix input string if it contains escaped special characters
512 | 
513 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
514 |     {'\\\\"a\\\\"': 100}
515 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
516 |     {'a': 100}
517 | 
518 |     omitempty: bool, optional
519 |         Skip empty dictionaries and lists
520 | 
521 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
522 |     [{'a': 12}, {}, {'b': 13}]
523 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
524 |     [{'a': 12}, {'b': 13}]
525 | 
526 |     loader: func, optional
527 |         Function used to load processed input data. By default `json.loads` is used
528 | 
529 |     >>> import orjson
530 |     >>> import chompjs
531 |     >>> 
532 |     >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
533 |     {'a': 12}
534 | 
535 |     loader_args: list, optional
536 |         Allow passing down positional arguments to loader function
537 | 
538 |     loader_kwargs: dict, optional
539 |         Allow passing down keyword arguments to loader function
540 | 
541 |     >>> next(parse_js_objects("{'a': 10.1}"))
542 |     {'a': 10.1}
543 |     >>> import decimal
544 |     >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
545 |     {'a': Decimal('10.1')}
546 | 
547 |     .. deprecated:: 1.3.0
548 |     json_params: dict, optional
549 |         Use `loader_kwargs` instead
550 | 
551 |     Returns
552 |     -------
553 |     generator
554 |         Iterating over it yields all encountered JSON objects
555 |     """
556 | 
557 |     if not string:
558 |         return
559 | 
560 |     loader_args, loader_kwargs = _process_loader_arguments(
561 |         loader_args, loader_kwargs, json_params
562 |     )
563 | 
564 |     string = _preprocess(string, unicode_escape)
565 |     for raw_data in parse_objects(string):
566 |         try:
567 |             data = loader(raw_data, *loader_args, **loader_kwargs)
568 |         except ValueError:
569 |             continue
570 | 
571 |         if not data and omitempty:
572 |             continue
573 | 
574 |         yield data
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 | 596 |
597 | 600 | 601 | 602 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | 4 | from io import open 5 | from os import path 6 | from platform import system 7 | from setuptools import setup, Extension 8 | 9 | 10 | this_directory = path.abspath(path.dirname(__file__)) 11 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 12 | long_description = f.read() 13 | 14 | extra_compile_args = [] 15 | extra_link_args = [] 16 | if system() == 'Linux': 17 | extra_compile_args = ['-Wl,-Bsymbolic-functions'] 18 | extra_link_args = ['-Wl,-Bsymbolic-functions'] 19 | 20 | chompjs_extension = Extension( 21 | '_chompjs', 22 | sources=['_chompjs/module.c', '_chompjs/parser.c', '_chompjs/buffer.c'], 23 | extra_compile_args=extra_compile_args, 24 | extra_link_args=extra_link_args, 25 | ) 26 | 27 | setup( 28 | name='chompjs', 29 | version='1.3.2', 30 | description='Parsing JavaScript objects into Python dictionaries', 31 | author='Mariusz Obajtek', 32 | author_email='nykakin@gmail.com', 33 | keywords='parsing parser JavaScript json json5 webscrapping', 34 | python_requires='>=3.8', 35 | ext_modules=[chompjs_extension], 36 | classifiers=[ 37 | "Programming Language :: Python :: 3", 38 | "Programming Language :: JavaScript", 39 | "Intended Audience :: Developers", 40 | "License :: OSI Approved :: MIT License", 41 | "Operating System :: OS Independent", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | "Topic :: Text Processing :: General", 44 | "Topic :: Text Processing :: Linguistic", 45 | "Development Status :: 5 - Production/Stable", 46 | "Environment :: Console", 47 | "Environment :: Web Environment", 48 | ], 49 | url='https://github.com/Nykakin/chompjs', 50 | long_description=long_description, 51 | long_description_content_type='text/markdown', 52 | include_package_data=True, 53 | packages=['chompjs'], 54 | ) 55 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39,py310,py311,py312,py313 3 | 4 | [testenv] 5 | deps = orjson 6 | commands = 7 | python -m unittest discover 8 | python -m doctest chompjs/chompjs.py 9 | --------------------------------------------------------------------------------