├── .github
└── workflows
│ ├── deploy.yml
│ └── test-deploy.yml
├── .gitignore
├── CHANGELOG
├── LICENSE
├── MANIFEST.in
├── README.md
├── _chompjs
├── buffer.c
├── buffer.h
├── module.c
├── parser.c
└── parser.h
├── chompjs
├── __init__.py
├── chompjs.py
└── test_parser.py
├── docs
└── index.html
├── setup.py
└── tox.ini
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: deploy
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | jobs:
7 | build_wheels:
8 | name: Build wheels on ${{ matrix.os }}
9 | runs-on: ${{ matrix.os }}
10 | strategy:
11 | matrix:
12 | os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
13 |
14 | steps:
15 | - uses: actions/checkout@v4
16 |
17 | - name: Build wheels
18 | uses: pypa/cibuildwheel@v2.22.0
19 | env:
20 | CIBW_SKIP: cp36-* cp37-* cp38-* pp*
21 |
22 | - uses: actions/upload-artifact@v4
23 | with:
24 | name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }}
25 | path: ./wheelhouse/*.whl
26 |
27 | build_sdist:
28 | name: Build source distribution
29 | runs-on: ubuntu-latest
30 | steps:
31 | - uses: actions/checkout@v4
32 |
33 | - name: Build sdist
34 | run: pipx run build --sdist
35 |
36 | - uses: actions/upload-artifact@v4
37 | with:
38 | name: chompjs-sdist
39 | path: dist/*.tar.gz
40 |
41 | upload_pypi:
42 | needs: [build_wheels, build_sdist]
43 | runs-on: ubuntu-latest
44 | environment: pypi
45 | permissions:
46 | id-token: write
47 | steps:
48 | - uses: actions/download-artifact@v4
49 | with:
50 | pattern: chompjs-*
51 | path: dist
52 | merge-multiple: true
53 |
54 | - uses: pypa/gh-action-pypi-publish@release/v1
55 |
--------------------------------------------------------------------------------
/.github/workflows/test-deploy.yml:
--------------------------------------------------------------------------------
1 | name: test-deploy
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | jobs:
7 | build_wheels:
8 | name: Build wheels on ${{ matrix.os }}
9 | runs-on: ${{ matrix.os }}
10 | strategy:
11 | matrix:
12 | os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
13 |
14 | steps:
15 | - uses: actions/checkout@v4
16 |
17 | - name: Build wheels
18 | uses: pypa/cibuildwheel@v2.22.0
19 | env:
20 | CIBW_SKIP: cp36-* cp37-* cp38-* pp*
21 |
22 | - uses: actions/upload-artifact@v4
23 | with:
24 | name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }}
25 | path: ./wheelhouse/*.whl
26 |
27 | build_sdist:
28 | name: Build source distribution
29 | runs-on: ubuntu-latest
30 | steps:
31 | - uses: actions/checkout@v4
32 |
33 | - name: Build sdist
34 | run: pipx run build --sdist
35 |
36 | - uses: actions/upload-artifact@v4
37 | with:
38 | name: chompjs-sdist
39 | path: dist/*.tar.gz
40 |
41 | upload_pypi:
42 | needs: [build_wheels, build_sdist]
43 | runs-on: ubuntu-latest
44 | environment: pypi
45 | permissions:
46 | id-token: write
47 | steps:
48 | - uses: actions/download-artifact@v4
49 | with:
50 | pattern: chompjs-*
51 | path: dist
52 | merge-multiple: true
53 |
54 | - uses: pypa/gh-action-pypi-publish@release/v1
55 | with:
56 | repository-url: https://test.pypi.org/legacy/
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Object files
5 | *.o
6 | *.ko
7 | *.obj
8 | *.elf
9 |
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 |
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 |
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 |
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 |
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 |
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 |
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 |
54 |
55 | # Byte-compiled / optimized / DLL files
56 | __pycache__/
57 | *.py[cod]
58 | *$py.class
59 |
60 | # C extensions
61 | *.so
62 |
63 | # Distribution / packaging
64 | .Python
65 | build/
66 | develop-eggs/
67 | dist/
68 | downloads/
69 | eggs/
70 | .eggs/
71 | lib/
72 | lib64/
73 | parts/
74 | sdist/
75 | var/
76 | wheels/
77 | pip-wheel-metadata/
78 | share/python-wheels/
79 | *.egg-info/
80 | .installed.cfg
81 | *.egg
82 | MANIFEST
83 |
84 | # PyInstaller
85 | # Usually these files are written by a python script from a template
86 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
87 | *.manifest
88 | *.spec
89 |
90 | # Installer logs
91 | pip-log.txt
92 | pip-delete-this-directory.txt
93 |
94 | # Unit test / coverage reports
95 | htmlcov/
96 | .tox/
97 | .nox/
98 | .coverage
99 | .coverage.*
100 | .cache
101 | nosetests.xml
102 | coverage.xml
103 | *.cover
104 | *.py,cover
105 | .hypothesis/
106 | .pytest_cache/
107 | cover/
108 |
109 | # Translations
110 | *.mo
111 | *.pot
112 |
113 | # Django stuff:
114 | *.log
115 | local_settings.py
116 | db.sqlite3
117 | db.sqlite3-journal
118 |
119 | # Flask stuff:
120 | instance/
121 | .webassets-cache
122 |
123 | # Scrapy stuff:
124 | .scrapy
125 |
126 | # Sphinx documentation
127 | docs/_build/
128 |
129 | # PyBuilder
130 | target/
131 |
132 | # Jupyter Notebook
133 | .ipynb_checkpoints
134 |
135 | # IPython
136 | profile_default/
137 | ipython_config.py
138 |
139 | # pyenv
140 | # For a library or package, you might want to ignore these files since the code is
141 | # intended to run in multiple environments; otherwise, check them in:
142 | # .python-version
143 |
144 | # pipenv
145 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
146 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
147 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
148 | # install all needed dependencies.
149 | #Pipfile.lock
150 |
151 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
152 | __pypackages__/
153 |
154 | # Celery stuff
155 | celerybeat-schedule
156 | celerybeat.pid
157 |
158 | # SageMath parsed files
159 | *.sage.py
160 |
161 | # Environments
162 | .env
163 | .venv
164 | env/
165 | venv/
166 | ENV/
167 | env.bak/
168 | venv.bak/
169 |
170 | # Spyder project settings
171 | .spyderproject
172 | .spyproject
173 |
174 | # Rope project settings
175 | .ropeproject
176 |
177 | # mkdocs documentation
178 | /site
179 |
180 | # mypy
181 | .mypy_cache/
182 | .dmypy.json
183 | dmypy.json
184 |
185 | # Pyre type checker
186 | .pyre/
187 |
188 | # pytype static type analyzer
189 | .pytype/
190 |
191 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | [1.3.2]
2 | * Release the GIL during parsing in C (#69)
3 |
4 | [1.3.1]
5 | * Introduced CI to build wheels (#68)
6 |
7 | [1.3.0]
8 | * Allow custom load function (https://github.com/Nykakin/chompjs/pull/63)
9 |
10 | [1.2.4]
11 | * Remove trailing whitespaces for unrecognized values (#59)
12 | * Fix segfault on empty string (#62)
13 |
14 | [1.2.3]
15 | * Remove trailing whitespaces from unquoted keys (#57)
16 |
17 | [1.2.2]
18 | * Fix parsing some floating numbers as octal (#52)
19 | * Fix number of digits calculation (#50)
20 |
21 | [1.2.1]
22 | * Fix compilation on Windows (#49)
23 |
24 | [1.2.0]
25 | * Allow trailing dot at the end of numerals (#39)
26 | * Parse hexadecimal, binary and octal literals as numbers (#40)
27 | * Drop support for Python 2.7 (#44)
28 | * Add parse_js_objects function (#45)
29 | * Drop jsonlines flag in parse_json_object in favor of parse_json_objects (#46)
30 | * Improve documentation (#47, #32)
31 |
32 | [1.1.9]
33 | * Handle NaN in input (#37)
34 |
35 | [1.1.8]
36 | * Fixed previous release (package couldn't be installed)
37 |
38 | [1.1.7]
39 | * Handle unquoted properties starting with reserved JS keywords (#34)
40 |
41 | [1.1.6]
42 | * Handle bug with parsing arrays like `["","/"]` (#33)
43 |
44 | [1.1.5]
45 | * Correctly handle malformed quotations (#31)
46 |
47 | [1.1.4]
48 | * Performance improvement (#19)
49 | * Handle numeric keys (#20)
50 | * Refactor error handling (#29)
51 |
52 | [1.1.3]
53 | * Avoid an infinite loop on malformed input (#27)
54 |
55 | [1.1.2]
56 | * Handle comments in JavaScript code (#22)
57 |
58 | [1.1.1]
59 | * Fix installation bug (headers moved to a different dir)
60 |
61 | [1.1.0]
62 | * Parser refactored and rewritten in order to simplify code and improve speed
63 | * Allow handling JavaScript functions and other strange stuff such as regexes (#16)
64 | * Allow passing down json.loads parameters
65 | * Allow handling hexadecimal, octal and binary literals (#12)
66 |
67 | [1.0.17]
68 | * Handle memory corruption on unclosed quotations (#13)
69 |
70 | [1.0.16]
71 | * Handle floats with leading zeros (#10)
72 |
73 | [1.0.15]
74 | * Handle $ and _ characters at the beginning of keys (#9)
75 |
76 | [1.0.14]
77 | * Handle "undefined" keyword in JavaScript objects (#7)
78 |
79 | [1.0.13]
80 | * Handle escaped quotations correctly (#6)
81 |
82 | [1.0.12]
83 | * Handle windows newlines (#5)
84 |
85 | [1.0.11]
86 | * Handle jsonlines (#3)
87 |
88 | [1.0.1]
89 | * Handle Unicode in keys (#2)
90 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Mariusz Obajtek
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft _chompjs
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Chompjs
2 |
3 | 
4 | 
5 | 
6 | 
7 |
8 | Transforms JavaScript objects into Python data structures.
9 |
10 | In web scraping, you sometimes need to transform Javascript objects embedded in HTML pages into valid Python dictionaries. `chompjs` is a library designed to do that as a more powerful replacement of standard `json.loads`:
11 |
12 | ```python
13 | >>> chompjs.parse_js_object("{a: 100}")
14 | {'a': 100}
15 | >>>
16 | >>> json_lines = """
17 | ... {'a': 12}
18 | ... {'b': 13}
19 | ... {'c': 14}
20 | ... """
21 | >>> for entry in chompjs.parse_js_objects(json_lines):
22 | ... print(entry)
23 | ...
24 | {'a': 12}
25 | {'b': 13}
26 | {'c': 14}
27 | ```
28 |
29 | [Reference documentation](https://nykakin.github.io/chompjs/)
30 |
31 | ## Quickstart
32 |
33 | **1. installation**
34 |
35 | ```
36 | > pip install chompjs
37 | ```
38 |
39 | or build from source:
40 |
41 | ```bash
42 | $ git clone https://github.com/Nykakin/chompjs
43 | $ cd chompjs
44 | $ python setup.py build
45 | $ python setup.py install
46 | ```
47 |
48 | ## Features
49 |
50 | There are two functions available:
51 | * `parse_js_object` - try reading first encountered JSON-like object. Raises `ValueError` on failure
52 | * `parse_js_objects` - returns a generator yielding all encountered JSON-like objects. Can be used to read [JSON Lines](https://jsonlines.org/). Does not raise on invalid input.
53 |
54 | An example usage with `scrapy`:
55 |
56 | ```python
57 | import chompjs
58 | import scrapy
59 |
60 |
61 | class MySpider(scrapy.Spider):
62 | # ...
63 |
64 | def parse(self, response):
65 | script_css = 'script:contains("__NEXT_DATA__")::text'
66 | script_pattern = r'__NEXT_DATA__ = (.*);'
67 | # warning: for some pages you need to pass replace_entities=True
68 | # into re_first to have JSON escaped properly
69 | script_text = response.css(script_css).re_first(script_pattern)
70 | try:
71 | json_data = chompjs.parse_js_object(script_text)
72 | except ValueError:
73 | self.log('Failed to extract data from {}'.format(response.url))
74 | return
75 |
76 | # work on json_data
77 | ```
78 |
79 | Parsing of [JSON5 objects](https://json5.org/) is supported:
80 |
81 | ```python
82 | >>> data = """
83 | ... {
84 | ... // comments
85 | ... unquoted: 'and you can quote me on that',
86 | ... singleQuotes: 'I can use "double quotes" here',
87 | ... lineBreaks: "Look, Mom! \
88 | ... No \\n's!",
89 | ... hexadecimal: 0xdecaf,
90 | ... leadingDecimalPoint: .8675309, andTrailing: 8675309.,
91 | ... positiveSign: +1,
92 | ... trailingComma: 'in objects', andIn: ['arrays',],
93 | ... "backwardsCompatible": "with JSON",
94 | ... }
95 | ... """
96 | >>> chompjs.parse_js_object(data)
97 | {'unquoted': 'and you can quote me on that', 'singleQuotes': 'I can use "double quotes" here', 'lineBreaks': "Look, Mom! No \n's!", 'hexadecimal': 912559, 'leadingDecimalPoint': 0.8675309, 'andTrailing': 8675309.0, 'positiveSign': '+1', 'trailingComma': 'in objects', 'andIn': ['arrays'], 'backwardsCompatible': 'with JSON'}
98 | ```
99 |
100 | If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it:
101 |
102 | ```python
103 | >>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True)
104 | {'a': 12}
105 | ```
106 |
107 | By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest:
108 |
109 | ```python
110 | >>> chompjs.parse_js_object('
...
...
')
111 | [1, 2, 3]
112 | ```
113 |
114 | Post-processed input is parsed using `json.loads` by default. A different loader such as `orsjon` can be used with `loader` argument:
115 |
116 | ```python
117 | >>> import orjson
118 | >>> import chompjs
119 | >>>
120 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
121 | {'a': 12}
122 | ```
123 |
124 | `loader_args` and `loader_kwargs` arguments can be used to pass options to underlying loader function. For example for default `json.loads` you can pass down options such as `strict` or `object_hook`:
125 |
126 | ```python
127 | >>> import decimal
128 | >>> import chompjs
129 | >>> chompjs.parse_js_object('[23.2]', loader_kwargs={'parse_float': decimal.Decimal})
130 | [Decimal('23.2')]
131 | ```
132 |
133 | # Rationale
134 |
135 | In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example:
136 |
137 | ```html
138 |
139 | ...
140 |
141 | ...
142 |
143 | ...
144 |
145 |
146 | ```
147 |
148 | Standard library function `json.loads` is usually sufficient to extract this data:
149 |
150 | ```python
151 | >>> # scrapy shell file:///tmp/test.html
152 | >>> import json
153 | >>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)')
154 | >>> json.loads(script_text)
155 | {u'foo': u'bar'}
156 |
157 | ```
158 | The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs:
159 |
160 | * `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote
161 | * `'{a: "b"}'`is not a valid JSON because property name is not quoted at all
162 | * `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array
163 | * `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0
164 |
165 | As a result, `json.loads` fail to extract any of those:
166 |
167 | ```python
168 | >>> json.loads("{'a': 'b'}")
169 | Traceback (most recent call last):
170 | ...
171 | ValueError: Expecting property name: line 1 column 2 (char 1)
172 | >>> json.loads('{a: "b"}')
173 | Traceback (most recent call last):
174 | ...
175 | ValueError: Expecting property name: line 1 column 2 (char 1)
176 | >>> json.loads('{"a": [1, 2, 3,]}')
177 | Traceback (most recent call last):
178 | ...
179 | ValueError: No JSON object could be decoded
180 | >>> json.loads('{"a": .99}')
181 | Traceback (most recent call last):
182 | ...
183 | json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6)
184 |
185 | ```
186 | `chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries:
187 |
188 | ```python
189 | >>> import chompjs
190 | >>>
191 | >>> chompjs.parse_js_object("{'a': 'b'}")
192 | {'a': 'b'}
193 | >>> chompjs.parse_js_object('{a: "b"}')
194 | {'a': 'b'}
195 | >>> chompjs.parse_js_object('{"a": [1, 2, 3,]}')
196 | {'a': [1, 2, 3]}
197 | >>> chompjs.parse_js_object('{"a": .99}')
198 | {'a': 0.99}
199 | ```
200 |
201 | Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`.
202 |
203 | ```python
204 | >>> import json
205 | >>> import _chompjs
206 | >>>
207 | >>> _chompjs.parse('{a: 1}')
208 | '{"a":1}'
209 | >>> json.loads(_)
210 | {'a': 1}
211 | ```
212 |
213 | # Development
214 | Pull requests are welcome.
215 |
216 | To run unittests
217 |
218 | ```
219 | $ tox
220 | ```
221 |
--------------------------------------------------------------------------------
/_chompjs/buffer.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
4 | */
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "buffer.h"
12 |
13 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size) {
14 | buffer->data = malloc(initial_depth_buffer_size);
15 | buffer->memory_buffer_length = initial_depth_buffer_size;
16 | buffer->index = 0;
17 | }
18 |
19 | void release_char_buffer(struct CharBuffer* buffer) {
20 | free(buffer->data);
21 | }
22 |
23 | void check_capacity(struct CharBuffer* buffer, size_t to_save) {
24 | if(buffer->index + to_save >= buffer->memory_buffer_length) {
25 | buffer->data = realloc(buffer->data, 2*buffer->memory_buffer_length);
26 | buffer->memory_buffer_length *= 2;
27 | }
28 | }
29 |
30 | void push(struct CharBuffer* buffer, char value) {
31 | check_capacity(buffer, 1);
32 | buffer->data[buffer->index] = value;
33 | buffer->index += 1;
34 | }
35 |
36 | void push_string(struct CharBuffer* buffer, const char* value, size_t len) {
37 | check_capacity(buffer, len);
38 | memcpy(buffer->data + buffer->index, value, len);
39 | buffer->index += len;
40 | }
41 |
42 | void push_number(struct CharBuffer* buffer, long value) {
43 | int size_in_chars;
44 | if (value == 0) {
45 | size_in_chars = 2;
46 | } else {
47 | size_in_chars = floor(log10(value)) + 2;
48 | }
49 | check_capacity(buffer, size_in_chars);
50 | buffer->index += sprintf(buffer->data + buffer->index, "%ld", value);
51 | }
52 |
53 | void pop(struct CharBuffer* buffer) {
54 | buffer->index -= 1;
55 | }
56 |
57 | char top(struct CharBuffer* buffer) {
58 | return buffer->data[buffer->index-1];
59 | }
60 |
61 | bool empty(struct CharBuffer* buffer) {
62 | return buffer->index <= 0;
63 | }
64 |
65 | void clear(struct CharBuffer* buffer) {
66 | buffer->index = 0;
67 | }
68 |
69 | size_t size(struct CharBuffer* buffer) {
70 | return buffer->index;
71 | }
72 |
--------------------------------------------------------------------------------
/_chompjs/buffer.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
4 | */
5 |
6 | #ifndef CHOMPJS_BUFFER_H
7 | #define CHOMPJS_BUFFER_H
8 |
9 | #include
10 | #include
11 |
12 | /**
13 | Implements a safe, dynamically growing char buffer
14 | */
15 | struct CharBuffer {
16 | char* data;
17 | size_t memory_buffer_length;
18 | size_t index;
19 | };
20 |
21 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size);
22 |
23 | void release_char_buffer(struct CharBuffer* buffer);
24 |
25 | void check_capacity(struct CharBuffer* buffer, size_t to_save);
26 |
27 | void push(struct CharBuffer* buffer, char value);
28 |
29 | void push_string(struct CharBuffer* buffer, const char* value, size_t len);
30 |
31 | void push_number(struct CharBuffer* buffer, long value);
32 |
33 | void pop(struct CharBuffer* buffer);
34 |
35 | char top(struct CharBuffer* buffer);
36 |
37 | bool empty(struct CharBuffer* buffer);
38 |
39 | void clear(struct CharBuffer* buffer);
40 |
41 | size_t size(struct CharBuffer* buffer);
42 |
43 | #endif
44 |
--------------------------------------------------------------------------------
/_chompjs/module.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
4 | */
5 |
6 | #define PY_SSIZE_T_CLEAN
7 | #include
8 | #include
9 | #include "parser.h"
10 |
11 | static PyObject* parse_python_object(PyObject *self, PyObject *args) {
12 | const char* string;
13 | if (!PyArg_ParseTuple(args, "s", &string)) {
14 | return NULL;
15 | }
16 |
17 | struct Lexer lexer;
18 | init_lexer(&lexer, string);
19 | Py_BEGIN_ALLOW_THREADS
20 | while(lexer.lexer_status == CAN_ADVANCE) {
21 | advance(&lexer);
22 | }
23 | Py_END_ALLOW_THREADS
24 |
25 | PyObject* ret = Py_BuildValue("s#", lexer.output.data, lexer.output.index-1);
26 | release_lexer(&lexer);
27 | if(lexer.lexer_status == ERROR) {
28 | const char* msg_sting = "Error parsing input near character %d";
29 | size_t error_buffer_size = snprintf(
30 | NULL,
31 | 0,
32 | msg_sting,
33 | lexer.input_position
34 | );
35 | char* error_buffer = malloc(error_buffer_size + 1);
36 | sprintf(
37 | error_buffer,
38 | msg_sting,
39 | lexer.input_position - 1
40 | );
41 | PyErr_SetString(PyExc_ValueError, error_buffer);
42 | free(error_buffer);
43 | return NULL;
44 | }
45 | return ret;
46 | }
47 |
48 | typedef struct {
49 | PyObject_HEAD
50 | struct Lexer lexer;
51 | } JsonIterState;
52 |
53 | static PyObject* json_iter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
54 | JsonIterState* json_iter_state = (JsonIterState *)type->tp_alloc(type, 0);
55 | if (!json_iter_state) {
56 | return NULL;
57 | }
58 |
59 | const char* string;
60 | if (!PyArg_ParseTuple(args, "s", &string)) {
61 | return NULL;
62 | }
63 | init_lexer(&json_iter_state->lexer, string);
64 |
65 | return (PyObject* )json_iter_state;
66 | }
67 |
68 | static void json_iter_dealloc(JsonIterState* json_iter_state) {
69 | release_lexer(&json_iter_state->lexer);
70 | Py_TYPE(json_iter_state)->tp_free(json_iter_state);
71 | }
72 |
73 | static PyObject* json_iter_next(JsonIterState* json_iter_state) {
74 | Py_BEGIN_ALLOW_THREADS
75 | while(json_iter_state->lexer.lexer_status == CAN_ADVANCE) {
76 | advance(&json_iter_state->lexer);
77 | }
78 | Py_END_ALLOW_THREADS
79 |
80 | if(json_iter_state->lexer.output.index == 1) {
81 | return NULL;
82 | }
83 | PyObject* ret = Py_BuildValue(
84 | "s#",
85 | json_iter_state->lexer.output.data,
86 | json_iter_state->lexer.output.index-1
87 | );
88 | reset_lexer_output(&json_iter_state->lexer);
89 | return ret;
90 | }
91 |
92 | PyTypeObject JSONIter_Type = {
93 | PyVarObject_HEAD_INIT(NULL, 0)
94 | "json_iter", /* tp_name */
95 | sizeof(JsonIterState), /* tp_basicsize */
96 | 0, /* tp_itemsize */
97 | (destructor)json_iter_dealloc, /* tp_dealloc */
98 | 0, /* tp_print */
99 | 0, /* tp_getattr */
100 | 0, /* tp_setattr */
101 | 0, /* tp_reserved */
102 | 0, /* tp_repr */
103 | 0, /* tp_as_number */
104 | 0, /* tp_as_sequence */
105 | 0, /* tp_as_mapping */
106 | 0, /* tp_hash */
107 | 0, /* tp_call */
108 | 0, /* tp_str */
109 | 0, /* tp_getattro */
110 | 0, /* tp_setattro */
111 | 0, /* tp_as_buffer */
112 | Py_TPFLAGS_DEFAULT, /* tp_flags */
113 | 0, /* tp_doc */
114 | 0, /* tp_traverse */
115 | 0, /* tp_clear */
116 | 0, /* tp_richcompare */
117 | 0, /* tp_weaklistoffset */
118 | PyObject_SelfIter, /* tp_iter */
119 | (iternextfunc)json_iter_next, /* tp_iternext */
120 | 0, /* tp_methods */
121 | 0, /* tp_members */
122 | 0, /* tp_getset */
123 | 0, /* tp_base */
124 | 0, /* tp_dict */
125 | 0, /* tp_descr_get */
126 | 0, /* tp_descr_set */
127 | 0, /* tp_dictoffset */
128 | 0, /* tp_init */
129 | PyType_GenericAlloc, /* tp_alloc */
130 | json_iter_new, /* tp_new */
131 | };
132 |
133 | static PyObject* parse_python_objects(PyObject *self, PyObject *args) {
134 | PyObject *obj = PyObject_CallObject((PyObject *) &JSONIter_Type, args);
135 | return obj;
136 | }
137 |
138 | static PyMethodDef parser_methods[] = {
139 | {
140 | "parse", parse_python_object, METH_VARARGS,
141 | "Extract JSON object from the string"
142 | },
143 | {
144 | "parse_objects", parse_python_objects, METH_VARARGS,
145 | "Iterate over all JSON objects in the string"
146 | },
147 | {NULL, NULL, 0, NULL}
148 | };
149 |
150 |
151 | static struct PyModuleDef parser_definition = {
152 | PyModuleDef_HEAD_INIT,
153 | "_chompjs",
154 | "C extension for fast JavaScript object parsing",
155 | -1,
156 | parser_methods
157 | };
158 |
159 | PyMODINIT_FUNC PyInit__chompjs(void) {
160 | Py_Initialize();
161 | PyObject* module = PyModule_Create(&parser_definition);
162 | if (!module) {
163 | return NULL;
164 | }
165 | if (PyType_Ready(&JSONIter_Type) < 0) {
166 | return NULL;
167 | }
168 | return module;
169 | }
170 |
--------------------------------------------------------------------------------
/_chompjs/parser.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
4 | */
5 |
6 | #include "parser.h"
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #define INITIAL_NESTING_DEPTH 20
15 |
16 | struct State states[] = {
17 | {begin},
18 | {json},
19 | {value},
20 | {end},
21 | {error},
22 | };
23 |
24 | enum StateIndex {
25 | BEGIN_STATE, JSON_STATE, VALUE_STATE, END_STATE, ERROR_STATE
26 | };
27 |
28 | void advance(struct Lexer* lexer) {
29 | lexer->state = lexer->state->change(lexer);
30 | }
31 |
32 | char next_char(struct Lexer* lexer) {
33 | while(1) {
34 | if(isspace(lexer->input[lexer->input_position])) {
35 | lexer->input_position += 1;
36 | continue;
37 | }
38 | return lexer->input[lexer->input_position];
39 | }
40 | return '\0';
41 | }
42 |
43 | char last_char(struct Lexer* lexer) {
44 | return top(&lexer->output);
45 | }
46 |
47 | void emit(char c, struct Lexer* lexer) {
48 | push(&lexer->output, c);
49 | lexer->input_position += 1;
50 | }
51 |
52 | void emit_in_place(char c, struct Lexer* lexer) {
53 | push(&lexer->output, c);
54 | }
55 |
56 | void unemit(struct Lexer* lexer) {
57 | pop(&lexer->output);
58 | }
59 |
60 | void emit_string(const char *s, size_t size, struct Lexer* lexer) {
61 | push_string(&lexer->output, s, size);
62 | lexer->input_position += size;
63 | }
64 |
65 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer) {
66 | push_string(&lexer->output, s, size);
67 | }
68 |
69 | void emit_number_in_place(long value, struct Lexer* lexer) {
70 | push_number(&lexer->output, value);
71 | }
72 |
73 | void init_lexer(struct Lexer* lexer, const char* string) {
74 | lexer->input = string;
75 | // allocate in advance more memory for output than for input because we might need
76 | // to add extra characters
77 | // for example `{a: undefined}` will be translated as `{"a": "undefined"}`
78 | lexer->output_size = 2 * strlen(string) + 1;
79 | init_char_buffer(&lexer->output, lexer->output_size);
80 | lexer->input_position = 0;
81 | init_char_buffer(&lexer->nesting_depth, INITIAL_NESTING_DEPTH);
82 | lexer->unrecognized_nesting_depth = 0;
83 | lexer->lexer_status = CAN_ADVANCE;
84 | lexer->state = &states[BEGIN_STATE];
85 | lexer->is_key = false;
86 | }
87 |
88 | void reset_lexer_output(struct Lexer* lexer) {
89 | clear(&lexer->output);
90 | lexer->lexer_status = CAN_ADVANCE;
91 | lexer->state = &states[BEGIN_STATE];
92 | lexer->is_key = false;
93 | lexer->input_position -= 1;
94 | }
95 |
96 | void release_lexer(struct Lexer* lexer) {
97 | release_char_buffer(&lexer->output);
98 | }
99 |
100 | struct State* begin(struct Lexer* lexer) {
101 | // Ignoring characters until either '{' or '[' appears
102 | for(;;) {
103 | switch(next_char(lexer)) {
104 | case '{':
105 | lexer->is_key = true;
106 | case '[':;
107 | return &states[JSON_STATE];
108 | break;
109 | case '\0':;
110 | return &states[END_STATE];
111 | default:
112 | lexer->input_position += 1;
113 | }
114 | }
115 | return &states[ERROR_STATE];
116 | }
117 |
118 | struct State* json(struct Lexer* lexer) {
119 | for(;;) {
120 | switch(next_char(lexer)) {
121 | case '{':
122 | push(&lexer->nesting_depth, '{');
123 | lexer->is_key = true;
124 | emit('{', lexer);
125 | break;
126 | case '[':
127 | push(&lexer->nesting_depth, '[');
128 | emit('[', lexer);
129 | break;
130 | case '}':
131 | if(last_char(lexer) == ',') {
132 | unemit(lexer);
133 | }
134 | pop(&lexer->nesting_depth);
135 | lexer->is_key = top(&lexer->nesting_depth) == '{';
136 | emit('}', lexer);
137 | if(size(&lexer->nesting_depth) <= 0) {
138 | return &states[END_STATE];
139 | }
140 | break;
141 | case ']':
142 | if(last_char(lexer) == ',') {
143 | unemit(lexer);
144 | }
145 | pop(&lexer->nesting_depth);
146 | lexer->is_key = top(&lexer->nesting_depth) == '{';
147 | emit(']', lexer);
148 | if(size(&lexer->nesting_depth) <= 0) {
149 | return &states[END_STATE];
150 | }
151 | break;
152 | case ':':
153 | lexer->is_key = false;
154 | emit(':', lexer);
155 | break;
156 | case ',':
157 | emit(',', lexer);
158 | lexer->is_key = top(&lexer->nesting_depth) == '{';
159 | break;
160 |
161 | case '/':;
162 | char next_c = lexer->input[lexer->input_position+1];
163 | if(next_c == '/' || next_c == '*') {
164 | handle_comments(lexer);
165 | } else {
166 | return &states[VALUE_STATE];
167 | }
168 | break;
169 |
170 | // This should never happen, but an malformed input can
171 | // cause an infinite loop without this check
172 | case '>':
173 | case ')':;
174 | return &states[ERROR_STATE];
175 | break;
176 |
177 | default:
178 | return &states[VALUE_STATE];
179 | }
180 | }
181 |
182 | return &states[ERROR_STATE];
183 | }
184 |
185 | struct State* _handle_string(struct Lexer* lexer, const char* string, size_t length) {
186 | char next_char = lexer->input[lexer->input_position+length+1];
187 | if(next_char == '_' || isalnum(next_char)) {
188 | return handle_unrecognized(lexer);
189 | }
190 | emit_string(string, length, lexer);
191 | return &states[JSON_STATE];
192 | }
193 |
194 | struct State* value(struct Lexer* lexer) {
195 | char c = next_char(lexer);
196 | const char* position = lexer->input + lexer->input_position;
197 |
198 | if(c == '"' || c == '\'' || c == '`') {
199 | return handle_quoted(lexer);
200 | } else if(isdigit(c) || c == '.' || c == '-') {
201 | if(lexer->is_key) {
202 | return handle_unrecognized(lexer);
203 | } else {
204 | return handle_numeric(lexer);
205 | }
206 | } else if(strncmp(position, "true", 4) == 0) {
207 | return _handle_string(lexer, "true", 4);
208 | } else if(strncmp(position, "false", 5) == 0) {
209 | return _handle_string(lexer, "false", 5);
210 | } else if(strncmp(position, "null", 4) == 0) {
211 | return _handle_string(lexer, "null", 4);
212 | } else if(c == ']' || c == '}' || c == '[' || c == '{') {
213 | return &states[JSON_STATE];
214 | } else if(strncmp(position, "NaN", 3) == 0) {
215 | return _handle_string(lexer, "NaN", 3);
216 | } else {
217 | return handle_unrecognized(lexer);
218 | }
219 |
220 | return &states[JSON_STATE];
221 | }
222 |
223 | struct State* end(struct Lexer* lexer) {
224 | emit('\0', lexer);
225 | lexer->lexer_status = FINISHED;
226 | return lexer->state;
227 | }
228 |
229 | struct State* error(struct Lexer* lexer) {
230 | emit('\0', lexer);
231 | lexer->lexer_status = ERROR;
232 | return lexer->state;
233 | }
234 |
235 | struct State* handle_quoted(struct Lexer* lexer) {
236 | char current_quotation = next_char(lexer);
237 | emit('"', lexer);
238 |
239 | for(;;) {
240 | char c = lexer->input[lexer->input_position];
241 | // handle escape sequences such as \\ and \'
242 | if(c == '\\') {
243 | char escaped = lexer->input[lexer->input_position+1];
244 | if(escaped == '\'') {
245 | emit('\'', lexer);
246 | lexer->input_position += 1;
247 | } else {
248 | emit('\\', lexer);
249 | emit(escaped, lexer);
250 | }
251 | continue;
252 | }
253 | // in case of malformed quotation we can reach end of the input
254 | if(c == '\0') {
255 | return &states[ERROR_STATE];
256 | }
257 | // if we're closing the quotations, we're done with the string
258 | if(c == current_quotation) {
259 | emit('"', lexer);
260 | return &states[JSON_STATE];
261 | }
262 | // otherwise, emit character
263 | if(c == '"') {
264 | emit_string_in_place("\\\"", 2, lexer);
265 | lexer->input_position += 1;
266 | } else {
267 | emit(c, lexer);
268 | }
269 | }
270 |
271 | return &states[ERROR_STATE];
272 | }
273 |
274 | struct State* handle_numeric(struct Lexer* lexer) {
275 | char c = next_char(lexer);
276 | if(c >= 49 && c <= 57) { // 1-9 range
277 | return handle_numeric_standard_base(lexer);
278 | } else if(c == '.') {
279 | emit_in_place('0', lexer);
280 | emit('.', lexer);
281 | return handle_numeric_standard_base(lexer);
282 | } else if(c == '-') {
283 | emit('-', lexer);
284 | return handle_numeric(lexer);
285 | } else if(c == '0') {
286 | char nc = tolower(lexer->input[lexer->input_position+1]);
287 | if(nc == '.') {
288 | emit('0', lexer);
289 | emit('.', lexer);
290 | return handle_numeric_standard_base(lexer);
291 | } else if(nc == 'x' || nc == 'X') {
292 | return handle_numeric_non_standard_base(lexer, 16);
293 | } else if(nc == 'o' || nc == 'O') {
294 | lexer->input_position += 2;
295 | return handle_numeric_non_standard_base(lexer, 8);
296 | } else if(isdigit(nc)) {
297 | return handle_numeric_non_standard_base(lexer, 8);
298 | } else if(nc == 'b' || nc == 'B') {
299 | lexer->input_position += 2;
300 | return handle_numeric_non_standard_base(lexer, 2);
301 | } else {
302 | emit('0', lexer);
303 | return &states[JSON_STATE];
304 | }
305 | } else {
306 | return &states[ERROR_STATE];
307 | }
308 | return &states[JSON_STATE];
309 | }
310 |
311 | struct State* handle_numeric_standard_base(struct Lexer* lexer) {
312 | char c = next_char(lexer);
313 | do {
314 | if(c != '_') {
315 | emit(c, lexer);
316 | } else {
317 | lexer->input_position += 1;
318 | }
319 | c = tolower(lexer->input[lexer->input_position]);
320 | } while(isdigit(c) || c == '.' || c == 'e' || c == 'E' || c == '+' || c =='-' || c == '_');
321 | if(last_char(lexer) == '.') {
322 | emit_in_place('0', lexer);
323 | }
324 | return &states[JSON_STATE];
325 | }
326 |
327 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base) {
328 | char* end;
329 | long n = strtol(lexer->input + lexer->input_position, &end, base);
330 | emit_number_in_place(n, lexer);
331 | lexer->input_position = end - lexer->input;
332 | return &states[JSON_STATE];
333 | }
334 |
335 | struct State* handle_unrecognized(struct Lexer* lexer) {
336 | emit_in_place('"', lexer);
337 | char currently_quoted_with = '\0';
338 |
339 | lexer->unrecognized_nesting_depth = 0;
340 | do {
341 | char c = lexer->input[lexer->input_position];
342 |
343 | switch(c) {
344 | case '\\':
345 | emit_in_place('\\', lexer);
346 | emit('\\', lexer);
347 | break;
348 |
349 | case '\'':
350 | case '"':
351 | case '`':
352 | if(c == '"') {
353 | emit_in_place('\\', lexer);
354 | emit('"', lexer);
355 | } else {
356 | emit(c, lexer);
357 | }
358 |
359 | if(!currently_quoted_with) {
360 | currently_quoted_with = c;
361 | } else if (currently_quoted_with == c) {
362 | currently_quoted_with = '\0';
363 | }
364 | break;
365 |
366 | case '{':
367 | case '[':
368 | case '<':
369 | case '(':
370 | emit(c, lexer);
371 | lexer->unrecognized_nesting_depth += 1;
372 | break;
373 |
374 | case '}':
375 | case ']':
376 | case '>':
377 | case ')':
378 | if(currently_quoted_with && lexer->unrecognized_nesting_depth > 0) {
379 | emit(c, lexer);
380 | } else if(lexer->unrecognized_nesting_depth > 0) {
381 | emit(c, lexer);
382 | lexer->unrecognized_nesting_depth -= 1;
383 | } else {
384 | // remove trailing whitespaces after value
385 | while(isspace(last_char(lexer))) {
386 | pop(&lexer->output);
387 | }
388 | emit_in_place('"', lexer);
389 | return &states[JSON_STATE];
390 | }
391 | break;
392 |
393 | case ',':
394 | case ':':
395 | if(!currently_quoted_with && lexer->unrecognized_nesting_depth <= 0) {
396 | // remove trailing whitespaces after key
397 | while(isspace(last_char(lexer))) {
398 | pop(&lexer->output);
399 | }
400 | emit_in_place('"', lexer);
401 | return &states[JSON_STATE];
402 | } else {
403 | emit(c, lexer);
404 | }
405 | break;
406 |
407 | default:
408 | emit(c, lexer);
409 | }
410 | } while (lexer->input[lexer->input_position] != '\0');
411 |
412 | return &states[ERROR_STATE];
413 | }
414 |
415 | void handle_comments(struct Lexer* lexer) {
416 | char c, next_c;
417 |
418 | lexer->input_position += 1;
419 | if(lexer->input[lexer->input_position] == '/' ) {
420 | for(;;) {
421 | lexer->input_position+=1;
422 | c = lexer->input[lexer->input_position];
423 | if((c == '\0') || (c == '\n')) {
424 | break;
425 | }
426 | }
427 | } else if(lexer->input[lexer->input_position] == '*') {
428 | for(;;) {
429 | lexer->input_position+=1;
430 | c = lexer->input[lexer->input_position];
431 | next_c = lexer->input[lexer->input_position+1];
432 | if((c == '\0') || (c == '*' && next_c == '/')) {
433 | break;
434 | }
435 | }
436 | lexer->input_position+=2;
437 | }
438 | }
439 |
--------------------------------------------------------------------------------
/_chompjs/parser.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
3 | * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
4 | */
5 |
6 | #ifndef CHOMPJS_PARSER_H
7 | #define CHOMPJS_PARSER_H
8 |
9 | #include
10 | #include
11 |
12 | #include "buffer.h"
13 |
14 | struct Lexer;
15 |
16 | /**
17 | States of internal state machine:
18 | * begin - start parsing
19 | * json - handle special characters: "[", "{", "}", "]", ",", ":"
20 | * value - handle a JSON value, such as strings and numbers
21 | * end - finish work
22 | * error - finish work, mark an error
23 | */
24 | struct State* begin(struct Lexer* lexer);
25 | struct State* json(struct Lexer* lexer);
26 | struct State* value(struct Lexer* lexer);
27 | struct State* end(struct Lexer* lexer);
28 | struct State* error(struct Lexer* lexer);
29 |
30 | /*
31 | Helper functions used in "value" state
32 | * handle_quoted - handles quoted strings
33 | * handle_numeric - handle numbers
34 | * handle_numeric_standard_base - handle numbers in standard base-10
35 | * handle_numeric_non_standard_base - handle numbers in non-standard bases (hex, oct)
36 | * handle_unrecognized - save all unrecognized data as a string
37 | */
38 | struct State* handle_quoted(struct Lexer* lexer);
39 | struct State* handle_numeric(struct Lexer* lexer);
40 | struct State* handle_numeric_standard_base(struct Lexer* lexer);
41 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base);
42 | struct State* handle_unrecognized(struct Lexer* lexer);
43 |
44 | /**
45 | State wrapper
46 | */
47 | struct State {
48 | struct State* (*change)(struct Lexer *);
49 | };
50 |
51 | /** Possible results of internal state machine state change state */
52 | typedef enum {
53 | CAN_ADVANCE,
54 | FINISHED,
55 | ERROR,
56 | } LexerStatus;
57 |
58 | /** Main object, responsible for everything */
59 | struct Lexer {
60 | const char* input;
61 | size_t output_size;
62 | struct CharBuffer output;
63 | size_t input_position;
64 | LexerStatus lexer_status;
65 | struct State* state;
66 | struct CharBuffer nesting_depth;
67 | size_t unrecognized_nesting_depth;
68 | bool is_key;
69 | };
70 |
71 | /** Switch state of internal state machine */
72 | void advance(struct Lexer* lexer);
73 |
74 | /** Get next char, ignore whitespaces */
75 | char next_char(struct Lexer* lexer);
76 |
77 | /** Get previously handled char */
78 | char last_char(struct Lexer* lexer);
79 |
80 | /** Send character to output buffer, advance input position */
81 | void emit(char c, struct Lexer* lexer);
82 |
83 | /** Send character to output buffer, keep old input position */
84 | void emit_in_place(char c, struct Lexer* lexer);
85 |
86 | /** Remove last character from output buffer */
87 | void unemit(struct Lexer* lexer);
88 |
89 | /** Send string to output buffer, advance input position */
90 | void emit_string(const char *s, size_t size, struct Lexer* lexer);
91 |
92 | /** Send string to output buffer, keep old input position */
93 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer);
94 |
95 | /** Send number to output buffer, keep old input position */
96 | void emit_number_in_place(long value, struct Lexer* lexer);
97 |
98 | /** Handle comments in JSON body */
99 | void handle_comments(struct Lexer* lexer);
100 |
101 | /** Initialize main lexer object */
102 | void init_lexer(struct Lexer* lexer, const char* string);
103 |
104 | /** Reset main lexer object output buffer */
105 | void reset_lexer_output(struct Lexer* lexer);
106 |
107 | /** Release main lexer object and its memory */
108 | void release_lexer(struct Lexer* lexer);
109 |
110 | #endif
111 |
--------------------------------------------------------------------------------
/chompjs/__init__.py:
--------------------------------------------------------------------------------
1 | from .chompjs import parse_js_object, parse_js_objects
2 |
--------------------------------------------------------------------------------
/chompjs/chompjs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import warnings
5 |
6 | from _chompjs import parse, parse_objects
7 |
8 |
9 | def _preprocess(string, unicode_escape=False):
10 | if unicode_escape:
11 | string = string.encode().decode("unicode_escape")
12 | return string
13 |
14 |
15 | def _process_loader_arguments(loader_args, loader_kwargs, json_params):
16 | if json_params:
17 | msg = "json_params argument is deprecated, please use loader_kwargs instead"
18 | warnings.warn(msg, DeprecationWarning)
19 | loader_kwargs = json_params
20 |
21 | if not loader_args:
22 | loader_args = []
23 |
24 | if not loader_kwargs:
25 | loader_kwargs = {}
26 |
27 | return (loader_args, loader_kwargs)
28 |
29 |
30 | def parse_js_object(
31 | string,
32 | unicode_escape=False,
33 | loader=json.loads,
34 | loader_args=None,
35 | loader_kwargs=None,
36 | json_params=None,
37 | ):
38 | """
39 | Extracts first JSON object encountered in the input string
40 |
41 | Parameters
42 | ----------
43 | string: str
44 | Input string
45 |
46 | >>> parse_js_object("{a: 100}")
47 | {'a': 100}
48 |
49 | unicode_escape: bool, optional
50 | Attempt to fix input string if it contains escaped special characters
51 |
52 | >>> parse_js_object('{\\\\"a\\\\": 100}')
53 | {'\\\\"a\\\\"': 100}
54 | >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
55 | {'a': 100}
56 |
57 | loader: func, optional
58 | Function used to load processed input data. By default `json.loads` is used
59 |
60 | >>> import orjson
61 | >>> import chompjs
62 | >>>
63 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
64 | {'a': 12}
65 |
66 | loader_args: list, optional
67 | Allow passing down positional arguments to loader function
68 |
69 | loader_kwargs: dict, optional
70 | Allow passing down keyword arguments to loader function
71 |
72 | >>> parse_js_object("{'a': 10.1}")
73 | {'a': 10.1}
74 | >>> import decimal
75 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
76 | {'a': Decimal('10.1')}
77 |
78 | .. deprecated:: 1.3.0
79 | json_params: dict, optional
80 | Use `loader_kwargs` instead
81 |
82 | Returns
83 | -------
84 | list | dict
85 | Extracted JSON object
86 |
87 | Raises
88 | ------
89 | ValueError
90 | If failed to parse input properly
91 |
92 | ```python
93 | >>> parse_js_object(None)
94 | Traceback (most recent call last):
95 | ...
96 | ValueError: Invalid input
97 | >>> parse_js_object("No JSON objects in sight...")
98 | Traceback (most recent call last):
99 | ...
100 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
101 |
102 | ```
103 |
104 | """
105 | if not string:
106 | raise ValueError("Invalid input")
107 |
108 | loader_args, loader_kwargs = _process_loader_arguments(
109 | loader_args, loader_kwargs, json_params
110 | )
111 |
112 | string = _preprocess(string, unicode_escape)
113 | parsed_data = parse(string)
114 | return loader(parsed_data, *loader_args, **loader_kwargs)
115 |
116 |
117 | def parse_js_objects(
118 | string,
119 | unicode_escape=False,
120 | omitempty=False,
121 | loader=json.loads,
122 | loader_args=None,
123 | loader_kwargs=None,
124 | json_params=None,
125 | ):
126 | """
127 | Returns a generator extracting all JSON objects encountered in the input string.
128 | Can be used to read JSON Lines
129 |
130 | Parameters
131 | ----------
132 | string: str
133 | Input string
134 |
135 | >>> it = parse_js_objects("{a: 100} {b: 100}")
136 | >>> next(it)
137 | {'a': 100}
138 | >>> next(it)
139 | {'b': 100}
140 |
141 | unicode_escape: bool, optional
142 | Attempt to fix input string if it contains escaped special characters
143 |
144 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
145 | {'\\\\"a\\\\"': 100}
146 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
147 | {'a': 100}
148 |
149 | omitempty: bool, optional
150 | Skip empty dictionaries and lists
151 |
152 | >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
153 | [{'a': 12}, {}, {'b': 13}]
154 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
155 | [{'a': 12}, {'b': 13}]
156 |
157 | loader: func, optional
158 | Function used to load processed input data. By default `json.loads` is used
159 |
160 | >>> import orjson
161 | >>> import chompjs
162 | >>>
163 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
164 | {'a': 12}
165 |
166 | loader_args: list, optional
167 | Allow passing down positional arguments to loader function
168 |
169 | loader_kwargs: dict, optional
170 | Allow passing down keyword arguments to loader function
171 |
172 | >>> next(parse_js_objects("{'a': 10.1}"))
173 | {'a': 10.1}
174 | >>> import decimal
175 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
176 | {'a': Decimal('10.1')}
177 |
178 | .. deprecated:: 1.3.0
179 | json_params: dict, optional
180 | Use `loader_kwargs` instead
181 |
182 | Returns
183 | -------
184 | generator
185 | Iterating over it yields all encountered JSON objects
186 | """
187 |
188 | if not string:
189 | return
190 |
191 | loader_args, loader_kwargs = _process_loader_arguments(
192 | loader_args, loader_kwargs, json_params
193 | )
194 |
195 | string = _preprocess(string, unicode_escape)
196 | for raw_data in parse_objects(string):
197 | try:
198 | data = loader(raw_data, *loader_args, **loader_kwargs)
199 | except ValueError:
200 | continue
201 |
202 | if not data and omitempty:
203 | continue
204 |
205 | yield data
206 |
--------------------------------------------------------------------------------
/chompjs/test_parser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | import functools
5 | import math
6 | import unittest
7 |
8 | from chompjs import parse_js_object, parse_js_objects
9 |
10 |
11 | def parametrize_test(*arguments_list):
12 | def decorate(func):
13 | @functools.wraps(func)
14 | def wrapper(self, *args, **kwargs):
15 | for arguments in arguments_list:
16 | func(self, *arguments)
17 | return wrapper
18 | return decorate
19 |
20 |
21 | class TestParser(unittest.TestCase):
22 | @parametrize_test(
23 | ("{'hello': 'world'}", {'hello': 'world'}),
24 | ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}),
25 | (
26 | "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}",
27 | {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'},
28 | ),
29 | ("{}", {}),
30 | )
31 | def test_parse_object(self, in_data, expected_data):
32 | result = parse_js_object(in_data)
33 | self.assertEqual(result, expected_data)
34 |
35 | @parametrize_test(
36 | ("[]", []),
37 | ("[[[]]]", [[[]]]),
38 | ("[[[1]]]", [[[1]]]),
39 | ("[1]", [1]),
40 | ("[1, 2, 3, 4]", [1, 2, 3, 4]),
41 | ("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']),
42 | ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]),
43 | )
44 | def test_parse_list(self, in_data, expected_data):
45 | result = parse_js_object(in_data)
46 | self.assertEqual(result, expected_data)
47 |
48 | @parametrize_test(
49 | ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}),
50 | ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}),
51 | ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]),
52 | (
53 | "[false, {'true': true, `pies`: \"kot\"}, false,]",
54 | [False, {"true": True, 'pies': 'kot'}, False],
55 | ),
56 | (
57 | "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}",
58 | {k: 1 for k in 'abcdefghij'},
59 | ),
60 | (
61 | "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
62 | {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]},
63 | ),
64 | )
65 | def test_parse_mixed(self, in_data, expected_data):
66 | result = parse_js_object(in_data)
67 | self.assertEqual(result, expected_data)
68 |
69 | @parametrize_test(
70 | ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}),
71 | ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]),
72 | ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}),
73 | ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}),
74 | ("[\"\\uD834\\uDD1E\"]", [u'𝄞']),
75 | ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}),
76 | ("['\u00E9']", ['é']),
77 | ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}),
78 | ('{"a": 3.125e7}', {'a': 3.125e7}),
79 | ('''{"a": "b\\'"}''', {'a': "b'"}),
80 | ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -.1}),
81 | ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]),
82 | ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}),
83 | )
84 | def test_parse_standard_values(self, in_data, expected_data):
85 | result = parse_js_object(in_data)
86 | self.assertEqual(result, expected_data)
87 |
88 | def test_parse_nan(self):
89 | in_data = '{"A": NaN}'
90 | result = parse_js_object(in_data)
91 | self.assertTrue(math.isnan(result["A"]))
92 |
93 | @parametrize_test(
94 | ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}),
95 | ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}),
96 | (
97 | "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}",
98 | {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"}
99 | ),
100 | (
101 | "{\"abc\": function() {return '])))))))))))))))';}}",
102 | {"abc": "function() {return '])))))))))))))))';}"},
103 | ),
104 | ('{"a": undefined}', {"a": "undefined"}),
105 | ('[undefined, undefined]', ["undefined", "undefined"]),
106 | ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}),
107 | ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}),
108 | ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}),
109 | ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}),
110 | ("{'a': 121.}", {'a': 121.0}),
111 | ("{abc : 100}", {'abc': 100}),
112 | ("{abc : 100}", {'abc': 100}),
113 | ("{abc: name }", {'abc': "name"}),
114 | ("{abc: name\t}", {'abc': "name"}),
115 | ("{abc: value\n}", {'abc': "value"}),
116 | ("{abc: name}", {'abc': "name"}),
117 | ("{abc: \tname}", {'abc': "name"}),
118 | ("{abc: \nvalue}", {'abc': "value"}),
119 | )
120 | def test_parse_strange_values(self, in_data, expected_data):
121 | result = parse_js_object(in_data)
122 | self.assertEqual(result, expected_data)
123 |
124 | @parametrize_test(
125 | ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}),
126 | ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}),
127 | ('{"a":\r\n10}', {'a': 10}),
128 | ("{'foo': 0,\r\n}", {'foo': 0}),
129 | ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}),
130 | )
131 | def test_strange_input(self, in_data, expected_data):
132 | result = parse_js_object(in_data)
133 | self.assertEqual(result, expected_data)
134 |
135 | @parametrize_test(
136 | ("[0]", [0]),
137 | ("[1]", [1]),
138 | ("[12]", [12]),
139 | ("[12_12]", [1212]),
140 | ("[0x12]", [18]),
141 | ("[0xab]", [171]),
142 | ("[0xAB]", [171]),
143 | ("[0X12]", [18]),
144 | ("[0Xab]", [171]),
145 | ("[0XAB]", [171]),
146 | ("[01234]", [668]),
147 | ("[0o1234]", [668]),
148 | ("[0O1234]", [668]),
149 | ("[0b1111]", [15]),
150 | ("[0B1111]", [15]),
151 | ("[-0]", [-0]),
152 | ("[-1]", [-1]),
153 | ("[-12]", [-12]),
154 | ("[-12_12]", [-1212]),
155 | ("[-0x12]", [-18]),
156 | ("[-0xab]", [-171]),
157 | ("[-0xAB]", [-171]),
158 | ("[-0X12]", [-18]),
159 | ("[-0Xab]", [-171]),
160 | ("[-0XAB]", [-171]),
161 | ("[-01234]", [-668]),
162 | ("[-0o1234]", [-668]),
163 | ("[-0O1234]", [-668]),
164 | ("[-0b1111]", [-15]),
165 | ("[-0B1111]", [-15]),
166 | )
167 | def test_integer_numeric_values(self, in_data, expected_data):
168 | result = parse_js_object(in_data)
169 | self.assertEqual(result, expected_data)
170 |
171 | @parametrize_test(
172 | ("[0.32]", [0.32]),
173 | ("[-0.32]", [-0.32]),
174 | ("[.32]", [0.32]),
175 | ("[-.32]", [-0.32]),
176 | ("[12.]", [12.0]),
177 | ("[-12.]", [-12.0]),
178 | ("[12.32]", [12.32]),
179 | ("[-12.12]", [-12.12]),
180 | ("[3.1415926]", [3.1415926]),
181 | ("[.123456789]", [.123456789]),
182 | ("[.0123]", [0.0123]),
183 | ("[0.0123]", [0.0123]),
184 | ("[-.0123]", [-0.0123]),
185 | ("[-0.0123]", [-0.0123]),
186 | ("[3.1E+12]", [3.1E+12]),
187 | ("[3.1e+12]", [3.1E+12]),
188 | ("[.1E-23]", [.1e-23]),
189 | ("[.1e-23]", [.1e-23]),
190 | )
191 | def test_float_numeric_values(self, in_data, expected_data):
192 | result = parse_js_object(in_data)
193 | self.assertEqual(result, expected_data)
194 |
195 |
196 | @parametrize_test(
197 | (
198 | """
199 | var obj = {
200 | // Comment
201 | x: "X", // Comment
202 | };
203 | """,
204 | {"x": "X"},
205 | ),
206 | (
207 | """
208 | var /* Comment */ obj = /* Comment */ {
209 | /* Comment */
210 | x: /* Comment */ "X", /* Comment */
211 | };
212 | """,
213 | {"x": "X"},
214 | ),
215 | (
216 | """[/*...*/1,2,3,/*...*/4,5,6]""",
217 | [1, 2, 3, 4, 5, 6],
218 | ),
219 | )
220 | def test_comments(self, in_data, expected_data):
221 | result = parse_js_object(in_data)
222 | self.assertEqual(result, expected_data)
223 |
224 | @parametrize_test(
225 | ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]),
226 | )
227 | def test_jsonlines(self, in_data, expected_data):
228 | result = list(parse_js_objects(in_data))
229 | self.assertEqual(result, expected_data)
230 |
231 |
232 | class TestParserExceptions(unittest.TestCase):
233 | @parametrize_test(
234 | ('}{', ValueError),
235 | ('', ValueError),
236 | (None, ValueError),
237 | )
238 | def test_exceptions(self, in_data, expected_exception):
239 | with self.assertRaises(expected_exception):
240 | parse_js_object(in_data)
241 |
242 | @parametrize_test(
243 | ("{whose: 's's', category_name: '>'}", ValueError),
244 | )
245 | def test_malformed_input(self, in_data, expected_exception):
246 | with self.assertRaises(expected_exception):
247 | parse_js_object(in_data)
248 |
249 | @parametrize_test(
250 | (
251 | '{"test": """}',
252 | ValueError,
253 | 'Error parsing input near character 13',
254 | ),
255 | )
256 | def test_error_messages(self, in_data, expected_exception, expected_exception_text):
257 | with self.assertRaisesRegex(expected_exception, expected_exception_text):
258 | parse_js_object(in_data)
259 |
260 |
261 | class TestOptions(unittest.TestCase):
262 | @parametrize_test(
263 | ('{\\\"a\\\": 12}', {'a': 12}),
264 | )
265 | def test_unicode_escape(self, in_data, expected_data):
266 | result = parse_js_object(in_data, unicode_escape=True)
267 | self.assertEqual(result, expected_data)
268 |
269 | @parametrize_test(
270 | ('["\n"]', ["\n"]),
271 | ("{'a': '\"\"', 'b': '\\\\', 'c': '\t\n'}", {'a': '""', 'b': '\\', 'c': '\t\n'}),
272 | (
273 | """var myObj = {
274 | myMethod: function(params) {
275 | // ...
276 | },
277 | myValue: 100
278 | }""",
279 | {'myMethod': 'function(params) {\n // ...\n }', 'myValue': 100},
280 | ),
281 | )
282 | def test_json_non_strict(self, in_data, expected_data):
283 | result = parse_js_object(in_data, loader_kwargs={'strict': False})
284 | self.assertEqual(result, expected_data)
285 |
286 | @parametrize_test(
287 | ("[]", []),
288 | ("[1, 2, 3]", [1, 2, 3]),
289 | ('var x = [1, 2, 3, 4, 5,]', [1, 2, 3, 4, 5]),
290 | ('{}', {}),
291 | ("{'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}),
292 | ("var x = {'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}),
293 | )
294 | def test_loader(self, in_data, expected_data):
295 | import ast
296 | result = parse_js_object(in_data, loader=ast.literal_eval)
297 | self.assertEqual(result, expected_data)
298 |
299 |
300 | class TestParseJsonObjects(unittest.TestCase):
301 | @parametrize_test(
302 | ("", []),
303 | ("aaaaaaaaaaaaaaaa", []),
304 | (" ", []),
305 | (" {'a': 12}", [{'a': 12}]),
306 | ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]),
307 | ("[12] [13] [14]", [[12], [13], [14]]),
308 | ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]),
309 | ("[1][1][1]", [[1], [1], [1]]),
310 | ("[1] [2] {'a': ", [[1], [2]]),
311 | ("[]", [[]]),
312 | ("[][][][]", [[], [], [], []]),
313 | ("{}", [{}]),
314 | ("{}{}{}{}", [{}, {}, {}, {}]),
315 | ("{{}}{{}}", []),
316 | ("[[]][[]]", [[[]], [[]]]),
317 | ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]),
318 | (
319 | 'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])',
320 | [{}, {'a': 12}, [1, 2, 3]],
321 | ),
322 | ('{"a": 12, broken}{"c": 100}', [{'c': 100}]),
323 | ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]),
324 | )
325 | def test_parse_json_objects(self, in_data, expected_data):
326 | result = list(parse_js_objects(in_data))
327 | self.assertEqual(result, expected_data)
328 |
329 | @parametrize_test(
330 | ("[1][][2]", [[1], [2]]),
331 | ("{'a': 12}{}{'b': 13}", [{'a': 12}, {'b': 13}]),
332 | ("[][][][][][][][][]", []),
333 | ("{}{}{}{}{}{}{}{}{}", []),
334 | )
335 | def test_parse_json_objects_without_empty(self, in_data, expected_data):
336 | result = list(parse_js_objects(in_data, omitempty=True))
337 | self.assertEqual(result, expected_data)
338 |
339 |
340 | if __name__ == '__main__':
341 | unittest.main()
342 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | chompjs API documentation
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
24 |
25 |
26 |
27 | Expand source code
28 |
29 | # -*- coding: utf-8 -*-
30 |
31 | import json
32 | import warnings
33 |
34 | from _chompjs import parse, parse_objects
35 |
36 |
37 | def _preprocess(string, unicode_escape=False):
38 | if unicode_escape:
39 | string = string.encode().decode("unicode_escape")
40 | return string
41 |
42 |
43 | def _process_loader_arguments(loader_args, loader_kwargs, json_params):
44 | if json_params:
45 | msg = "json_params argument is deprecated, please use loader_kwargs instead"
46 | warnings.warn(msg, DeprecationWarning)
47 | loader_kwargs = json_params
48 |
49 | if not loader_args:
50 | loader_args = []
51 |
52 | if not loader_kwargs:
53 | loader_kwargs = {}
54 |
55 | return (loader_args, loader_kwargs)
56 |
57 |
58 | def parse_js_object(
59 | string,
60 | unicode_escape=False,
61 | loader=json.loads,
62 | loader_args=None,
63 | loader_kwargs=None,
64 | json_params=None,
65 | ):
66 | """
67 | Extracts first JSON object encountered in the input string
68 |
69 | Parameters
70 | ----------
71 | string: str
72 | Input string
73 |
74 | >>> parse_js_object("{a: 100}")
75 | {'a': 100}
76 |
77 | unicode_escape: bool, optional
78 | Attempt to fix input string if it contains escaped special characters
79 |
80 | >>> parse_js_object('{\\\\"a\\\\": 100}')
81 | {'\\\\"a\\\\"': 100}
82 | >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
83 | {'a': 100}
84 |
85 | loader: func, optional
86 | Function used to load processed input data. By default `json.loads` is used
87 |
88 | >>> import orjson
89 | >>> import chompjs
90 | >>>
91 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
92 | {'a': 12}
93 |
94 | loader_args: list, optional
95 | Allow passing down positional arguments to loader function
96 |
97 | loader_kwargs: dict, optional
98 | Allow passing down keyword arguments to loader function
99 |
100 | >>> parse_js_object("{'a': 10.1}")
101 | {'a': 10.1}
102 | >>> import decimal
103 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
104 | {'a': Decimal('10.1')}
105 |
106 | .. deprecated:: 1.3.0
107 | json_params: dict, optional
108 | Use `loader_kwargs` instead
109 |
110 | Returns
111 | -------
112 | list | dict
113 | Extracted JSON object
114 |
115 | Raises
116 | ------
117 | ValueError
118 | If failed to parse input properly
119 |
120 | ```python
121 | >>> parse_js_object(None)
122 | Traceback (most recent call last):
123 | ...
124 | ValueError: Invalid input
125 | >>> parse_js_object("No JSON objects in sight...")
126 | Traceback (most recent call last):
127 | ...
128 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
129 |
130 | ```
131 |
132 | """
133 | if not string:
134 | raise ValueError("Invalid input")
135 |
136 | loader_args, loader_kwargs = _process_loader_arguments(
137 | loader_args, loader_kwargs, json_params
138 | )
139 |
140 | if json_params:
141 | msg = "json_params argument is deprecated, please use loader_kwargs instead"
142 | warnings.warn(msg, DeprecationWarning)
143 |
144 | string = _preprocess(string, unicode_escape)
145 | parsed_data = parse(string)
146 | return loader(parsed_data, *loader_args, **loader_kwargs)
147 |
148 |
149 | def parse_js_objects(
150 | string,
151 | unicode_escape=False,
152 | omitempty=False,
153 | loader=json.loads,
154 | loader_args=None,
155 | loader_kwargs=None,
156 | json_params=None,
157 | ):
158 | """
159 | Returns a generator extracting all JSON objects encountered in the input string.
160 | Can be used to read JSON Lines
161 |
162 | Parameters
163 | ----------
164 | string: str
165 | Input string
166 |
167 | >>> it = parse_js_objects("{a: 100} {b: 100}")
168 | >>> next(it)
169 | {'a': 100}
170 | >>> next(it)
171 | {'b': 100}
172 |
173 | unicode_escape: bool, optional
174 | Attempt to fix input string if it contains escaped special characters
175 |
176 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
177 | {'\\\\"a\\\\"': 100}
178 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
179 | {'a': 100}
180 |
181 | omitempty: bool, optional
182 | Skip empty dictionaries and lists
183 |
184 | >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
185 | [{'a': 12}, {}, {'b': 13}]
186 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
187 | [{'a': 12}, {'b': 13}]
188 |
189 | loader: func, optional
190 | Function used to load processed input data. By default `json.loads` is used
191 |
192 | >>> import orjson
193 | >>> import chompjs
194 | >>>
195 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
196 | {'a': 12}
197 |
198 | loader_args: list, optional
199 | Allow passing down positional arguments to loader function
200 |
201 | loader_kwargs: dict, optional
202 | Allow passing down keyword arguments to loader function
203 |
204 | >>> next(parse_js_objects("{'a': 10.1}"))
205 | {'a': 10.1}
206 | >>> import decimal
207 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
208 | {'a': Decimal('10.1')}
209 |
210 | .. deprecated:: 1.3.0
211 | json_params: dict, optional
212 | Use `loader_kwargs` instead
213 |
214 | Returns
215 | -------
216 | generator
217 | Iterating over it yields all encountered JSON objects
218 | """
219 |
220 | if not string:
221 | return
222 |
223 | loader_args, loader_kwargs = _process_loader_arguments(
224 | loader_args, loader_kwargs, json_params
225 | )
226 |
227 | string = _preprocess(string, unicode_escape)
228 | for raw_data in parse_objects(string):
229 | try:
230 | data = loader(raw_data, *loader_args, **loader_kwargs)
231 | except ValueError:
232 | continue
233 |
234 | if not data and omitempty:
235 | continue
236 |
237 | yield data
238 |
239 |
240 |
242 |
244 |
245 |
246 |
247 |
248 | def parse_js_object(string, unicode_escape=False, loader=<function loads>, loader_args=None, loader_kwargs=None, json_params=None)
249 |
250 | -
251 |
Extracts first JSON object encountered in the input string
252 |
Parameters
253 |
254 | string
: str
255 | - Input string
256 |
257 |
>>> parse_js_object("{a: 100}")
258 | {'a': 100}
259 |
260 |
261 | unicode_escape
: bool
, optional
262 | - Attempt to fix input string if it contains escaped special characters
263 |
264 |
>>> parse_js_object('{\\"a\\": 100}')
265 | {'\\"a\\"': 100}
266 | >>> parse_js_object('{\\"a\\": 100}', unicode_escape=True)
267 | {'a': 100}
268 |
269 |
270 | loader
: func
, optional
271 | - Function used to load processed input data. By default
json.loads
is used
272 |
273 |
>>> import orjson
274 | >>> import chompjs
275 | >>>
276 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
277 | {'a': 12}
278 |
279 |
280 | loader_args
: list
, optional
281 | - Allow passing down positional arguments to loader function
282 | loader_kwargs
: dict
, optional
283 | - Allow passing down keyword arguments to loader function
284 |
285 |
>>> parse_js_object("{'a': 10.1}")
286 | {'a': 10.1}
287 | >>> import decimal
288 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
289 | {'a': Decimal('10.1')}
290 |
291 |
292 |
Deprecated since version: 1.3.0
293 |
294 |
295 | json_params
: dict
, optional
296 | - Use
loader_kwargs
instead
297 |
298 |
Returns
299 |
300 | list | dict
301 | - Extracted JSON object
302 |
303 |
Raises
304 |
305 | ValueError
306 | - If failed to parse input properly
307 |
308 |
>>> parse_js_object(None)
309 | Traceback (most recent call last):
310 | ...
311 | ValueError: Invalid input
312 | >>> parse_js_object("No JSON objects in sight...")
313 | Traceback (most recent call last):
314 | ...
315 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
316 |
317 |
318 |
319 |
320 | Expand source code
321 |
322 | def parse_js_object(
323 | string,
324 | unicode_escape=False,
325 | loader=json.loads,
326 | loader_args=None,
327 | loader_kwargs=None,
328 | json_params=None,
329 | ):
330 | """
331 | Extracts first JSON object encountered in the input string
332 |
333 | Parameters
334 | ----------
335 | string: str
336 | Input string
337 |
338 | >>> parse_js_object("{a: 100}")
339 | {'a': 100}
340 |
341 | unicode_escape: bool, optional
342 | Attempt to fix input string if it contains escaped special characters
343 |
344 | >>> parse_js_object('{\\\\"a\\\\": 100}')
345 | {'\\\\"a\\\\"': 100}
346 | >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
347 | {'a': 100}
348 |
349 | loader: func, optional
350 | Function used to load processed input data. By default `json.loads` is used
351 |
352 | >>> import orjson
353 | >>> import chompjs
354 | >>>
355 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
356 | {'a': 12}
357 |
358 | loader_args: list, optional
359 | Allow passing down positional arguments to loader function
360 |
361 | loader_kwargs: dict, optional
362 | Allow passing down keyword arguments to loader function
363 |
364 | >>> parse_js_object("{'a': 10.1}")
365 | {'a': 10.1}
366 | >>> import decimal
367 | >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
368 | {'a': Decimal('10.1')}
369 |
370 | .. deprecated:: 1.3.0
371 | json_params: dict, optional
372 | Use `loader_kwargs` instead
373 |
374 | Returns
375 | -------
376 | list | dict
377 | Extracted JSON object
378 |
379 | Raises
380 | ------
381 | ValueError
382 | If failed to parse input properly
383 |
384 | ```python
385 | >>> parse_js_object(None)
386 | Traceback (most recent call last):
387 | ...
388 | ValueError: Invalid input
389 | >>> parse_js_object("No JSON objects in sight...")
390 | Traceback (most recent call last):
391 | ...
392 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
393 |
394 | ```
395 |
396 | """
397 | if not string:
398 | raise ValueError("Invalid input")
399 |
400 | loader_args, loader_kwargs = _process_loader_arguments(
401 | loader_args, loader_kwargs, json_params
402 | )
403 |
404 | if json_params:
405 | msg = "json_params argument is deprecated, please use loader_kwargs instead"
406 | warnings.warn(msg, DeprecationWarning)
407 |
408 | string = _preprocess(string, unicode_escape)
409 | parsed_data = parse(string)
410 | return loader(parsed_data, *loader_args, **loader_kwargs)
411 |
412 |
413 |
414 | def parse_js_objects(string, unicode_escape=False, omitempty=False, loader=<function loads>, loader_args=None, loader_kwargs=None, json_params=None)
415 |
416 | -
417 |
Returns a generator extracting all JSON objects encountered in the input string.
418 | Can be used to read JSON Lines
419 |
Parameters
420 |
421 | string
: str
422 | - Input string
423 |
424 |
>>> it = parse_js_objects("{a: 100} {b: 100}")
425 | >>> next(it)
426 | {'a': 100}
427 | >>> next(it)
428 | {'b': 100}
429 |
430 |
431 | unicode_escape
: bool
, optional
432 | - Attempt to fix input string if it contains escaped special characters
433 |
434 |
>>> next(parse_js_objects('{\\"a\\": 100}'))
435 | {'\\"a\\"': 100}
436 | >>> next(parse_js_objects('{\\"a\\": 100}', unicode_escape=True))
437 | {'a': 100}
438 |
439 |
440 | omitempty
: bool
, optional
441 | - Skip empty dictionaries and lists
442 |
443 |
>>> list(parse_js_objects("{a: 12} {} {b: 13}"))
444 | [{'a': 12}, {}, {'b': 13}]
445 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
446 | [{'a': 12}, {'b': 13}]
447 |
448 |
449 | loader
: func
, optional
450 | - Function used to load processed input data. By default
json.loads
is used
451 |
452 |
>>> import orjson
453 | >>> import chompjs
454 | >>>
455 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
456 | {'a': 12}
457 |
458 |
459 | loader_args
: list
, optional
460 | - Allow passing down positional arguments to loader function
461 | loader_kwargs
: dict
, optional
462 | - Allow passing down keyword arguments to loader function
463 |
464 |
>>> next(parse_js_objects("{'a': 10.1}"))
465 | {'a': 10.1}
466 | >>> import decimal
467 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
468 | {'a': Decimal('10.1')}
469 |
470 |
471 |
Deprecated since version: 1.3.0
472 |
473 |
474 | json_params
: dict
, optional
475 | - Use
loader_kwargs
instead
476 |
477 |
Returns
478 |
479 | generator
480 | - Iterating over it yields all encountered JSON objects
481 |
482 |
483 |
484 | Expand source code
485 |
486 | def parse_js_objects(
487 | string,
488 | unicode_escape=False,
489 | omitempty=False,
490 | loader=json.loads,
491 | loader_args=None,
492 | loader_kwargs=None,
493 | json_params=None,
494 | ):
495 | """
496 | Returns a generator extracting all JSON objects encountered in the input string.
497 | Can be used to read JSON Lines
498 |
499 | Parameters
500 | ----------
501 | string: str
502 | Input string
503 |
504 | >>> it = parse_js_objects("{a: 100} {b: 100}")
505 | >>> next(it)
506 | {'a': 100}
507 | >>> next(it)
508 | {'b': 100}
509 |
510 | unicode_escape: bool, optional
511 | Attempt to fix input string if it contains escaped special characters
512 |
513 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
514 | {'\\\\"a\\\\"': 100}
515 | >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
516 | {'a': 100}
517 |
518 | omitempty: bool, optional
519 | Skip empty dictionaries and lists
520 |
521 | >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
522 | [{'a': 12}, {}, {'b': 13}]
523 | >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
524 | [{'a': 12}, {'b': 13}]
525 |
526 | loader: func, optional
527 | Function used to load processed input data. By default `json.loads` is used
528 |
529 | >>> import orjson
530 | >>> import chompjs
531 | >>>
532 | >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
533 | {'a': 12}
534 |
535 | loader_args: list, optional
536 | Allow passing down positional arguments to loader function
537 |
538 | loader_kwargs: dict, optional
539 | Allow passing down keyword arguments to loader function
540 |
541 | >>> next(parse_js_objects("{'a': 10.1}"))
542 | {'a': 10.1}
543 | >>> import decimal
544 | >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
545 | {'a': Decimal('10.1')}
546 |
547 | .. deprecated:: 1.3.0
548 | json_params: dict, optional
549 | Use `loader_kwargs` instead
550 |
551 | Returns
552 | -------
553 | generator
554 | Iterating over it yields all encountered JSON objects
555 | """
556 |
557 | if not string:
558 | return
559 |
560 | loader_args, loader_kwargs = _process_loader_arguments(
561 | loader_args, loader_kwargs, json_params
562 | )
563 |
564 | string = _preprocess(string, unicode_escape)
565 | for raw_data in parse_objects(string):
566 | try:
567 | data = loader(raw_data, *loader_args, **loader_kwargs)
568 | except ValueError:
569 | continue
570 |
571 | if not data and omitempty:
572 | continue
573 |
574 | yield data
575 |
576 |
577 |
578 |
579 |
581 |
582 |
596 |
597 |
600 |
601 |
602 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 |
4 | from io import open
5 | from os import path
6 | from platform import system
7 | from setuptools import setup, Extension
8 |
9 |
10 | this_directory = path.abspath(path.dirname(__file__))
11 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
12 | long_description = f.read()
13 |
14 | extra_compile_args = []
15 | extra_link_args = []
16 | if system() == 'Linux':
17 | extra_compile_args = ['-Wl,-Bsymbolic-functions']
18 | extra_link_args = ['-Wl,-Bsymbolic-functions']
19 |
20 | chompjs_extension = Extension(
21 | '_chompjs',
22 | sources=['_chompjs/module.c', '_chompjs/parser.c', '_chompjs/buffer.c'],
23 | extra_compile_args=extra_compile_args,
24 | extra_link_args=extra_link_args,
25 | )
26 |
27 | setup(
28 | name='chompjs',
29 | version='1.3.2',
30 | description='Parsing JavaScript objects into Python dictionaries',
31 | author='Mariusz Obajtek',
32 | author_email='nykakin@gmail.com',
33 | keywords='parsing parser JavaScript json json5 webscrapping',
34 | python_requires='>=3.8',
35 | ext_modules=[chompjs_extension],
36 | classifiers=[
37 | "Programming Language :: Python :: 3",
38 | "Programming Language :: JavaScript",
39 | "Intended Audience :: Developers",
40 | "License :: OSI Approved :: MIT License",
41 | "Operating System :: OS Independent",
42 | "Topic :: Software Development :: Libraries :: Python Modules",
43 | "Topic :: Text Processing :: General",
44 | "Topic :: Text Processing :: Linguistic",
45 | "Development Status :: 5 - Production/Stable",
46 | "Environment :: Console",
47 | "Environment :: Web Environment",
48 | ],
49 | url='https://github.com/Nykakin/chompjs',
50 | long_description=long_description,
51 | long_description_content_type='text/markdown',
52 | include_package_data=True,
53 | packages=['chompjs'],
54 | )
55 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py39,py310,py311,py312,py313
3 |
4 | [testenv]
5 | deps = orjson
6 | commands =
7 | python -m unittest discover
8 | python -m doctest chompjs/chompjs.py
9 |
--------------------------------------------------------------------------------