├── .github
    └── workflows
    │   ├── deploy.yml
    │   └── test-deploy.yml
├── .gitignore
├── CHANGELOG
├── LICENSE
├── MANIFEST.in
├── README.md
├── _chompjs
    ├── buffer.c
    ├── buffer.h
    ├── module.c
    ├── parser.c
    └── parser.h
├── chompjs
    ├── __init__.py
    ├── chompjs.py
    └── test_parser.py
├── docs
    └── index.html
├── setup.py
└── tox.ini


/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: deploy
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   
 6 | jobs:
 7 |   build_wheels:
 8 |     name: Build wheels on ${{ matrix.os }}
 9 |     runs-on: ${{ matrix.os }}
10 |     strategy:
11 |       matrix:
12 |         os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Build wheels
18 |         uses: pypa/cibuildwheel@v2.22.0
19 |         env:
20 |            CIBW_SKIP: cp36-* cp37-* cp38-* pp*
21 | 
22 |       - uses: actions/upload-artifact@v4
23 |         with:
24 |           name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }}
25 |           path: ./wheelhouse/*.whl
26 | 
27 |   build_sdist:
28 |     name: Build source distribution
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v4
32 | 
33 |       - name: Build sdist
34 |         run: pipx run build --sdist
35 | 
36 |       - uses: actions/upload-artifact@v4
37 |         with:
38 |           name: chompjs-sdist
39 |           path: dist/*.tar.gz
40 | 
41 |   upload_pypi:
42 |     needs: [build_wheels, build_sdist]
43 |     runs-on: ubuntu-latest
44 |     environment: pypi
45 |     permissions:
46 |       id-token: write
47 |     steps:
48 |       - uses: actions/download-artifact@v4
49 |         with:
50 |           pattern: chompjs-*
51 |           path: dist
52 |           merge-multiple: true
53 | 
54 |       - uses: pypa/gh-action-pypi-publish@release/v1
55 | 


--------------------------------------------------------------------------------
/.github/workflows/test-deploy.yml:
--------------------------------------------------------------------------------
 1 | name: test-deploy
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   
 6 | jobs:
 7 |   build_wheels:
 8 |     name: Build wheels on ${{ matrix.os }}
 9 |     runs-on: ${{ matrix.os }}
10 |     strategy:
11 |       matrix:
12 |         os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Build wheels
18 |         uses: pypa/cibuildwheel@v2.22.0
19 |         env:
20 |            CIBW_SKIP: cp36-* cp37-* cp38-* pp*
21 | 
22 |       - uses: actions/upload-artifact@v4
23 |         with:
24 |           name: chompjs-wheels-${{ matrix.os }}-${{ strategy.job-index }}
25 |           path: ./wheelhouse/*.whl
26 | 
27 |   build_sdist:
28 |     name: Build source distribution
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v4
32 | 
33 |       - name: Build sdist
34 |         run: pipx run build --sdist
35 | 
36 |       - uses: actions/upload-artifact@v4
37 |         with:
38 |           name: chompjs-sdist
39 |           path: dist/*.tar.gz
40 | 
41 |   upload_pypi:
42 |     needs: [build_wheels, build_sdist]
43 |     runs-on: ubuntu-latest
44 |     environment: pypi
45 |     permissions:
46 |       id-token: write
47 |     steps:
48 |       - uses: actions/download-artifact@v4
49 |         with:
50 |           pattern: chompjs-*
51 |           path: dist
52 |           merge-multiple: true
53 | 
54 |       - uses: pypa/gh-action-pypi-publish@release/v1
55 |         with:
56 |           repository-url: https://test.pypi.org/legacy/
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Prerequisites
  2 | *.d
  3 | 
  4 | # Object files
  5 | *.o
  6 | *.ko
  7 | *.obj
  8 | *.elf
  9 | 
 10 | # Linker output
 11 | *.ilk
 12 | *.map
 13 | *.exp
 14 | 
 15 | # Precompiled Headers
 16 | *.gch
 17 | *.pch
 18 | 
 19 | # Libraries
 20 | *.lib
 21 | *.a
 22 | *.la
 23 | *.lo
 24 | 
 25 | # Shared objects (inc. Windows DLLs)
 26 | *.dll
 27 | *.so
 28 | *.so.*
 29 | *.dylib
 30 | 
 31 | # Executables
 32 | *.exe
 33 | *.out
 34 | *.app
 35 | *.i*86
 36 | *.x86_64
 37 | *.hex
 38 | 
 39 | # Debug files
 40 | *.dSYM/
 41 | *.su
 42 | *.idb
 43 | *.pdb
 44 | 
 45 | # Kernel Module Compile Results
 46 | *.mod*
 47 | *.cmd
 48 | .tmp_versions/
 49 | modules.order
 50 | Module.symvers
 51 | Mkfile.old
 52 | dkms.conf
 53 | 
 54 | 
 55 | # Byte-compiled / optimized / DLL files
 56 | __pycache__/
 57 | *.py[cod]
 58 | *$py.class
 59 | 
 60 | # C extensions
 61 | *.so
 62 | 
 63 | # Distribution / packaging
 64 | .Python
 65 | build/
 66 | develop-eggs/
 67 | dist/
 68 | downloads/
 69 | eggs/
 70 | .eggs/
 71 | lib/
 72 | lib64/
 73 | parts/
 74 | sdist/
 75 | var/
 76 | wheels/
 77 | pip-wheel-metadata/
 78 | share/python-wheels/
 79 | *.egg-info/
 80 | .installed.cfg
 81 | *.egg
 82 | MANIFEST
 83 | 
 84 | # PyInstaller
 85 | #  Usually these files are written by a python script from a template
 86 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 87 | *.manifest
 88 | *.spec
 89 | 
 90 | # Installer logs
 91 | pip-log.txt
 92 | pip-delete-this-directory.txt
 93 | 
 94 | # Unit test / coverage reports
 95 | htmlcov/
 96 | .tox/
 97 | .nox/
 98 | .coverage
 99 | .coverage.*
100 | .cache
101 | nosetests.xml
102 | coverage.xml
103 | *.cover
104 | *.py,cover
105 | .hypothesis/
106 | .pytest_cache/
107 | cover/
108 | 
109 | # Translations
110 | *.mo
111 | *.pot
112 | 
113 | # Django stuff:
114 | *.log
115 | local_settings.py
116 | db.sqlite3
117 | db.sqlite3-journal
118 | 
119 | # Flask stuff:
120 | instance/
121 | .webassets-cache
122 | 
123 | # Scrapy stuff:
124 | .scrapy
125 | 
126 | # Sphinx documentation
127 | docs/_build/
128 | 
129 | # PyBuilder
130 | target/
131 | 
132 | # Jupyter Notebook
133 | .ipynb_checkpoints
134 | 
135 | # IPython
136 | profile_default/
137 | ipython_config.py
138 | 
139 | # pyenv
140 | #   For a library or package, you might want to ignore these files since the code is
141 | #   intended to run in multiple environments; otherwise, check them in:
142 | # .python-version
143 | 
144 | # pipenv
145 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
146 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
147 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
148 | #   install all needed dependencies.
149 | #Pipfile.lock
150 | 
151 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
152 | __pypackages__/
153 | 
154 | # Celery stuff
155 | celerybeat-schedule
156 | celerybeat.pid
157 | 
158 | # SageMath parsed files
159 | *.sage.py
160 | 
161 | # Environments
162 | .env
163 | .venv
164 | env/
165 | venv/
166 | ENV/
167 | env.bak/
168 | venv.bak/
169 | 
170 | # Spyder project settings
171 | .spyderproject
172 | .spyproject
173 | 
174 | # Rope project settings
175 | .ropeproject
176 | 
177 | # mkdocs documentation
178 | /site
179 | 
180 | # mypy
181 | .mypy_cache/
182 | .dmypy.json
183 | dmypy.json
184 | 
185 | # Pyre type checker
186 | .pyre/
187 | 
188 | # pytype static type analyzer
189 | .pytype/
190 | 
191 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | [1.3.2]
 2 | * Release the GIL during parsing in C (#69)
 3 | 
 4 | [1.3.1]
 5 | * Introduced CI to build wheels (#68)
 6 | 
 7 | [1.3.0]
 8 | * Allow custom load function (https://github.com/Nykakin/chompjs/pull/63)
 9 | 
10 | [1.2.4]
11 | * Remove trailing whitespaces for unrecognized values (#59)
12 | * Fix segfault on empty string (#62)
13 | 
14 | [1.2.3]
15 | * Remove trailing whitespaces from unquoted keys (#57)
16 | 
17 | [1.2.2]
18 | * Fix parsing some floating numbers as octal (#52)
19 | * Fix number of digits calculation (#50)
20 | 
21 | [1.2.1]
22 | * Fix compilation on Windows (#49)
23 | 
24 | [1.2.0]
25 | * Allow trailing dot at the end of numerals (#39)
26 | * Parse hexadecimal, binary and octal literals as numbers (#40)
27 | * Drop support for Python 2.7 (#44)
28 | * Add parse_js_objects function (#45)
29 | * Drop jsonlines flag in parse_json_object in favor of parse_json_objects (#46)
30 | * Improve documentation (#47, #32)
31 | 
32 | [1.1.9]
33 | * Handle NaN in input (#37)
34 | 
35 | [1.1.8]
36 | * Fixed previous release (package couldn't be installed)
37 | 
38 | [1.1.7]
39 | * Handle unquoted properties starting with reserved JS keywords (#34)
40 | 
41 | [1.1.6]
42 | * Handle bug with parsing arrays like `["","/"]` (#33)
43 | 
44 | [1.1.5]
45 | * Correctly handle malformed quotations (#31)
46 | 
47 | [1.1.4]
48 | * Performance improvement (#19)
49 | * Handle numeric keys (#20)
50 | * Refactor error handling (#29)
51 | 
52 | [1.1.3]
53 | * Avoid an infinite loop on malformed input (#27)
54 | 
55 | [1.1.2]
56 | * Handle comments in JavaScript code (#22)
57 | 
58 | [1.1.1]
59 | * Fix installation bug (headers moved to a different dir)
60 | 
61 | [1.1.0]
62 | * Parser refactored and rewritten in order to simplify code and improve speed
63 | * Allow handling JavaScript functions and other strange stuff such as regexes (#16)
64 | * Allow passing down json.loads parameters
65 | * Allow handling hexadecimal, octal and binary literals (#12)
66 | 
67 | [1.0.17]
68 | * Handle memory corruption on unclosed quotations (#13)
69 | 
70 | [1.0.16]
71 | * Handle floats with leading zeros (#10)
72 | 
73 | [1.0.15]
74 | * Handle $ and _ characters at the beginning of keys (#9)
75 | 
76 | [1.0.14]
77 | * Handle "undefined" keyword in JavaScript objects (#7)
78 | 
79 | [1.0.13]
80 | * Handle escaped quotations correctly (#6)
81 | 
82 | [1.0.12]
83 | * Handle windows newlines (#5)
84 | 
85 | [1.0.11]
86 | * Handle jsonlines (#3)
87 | 
88 | [1.0.1]
89 | * Handle Unicode in keys (#2)
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Mariusz Obajtek
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft _chompjs
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Chompjs
  2 | 
  3 | ![license](https://img.shields.io/github/license/Nykakin/chompjs?style=flat-square)
  4 | ![pypi version](https://img.shields.io/pypi/v/chompjs.svg)
  5 | ![python version](https://img.shields.io/pypi/pyversions/chompjs.svg)
  6 | ![downloads](https://img.shields.io/pypi/dm/chompjs.svg)
  7 | 
  8 | Transforms JavaScript objects into Python data structures.
  9 | 
 10 | In web scraping, you sometimes need to transform Javascript objects embedded in HTML pages into valid Python dictionaries. `chompjs` is a library designed to do that as a more powerful replacement of standard `json.loads`:
 11 | 
 12 | ```python
 13 | >>> chompjs.parse_js_object("{a: 100}")
 14 | {'a': 100}
 15 | >>>
 16 | >>> json_lines = """
 17 | ... {'a': 12}
 18 | ... {'b': 13}
 19 | ... {'c': 14}
 20 | ... """
 21 | >>> for entry in chompjs.parse_js_objects(json_lines):
 22 | ...     print(entry)
 23 | ... 
 24 | {'a': 12}
 25 | {'b': 13}
 26 | {'c': 14}
 27 | ```
 28 | 
 29 | [Reference documentation](https://nykakin.github.io/chompjs/)
 30 | 
 31 | ## Quickstart
 32 | 
 33 | **1. installation**
 34 | 
 35 | ```
 36 | > pip install chompjs
 37 | ```
 38 | 
 39 | or build from source:
 40 | 
 41 | ```bash
 42 | $ git clone https://github.com/Nykakin/chompjs
 43 | $ cd chompjs
 44 | $ python setup.py build
 45 | $ python setup.py install
 46 | ```
 47 | 
 48 | ## Features
 49 | 
 50 | There are two functions available:
 51 | * `parse_js_object` - try reading first encountered JSON-like object. Raises `ValueError` on failure
 52 | * `parse_js_objects` - returns a generator yielding all encountered JSON-like objects. Can be used to read [JSON Lines](https://jsonlines.org/). Does not raise on invalid input.
 53 | 
 54 | An example usage with `scrapy`:
 55 | 
 56 | ```python
 57 | import chompjs
 58 | import scrapy
 59 | 
 60 | 
 61 | class MySpider(scrapy.Spider):
 62 |     # ...
 63 | 
 64 |     def parse(self, response):
 65 |         script_css = 'script:contains("__NEXT_DATA__")::text'
 66 |         script_pattern = r'__NEXT_DATA__ = (.*);'
 67 |         # warning: for some pages you need to pass replace_entities=True
 68 |         # into re_first to have JSON escaped properly
 69 |         script_text = response.css(script_css).re_first(script_pattern)
 70 |         try:
 71 |             json_data = chompjs.parse_js_object(script_text)
 72 |         except ValueError:
 73 |             self.log('Failed to extract data from {}'.format(response.url))
 74 |             return
 75 | 
 76 |         # work on json_data
 77 | ```
 78 | 
 79 | Parsing of [JSON5 objects](https://json5.org/) is supported:
 80 | 
 81 | ```python
 82 | >>> data = """
 83 | ... {
 84 | ...   // comments
 85 | ...   unquoted: 'and you can quote me on that',
 86 | ...   singleQuotes: 'I can use "double quotes" here',
 87 | ...   lineBreaks: "Look, Mom! \
 88 | ... No \\n's!",
 89 | ...   hexadecimal: 0xdecaf,
 90 | ...   leadingDecimalPoint: .8675309, andTrailing: 8675309.,
 91 | ...   positiveSign: +1,
 92 | ...   trailingComma: 'in objects', andIn: ['arrays',],
 93 | ...   "backwardsCompatible": "with JSON",
 94 | ... }
 95 | ... """
 96 | >>> chompjs.parse_js_object(data)
 97 | {'unquoted': 'and you can quote me on that', 'singleQuotes': 'I can use "double quotes" here', 'lineBreaks': "Look, Mom! No \n's!", 'hexadecimal': 912559, 'leadingDecimalPoint': 0.8675309, 'andTrailing': 8675309.0, 'positiveSign': '+1', 'trailingComma': 'in objects', 'andIn': ['arrays'], 'backwardsCompatible': 'with JSON'}
 98 | ```
 99 | 
100 | If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it:
101 | 
102 | ```python
103 | >>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True)
104 | {'a': 12}
105 | ```
106 | 
107 | By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest:
108 | 
109 | ```python
110 | >>> chompjs.parse_js_object('<div>...</div><script>foo = [1, 2, 3];</script><div>...</div>')
111 | [1, 2, 3]
112 | ```
113 | 
114 | Post-processed input is parsed using `json.loads` by default. A different loader such as `orsjon` can be used with `loader` argument:
115 | 
116 | ```python
117 | >>> import orjson
118 | >>> import chompjs
119 | >>> 
120 | >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
121 | {'a': 12}
122 | ```
123 | 
124 | `loader_args` and `loader_kwargs` arguments can be used to pass options to underlying loader function. For example for default `json.loads` you can pass down options such as `strict` or `object_hook`:
125 | 
126 | ```python
127 | >>> import decimal
128 | >>> import chompjs
129 | >>> chompjs.parse_js_object('[23.2]', loader_kwargs={'parse_float': decimal.Decimal})
130 | [Decimal('23.2')]
131 | ```
132 | 
133 | # Rationale
134 | 
135 | In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example:
136 | 
137 | ```html
138 | <html>
139 | <head>...</head>
140 | <body>
141 | ...
142 | <script type="text/javascript">window.__PRELOADED_STATE__={"foo": "bar"}</script>
143 | ...
144 | </body>
145 | </html>
146 | ```
147 | 
148 | Standard library function `json.loads` is usually sufficient to extract this data:
149 | 
150 | ```python
151 | >>> # scrapy shell file:///tmp/test.html
152 | >>> import json
153 | >>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)')
154 | >>> json.loads(script_text)
155 | {u'foo': u'bar'}
156 | 
157 | ```
158 | The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs:
159 | 
160 | * `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote
161 | * `'{a: "b"}'`is not a valid JSON because property name is not quoted at all
162 | * `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array
163 | * `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0
164 | 
165 | As a result, `json.loads` fail to extract any of those:
166 | 
167 | ```python
168 | >>> json.loads("{'a': 'b'}")
169 | Traceback (most recent call last):
170 |   ...
171 | ValueError: Expecting property name: line 1 column 2 (char 1)
172 | >>> json.loads('{a: "b"}')
173 | Traceback (most recent call last):
174 |   ...
175 | ValueError: Expecting property name: line 1 column 2 (char 1)
176 | >>> json.loads('{"a": [1, 2, 3,]}')
177 | Traceback (most recent call last):
178 |   ...
179 | ValueError: No JSON object could be decoded
180 | >>> json.loads('{"a": .99}')
181 | Traceback (most recent call last):
182 |   ...
183 | json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6)
184 | 
185 | ```
186 | `chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries:
187 | 
188 | ```python
189 | >>> import chompjs
190 | >>> 
191 | >>> chompjs.parse_js_object("{'a': 'b'}")
192 | {'a': 'b'}
193 | >>> chompjs.parse_js_object('{a: "b"}')
194 | {'a': 'b'}
195 | >>> chompjs.parse_js_object('{"a": [1, 2, 3,]}')
196 | {'a': [1, 2, 3]}
197 | >>> chompjs.parse_js_object('{"a": .99}')
198 | {'a': 0.99}
199 | ```
200 | 
201 | Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`.
202 | 
203 | ```python
204 | >>> import json
205 | >>> import _chompjs
206 | >>> 
207 | >>> _chompjs.parse('{a: 1}')
208 | '{"a":1}'
209 | >>> json.loads(_)
210 | {'a': 1}
211 | ```
212 | 
213 | # Development
214 | Pull requests are welcome. 
215 | 
216 | To run unittests
217 | 
218 | ```
219 | $ tox
220 | ```
221 | 


--------------------------------------------------------------------------------
/_chompjs/buffer.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
 3 |  * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
 4 |  */
 5 | 
 6 | #include <string.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | 
11 | #include "buffer.h"
12 | 
13 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size) {
14 |     buffer->data = malloc(initial_depth_buffer_size);
15 |     buffer->memory_buffer_length = initial_depth_buffer_size;
16 |     buffer->index = 0;
17 | }
18 | 
19 | void release_char_buffer(struct CharBuffer* buffer) {
20 |     free(buffer->data);
21 | }
22 | 
23 | void check_capacity(struct CharBuffer* buffer, size_t to_save) {
24 |     if(buffer->index + to_save >= buffer->memory_buffer_length) {
25 |         buffer->data = realloc(buffer->data, 2*buffer->memory_buffer_length);
26 |         buffer->memory_buffer_length *= 2;
27 |     }
28 | }
29 | 
30 | void push(struct CharBuffer* buffer, char value) {
31 |     check_capacity(buffer, 1);
32 |     buffer->data[buffer->index] = value;
33 |     buffer->index += 1;
34 | }
35 | 
36 | void push_string(struct CharBuffer* buffer, const char* value, size_t len) {
37 |     check_capacity(buffer, len);
38 |     memcpy(buffer->data + buffer->index, value, len);
39 |     buffer->index += len;
40 | }
41 | 
42 | void push_number(struct CharBuffer* buffer, long value) {
43 |     int size_in_chars;
44 |     if (value == 0) {
45 |         size_in_chars = 2;
46 |     } else {
47 |         size_in_chars = floor(log10(value)) + 2;
48 |     }
49 |     check_capacity(buffer, size_in_chars);
50 |     buffer->index += sprintf(buffer->data + buffer->index, "%ld", value);
51 | }
52 | 
53 | void pop(struct CharBuffer* buffer) {
54 |     buffer->index -= 1;
55 | }
56 | 
57 | char top(struct CharBuffer* buffer) {
58 |     return buffer->data[buffer->index-1];
59 | }
60 | 
61 | bool empty(struct CharBuffer* buffer) {
62 |     return buffer->index <= 0;
63 | }
64 | 
65 | void clear(struct CharBuffer* buffer) {
66 |     buffer->index = 0;
67 | }
68 | 
69 | size_t size(struct CharBuffer* buffer) {
70 |     return buffer->index;
71 | }
72 | 


--------------------------------------------------------------------------------
/_chompjs/buffer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
 3 |  * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
 4 |  */
 5 | 
 6 | #ifndef CHOMPJS_BUFFER_H
 7 | #define CHOMPJS_BUFFER_H
 8 | 
 9 | #include <stdbool.h>
10 | #include <stddef.h>
11 | 
12 | /**
13 |     Implements a safe, dynamically growing char buffer
14 | */
15 | struct CharBuffer {
16 |     char* data;
17 |     size_t memory_buffer_length;
18 |     size_t index;
19 | };
20 | 
21 | void init_char_buffer(struct CharBuffer* buffer, size_t initial_depth_buffer_size);
22 | 
23 | void release_char_buffer(struct CharBuffer* buffer);
24 | 
25 | void check_capacity(struct CharBuffer* buffer, size_t to_save);
26 | 
27 | void push(struct CharBuffer* buffer, char value);
28 | 
29 | void push_string(struct CharBuffer* buffer, const char* value, size_t len);
30 | 
31 | void push_number(struct CharBuffer* buffer, long value);
32 | 
33 | void pop(struct CharBuffer* buffer);
34 | 
35 | char top(struct CharBuffer* buffer);
36 | 
37 | bool empty(struct CharBuffer* buffer);
38 | 
39 | void clear(struct CharBuffer* buffer);
40 | 
41 | size_t size(struct CharBuffer* buffer);
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/_chompjs/module.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
  3 |  * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
  4 |  */
  5 | 
  6 | #define PY_SSIZE_T_CLEAN
  7 | #include <Python.h>
  8 | #include <stdio.h>
  9 | #include "parser.h"
 10 | 
 11 | static PyObject* parse_python_object(PyObject *self, PyObject *args) {
 12 |     const char* string;
 13 |     if (!PyArg_ParseTuple(args, "s", &string)) {
 14 |         return NULL;
 15 |     }
 16 | 
 17 |     struct Lexer lexer;
 18 |     init_lexer(&lexer, string);
 19 |     Py_BEGIN_ALLOW_THREADS 
 20 |     while(lexer.lexer_status == CAN_ADVANCE) {
 21 |         advance(&lexer);
 22 |     }
 23 |     Py_END_ALLOW_THREADS
 24 | 
 25 |     PyObject* ret = Py_BuildValue("s#", lexer.output.data, lexer.output.index-1);
 26 |     release_lexer(&lexer);
 27 |     if(lexer.lexer_status == ERROR) {
 28 |         const char* msg_sting = "Error parsing input near character %d";
 29 |         size_t error_buffer_size = snprintf(
 30 |             NULL,
 31 |             0,
 32 |             msg_sting,
 33 |             lexer.input_position
 34 |         );       
 35 |         char* error_buffer = malloc(error_buffer_size + 1);
 36 |         sprintf(
 37 |             error_buffer,
 38 |             msg_sting,
 39 |             lexer.input_position - 1
 40 |         );
 41 |         PyErr_SetString(PyExc_ValueError, error_buffer);
 42 |         free(error_buffer);
 43 |         return NULL;
 44 |     }
 45 |     return ret;
 46 | }
 47 | 
 48 | typedef struct {
 49 |     PyObject_HEAD
 50 |     struct Lexer lexer;
 51 | } JsonIterState;
 52 | 
 53 | static PyObject* json_iter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
 54 |     JsonIterState* json_iter_state = (JsonIterState *)type->tp_alloc(type, 0);
 55 |     if (!json_iter_state) {
 56 |         return NULL;
 57 |     }
 58 | 
 59 |     const char* string;
 60 |     if (!PyArg_ParseTuple(args, "s", &string)) {
 61 |         return NULL;
 62 |     }
 63 |     init_lexer(&json_iter_state->lexer, string);
 64 | 
 65 |     return (PyObject* )json_iter_state;
 66 | }
 67 | 
 68 | static void json_iter_dealloc(JsonIterState* json_iter_state) {
 69 |     release_lexer(&json_iter_state->lexer);
 70 |     Py_TYPE(json_iter_state)->tp_free(json_iter_state);
 71 | }
 72 | 
 73 | static PyObject* json_iter_next(JsonIterState* json_iter_state) {
 74 |     Py_BEGIN_ALLOW_THREADS
 75 |     while(json_iter_state->lexer.lexer_status == CAN_ADVANCE) {
 76 |         advance(&json_iter_state->lexer);
 77 |     }
 78 |     Py_END_ALLOW_THREADS
 79 | 
 80 |     if(json_iter_state->lexer.output.index == 1) {
 81 |         return NULL;
 82 |     }
 83 |     PyObject* ret = Py_BuildValue(
 84 |         "s#",
 85 |         json_iter_state->lexer.output.data,
 86 |         json_iter_state->lexer.output.index-1
 87 |     );
 88 |     reset_lexer_output(&json_iter_state->lexer);
 89 |     return ret;
 90 | }
 91 | 
 92 | PyTypeObject JSONIter_Type = {
 93 |     PyVarObject_HEAD_INIT(NULL, 0)
 94 |     "json_iter",                    /* tp_name */
 95 |     sizeof(JsonIterState),          /* tp_basicsize */
 96 |     0,                              /* tp_itemsize */
 97 |     (destructor)json_iter_dealloc,  /* tp_dealloc */
 98 |     0,                              /* tp_print */
 99 |     0,                              /* tp_getattr */
100 |     0,                              /* tp_setattr */
101 |     0,                              /* tp_reserved */
102 |     0,                              /* tp_repr */
103 |     0,                              /* tp_as_number */
104 |     0,                              /* tp_as_sequence */
105 |     0,                              /* tp_as_mapping */
106 |     0,                              /* tp_hash */
107 |     0,                              /* tp_call */
108 |     0,                              /* tp_str */
109 |     0,                              /* tp_getattro */
110 |     0,                              /* tp_setattro */
111 |     0,                              /* tp_as_buffer */
112 |     Py_TPFLAGS_DEFAULT,             /* tp_flags */
113 |     0,                              /* tp_doc */
114 |     0,                              /* tp_traverse */
115 |     0,                              /* tp_clear */
116 |     0,                              /* tp_richcompare */
117 |     0,                              /* tp_weaklistoffset */
118 |     PyObject_SelfIter,              /* tp_iter */
119 |     (iternextfunc)json_iter_next,   /* tp_iternext */
120 |     0,                              /* tp_methods */
121 |     0,                              /* tp_members */
122 |     0,                              /* tp_getset */
123 |     0,                              /* tp_base */
124 |     0,                              /* tp_dict */
125 |     0,                              /* tp_descr_get */
126 |     0,                              /* tp_descr_set */
127 |     0,                              /* tp_dictoffset */
128 |     0,                              /* tp_init */
129 |     PyType_GenericAlloc,            /* tp_alloc */
130 |     json_iter_new,                  /* tp_new */
131 | };
132 | 
133 | static PyObject* parse_python_objects(PyObject *self, PyObject *args) {
134 |     PyObject *obj = PyObject_CallObject((PyObject *) &JSONIter_Type, args);
135 |     return obj;
136 | }
137 | 
138 | static PyMethodDef parser_methods[] = { 
139 |     {   
140 |         "parse", parse_python_object, METH_VARARGS,
141 |         "Extract JSON object from the string"
142 |     },  
143 |     {   
144 |         "parse_objects", parse_python_objects, METH_VARARGS,
145 |         "Iterate over all JSON objects in the string"
146 |     },  
147 |     {NULL, NULL, 0, NULL}
148 | };
149 | 
150 | 
151 | static struct PyModuleDef parser_definition = { 
152 |     PyModuleDef_HEAD_INIT,
153 |     "_chompjs",
154 |     "C extension for fast JavaScript object parsing",
155 |     -1, 
156 |     parser_methods
157 | };
158 | 
159 | PyMODINIT_FUNC PyInit__chompjs(void) {
160 |     Py_Initialize();
161 |     PyObject* module = PyModule_Create(&parser_definition);
162 |     if (!module) {
163 |         return NULL;
164 |     }
165 |     if (PyType_Ready(&JSONIter_Type) < 0) {
166 |         return NULL;
167 |     }
168 |     return module;
169 | }
170 | 


--------------------------------------------------------------------------------
/_chompjs/parser.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
  3 |  * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
  4 |  */
  5 | 
  6 | #include "parser.h"
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <ctype.h>
 12 | #include <string.h>
 13 | 
 14 | #define INITIAL_NESTING_DEPTH 20
 15 | 
 16 | struct State states[] = {
 17 |     {begin},
 18 |     {json},
 19 |     {value},
 20 |     {end},
 21 |     {error},
 22 | };
 23 | 
 24 | enum StateIndex {
 25 |     BEGIN_STATE, JSON_STATE, VALUE_STATE, END_STATE, ERROR_STATE
 26 | };
 27 | 
 28 | void advance(struct Lexer* lexer) {
 29 |     lexer->state = lexer->state->change(lexer);
 30 | }
 31 | 
 32 | char next_char(struct Lexer* lexer) {
 33 |     while(1) {
 34 |         if(isspace(lexer->input[lexer->input_position])) {
 35 |             lexer->input_position += 1;
 36 |             continue;
 37 |         }
 38 |         return lexer->input[lexer->input_position];
 39 |     }
 40 |     return '\0';
 41 | }
 42 | 
 43 | char last_char(struct Lexer* lexer) {
 44 |     return top(&lexer->output);
 45 | }
 46 | 
 47 | void emit(char c, struct Lexer* lexer) {
 48 |     push(&lexer->output, c);
 49 |     lexer->input_position += 1;   
 50 | }
 51 | 
 52 | void emit_in_place(char c, struct Lexer* lexer) {
 53 |     push(&lexer->output, c);
 54 | }
 55 | 
 56 | void unemit(struct Lexer* lexer) {
 57 |     pop(&lexer->output);
 58 | }
 59 | 
 60 | void emit_string(const char *s, size_t size, struct Lexer* lexer) {
 61 |     push_string(&lexer->output, s, size);
 62 |     lexer->input_position += size;   
 63 | }
 64 | 
 65 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer) {
 66 |     push_string(&lexer->output, s, size);
 67 | }
 68 | 
 69 | void emit_number_in_place(long value, struct Lexer* lexer) {
 70 |     push_number(&lexer->output, value);
 71 | }
 72 | 
 73 | void init_lexer(struct Lexer* lexer, const char* string) {
 74 |     lexer->input = string;
 75 |     // allocate in advance more memory for output than for input because we might need
 76 |     // to add extra characters
 77 |     // for example `{a: undefined}` will be translated as `{"a": "undefined"}`
 78 |     lexer->output_size = 2 * strlen(string) + 1;
 79 |     init_char_buffer(&lexer->output, lexer->output_size);
 80 |     lexer->input_position = 0;
 81 |     init_char_buffer(&lexer->nesting_depth, INITIAL_NESTING_DEPTH);
 82 |     lexer->unrecognized_nesting_depth = 0;
 83 |     lexer->lexer_status = CAN_ADVANCE;
 84 |     lexer->state = &states[BEGIN_STATE];
 85 |     lexer->is_key = false;
 86 | }
 87 | 
 88 | void reset_lexer_output(struct Lexer* lexer) {
 89 |     clear(&lexer->output);
 90 |     lexer->lexer_status = CAN_ADVANCE;
 91 |     lexer->state = &states[BEGIN_STATE];
 92 |     lexer->is_key = false;
 93 |     lexer->input_position -= 1;
 94 | }
 95 | 
 96 | void release_lexer(struct Lexer* lexer) {
 97 |     release_char_buffer(&lexer->output);
 98 | }
 99 | 
100 | struct State* begin(struct Lexer* lexer) {
101 |     // Ignoring characters until either '{' or '[' appears
102 |     for(;;) {
103 |         switch(next_char(lexer)) {
104 |         case '{':
105 |             lexer->is_key = true;
106 |         case '[':;
107 |             return &states[JSON_STATE];
108 |         break;
109 |         case '\0':;
110 |             return &states[END_STATE];
111 |         default:
112 |             lexer->input_position += 1;
113 |         }
114 |     }
115 |     return &states[ERROR_STATE];
116 | }
117 | 
118 | struct State* json(struct Lexer* lexer) {
119 |     for(;;) {
120 |         switch(next_char(lexer)) {
121 |         case '{':
122 |             push(&lexer->nesting_depth, '{');
123 |             lexer->is_key = true;
124 |             emit('{', lexer);
125 |         break;
126 |         case '[':
127 |             push(&lexer->nesting_depth, '[');
128 |             emit('[', lexer);
129 |         break;
130 |         case '}':
131 |             if(last_char(lexer) == ',') {
132 |                 unemit(lexer);
133 |             }
134 |             pop(&lexer->nesting_depth);
135 |             lexer->is_key = top(&lexer->nesting_depth) == '{';
136 |             emit('}', lexer);
137 |             if(size(&lexer->nesting_depth) <= 0) {
138 |                 return &states[END_STATE];
139 |             }
140 |         break;
141 |         case ']':
142 |             if(last_char(lexer) == ',') {
143 |                 unemit(lexer);
144 |             }
145 |             pop(&lexer->nesting_depth);
146 |             lexer->is_key = top(&lexer->nesting_depth) == '{';
147 |             emit(']', lexer);
148 |             if(size(&lexer->nesting_depth) <= 0) {
149 |                 return &states[END_STATE];
150 |             }
151 |         break;
152 |         case ':':
153 |             lexer->is_key = false;
154 |             emit(':', lexer);
155 |         break;
156 |         case ',':
157 |             emit(',', lexer);
158 |             lexer->is_key = top(&lexer->nesting_depth) == '{';
159 |         break;
160 | 
161 |         case '/':;
162 |             char next_c = lexer->input[lexer->input_position+1];
163 |             if(next_c == '/' || next_c == '*') {
164 |                 handle_comments(lexer);
165 |             } else {
166 |                 return &states[VALUE_STATE];
167 |             }
168 |         break;
169 | 
170 |         // This should never happen, but an malformed input can
171 |         // cause an infinite loop without this check
172 |         case '>':
173 |         case ')':;
174 |             return &states[ERROR_STATE];
175 |         break;
176 | 
177 |         default:
178 |             return &states[VALUE_STATE];
179 |         }
180 |     }
181 | 
182 |     return &states[ERROR_STATE];
183 | }
184 | 
185 | struct State* _handle_string(struct Lexer* lexer, const char* string, size_t length) {
186 |     char next_char = lexer->input[lexer->input_position+length+1];
187 |     if(next_char == '_' || isalnum(next_char)) {
188 |         return handle_unrecognized(lexer);
189 |     }
190 |     emit_string(string, length, lexer);
191 |     return &states[JSON_STATE];
192 | }
193 | 
194 | struct State* value(struct Lexer* lexer) {
195 |     char c = next_char(lexer);
196 |     const char* position = lexer->input + lexer->input_position;
197 | 
198 |     if(c == '"' || c == '\'' || c == '`') {
199 |         return handle_quoted(lexer);
200 |     } else if(isdigit(c) || c == '.' || c == '-') {
201 |         if(lexer->is_key) {
202 |             return handle_unrecognized(lexer);
203 |         } else {
204 |             return handle_numeric(lexer);
205 |         }
206 |     } else if(strncmp(position, "true", 4) == 0) {
207 |         return _handle_string(lexer, "true", 4);
208 |     } else if(strncmp(position, "false", 5) == 0) {
209 |         return _handle_string(lexer, "false", 5);
210 |     } else if(strncmp(position, "null", 4) == 0) {
211 |         return _handle_string(lexer, "null", 4);
212 |     } else if(c == ']' || c == '}' || c == '[' || c == '{') {
213 |         return &states[JSON_STATE];
214 |     } else if(strncmp(position, "NaN", 3) == 0) {
215 |         return _handle_string(lexer, "NaN", 3);
216 |     } else {
217 |         return handle_unrecognized(lexer);
218 |     }
219 | 
220 |     return &states[JSON_STATE];
221 | }
222 | 
223 | struct State* end(struct Lexer* lexer) {
224 |     emit('\0', lexer);
225 |     lexer->lexer_status = FINISHED;
226 |     return lexer->state;
227 | }
228 | 
229 | struct State* error(struct Lexer* lexer) {
230 |     emit('\0', lexer);
231 |     lexer->lexer_status = ERROR;
232 |     return lexer->state;
233 | }
234 | 
235 | struct State* handle_quoted(struct Lexer* lexer) {
236 |     char current_quotation = next_char(lexer);
237 |     emit('"', lexer);
238 | 
239 |     for(;;) {
240 |         char c = lexer->input[lexer->input_position];
241 |         // handle escape sequences such as \\ and \'
242 |         if(c == '\\') {
243 |             char escaped = lexer->input[lexer->input_position+1];
244 |             if(escaped == '\'') {
245 |                 emit('\'', lexer);
246 |                 lexer->input_position += 1;
247 |             } else {
248 |                 emit('\\', lexer);
249 |                 emit(escaped, lexer);   
250 |             }
251 |             continue;
252 |         }
253 |         // in case of malformed quotation we can reach end of the input
254 |         if(c == '\0') {
255 |             return &states[ERROR_STATE];
256 |         }
257 |         // if we're closing the quotations, we're done with the string
258 |         if(c == current_quotation) {
259 |             emit('"', lexer);
260 |             return &states[JSON_STATE];
261 |         }
262 |         // otherwise, emit character
263 |         if(c == '"') {
264 |             emit_string_in_place("\\\"", 2, lexer);
265 |             lexer->input_position += 1;
266 |         } else {
267 |             emit(c, lexer);
268 |         }
269 |     }
270 |             
271 |     return &states[ERROR_STATE];
272 | }
273 | 
274 | struct State* handle_numeric(struct Lexer* lexer) {
275 |     char c = next_char(lexer);
276 |     if(c >= 49 && c <= 57) { // 1-9 range
277 |         return handle_numeric_standard_base(lexer);
278 |     } else if(c == '.') {
279 |         emit_in_place('0', lexer);
280 |         emit('.', lexer);
281 |         return handle_numeric_standard_base(lexer);
282 |     } else if(c == '-') {
283 |         emit('-', lexer);
284 |         return handle_numeric(lexer);
285 |     } else if(c == '0') {
286 |         char nc = tolower(lexer->input[lexer->input_position+1]);
287 |         if(nc == '.') {
288 |             emit('0', lexer);
289 |             emit('.', lexer);
290 |             return handle_numeric_standard_base(lexer);
291 |         } else if(nc == 'x' || nc == 'X') {
292 |             return handle_numeric_non_standard_base(lexer, 16);
293 |         } else if(nc == 'o' || nc == 'O') {
294 |             lexer->input_position += 2;
295 |             return handle_numeric_non_standard_base(lexer, 8);
296 |         } else if(isdigit(nc)) {
297 |             return handle_numeric_non_standard_base(lexer, 8);
298 |         } else if(nc == 'b' || nc == 'B') {
299 |             lexer->input_position += 2;
300 |             return handle_numeric_non_standard_base(lexer, 2);
301 |         } else {
302 |             emit('0', lexer);
303 |             return &states[JSON_STATE];
304 |         }
305 |     } else {
306 |         return &states[ERROR_STATE];
307 |     }
308 |     return &states[JSON_STATE];
309 | }
310 | 
311 | struct State* handle_numeric_standard_base(struct Lexer* lexer) {
312 |     char c = next_char(lexer);
313 |     do {
314 |         if(c != '_') {
315 |             emit(c, lexer);
316 |         } else {
317 |             lexer->input_position += 1;
318 |         }
319 |         c = tolower(lexer->input[lexer->input_position]);
320 |     } while(isdigit(c) || c == '.' || c == 'e' || c == 'E' || c == '+' || c =='-' || c == '_');
321 |     if(last_char(lexer) == '.') {
322 |         emit_in_place('0', lexer);
323 |     }
324 |     return &states[JSON_STATE];
325 | }
326 | 
327 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base) {
328 |     char* end;
329 |     long n = strtol(lexer->input + lexer->input_position, &end, base);
330 |     emit_number_in_place(n, lexer);
331 |     lexer->input_position = end - lexer->input;
332 |     return &states[JSON_STATE];
333 | }
334 | 
335 | struct State* handle_unrecognized(struct Lexer* lexer) {
336 |     emit_in_place('"', lexer);
337 |     char currently_quoted_with = '\0';
338 | 
339 |     lexer->unrecognized_nesting_depth = 0;
340 |     do {
341 |         char c = lexer->input[lexer->input_position];
342 | 
343 |         switch(c) {
344 |             case '\\':
345 |                 emit_in_place('\\', lexer);
346 |                 emit('\\', lexer);
347 |             break;
348 | 
349 |             case '\'':
350 |             case '"':
351 |             case '`':
352 |                 if(c == '"') {
353 |                     emit_in_place('\\', lexer);
354 |                     emit('"', lexer);
355 |                 } else {
356 |                     emit(c, lexer);
357 |                 }
358 | 
359 |                 if(!currently_quoted_with) {
360 |                     currently_quoted_with = c;
361 |                 } else if (currently_quoted_with == c) {
362 |                     currently_quoted_with = '\0';
363 |                 }
364 |             break;
365 | 
366 |             case '{':
367 |             case '[':
368 |             case '<':
369 |             case '(':
370 |                 emit(c, lexer);
371 |                 lexer->unrecognized_nesting_depth += 1;
372 |             break;
373 | 
374 |             case '}':
375 |             case ']':
376 |             case '>':
377 |             case ')':
378 |                 if(currently_quoted_with && lexer->unrecognized_nesting_depth > 0) {
379 |                     emit(c, lexer);
380 |                 } else if(lexer->unrecognized_nesting_depth > 0) {
381 |                     emit(c, lexer);
382 |                     lexer->unrecognized_nesting_depth -= 1;
383 |                 } else {
384 |                     // remove trailing whitespaces after value
385 |                     while(isspace(last_char(lexer))) {
386 |                         pop(&lexer->output);
387 |                     }
388 |                     emit_in_place('"', lexer);
389 |                     return &states[JSON_STATE];
390 |                 }
391 |             break;
392 | 
393 |             case ',':
394 |             case ':':
395 |                 if(!currently_quoted_with && lexer->unrecognized_nesting_depth <= 0) {
396 |                     // remove trailing whitespaces after key
397 |                     while(isspace(last_char(lexer))) {
398 |                         pop(&lexer->output);
399 |                     }
400 |                     emit_in_place('"', lexer);
401 |                     return &states[JSON_STATE];
402 |                 } else {
403 |                     emit(c, lexer);
404 |                 }
405 |             break;
406 | 
407 |             default:
408 |                 emit(c, lexer);
409 |         }
410 |     } while (lexer->input[lexer->input_position] != '\0');
411 | 
412 |     return &states[ERROR_STATE];
413 | }
414 | 
415 | void handle_comments(struct Lexer* lexer) {
416 |     char c, next_c;
417 | 
418 |     lexer->input_position += 1;
419 |     if(lexer->input[lexer->input_position] == '/' ) {
420 |         for(;;) {
421 |             lexer->input_position+=1;
422 |             c = lexer->input[lexer->input_position];
423 |             if((c == '\0') || (c == '\n')) {
424 |                 break;
425 |             }
426 |         }
427 |     } else if(lexer->input[lexer->input_position] == '*') {
428 |         for(;;) {
429 |             lexer->input_position+=1;
430 |             c = lexer->input[lexer->input_position];
431 |             next_c = lexer->input[lexer->input_position+1];
432 |             if((c == '\0') || (c == '*' && next_c == '/')) {
433 |                 break;
434 |             }
435 |         }
436 |         lexer->input_position+=2;
437 |     }
438 | }
439 | 


--------------------------------------------------------------------------------
/_chompjs/parser.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020-2025 Mariusz Obajtek. All rights reserved.
  3 |  * License: https://github.com/Nykakin/chompjs/blob/master/LICENSE
  4 |  */
  5 | 
  6 | #ifndef CHOMPJS_PARSER_H
  7 | #define CHOMPJS_PARSER_H
  8 | 
  9 | #include <stddef.h>
 10 | #include <stdbool.h>
 11 | 
 12 | #include "buffer.h"
 13 | 
 14 | struct Lexer;
 15 | 
 16 | /**
 17 |     States of internal state machine:
 18 |     * begin - start parsing
 19 |     * json - handle special characters: "[", "{", "}", "]", ",", ":"
 20 |     * value - handle a JSON value, such as strings and numbers
 21 |     * end - finish work
 22 |     * error - finish work, mark an error
 23 | */
 24 | struct State* begin(struct Lexer* lexer);
 25 | struct State* json(struct Lexer* lexer);
 26 | struct State* value(struct Lexer* lexer);
 27 | struct State* end(struct Lexer* lexer);
 28 | struct State* error(struct Lexer* lexer);
 29 | 
 30 | /*
 31 |     Helper functions used in "value" state
 32 |     * handle_quoted - handles quoted strings
 33 |     * handle_numeric - handle numbers
 34 |     * handle_numeric_standard_base - handle numbers in standard base-10
 35 |     * handle_numeric_non_standard_base - handle numbers in non-standard bases (hex, oct)
 36 |     * handle_unrecognized - save all unrecognized data as a string
 37 | */
 38 | struct State* handle_quoted(struct Lexer* lexer);
 39 | struct State* handle_numeric(struct Lexer* lexer);
 40 | struct State* handle_numeric_standard_base(struct Lexer* lexer);
 41 | struct State* handle_numeric_non_standard_base(struct Lexer* lexer, int base);
 42 | struct State* handle_unrecognized(struct Lexer* lexer);
 43 | 
 44 | /**
 45 |     State wrapper
 46 | */
 47 | struct State {
 48 |     struct State* (*change)(struct Lexer *);
 49 | };
 50 | 
 51 | /** Possible results of internal state machine state change state */
 52 | typedef enum {
 53 |     CAN_ADVANCE,
 54 |     FINISHED,
 55 |     ERROR,
 56 | } LexerStatus;
 57 | 
 58 | /** Main object, responsible for everything */
 59 | struct Lexer {
 60 |     const char* input;
 61 |     size_t output_size;
 62 |     struct CharBuffer output;
 63 |     size_t input_position;
 64 |     LexerStatus lexer_status;
 65 |     struct State* state;
 66 |     struct CharBuffer nesting_depth;
 67 |     size_t unrecognized_nesting_depth;
 68 |     bool is_key;
 69 | };
 70 | 
 71 | /** Switch state of internal state machine */
 72 | void advance(struct Lexer* lexer);
 73 | 
 74 | /** Get next char, ignore whitespaces */
 75 | char next_char(struct Lexer* lexer);
 76 | 
 77 | /** Get previously handled char */
 78 | char last_char(struct Lexer* lexer);
 79 | 
 80 | /** Send character to output buffer, advance input position */
 81 | void emit(char c, struct Lexer* lexer);
 82 | 
 83 | /** Send character to output buffer, keep old input position */
 84 | void emit_in_place(char c, struct Lexer* lexer);
 85 | 
 86 | /** Remove last character from output buffer */
 87 | void unemit(struct Lexer* lexer);
 88 | 
 89 | /** Send string to output buffer, advance input position */
 90 | void emit_string(const char *s, size_t size, struct Lexer* lexer);
 91 | 
 92 | /** Send string to output buffer, keep old input position */
 93 | void emit_string_in_place(const char *s, size_t size, struct Lexer* lexer);
 94 | 
 95 | /** Send number to output buffer, keep old input position */
 96 | void emit_number_in_place(long value, struct Lexer* lexer);
 97 | 
 98 | /** Handle comments in JSON body */
 99 | void handle_comments(struct Lexer* lexer);
100 | 
101 | /** Initialize main lexer object */
102 | void init_lexer(struct Lexer* lexer, const char* string);
103 | 
104 | /** Reset main lexer object output buffer */
105 | void reset_lexer_output(struct Lexer* lexer);
106 | 
107 | /** Release main lexer object and its memory */
108 | void release_lexer(struct Lexer* lexer);
109 | 
110 | #endif
111 | 


--------------------------------------------------------------------------------
/chompjs/__init__.py:
--------------------------------------------------------------------------------
1 | from .chompjs import parse_js_object, parse_js_objects
2 | 


--------------------------------------------------------------------------------
/chompjs/chompjs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import warnings
  5 | 
  6 | from _chompjs import parse, parse_objects
  7 | 
  8 | 
  9 | def _preprocess(string, unicode_escape=False):
 10 |     if unicode_escape:
 11 |         string = string.encode().decode("unicode_escape")
 12 |     return string
 13 | 
 14 | 
 15 | def _process_loader_arguments(loader_args, loader_kwargs, json_params):
 16 |     if json_params:
 17 |         msg = "json_params argument is deprecated, please use loader_kwargs instead"
 18 |         warnings.warn(msg, DeprecationWarning)
 19 |         loader_kwargs = json_params
 20 | 
 21 |     if not loader_args:
 22 |         loader_args = []
 23 | 
 24 |     if not loader_kwargs:
 25 |         loader_kwargs = {}
 26 | 
 27 |     return (loader_args, loader_kwargs)
 28 | 
 29 | 
 30 | def parse_js_object(
 31 |     string,
 32 |     unicode_escape=False,
 33 |     loader=json.loads,
 34 |     loader_args=None,
 35 |     loader_kwargs=None,
 36 |     json_params=None,
 37 | ):
 38 |     """
 39 |     Extracts first JSON object encountered in the input string
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     string: str
 44 |         Input string
 45 | 
 46 |     >>> parse_js_object("{a: 100}")
 47 |     {'a': 100}
 48 | 
 49 |     unicode_escape: bool, optional
 50 |         Attempt to fix input string if it contains escaped special characters
 51 | 
 52 |     >>> parse_js_object('{\\\\"a\\\\": 100}')
 53 |     {'\\\\"a\\\\"': 100}
 54 |     >>> parse_js_object('{\\\\"a\\\\": 100}', unicode_escape=True)
 55 |     {'a': 100}
 56 | 
 57 |     loader: func, optional
 58 |         Function used to load processed input data. By default `json.loads` is used
 59 | 
 60 |     >>> import orjson
 61 |     >>> import chompjs
 62 |     >>> 
 63 |     >>> chompjs.parse_js_object("{'a': 12}", loader=orjson.loads)
 64 |     {'a': 12}
 65 | 
 66 |     loader_args: list, optional
 67 |         Allow passing down positional arguments to loader function
 68 | 
 69 |     loader_kwargs: dict, optional
 70 |         Allow passing down keyword arguments to loader function
 71 | 
 72 |     >>> parse_js_object("{'a': 10.1}")
 73 |     {'a': 10.1}
 74 |     >>> import decimal
 75 |     >>> parse_js_object("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal})
 76 |     {'a': Decimal('10.1')}
 77 | 
 78 |     .. deprecated:: 1.3.0
 79 |     json_params: dict, optional
 80 |         Use `loader_kwargs` instead
 81 | 
 82 |     Returns
 83 |     -------
 84 |     list | dict
 85 |         Extracted JSON object
 86 | 
 87 |     Raises
 88 |     ------
 89 |     ValueError
 90 |         If failed to parse input properly
 91 | 
 92 |     ```python
 93 |     >>> parse_js_object(None)
 94 |     Traceback (most recent call last):
 95 |       ...
 96 |     ValueError: Invalid input
 97 |     >>> parse_js_object("No JSON objects in sight...")
 98 |     Traceback (most recent call last):
 99 |       ...
100 |     json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
101 | 
102 |     ```
103 | 
104 |     """
105 |     if not string:
106 |         raise ValueError("Invalid input")
107 | 
108 |     loader_args, loader_kwargs = _process_loader_arguments(
109 |         loader_args, loader_kwargs, json_params
110 |     )
111 | 
112 |     string = _preprocess(string, unicode_escape)
113 |     parsed_data = parse(string)
114 |     return loader(parsed_data, *loader_args, **loader_kwargs)
115 | 
116 | 
117 | def parse_js_objects(
118 |     string,
119 |     unicode_escape=False,
120 |     omitempty=False, 
121 |     loader=json.loads,
122 |     loader_args=None,
123 |     loader_kwargs=None,
124 |     json_params=None,
125 | ):
126 |     """
127 |     Returns a generator extracting all JSON objects encountered in the input string.
128 |     Can be used to read JSON Lines
129 | 
130 |     Parameters
131 |     ----------
132 |     string: str
133 |         Input string
134 | 
135 |     >>> it = parse_js_objects("{a: 100} {b: 100}")
136 |     >>> next(it)
137 |     {'a': 100}
138 |     >>> next(it)
139 |     {'b': 100}
140 | 
141 |     unicode_escape: bool, optional
142 |         Attempt to fix input string if it contains escaped special characters
143 | 
144 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}'))
145 |     {'\\\\"a\\\\"': 100}
146 |     >>> next(parse_js_objects('{\\\\"a\\\\": 100}', unicode_escape=True))
147 |     {'a': 100}
148 | 
149 |     omitempty: bool, optional
150 |         Skip empty dictionaries and lists
151 | 
152 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}"))
153 |     [{'a': 12}, {}, {'b': 13}]
154 |     >>> list(parse_js_objects("{a: 12} {} {b: 13}", omitempty=True))
155 |     [{'a': 12}, {'b': 13}]
156 | 
157 |     loader: func, optional
158 |         Function used to load processed input data. By default `json.loads` is used
159 | 
160 |     >>> import orjson
161 |     >>> import chompjs
162 |     >>> 
163 |     >>> next(chompjs.parse_js_objects("{'a': 12}", loader=orjson.loads))
164 |     {'a': 12}
165 | 
166 |     loader_args: list, optional
167 |         Allow passing down positional arguments to loader function
168 | 
169 |     loader_kwargs: dict, optional
170 |         Allow passing down keyword arguments to loader function
171 | 
172 |     >>> next(parse_js_objects("{'a': 10.1}"))
173 |     {'a': 10.1}
174 |     >>> import decimal
175 |     >>> next(parse_js_objects("{'a': 10.1}", loader_kwargs={'parse_float': decimal.Decimal}))
176 |     {'a': Decimal('10.1')}
177 | 
178 |     .. deprecated:: 1.3.0
179 |     json_params: dict, optional
180 |         Use `loader_kwargs` instead
181 | 
182 |     Returns
183 |     -------
184 |     generator
185 |         Iterating over it yields all encountered JSON objects
186 |     """
187 | 
188 |     if not string:
189 |         return
190 | 
191 |     loader_args, loader_kwargs = _process_loader_arguments(
192 |         loader_args, loader_kwargs, json_params
193 |     )
194 | 
195 |     string = _preprocess(string, unicode_escape)
196 |     for raw_data in parse_objects(string):
197 |         try:
198 |             data = loader(raw_data, *loader_args, **loader_kwargs)
199 |         except ValueError:
200 |             continue
201 | 
202 |         if not data and omitempty:
203 |             continue
204 | 
205 |         yield data
206 | 


--------------------------------------------------------------------------------
/chompjs/test_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | import functools
  5 | import math
  6 | import unittest
  7 | 
  8 | from chompjs import parse_js_object, parse_js_objects
  9 | 
 10 | 
 11 | def parametrize_test(*arguments_list):
 12 |     def decorate(func):
 13 |         @functools.wraps(func)
 14 |         def wrapper(self, *args, **kwargs):
 15 |             for arguments in arguments_list:
 16 |                 func(self, *arguments)
 17 |         return wrapper
 18 |     return decorate
 19 | 
 20 | 
 21 | class TestParser(unittest.TestCase):
 22 |     @parametrize_test(
 23 |         ("{'hello': 'world'}", {'hello': 'world'}),
 24 |         ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}),
 25 |         (
 26 |             "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}",
 27 |             {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'},
 28 |         ),
 29 |         ("{}", {}),
 30 |     )
 31 |     def test_parse_object(self, in_data, expected_data):
 32 |         result = parse_js_object(in_data)
 33 |         self.assertEqual(result, expected_data)
 34 | 
 35 |     @parametrize_test(
 36 |         ("[]", []),
 37 |         ("[[[]]]", [[[]]]),
 38 |         ("[[[1]]]", [[[1]]]),
 39 |         ("[1]", [1]),
 40 |         ("[1, 2, 3, 4]", [1, 2, 3, 4]),
 41 |         ("['h', 'e', 'l', 'l', 'o']",  ['h', 'e', 'l', 'l', 'o']),
 42 |         ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]),
 43 |     )
 44 |     def test_parse_list(self, in_data, expected_data):
 45 |         result = parse_js_object(in_data)
 46 |         self.assertEqual(result, expected_data)
 47 | 
 48 |     @parametrize_test(
 49 |         ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}),
 50 |         ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}),
 51 |         ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]),
 52 |         (
 53 |             "[false, {'true': true, `pies`: \"kot\"}, false,]",
 54 |             [False, {"true": True, 'pies': 'kot'}, False],
 55 |         ),
 56 |         (
 57 |             "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}",
 58 |             {k: 1 for k in 'abcdefghij'},
 59 |         ),
 60 |         (
 61 |             "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
 62 |             {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]},
 63 |         ),
 64 |     )
 65 |     def test_parse_mixed(self, in_data, expected_data):
 66 |         result = parse_js_object(in_data)
 67 |         self.assertEqual(result, expected_data)
 68 | 
 69 |     @parametrize_test(
 70 |         ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}),
 71 |         ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]),
 72 |         ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}),
 73 |         ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}),
 74 |         ("[\"\\uD834\\uDD1E\"]", [u'𝄞']),
 75 |         ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}),
 76 |         ("['\u00E9']", ['é']),
 77 |         ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}),
 78 |         ('{"a": 3.125e7}', {'a': 3.125e7}),
 79 |         ('''{"a": "b\\'"}''', {'a': "b'"}),
 80 |         ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -.1}),
 81 |         ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]),
 82 |         ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}),
 83 |     )
 84 |     def test_parse_standard_values(self, in_data, expected_data):
 85 |         result = parse_js_object(in_data)
 86 |         self.assertEqual(result, expected_data)
 87 | 
 88 |     def test_parse_nan(self):
 89 |         in_data = '{"A": NaN}'
 90 |         result = parse_js_object(in_data)
 91 |         self.assertTrue(math.isnan(result["A"]))
 92 | 
 93 |     @parametrize_test(
 94 |         ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}),
 95 |         ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}),
 96 |         (
 97 |             "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}",
 98 |             {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"}
 99 |         ),
100 |         (
101 |             "{\"abc\": function() {return '])))))))))))))))';}}",
102 |             {"abc": "function() {return '])))))))))))))))';}"},
103 |         ),
104 |         ('{"a": undefined}', {"a": "undefined"}),
105 |         ('[undefined, undefined]', ["undefined", "undefined"]),
106 |         ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}),
107 |         ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}),
108 |         ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}),
109 |         ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}),
110 |         ("{'a': 121.}", {'a': 121.0}),
111 |         ("{abc : 100}", {'abc': 100}),
112 |         ("{abc     :       100}", {'abc': 100}),
113 |         ("{abc: name }", {'abc': "name"}),
114 |         ("{abc: name\t}", {'abc': "name"}),
115 |         ("{abc: value\n}", {'abc': "value"}),
116 |         ("{abc:  name}", {'abc': "name"}),
117 |         ("{abc: \tname}", {'abc': "name"}),
118 |         ("{abc: \nvalue}", {'abc': "value"}),
119 |     )
120 |     def test_parse_strange_values(self, in_data, expected_data):
121 |         result = parse_js_object(in_data)
122 |         self.assertEqual(result, expected_data)
123 | 
124 |     @parametrize_test(
125 |         ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}),
126 |         ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}),
127 |         ('{"a":\r\n10}', {'a': 10}),
128 |         ("{'foo': 0,\r\n}", {'foo': 0}),
129 |         ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}),
130 |     )
131 |     def test_strange_input(self, in_data, expected_data):
132 |         result = parse_js_object(in_data)
133 |         self.assertEqual(result, expected_data)
134 | 
135 |     @parametrize_test(
136 |         ("[0]", [0]),
137 |         ("[1]", [1]),
138 |         ("[12]", [12]),
139 |         ("[12_12]", [1212]),
140 |         ("[0x12]", [18]),
141 |         ("[0xab]", [171]),
142 |         ("[0xAB]", [171]),
143 |         ("[0X12]", [18]),
144 |         ("[0Xab]", [171]),
145 |         ("[0XAB]", [171]),
146 |         ("[01234]", [668]),
147 |         ("[0o1234]", [668]),
148 |         ("[0O1234]", [668]),
149 |         ("[0b1111]", [15]),
150 |         ("[0B1111]", [15]),
151 |         ("[-0]", [-0]),
152 |         ("[-1]", [-1]),
153 |         ("[-12]", [-12]),
154 |         ("[-12_12]", [-1212]),
155 |         ("[-0x12]", [-18]),
156 |         ("[-0xab]", [-171]),
157 |         ("[-0xAB]", [-171]),
158 |         ("[-0X12]", [-18]),
159 |         ("[-0Xab]", [-171]),
160 |         ("[-0XAB]", [-171]),
161 |         ("[-01234]", [-668]),
162 |         ("[-0o1234]", [-668]),
163 |         ("[-0O1234]", [-668]),
164 |         ("[-0b1111]", [-15]),
165 |         ("[-0B1111]", [-15]),
166 |     )
167 |     def test_integer_numeric_values(self, in_data, expected_data):
168 |         result = parse_js_object(in_data)
169 |         self.assertEqual(result, expected_data)
170 | 
171 |     @parametrize_test(
172 |         ("[0.32]", [0.32]),
173 |         ("[-0.32]", [-0.32]),
174 |         ("[.32]", [0.32]),
175 |         ("[-.32]", [-0.32]),
176 |         ("[12.]", [12.0]),
177 |         ("[-12.]", [-12.0]),
178 |         ("[12.32]", [12.32]),
179 |         ("[-12.12]", [-12.12]),
180 |         ("[3.1415926]", [3.1415926]),
181 |         ("[.123456789]", [.123456789]),
182 |         ("[.0123]", [0.0123]),
183 |         ("[0.0123]", [0.0123]),
184 |         ("[-.0123]", [-0.0123]),
185 |         ("[-0.0123]", [-0.0123]),
186 |         ("[3.1E+12]", [3.1E+12]),
187 |         ("[3.1e+12]", [3.1E+12]),
188 |         ("[.1E-23]", [.1e-23]),
189 |         ("[.1e-23]", [.1e-23]),
190 |     )
191 |     def test_float_numeric_values(self, in_data, expected_data):
192 |         result = parse_js_object(in_data)
193 |         self.assertEqual(result, expected_data)
194 | 
195 | 
196 |     @parametrize_test(
197 |         (
198 |             """
199 |                 var obj = {
200 |                     // Comment
201 |                     x: "X", // Comment
202 |                 };
203 |             """,
204 |             {"x": "X"},
205 |         ),
206 |         (
207 |             """
208 |                 var /* Comment */ obj = /* Comment */ {
209 |                     /* Comment */
210 |                     x: /* Comment */ "X", /* Comment */
211 |                 };
212 |             """,
213 |             {"x": "X"},
214 |         ),
215 |         (
216 |             """[/*...*/1,2,3,/*...*/4,5,6]""",
217 |             [1, 2, 3, 4, 5, 6],
218 |         ),
219 |     )
220 |     def test_comments(self, in_data, expected_data):
221 |         result = parse_js_object(in_data)
222 |         self.assertEqual(result, expected_data)
223 | 
224 |     @parametrize_test(
225 |         ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]),
226 |     )
227 |     def test_jsonlines(self, in_data, expected_data):
228 |         result = list(parse_js_objects(in_data))
229 |         self.assertEqual(result, expected_data)
230 | 
231 | 
232 | class TestParserExceptions(unittest.TestCase):
233 |     @parametrize_test(
234 |         ('}{', ValueError),
235 |         ('', ValueError),
236 |         (None, ValueError),
237 |     )
238 |     def test_exceptions(self, in_data, expected_exception):
239 |         with self.assertRaises(expected_exception):
240 |             parse_js_object(in_data)
241 | 
242 |     @parametrize_test(
243 |         ("{whose: 's's', category_name: '>'}", ValueError),
244 |     )
245 |     def test_malformed_input(self, in_data, expected_exception):
246 |         with self.assertRaises(expected_exception):
247 |             parse_js_object(in_data)
248 | 
249 |     @parametrize_test(
250 |         (
251 |             '{"test": """}',
252 |             ValueError,
253 |             'Error parsing input near character 13',
254 |         ),
255 |     )
256 |     def test_error_messages(self, in_data, expected_exception, expected_exception_text):
257 |         with self.assertRaisesRegex(expected_exception, expected_exception_text):
258 |             parse_js_object(in_data)
259 | 
260 | 
261 | class TestOptions(unittest.TestCase):
262 |     @parametrize_test(
263 |         ('{\\\"a\\\": 12}', {'a': 12}),
264 |     )
265 |     def test_unicode_escape(self, in_data, expected_data):
266 |         result = parse_js_object(in_data, unicode_escape=True)
267 |         self.assertEqual(result, expected_data)
268 | 
269 |     @parametrize_test(
270 |         ('["\n"]', ["\n"]),
271 |         ("{'a': '\"\"', 'b': '\\\\', 'c': '\t\n'}", {'a': '""', 'b': '\\', 'c': '\t\n'}),
272 |         (
273 |             """var myObj = {
274 |                 myMethod: function(params) {
275 |                     // ...
276 |                 },
277 |                 myValue: 100
278 |             }""",
279 |             {'myMethod': 'function(params) {\n                    // ...\n                }', 'myValue': 100},
280 |         ),
281 |     )
282 |     def test_json_non_strict(self, in_data, expected_data):
283 |         result = parse_js_object(in_data, loader_kwargs={'strict': False})
284 |         self.assertEqual(result, expected_data)
285 | 
286 |     @parametrize_test(
287 |         ("[]", []),
288 |         ("[1, 2, 3]", [1, 2, 3]),
289 |         ('var x = [1, 2, 3, 4, 5,]', [1, 2, 3, 4, 5]),
290 |         ('{}', {}),
291 |         ("{'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}),
292 |         ("var x = {'a': 12, 'b': 13, 'c': 14}", {'a': 12, 'b': 13, 'c': 14}),
293 |     )
294 |     def test_loader(self, in_data, expected_data):
295 |         import ast
296 |         result = parse_js_object(in_data, loader=ast.literal_eval)
297 |         self.assertEqual(result, expected_data)
298 | 
299 | 
300 | class TestParseJsonObjects(unittest.TestCase):
301 |     @parametrize_test(
302 |         ("", []),
303 |         ("aaaaaaaaaaaaaaaa", []),
304 |         ("         ", []),
305 |         ("      {'a': 12}", [{'a': 12}]),
306 |         ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]),
307 |         ("[12] [13] [14]", [[12], [13], [14]]),
308 |         ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]),
309 |         ("[1][1][1]", [[1], [1], [1]]),
310 |         ("[1] [2] {'a': ", [[1], [2]]),
311 |         ("[]", [[]]),
312 |         ("[][][][]", [[], [], [], []]),
313 |         ("{}", [{}]),
314 |         ("{}{}{}{}", [{}, {}, {}, {}]),
315 |         ("{{}}{{}}", []),
316 |         ("[[]][[]]", [[[]], [[]]]),
317 |         ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]),
318 |         (
319 |             'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])',
320 |             [{}, {'a': 12}, [1, 2, 3]],
321 |         ),
322 |         ('{"a": 12, broken}{"c": 100}', [{'c': 100}]),
323 |         ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]),
324 |     )
325 |     def test_parse_json_objects(self, in_data, expected_data):
326 |         result = list(parse_js_objects(in_data))
327 |         self.assertEqual(result, expected_data)
328 | 
329 |     @parametrize_test(
330 |         ("[1][][2]", [[1], [2]]),
331 |         ("{'a': 12}{}{'b': 13}", [{'a': 12}, {'b': 13}]),
332 |         ("[][][][][][][][][]", []),
333 |         ("{}{}{}{}{}{}{}{}{}", []),
334 |     )
335 |     def test_parse_json_objects_without_empty(self, in_data, expected_data):
336 |         result = list(parse_js_objects(in_data, omitempty=True))
337 |         self.assertEqual(result, expected_data)
338 | 
339 | 
340 | if __name__ == '__main__':
341 |     unittest.main()
342 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 | <head>
  4 | <meta charset="utf-8">
  5 | <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
  6 | <meta name="generator" content="pdoc 0.10.0" />
  7 | <title>chompjs API documentation</title>
  8 | <meta name="description" content="" />
  9 | <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
 10 | <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
 11 | <link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
 12 | <style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
 13 | <style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
 14 | <style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
 15 | <script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
 16 | <script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
 17 | </head>
 18 | <body>
 19 | <main>
 20 | <article id="content">
 21 | <header>
 22 | <h1 class="title">Module <code>chompjs</code></h1>
 23 | </header>
 24 | <section id="section-intro">
 25 | <details class="source">
 26 | <summary>
 27 | <span>Expand source code</span>
 28 | </summary>
 29 | <pre><code class="python"># -*- coding: utf-8 -*-
 30 | 
 31 | import json
 32 | import warnings
 33 | 
 34 | from _chompjs import parse, parse_objects
 35 | 
 36 | 
 37 | def _preprocess(string, unicode_escape=False):
 38 |     if unicode_escape:
 39 |         string = string.encode().decode(&#34;unicode_escape&#34;)
 40 |     return string
 41 | 
 42 | 
 43 | def _process_loader_arguments(loader_args, loader_kwargs, json_params):
 44 |     if json_params:
 45 |         msg = &#34;json_params argument is deprecated, please use loader_kwargs instead&#34;
 46 |         warnings.warn(msg, DeprecationWarning)
 47 |         loader_kwargs = json_params
 48 | 
 49 |     if not loader_args:
 50 |         loader_args = []
 51 | 
 52 |     if not loader_kwargs:
 53 |         loader_kwargs = {}
 54 | 
 55 |     return (loader_args, loader_kwargs)
 56 | 
 57 | 
 58 | def parse_js_object(
 59 |     string,
 60 |     unicode_escape=False,
 61 |     loader=json.loads,
 62 |     loader_args=None,
 63 |     loader_kwargs=None,
 64 |     json_params=None,
 65 | ):
 66 |     &#34;&#34;&#34;
 67 |     Extracts first JSON object encountered in the input string
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     string: str
 72 |         Input string
 73 | 
 74 |     &gt;&gt;&gt; parse_js_object(&#34;{a: 100}&#34;)
 75 |     {&#39;a&#39;: 100}
 76 | 
 77 |     unicode_escape: bool, optional
 78 |         Attempt to fix input string if it contains escaped special characters
 79 | 
 80 |     &gt;&gt;&gt; parse_js_object(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;)
 81 |     {&#39;\\\\&#34;a\\\\&#34;&#39;: 100}
 82 |     &gt;&gt;&gt; parse_js_object(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;, unicode_escape=True)
 83 |     {&#39;a&#39;: 100}
 84 | 
 85 |     loader: func, optional
 86 |         Function used to load processed input data. By default `json.loads` is used
 87 | 
 88 |     &gt;&gt;&gt; import orjson
 89 |     &gt;&gt;&gt; import chompjs
 90 |     &gt;&gt;&gt; 
 91 |     &gt;&gt;&gt; chompjs.parse_js_object(&#34;{&#39;a&#39;: 12}&#34;, loader=orjson.loads)
 92 |     {&#39;a&#39;: 12}
 93 | 
 94 |     loader_args: list, optional
 95 |         Allow passing down positional arguments to loader function
 96 | 
 97 |     loader_kwargs: dict, optional
 98 |         Allow passing down keyword arguments to loader function
 99 | 
100 |     &gt;&gt;&gt; parse_js_object(&#34;{&#39;a&#39;: 10.1}&#34;)
101 |     {&#39;a&#39;: 10.1}
102 |     &gt;&gt;&gt; import decimal
103 |     &gt;&gt;&gt; parse_js_object(&#34;{&#39;a&#39;: 10.1}&#34;, loader_kwargs={&#39;parse_float&#39;: decimal.Decimal})
104 |     {&#39;a&#39;: Decimal(&#39;10.1&#39;)}
105 | 
106 |     .. deprecated:: 1.3.0
107 |     json_params: dict, optional
108 |         Use `loader_kwargs` instead
109 | 
110 |     Returns
111 |     -------
112 |     list | dict
113 |         Extracted JSON object
114 | 
115 |     Raises
116 |     ------
117 |     ValueError
118 |         If failed to parse input properly
119 | 
120 |     ```python
121 |     &gt;&gt;&gt; parse_js_object(None)
122 |     Traceback (most recent call last):
123 |       ...
124 |     ValueError: Invalid input
125 |     &gt;&gt;&gt; parse_js_object(&#34;No JSON objects in sight...&#34;)
126 |     Traceback (most recent call last):
127 |       ...
128 |     json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
129 | 
130 |     ```
131 | 
132 |     &#34;&#34;&#34;
133 |     if not string:
134 |         raise ValueError(&#34;Invalid input&#34;)
135 | 
136 |     loader_args, loader_kwargs = _process_loader_arguments(
137 |         loader_args, loader_kwargs, json_params
138 |     )
139 | 
140 |     if json_params:
141 |         msg = &#34;json_params argument is deprecated, please use loader_kwargs instead&#34;
142 |         warnings.warn(msg, DeprecationWarning)
143 | 
144 |     string = _preprocess(string, unicode_escape)
145 |     parsed_data = parse(string)
146 |     return loader(parsed_data, *loader_args, **loader_kwargs)
147 | 
148 | 
149 | def parse_js_objects(
150 |     string,
151 |     unicode_escape=False,
152 |     omitempty=False, 
153 |     loader=json.loads,
154 |     loader_args=None,
155 |     loader_kwargs=None,
156 |     json_params=None,
157 | ):
158 |     &#34;&#34;&#34;
159 |     Returns a generator extracting all JSON objects encountered in the input string.
160 |     Can be used to read JSON Lines
161 | 
162 |     Parameters
163 |     ----------
164 |     string: str
165 |         Input string
166 | 
167 |     &gt;&gt;&gt; it = parse_js_objects(&#34;{a: 100} {b: 100}&#34;)
168 |     &gt;&gt;&gt; next(it)
169 |     {&#39;a&#39;: 100}
170 |     &gt;&gt;&gt; next(it)
171 |     {&#39;b&#39;: 100}
172 | 
173 |     unicode_escape: bool, optional
174 |         Attempt to fix input string if it contains escaped special characters
175 | 
176 |     &gt;&gt;&gt; next(parse_js_objects(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;))
177 |     {&#39;\\\\&#34;a\\\\&#34;&#39;: 100}
178 |     &gt;&gt;&gt; next(parse_js_objects(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;, unicode_escape=True))
179 |     {&#39;a&#39;: 100}
180 | 
181 |     omitempty: bool, optional
182 |         Skip empty dictionaries and lists
183 | 
184 |     &gt;&gt;&gt; list(parse_js_objects(&#34;{a: 12} {} {b: 13}&#34;))
185 |     [{&#39;a&#39;: 12}, {}, {&#39;b&#39;: 13}]
186 |     &gt;&gt;&gt; list(parse_js_objects(&#34;{a: 12} {} {b: 13}&#34;, omitempty=True))
187 |     [{&#39;a&#39;: 12}, {&#39;b&#39;: 13}]
188 | 
189 |     loader: func, optional
190 |         Function used to load processed input data. By default `json.loads` is used
191 | 
192 |     &gt;&gt;&gt; import orjson
193 |     &gt;&gt;&gt; import chompjs
194 |     &gt;&gt;&gt; 
195 |     &gt;&gt;&gt; next(chompjs.parse_js_objects(&#34;{&#39;a&#39;: 12}&#34;, loader=orjson.loads))
196 |     {&#39;a&#39;: 12}
197 | 
198 |     loader_args: list, optional
199 |         Allow passing down positional arguments to loader function
200 | 
201 |     loader_kwargs: dict, optional
202 |         Allow passing down keyword arguments to loader function
203 | 
204 |     &gt;&gt;&gt; next(parse_js_objects(&#34;{&#39;a&#39;: 10.1}&#34;))
205 |     {&#39;a&#39;: 10.1}
206 |     &gt;&gt;&gt; import decimal
207 |     &gt;&gt;&gt; next(parse_js_objects(&#34;{&#39;a&#39;: 10.1}&#34;, loader_kwargs={&#39;parse_float&#39;: decimal.Decimal}))
208 |     {&#39;a&#39;: Decimal(&#39;10.1&#39;)}
209 | 
210 |     .. deprecated:: 1.3.0
211 |     json_params: dict, optional
212 |         Use `loader_kwargs` instead
213 | 
214 |     Returns
215 |     -------
216 |     generator
217 |         Iterating over it yields all encountered JSON objects
218 |     &#34;&#34;&#34;
219 | 
220 |     if not string:
221 |         return
222 | 
223 |     loader_args, loader_kwargs = _process_loader_arguments(
224 |         loader_args, loader_kwargs, json_params
225 |     )
226 | 
227 |     string = _preprocess(string, unicode_escape)
228 |     for raw_data in parse_objects(string):
229 |         try:
230 |             data = loader(raw_data, *loader_args, **loader_kwargs)
231 |         except ValueError:
232 |             continue
233 | 
234 |         if not data and omitempty:
235 |             continue
236 | 
237 |         yield data</code></pre>
238 | </details>
239 | </section>
240 | <section>
241 | </section>
242 | <section>
243 | </section>
244 | <section>
245 | <h2 class="section-title" id="header-functions">Functions</h2>
246 | <dl>
247 | <dt id="chompjs.parse_js_object"><code class="name flex">
248 | <span>def <span class="ident">parse_js_object</span></span>(<span>string, unicode_escape=False, loader=&lt;function loads&gt;, loader_args=None, loader_kwargs=None, json_params=None)</span>
249 | </code></dt>
250 | <dd>
251 | <div class="desc"><p>Extracts first JSON object encountered in the input string</p>
252 | <h2 id="parameters">Parameters</h2>
253 | <dl>
254 | <dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
255 | <dd>Input string</dd>
256 | </dl>
257 | <pre><code class="language-python-repl">&gt;&gt;&gt; parse_js_object(&quot;{a: 100}&quot;)
258 | {'a': 100}
259 | </code></pre>
260 | <dl>
261 | <dt><strong><code>unicode_escape</code></strong> :&ensp;<code>bool</code>, optional</dt>
262 | <dd>Attempt to fix input string if it contains escaped special characters</dd>
263 | </dl>
264 | <pre><code class="language-python-repl">&gt;&gt;&gt; parse_js_object('{\\&quot;a\\&quot;: 100}')
265 | {'\\&quot;a\\&quot;': 100}
266 | &gt;&gt;&gt; parse_js_object('{\\&quot;a\\&quot;: 100}', unicode_escape=True)
267 | {'a': 100}
268 | </code></pre>
269 | <dl>
270 | <dt><strong><code>loader</code></strong> :&ensp;<code>func</code>, optional</dt>
271 | <dd>Function used to load processed input data. By default <code>json.loads</code> is used</dd>
272 | </dl>
273 | <pre><code class="language-python-repl">&gt;&gt;&gt; import orjson
274 | &gt;&gt;&gt; import chompjs
275 | &gt;&gt;&gt; 
276 | &gt;&gt;&gt; chompjs.parse_js_object(&quot;{'a': 12}&quot;, loader=orjson.loads)
277 | {'a': 12}
278 | </code></pre>
279 | <dl>
280 | <dt><strong><code>loader_args</code></strong> :&ensp;<code>list</code>, optional</dt>
281 | <dd>Allow passing down positional arguments to loader function</dd>
282 | <dt><strong><code>loader_kwargs</code></strong> :&ensp;<code>dict</code>, optional</dt>
283 | <dd>Allow passing down keyword arguments to loader function</dd>
284 | </dl>
285 | <pre><code class="language-python-repl">&gt;&gt;&gt; parse_js_object(&quot;{'a': 10.1}&quot;)
286 | {'a': 10.1}
287 | &gt;&gt;&gt; import decimal
288 | &gt;&gt;&gt; parse_js_object(&quot;{'a': 10.1}&quot;, loader_kwargs={'parse_float': decimal.Decimal})
289 | {'a': Decimal('10.1')}
290 | </code></pre>
291 | <div class="admonition deprecated">
292 | <p class="admonition-title">Deprecated since version:&ensp;1.3.0</p>
293 | </div>
294 | <dl>
295 | <dt><strong><code>json_params</code></strong> :&ensp;<code>dict</code>, optional</dt>
296 | <dd>Use <code>loader_kwargs</code> instead</dd>
297 | </dl>
298 | <h2 id="returns">Returns</h2>
299 | <dl>
300 | <dt><code>list | dict</code></dt>
301 | <dd>Extracted JSON object</dd>
302 | </dl>
303 | <h2 id="raises">Raises</h2>
304 | <dl>
305 | <dt><code>ValueError</code></dt>
306 | <dd>If failed to parse input properly</dd>
307 | </dl>
308 | <pre><code class="language-python">&gt;&gt;&gt; parse_js_object(None)
309 | Traceback (most recent call last):
310 |   ...
311 | ValueError: Invalid input
312 | &gt;&gt;&gt; parse_js_object(&quot;No JSON objects in sight...&quot;)
313 | Traceback (most recent call last):
314 |   ...
315 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
316 | 
317 | </code></pre></div>
318 | <details class="source">
319 | <summary>
320 | <span>Expand source code</span>
321 | </summary>
322 | <pre><code class="python">def parse_js_object(
323 |     string,
324 |     unicode_escape=False,
325 |     loader=json.loads,
326 |     loader_args=None,
327 |     loader_kwargs=None,
328 |     json_params=None,
329 | ):
330 |     &#34;&#34;&#34;
331 |     Extracts first JSON object encountered in the input string
332 | 
333 |     Parameters
334 |     ----------
335 |     string: str
336 |         Input string
337 | 
338 |     &gt;&gt;&gt; parse_js_object(&#34;{a: 100}&#34;)
339 |     {&#39;a&#39;: 100}
340 | 
341 |     unicode_escape: bool, optional
342 |         Attempt to fix input string if it contains escaped special characters
343 | 
344 |     &gt;&gt;&gt; parse_js_object(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;)
345 |     {&#39;\\\\&#34;a\\\\&#34;&#39;: 100}
346 |     &gt;&gt;&gt; parse_js_object(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;, unicode_escape=True)
347 |     {&#39;a&#39;: 100}
348 | 
349 |     loader: func, optional
350 |         Function used to load processed input data. By default `json.loads` is used
351 | 
352 |     &gt;&gt;&gt; import orjson
353 |     &gt;&gt;&gt; import chompjs
354 |     &gt;&gt;&gt; 
355 |     &gt;&gt;&gt; chompjs.parse_js_object(&#34;{&#39;a&#39;: 12}&#34;, loader=orjson.loads)
356 |     {&#39;a&#39;: 12}
357 | 
358 |     loader_args: list, optional
359 |         Allow passing down positional arguments to loader function
360 | 
361 |     loader_kwargs: dict, optional
362 |         Allow passing down keyword arguments to loader function
363 | 
364 |     &gt;&gt;&gt; parse_js_object(&#34;{&#39;a&#39;: 10.1}&#34;)
365 |     {&#39;a&#39;: 10.1}
366 |     &gt;&gt;&gt; import decimal
367 |     &gt;&gt;&gt; parse_js_object(&#34;{&#39;a&#39;: 10.1}&#34;, loader_kwargs={&#39;parse_float&#39;: decimal.Decimal})
368 |     {&#39;a&#39;: Decimal(&#39;10.1&#39;)}
369 | 
370 |     .. deprecated:: 1.3.0
371 |     json_params: dict, optional
372 |         Use `loader_kwargs` instead
373 | 
374 |     Returns
375 |     -------
376 |     list | dict
377 |         Extracted JSON object
378 | 
379 |     Raises
380 |     ------
381 |     ValueError
382 |         If failed to parse input properly
383 | 
384 |     ```python
385 |     &gt;&gt;&gt; parse_js_object(None)
386 |     Traceback (most recent call last):
387 |       ...
388 |     ValueError: Invalid input
389 |     &gt;&gt;&gt; parse_js_object(&#34;No JSON objects in sight...&#34;)
390 |     Traceback (most recent call last):
391 |       ...
392 |     json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
393 | 
394 |     ```
395 | 
396 |     &#34;&#34;&#34;
397 |     if not string:
398 |         raise ValueError(&#34;Invalid input&#34;)
399 | 
400 |     loader_args, loader_kwargs = _process_loader_arguments(
401 |         loader_args, loader_kwargs, json_params
402 |     )
403 | 
404 |     if json_params:
405 |         msg = &#34;json_params argument is deprecated, please use loader_kwargs instead&#34;
406 |         warnings.warn(msg, DeprecationWarning)
407 | 
408 |     string = _preprocess(string, unicode_escape)
409 |     parsed_data = parse(string)
410 |     return loader(parsed_data, *loader_args, **loader_kwargs)</code></pre>
411 | </details>
412 | </dd>
413 | <dt id="chompjs.parse_js_objects"><code class="name flex">
414 | <span>def <span class="ident">parse_js_objects</span></span>(<span>string, unicode_escape=False, omitempty=False, loader=&lt;function loads&gt;, loader_args=None, loader_kwargs=None, json_params=None)</span>
415 | </code></dt>
416 | <dd>
417 | <div class="desc"><p>Returns a generator extracting all JSON objects encountered in the input string.
418 | Can be used to read JSON Lines</p>
419 | <h2 id="parameters">Parameters</h2>
420 | <dl>
421 | <dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
422 | <dd>Input string</dd>
423 | </dl>
424 | <pre><code class="language-python-repl">&gt;&gt;&gt; it = parse_js_objects(&quot;{a: 100} {b: 100}&quot;)
425 | &gt;&gt;&gt; next(it)
426 | {'a': 100}
427 | &gt;&gt;&gt; next(it)
428 | {'b': 100}
429 | </code></pre>
430 | <dl>
431 | <dt><strong><code>unicode_escape</code></strong> :&ensp;<code>bool</code>, optional</dt>
432 | <dd>Attempt to fix input string if it contains escaped special characters</dd>
433 | </dl>
434 | <pre><code class="language-python-repl">&gt;&gt;&gt; next(parse_js_objects('{\\&quot;a\\&quot;: 100}'))
435 | {'\\&quot;a\\&quot;': 100}
436 | &gt;&gt;&gt; next(parse_js_objects('{\\&quot;a\\&quot;: 100}', unicode_escape=True))
437 | {'a': 100}
438 | </code></pre>
439 | <dl>
440 | <dt><strong><code>omitempty</code></strong> :&ensp;<code>bool</code>, optional</dt>
441 | <dd>Skip empty dictionaries and lists</dd>
442 | </dl>
443 | <pre><code class="language-python-repl">&gt;&gt;&gt; list(parse_js_objects(&quot;{a: 12} {} {b: 13}&quot;))
444 | [{'a': 12}, {}, {'b': 13}]
445 | &gt;&gt;&gt; list(parse_js_objects(&quot;{a: 12} {} {b: 13}&quot;, omitempty=True))
446 | [{'a': 12}, {'b': 13}]
447 | </code></pre>
448 | <dl>
449 | <dt><strong><code>loader</code></strong> :&ensp;<code>func</code>, optional</dt>
450 | <dd>Function used to load processed input data. By default <code>json.loads</code> is used</dd>
451 | </dl>
452 | <pre><code class="language-python-repl">&gt;&gt;&gt; import orjson
453 | &gt;&gt;&gt; import chompjs
454 | &gt;&gt;&gt; 
455 | &gt;&gt;&gt; next(chompjs.parse_js_objects(&quot;{'a': 12}&quot;, loader=orjson.loads))
456 | {'a': 12}
457 | </code></pre>
458 | <dl>
459 | <dt><strong><code>loader_args</code></strong> :&ensp;<code>list</code>, optional</dt>
460 | <dd>Allow passing down positional arguments to loader function</dd>
461 | <dt><strong><code>loader_kwargs</code></strong> :&ensp;<code>dict</code>, optional</dt>
462 | <dd>Allow passing down keyword arguments to loader function</dd>
463 | </dl>
464 | <pre><code class="language-python-repl">&gt;&gt;&gt; next(parse_js_objects(&quot;{'a': 10.1}&quot;))
465 | {'a': 10.1}
466 | &gt;&gt;&gt; import decimal
467 | &gt;&gt;&gt; next(parse_js_objects(&quot;{'a': 10.1}&quot;, loader_kwargs={'parse_float': decimal.Decimal}))
468 | {'a': Decimal('10.1')}
469 | </code></pre>
470 | <div class="admonition deprecated">
471 | <p class="admonition-title">Deprecated since version:&ensp;1.3.0</p>
472 | </div>
473 | <dl>
474 | <dt><strong><code>json_params</code></strong> :&ensp;<code>dict</code>, optional</dt>
475 | <dd>Use <code>loader_kwargs</code> instead</dd>
476 | </dl>
477 | <h2 id="returns">Returns</h2>
478 | <dl>
479 | <dt><code>generator</code></dt>
480 | <dd>Iterating over it yields all encountered JSON objects</dd>
481 | </dl></div>
482 | <details class="source">
483 | <summary>
484 | <span>Expand source code</span>
485 | </summary>
486 | <pre><code class="python">def parse_js_objects(
487 |     string,
488 |     unicode_escape=False,
489 |     omitempty=False, 
490 |     loader=json.loads,
491 |     loader_args=None,
492 |     loader_kwargs=None,
493 |     json_params=None,
494 | ):
495 |     &#34;&#34;&#34;
496 |     Returns a generator extracting all JSON objects encountered in the input string.
497 |     Can be used to read JSON Lines
498 | 
499 |     Parameters
500 |     ----------
501 |     string: str
502 |         Input string
503 | 
504 |     &gt;&gt;&gt; it = parse_js_objects(&#34;{a: 100} {b: 100}&#34;)
505 |     &gt;&gt;&gt; next(it)
506 |     {&#39;a&#39;: 100}
507 |     &gt;&gt;&gt; next(it)
508 |     {&#39;b&#39;: 100}
509 | 
510 |     unicode_escape: bool, optional
511 |         Attempt to fix input string if it contains escaped special characters
512 | 
513 |     &gt;&gt;&gt; next(parse_js_objects(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;))
514 |     {&#39;\\\\&#34;a\\\\&#34;&#39;: 100}
515 |     &gt;&gt;&gt; next(parse_js_objects(&#39;{\\\\&#34;a\\\\&#34;: 100}&#39;, unicode_escape=True))
516 |     {&#39;a&#39;: 100}
517 | 
518 |     omitempty: bool, optional
519 |         Skip empty dictionaries and lists
520 | 
521 |     &gt;&gt;&gt; list(parse_js_objects(&#34;{a: 12} {} {b: 13}&#34;))
522 |     [{&#39;a&#39;: 12}, {}, {&#39;b&#39;: 13}]
523 |     &gt;&gt;&gt; list(parse_js_objects(&#34;{a: 12} {} {b: 13}&#34;, omitempty=True))
524 |     [{&#39;a&#39;: 12}, {&#39;b&#39;: 13}]
525 | 
526 |     loader: func, optional
527 |         Function used to load processed input data. By default `json.loads` is used
528 | 
529 |     &gt;&gt;&gt; import orjson
530 |     &gt;&gt;&gt; import chompjs
531 |     &gt;&gt;&gt; 
532 |     &gt;&gt;&gt; next(chompjs.parse_js_objects(&#34;{&#39;a&#39;: 12}&#34;, loader=orjson.loads))
533 |     {&#39;a&#39;: 12}
534 | 
535 |     loader_args: list, optional
536 |         Allow passing down positional arguments to loader function
537 | 
538 |     loader_kwargs: dict, optional
539 |         Allow passing down keyword arguments to loader function
540 | 
541 |     &gt;&gt;&gt; next(parse_js_objects(&#34;{&#39;a&#39;: 10.1}&#34;))
542 |     {&#39;a&#39;: 10.1}
543 |     &gt;&gt;&gt; import decimal
544 |     &gt;&gt;&gt; next(parse_js_objects(&#34;{&#39;a&#39;: 10.1}&#34;, loader_kwargs={&#39;parse_float&#39;: decimal.Decimal}))
545 |     {&#39;a&#39;: Decimal(&#39;10.1&#39;)}
546 | 
547 |     .. deprecated:: 1.3.0
548 |     json_params: dict, optional
549 |         Use `loader_kwargs` instead
550 | 
551 |     Returns
552 |     -------
553 |     generator
554 |         Iterating over it yields all encountered JSON objects
555 |     &#34;&#34;&#34;
556 | 
557 |     if not string:
558 |         return
559 | 
560 |     loader_args, loader_kwargs = _process_loader_arguments(
561 |         loader_args, loader_kwargs, json_params
562 |     )
563 | 
564 |     string = _preprocess(string, unicode_escape)
565 |     for raw_data in parse_objects(string):
566 |         try:
567 |             data = loader(raw_data, *loader_args, **loader_kwargs)
568 |         except ValueError:
569 |             continue
570 | 
571 |         if not data and omitempty:
572 |             continue
573 | 
574 |         yield data</code></pre>
575 | </details>
576 | </dd>
577 | </dl>
578 | </section>
579 | <section>
580 | </section>
581 | </article>
582 | <nav id="sidebar">
583 | <h1>Index</h1>
584 | <div class="toc">
585 | <ul></ul>
586 | </div>
587 | <ul id="index">
588 | <li><h3><a href="#header-functions">Functions</a></h3>
589 | <ul class="">
590 | <li><code><a title="chompjs.parse_js_object" href="#chompjs.parse_js_object">parse_js_object</a></code></li>
591 | <li><code><a title="chompjs.parse_js_objects" href="#chompjs.parse_js_objects">parse_js_objects</a></code></li>
592 | </ul>
593 | </li>
594 | </ul>
595 | </nav>
596 | </main>
597 | <footer id="footer">
598 | <p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
599 | </footer>
600 | </body>
601 | </html>
602 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | from io import open
 5 | from os import path
 6 | from platform import system
 7 | from setuptools import setup, Extension
 8 | 
 9 | 
10 | this_directory = path.abspath(path.dirname(__file__))
11 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
12 |     long_description = f.read()
13 | 
14 | extra_compile_args = []
15 | extra_link_args = []
16 | if system() == 'Linux':
17 |     extra_compile_args = ['-Wl,-Bsymbolic-functions']
18 |     extra_link_args = ['-Wl,-Bsymbolic-functions']
19 | 
20 | chompjs_extension = Extension(
21 |     '_chompjs',
22 |     sources=['_chompjs/module.c', '_chompjs/parser.c', '_chompjs/buffer.c'],
23 |     extra_compile_args=extra_compile_args,
24 |     extra_link_args=extra_link_args,
25 | )
26 | 
27 | setup(
28 |     name='chompjs',
29 |     version='1.3.2',
30 |     description='Parsing JavaScript objects into Python dictionaries',
31 |     author='Mariusz Obajtek',
32 |     author_email='nykakin@gmail.com',
33 |     keywords='parsing parser JavaScript json json5 webscrapping',
34 |     python_requires='>=3.8',
35 |     ext_modules=[chompjs_extension],
36 |     classifiers=[
37 |         "Programming Language :: Python :: 3",
38 |         "Programming Language :: JavaScript",
39 |         "Intended Audience :: Developers",
40 |         "License :: OSI Approved :: MIT License",
41 |         "Operating System :: OS Independent",
42 |         "Topic :: Software Development :: Libraries :: Python Modules",
43 |         "Topic :: Text Processing :: General",
44 |         "Topic :: Text Processing :: Linguistic",
45 |         "Development Status :: 5 - Production/Stable",
46 |         "Environment :: Console",
47 |         "Environment :: Web Environment",
48 |     ],
49 |     url='https://github.com/Nykakin/chompjs',
50 |     long_description=long_description,
51 |     long_description_content_type='text/markdown',
52 |     include_package_data=True,
53 |     packages=['chompjs'],
54 | )
55 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py39,py310,py311,py312,py313
3 | 
4 | [testenv]
5 | deps = orjson
6 | commands =
7 |     python -m unittest discover
8 |     python -m doctest chompjs/chompjs.py
9 | 


--------------------------------------------------------------------------------