├── .github └── workflows │ └── semgrep.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── bench.lua ├── demo.c ├── json_decoder.lua ├── ljson_parser.h ├── mempool.c ├── mempool.h ├── parse_array.c ├── parse_hashtab.c ├── parser.c ├── parser.h ├── scan_fp.h ├── scan_fp_relax.c ├── scan_fp_strict.c ├── scaner.c ├── scaner.h ├── tests ├── Makefile ├── test.lua ├── test_cmp.lua ├── test_spec │ ├── test_composite.txt │ ├── test_diagnostic.txt │ ├── test_misc.txt │ └── test_token.txt ├── test_util.cxx ├── test_util.h └── unit_test.cxx └── util.h /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: {} 3 | workflow_dispatch: {} 4 | push: 5 | branches: 6 | - main 7 | - master 8 | schedule: 9 | - cron: '0 0 * * *' 10 | name: Semgrep config 11 | jobs: 12 | semgrep: 13 | name: semgrep/ci 14 | runs-on: ubuntu-latest 15 | env: 16 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 17 | SEMGREP_URL: https://cloudflare.semgrep.dev 18 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 20 | container: 21 | image: returntocorp/semgrep 22 | steps: 23 | - uses: actions/checkout@v4 24 | - run: semgrep ci 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.d 3 | *.so 4 | *.dylib 5 | callgrind.* 6 | .gdb_history 7 | tags 8 | demo 9 | dep.txt 10 | tests/unit_test 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, yangshuxin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Binaries we are going to build, and its source code 4 | # 5 | ################################################################# 6 | # 7 | OS := $(shell uname) 8 | 9 | SRC := mempool.c scaner.c parse_array.c parse_hashtab.c parser.c scan_fp_strict.c scan_fp_relax.c 10 | OBJ := $(SRC:.c=.o) 11 | 12 | DEMO := demo 13 | 14 | ifeq ($(OS), Darwin) 15 | C_SO_NAME := libljson.dylib 16 | else 17 | C_SO_NAME := libljson.so 18 | endif 19 | 20 | ################################################################# 21 | # 22 | # Compile and link flags 23 | # 24 | ################################################################# 25 | # 26 | CFLAGS := -Wall -O3 -flto -g -DFP_RELAX=0 #-DDEBUG 27 | THE_CFLAGS := $(CFLAGS) -fPIC -Wl,--build-id -MMD -fvisibility=hidden 28 | 29 | ifeq ($(OS), Linux) 30 | THE_CFLAGS := $(THE_CFLAGS) -Wl,--build-id 31 | endif 32 | 33 | ################################################################# 34 | # 35 | # Installtion flags 36 | # 37 | ################################################################# 38 | # 39 | PREFIX := /usr/local 40 | LUA_VERSION = 5.1 41 | SO_TARGET_DIR := $(PREFIX)/lib/lua/$(LUA_VERSION) 42 | LUA_TARGET_DIR := $(PREFIX)/share/lua/$(LUA_VERSION)/ 43 | 44 | ################################################################# 45 | # 46 | # Make recipes 47 | # 48 | ################################################################# 49 | # 50 | .PHONY = all test clean install 51 | 52 | all : $(C_SO_NAME) $(DEMO) 53 | 54 | -include dep.txt 55 | 56 | ${OBJ} : %.o : %.c 57 | $(CC) $(THE_CFLAGS) -DBUILDING_SO -c $< 58 | 59 | ${C_SO_NAME} : ${OBJ} 60 | $(CC) $(THE_CFLAGS) -DBUILDING_SO $^ -shared -o $@ 61 | cat *.d > dep.txt 62 | 63 | demo : ${C_SO_NAME} demo.o 64 | $(CC) $(THE_CFLAGS) -Wl,-rpath,. demo.o -L. -lljson -o $@ 65 | 66 | test : 67 | $(MAKE) -C tests 68 | 69 | clean:; rm -f *.o *.so a.out *.d dep.txt demo 70 | 71 | install: 72 | install -D -m 755 $(C_SO_NAME) $(DESTDIR)/$(SO_TARGET_DIR)/$(C_SO_NAME) 73 | install -D -m 664 json_decoder.lua $(DESTDIR)/$(LUA_TARGET_DIR)/json_decoder.lua 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lua-resty-json 2 | ============== 3 | 4 | Json lib for lua and C. The C interface is depicted by `ljson_parser.h`; 5 | while the Lua interface is implemented by `json_decoder.lua`. The lua 6 | interface is built on top of C implementation, and it's implemented 7 | using FFI instead of Lua C-API. 8 | 9 | Following is an example of Lua usage: 10 | ```lua 11 | local ljson_decoder = require 'json_decoder' 12 | local instance = ljson_decoder.create() 13 | local result, err = ljson_decoder.parse(instance, line) 14 | ``` 15 | Performance 16 | ----------- 17 | 18 | As of I write this README.md, I compare this work against cjson using 19 | few real-world json strings. For string-array intensive jsons, our decoder 20 | is normally 30% - 50% ahead of cjson. While for the hash-table intensive 21 | input, we are only 10-30% better. In an extreme example where there is 22 | a super long string, we see 5X speedup. The performance is measured with 23 | luajit 2.1. 24 | 25 | So far we pay lots of attention to string handling, and did not get chance to 26 | improve following aspects: 27 | - Parse floating point number quickly. so far we rely on `strtod()` 28 | to do the dirty job. Unfortunately, the `strtod()` seems to be pretty 29 | slow. 30 | 31 | - Efficiently skip white-spaces between tokens. 32 | 33 | - More efficient memory allocation. We are currently using `mempool` 34 | which allocate a big chunk and the subsequent memory allocation requests 35 | are served by carving block out of the chunk. It works pretty well 36 | for small to medium-sized `JSON` input (say under `100k` in size); 37 | however, the memory allocation overhead is still high (primarily due to 38 | the cost of allocating big chunks) for big `JSON`s. 39 | 40 | Floating Point Number 41 | -------------------- 42 | The way we handle following situations may not be what you expect, but 43 | the `JSON` SPEC does not seem to articulate how to handle these situations 44 | right either. 45 | 46 | - literal `-0` is interpreted as integer `0`, instead of floating point 47 | `0.0`. 48 | 49 | - `-0.0` is interperted as floating point `-0.0`. 50 | - If a literal is beyond the range of double-precision, we consider it 51 | as overflow/underflow; we do not try to represent the literal using 52 | `long double` or `quadruple`. 53 | 54 | - We rely on `strtod()` to parse literal (in strict mode), which, I guess, 55 | is using default rounding mode or the mode designated by the appliction 56 | which call the decoder. 57 | 58 | - We try to represent literals in signed 64-bit interger whenever possible. 59 | But the numbers like `1E6` is still represented as floating point as we 60 | currently rely on `strtod()` for handling scientific notation. 61 | 62 | TODO 63 | ---- 64 | - Continue to improve floating point parsing. 65 | - Improve testing, and add more testing. 66 | - Improve hashtab parsing performance (I almost have not yet got chance 67 | tune its performance when I write this comment). 68 | -------------------------------------------------------------------------------- /bench.lua: -------------------------------------------------------------------------------- 1 | local cjson = require "cjson" 2 | local ljson_decoder = require 'json_decoder' 3 | local f, err = io.open("bench.json", "r") 4 | 5 | local iter = 100000 6 | --local iter = 3 7 | local instance = ljson_decoder.new() 8 | 9 | for line in f:lines() do 10 | local begin = os.clock() 11 | for i = 1, iter do 12 | local result, err = instance:decode(line) 13 | end 14 | 15 | local t1 = os.clock() - begin 16 | begin = os.clock() 17 | for i = 1, iter do 18 | local t = cjson.decode(line) 19 | end 20 | local t2 = os.clock() - begin 21 | 22 | print(t1, t2, (t2-t1)/t2 * 100) 23 | end 24 | -------------------------------------------------------------------------------- /demo.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "ljson_parser.h" 10 | 11 | char* load_json(const char* file_path, size_t* len) { 12 | struct stat buf; 13 | if (stat(file_path, &buf)) { 14 | perror("stat"); 15 | exit(1); 16 | } 17 | 18 | if (!S_ISREG(buf.st_mode)) { 19 | fprintf(stderr, "not regular file"); 20 | exit(1); 21 | } 22 | 23 | size_t file_len = buf.st_size; 24 | 25 | int fd = open(file_path, 0); 26 | if (fd == -1) { 27 | perror("open"); 28 | exit(1); 29 | } 30 | 31 | char *payload = malloc(file_len); 32 | if (payload == NULL) { 33 | perror("malloc"); 34 | exit(1); 35 | } 36 | 37 | if (read(fd, payload, file_len) != file_len) { 38 | perror("read"); 39 | exit(1); 40 | } 41 | 42 | close(fd); 43 | 44 | *len = file_len; 45 | return payload; 46 | } 47 | 48 | int 49 | main (int argc, char** argv) { 50 | if (argc != 2) { 51 | fprintf(stderr, "usage: argv[0] json-file\n"); 52 | return 1; 53 | } 54 | 55 | size_t len; 56 | char* json = load_json(argv[1], &len); 57 | 58 | struct json_parser* jp = jp_create(); 59 | if (!jp) { 60 | fprintf(stderr, "WTF\n"); 61 | return 1; 62 | } 63 | 64 | #if 1 65 | int i = 0; 66 | int ret = 0; 67 | for (; i < 10000; i++) { 68 | if (jp_parse(jp, json, len) == NULL) { 69 | ret = 1; 70 | fprintf(stderr, "parsing failed: %s\n", jp_get_err(jp)); 71 | break; 72 | } 73 | } 74 | #else 75 | obj_t* obj = jp_parse(jp, json, len); 76 | if (obj) { 77 | dump_obj(stderr, obj); 78 | } else { 79 | fprintf(stderr, "err: %s\n", jp_get_err(jp)); 80 | } 81 | #endif 82 | jp_destroy(jp); 83 | free(json); 84 | return ret; 85 | } 86 | -------------------------------------------------------------------------------- /json_decoder.lua: -------------------------------------------------------------------------------- 1 | -- Helper wrappring script for loading shared object libac.so (FFI interface) 2 | -- from package.cpath instead of LD_LIBRARTY_PATH. 3 | -- 4 | 5 | local ffi = require 'ffi' 6 | ffi.cdef[[ 7 | typedef enum { 8 | OT_INT64, 9 | OT_FP, 10 | OT_STR, 11 | OT_BOOL, 12 | OT_NULL, 13 | OT_LAST_PRIMITIVE = OT_NULL, 14 | OT_HASHTAB, 15 | OT_ARRAY, 16 | OT_ROOT /* type of dummy object introduced during parsing process */ 17 | } obj_ty_t; 18 | 19 | struct obj_tag; 20 | typedef struct obj_tag obj_t; 21 | 22 | struct obj_tag { 23 | obj_t* next; 24 | int32_t obj_ty; 25 | union { 26 | int32_t str_len; 27 | int32_t elmt_num; /* # of element of array/hashtab */ 28 | }; 29 | }; 30 | 31 | /* primitive object */ 32 | typedef struct { 33 | obj_t common; 34 | union { 35 | char* str_val; 36 | int64_t int_val; 37 | double db_val; 38 | }; 39 | } obj_primitive_t; 40 | 41 | struct obj_composite_tag; 42 | typedef struct obj_composite_tag obj_composite_t; 43 | struct obj_composite_tag { 44 | obj_t common; 45 | obj_t* subobjs; 46 | obj_composite_t* reverse_nesting_order; 47 | uint32_t id; 48 | }; 49 | 50 | struct json_parser; 51 | 52 | /* Export functions */ 53 | struct json_parser* jp_create(void); 54 | obj_t* jp_parse(struct json_parser*, const char* json, uint32_t len); 55 | const char* jp_get_err(struct json_parser*); 56 | void jp_destroy(struct json_parser*); 57 | ]] 58 | 59 | local cobj_ptr_t = ffi.typeof("obj_composite_t*") 60 | local pobj_ptr_t = ffi.typeof("obj_primitive_t*") 61 | local obj_ptr_t = ffi.typeof("obj_t*") 62 | 63 | local ffi_cast = ffi.cast 64 | local ffi_string = ffi.string 65 | 66 | local _M = {} 67 | local ok, tab_new = pcall(require, "table.new") 68 | if not ok then 69 | tab_new = function (narr, nrec) return {} end 70 | end 71 | 72 | local jp_lib 73 | 74 | --[[ Find shared object file package.cpath, obviating the need of setting 75 | LD_LIBRARY_PATH 76 | ]] 77 | local function find_shared_obj(cpath, so_name) 78 | local string_gmatch = string.gmatch 79 | local string_match = string.match 80 | local io_open = io.open 81 | 82 | for k in string_gmatch(cpath, "[^;]+") do 83 | local so_path = string_match(k, "(.*/)") 84 | so_path = so_path .. so_name 85 | 86 | -- Don't get me wrong, the only way to know if a file exist is trying 87 | -- to open it. 88 | local f = io_open(so_path) 89 | if f ~= nil then 90 | io.close(f) 91 | return so_path 92 | end 93 | end 94 | end 95 | 96 | local function load_json_parser() 97 | if jp_lib ~= nil then 98 | return jp_lib 99 | else 100 | local so_path = find_shared_obj(package.cpath, "libljson.so") 101 | if so_path ~= nil then 102 | jp_lib = ffi.load(so_path) 103 | return jp_lib 104 | end 105 | end 106 | end 107 | 108 | function _M.create() 109 | end 110 | 111 | local ty_int64 = 0 112 | local ty_fp = 1 113 | local ty_str = 2 114 | local ty_bool = 3 115 | local ty_null = 4 116 | local ty_last_primitive = 4 117 | local ty_hashtab = 5 118 | local ty_array= 6 119 | 120 | local create_primitive 121 | local create_array 122 | local create_hashtab 123 | local convert_obj 124 | local tonumber = tonumber 125 | 126 | create_primitive = function(obj) 127 | local ty = obj.common.obj_ty 128 | if ty == ty_int64 then 129 | return tonumber(obj.int_val) 130 | elseif ty == ty_str then 131 | return ffi_string(obj.str_val, obj.common.str_len) 132 | elseif ty == ty_null then 133 | return nil 134 | elseif ty == ty_bool then 135 | if obj.int_val == 0 then 136 | return false 137 | else 138 | return true 139 | end 140 | else 141 | return tonumber(obj.db_val) 142 | end 143 | 144 | return nil, "Unknown primitive type" 145 | end 146 | 147 | create_array = function(array, cobj_array) 148 | local elmt_num = array.common.elmt_num 149 | local elmt_list = array.subobjs 150 | 151 | -- HINT: The representation of an array-obj [e1, e2,..., en] 152 | -- is en->...->e2->e1 153 | local result = tab_new(elmt_num, 0) 154 | for iter = 1, elmt_num do 155 | local elmt = elmt_list 156 | 157 | local elmt_obj 158 | if elmt.obj_ty <= ty_last_primitive then 159 | local err; 160 | elmt_obj, err = create_primitive(ffi_cast(pobj_ptr_t, elmt)) 161 | if err then 162 | return nil, err 163 | end 164 | else 165 | local cobj = ffi_cast(cobj_ptr_t, elmt); 166 | elmt_obj = cobj_array[cobj.id + 1] 167 | end 168 | 169 | result[elmt_num - iter + 1] = elmt_obj 170 | elmt_list = elmt_list.next 171 | end 172 | 173 | cobj_array[array.id + 1] = result 174 | 175 | return result; 176 | end 177 | 178 | create_hashtab = function(hashtab, cobj_array) 179 | local elmt_num = hashtab.common.elmt_num 180 | local elmt_list = hashtab.subobjs 181 | 182 | -- HINT: The representation of a hash-obj {k1,v1,...,kn:vn} 183 | -- is vn->kn->...->v1->k1. 184 | local result = tab_new(0, elmt_num / 2) 185 | for _ = 1, elmt_num, 2 do 186 | local val = elmt_list 187 | elmt_list = elmt_list.next 188 | 189 | local key = ffi_cast(pobj_ptr_t, elmt_list) 190 | local key_obj = ffi_string(key.str_val, key.common.str_len) 191 | 192 | local val_obj = nil 193 | if val.obj_ty <= ty_last_primitive then 194 | local err; 195 | val_obj, err = create_primitive(ffi_cast(pobj_ptr_t, val)) 196 | if err then 197 | return nil, err 198 | end 199 | else 200 | local cobj = ffi_cast(cobj_ptr_t, val); 201 | val_obj = cobj_array[cobj.id + 1] 202 | end 203 | 204 | result[key_obj] = val_obj; 205 | elmt_list = elmt_list.next 206 | end 207 | 208 | cobj_array[hashtab.id + 1] = result 209 | 210 | return result 211 | end 212 | 213 | convert_obj = function(obj, cobj_array) 214 | local ty = obj.obj_ty 215 | if ty <= ty_last_primitive then 216 | return create_primitive(ffi_cast(pobj_ptr_t, obj)) 217 | elseif ty == ty_array then 218 | return create_array(ffi_cast(cobj_ptr_t, obj), cobj_array) 219 | else 220 | return create_hashtab(ffi_cast(cobj_ptr_t, obj), cobj_array) 221 | end 222 | end 223 | 224 | -- Create an array big enough to accommodate elmt_num + 2 elements. 225 | -- If cobj_vect is big enough, return it; otherwise, create a new one. 226 | local function create_cobj_vect(cobj_vect, elmt_num) 227 | local array_size = elmt_num + 2 228 | local cap = cobj_vect[0] 229 | if cap < 400 and cap >= array_size then 230 | return cobj_vect 231 | end 232 | 233 | cobj_vect = tab_new(array_size, 1) 234 | cobj_vect[0] = array_size 235 | return cobj_vect 236 | end 237 | 238 | -- set each element to be nil, such that they can be GC-ed ASAP. 239 | local function clean_cobj_vect(cobj_vect, elmt_num) 240 | for iter = 1, elmt_num + 2 do 241 | cobj_vect[iter] = nil 242 | end 243 | end 244 | 245 | -- ######################################################################### 246 | -- 247 | -- "Export" functions 248 | -- 249 | -- ######################################################################### 250 | local setmetatable = setmetatable 251 | local mt = { __index = _M } 252 | 253 | function _M.new() 254 | if not jp_lib then 255 | load_json_parser() 256 | end 257 | 258 | if not jp_lib then 259 | return nil, "fail to load libjson.so" 260 | end 261 | 262 | local parser_inst = jp_lib.jp_create() 263 | if parser_inst ~= nil then 264 | ffi.gc(parser_inst, jp_lib.jp_destroy) 265 | else 266 | return nil, "Fail to create JSON parser, likely due to OOM" 267 | end 268 | 269 | local cobj_vect = tab_new(100, 1) 270 | if cobj_vect then 271 | cobj_vect[0] = 100 272 | else 273 | return nil, "fail to create intermediate array" 274 | end 275 | 276 | local self = { 277 | cobj_vect = cobj_vect, 278 | parser = parser_inst 279 | } 280 | 281 | return setmetatable(self, mt) 282 | end 283 | 284 | function _M.decode(self, json) 285 | --[[ 286 | if not self then 287 | return nil, "JSON parser was not initialized properly" 288 | end]] 289 | 290 | local objs = jp_lib.jp_parse(self.parser, json, #json) 291 | if objs == nil then 292 | return nil, ffi_string(jp_lib.jp_get_err(self.parser)) 293 | end 294 | 295 | local ty = objs.obj_ty 296 | if ty <= ty_last_primitive then 297 | return convert_obj(objs) 298 | end 299 | 300 | local composite_objs = ffi_cast(cobj_ptr_t, objs) 301 | local elmt_num = composite_objs.id 302 | local cobj_vect = create_cobj_vect(self.cobj_vect, elmt_num) 303 | self.cobj_vect = cobj_vect 304 | 305 | local last_val 306 | repeat 307 | last_val = convert_obj(ffi_cast(obj_ptr_t, composite_objs), cobj_vect) 308 | composite_objs = composite_objs.reverse_nesting_order 309 | until composite_objs == nil 310 | 311 | clean_cobj_vect(cobj_vect, elmt_num) 312 | 313 | return last_val 314 | end 315 | 316 | -- return: 317 | -- 1). array of strings in the input JSON 318 | -- 2). error message if error occur 319 | -- 320 | -- 1) could be nil if no string at all is found, if 1) is non-nil 321 | -- element with index 0 is the size of the array 322 | -- 323 | function _M.get_strings(self, json) 324 | 325 | -- step 1: decode the input JSON 326 | local objs = jp_lib.jp_parse(self.parser, json, #json) 327 | if objs == nil then 328 | return nil, ffi_string(jp_lib.jp_get_err(self.parser)) 329 | end 330 | 331 | local ty = objs.obj_ty 332 | if ty <= ty_last_primitive then 333 | -- The enclosing object must be either a hashtab or array 334 | return nil, "malformed JSON" 335 | end 336 | 337 | local composite_objs = ffi_cast(cobj_ptr_t, objs) 338 | local str_count = 0 339 | 340 | -- step 2: count the number of strings 341 | repeat 342 | local elmt_num = composite_objs.common.elmt_num 343 | local elmt_list = composite_objs.subobjs 344 | 345 | -- go through all element 346 | for iter = 1, elmt_num do 347 | local elmt = elmt_list 348 | elmt_list = elmt_list.next 349 | 350 | if elmt.obj_ty == ty_str then 351 | str_count = str_count + 1 352 | end 353 | end 354 | composite_objs = composite_objs.reverse_nesting_order 355 | until composite_objs == nil 356 | 357 | if str_count == 0 then 358 | return 359 | end 360 | 361 | -- step 3: collect all strings 362 | local str_array = tab_new(str_count, 1) 363 | composite_objs = ffi_cast(cobj_ptr_t, objs) 364 | local idx = 1 365 | 366 | repeat 367 | local elmt_num = composite_objs.common.elmt_num 368 | local elmt_list = composite_objs.subobjs 369 | 370 | -- go through all elements 371 | for iter = 1, elmt_num do 372 | local elmt = elmt_list 373 | elmt_list = elmt_list.next 374 | 375 | if elmt.obj_ty == ty_str then 376 | elmt = ffi_cast(pobj_ptr_t, elmt) 377 | str_array[idx] = ffi_string(elmt.str_val, elmt.common.str_len) 378 | idx = idx + 1 379 | end 380 | end 381 | composite_objs = composite_objs.reverse_nesting_order 382 | until composite_objs == nil 383 | 384 | str_array[0] = str_count; 385 | 386 | return str_array 387 | end 388 | 389 | -- ######################################################################### 390 | -- 391 | -- Debugging and Misc 392 | -- 393 | -- ######################################################################### 394 | local print_primitive 395 | local print_table 396 | local print_var 397 | local print = print 398 | local string_format = string.format 399 | local tostring = tostring 400 | local pairs = pairs 401 | local type = type 402 | local io_write = io.write 403 | 404 | print_primitive = function(luadata) 405 | if type(luadata) == "string" then 406 | io_write(string_format("\"%s\"", luadata)) 407 | else 408 | io_write(tostring(luadata)) 409 | end 410 | end 411 | 412 | print_table = function(array) 413 | io_write("{"); 414 | local elmt_num = 0 415 | for k, v in pairs(array) do 416 | if elmt_num > 0 then 417 | io_write(", ") 418 | end 419 | 420 | print_primitive(k) 421 | io_write(":") 422 | print_var(v) 423 | elmt_num = elmt_num + 1 424 | end 425 | io_write("}"); 426 | end 427 | print_var = function(var) 428 | if type(var) == "table" then 429 | print_table(var) 430 | else 431 | print_primitive(var) 432 | end 433 | end 434 | 435 | function _M.debug(luadata) 436 | print_var(luadata) 437 | print("") 438 | end 439 | 440 | return _M 441 | -------------------------------------------------------------------------------- /ljson_parser.h: -------------------------------------------------------------------------------- 1 | /* ************************************************************************** 2 | * 3 | * The file delcare json parser interface. All the export functions are 4 | * self-descriptive except the jp_parse() which is bit involved in its 5 | * return value. The return value of the jp_parse() is a singly-linked list 6 | * of composite objects chained together in a reverse-nesting order. 7 | * 8 | * Use the same runing example we use in parser.c. Suppose the input json 9 | * is: [1, 2, {"key": 3.4}]. Let object Obj2 be {"key": 3.4}, and Obj1 be 10 | * [1, 2, Obj2]. The return-value would be: 11 | * 12 | * Obj2 -> Obj1. (linked via obj_composite_t::reverse_nesting_order) 13 | * 14 | * Note that the out-most composite object, in this case, the array Obj1, 15 | * is at the end of the resulting list. The rationale for reverse-nesting 16 | * order is that re-consturcting the nesting relationship can be done 17 | * simply by iterating the list only once. 18 | * 19 | * The elements of a composite object is pointed by obj_composite_t::subobjs. 20 | * Again, the order of the elements is *reversed* as well. In general, if 21 | * the original JSON object is an array {e1, e2, ..., en}; the representation 22 | * of the array is list: en->...,->e2->e1; if the original JSON object is a 23 | * hash-table { k1:v1, k2:v2, ..., kn:vn}, the representation would be 24 | * vn->kn->....->v2->k2->v1->k1 25 | * 26 | * ************************************************************************** 27 | **/ 28 | #ifndef LUA_JSON_PASER_H 29 | #define LUA_JSON_PASER_H 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | #include 36 | #include 37 | 38 | typedef enum { 39 | OT_INT64, 40 | OT_FP, 41 | OT_STR, 42 | OT_BOOL, 43 | OT_NULL, 44 | OT_LAST_PRIMITIVE = OT_NULL, 45 | OT_HASHTAB, 46 | OT_ARRAY, 47 | OT_ROOT /* type of dummy object introduced during parsing process */ 48 | } obj_ty_t; 49 | 50 | /* Data structure shared both by composite and primitive objects. 51 | * In our context, jason array (object in the form of [elmt1, ... ]) and 52 | * hash-tab (in the form of {"key":value, ... } are called compostive object, 53 | * while number/string/null/boolean are called primitive objects. 54 | */ 55 | struct obj_tag; 56 | typedef struct obj_tag obj_t; 57 | 58 | struct obj_tag { 59 | obj_t* next; 60 | int32_t obj_ty; 61 | union { 62 | int32_t str_len; 63 | int32_t elmt_num; /* # of element of array/hashtab */ 64 | }; 65 | }; 66 | 67 | /* primitive object */ 68 | typedef struct { 69 | obj_t common; /* Must be the 1st field */ 70 | union { 71 | char* str_val; 72 | int64_t int_val; 73 | double db_val; 74 | }; 75 | } obj_primitive_t; 76 | 77 | /* composite object */ 78 | struct obj_composite_tag; 79 | typedef struct obj_composite_tag obj_composite_t; 80 | struct obj_composite_tag { 81 | obj_t common; /* Must be the 1st field */ 82 | obj_t* subobjs; 83 | obj_composite_t* reverse_nesting_order; 84 | uint32_t id; 85 | }; 86 | 87 | struct json_parser; 88 | 89 | #ifdef BUILDING_SO 90 | #ifndef __APPLE__ 91 | #define LJP_EXPORT __attribute__ ((visibility ("protected"))) 92 | #else 93 | /* OSX does not support protect-visibility */ 94 | #define LJP_EXPORT __attribute__ ((visibility ("default"))) 95 | #endif 96 | #else 97 | #define LJP_EXPORT 98 | #endif 99 | 100 | /* ************************************************************************** 101 | * 102 | * Export Functions 103 | * 104 | * ************************************************************************** 105 | */ 106 | struct json_parser* jp_create(void) LJP_EXPORT; 107 | void jp_destroy(struct json_parser*) LJP_EXPORT; 108 | 109 | /* Parse the given json, and return the resulting object corresponding to the 110 | * input json is returned. In the event of error, NULL is returned. See the 111 | * above comment for details. 112 | */ 113 | obj_t* jp_parse(struct json_parser*, const char* json, uint32_t len) LJP_EXPORT; 114 | 115 | /* Get the error message. Do not call this function if jp_parser() return 116 | * non-NULL pointer. 117 | */ 118 | const char* jp_get_err(struct json_parser*) LJP_EXPORT; 119 | 120 | /* Dump the result returned from jp_parse() */ 121 | void dump_obj(FILE*, obj_t*) LJP_EXPORT; 122 | 123 | #ifdef __cplusplus 124 | } 125 | #endif 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /mempool.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "mempool.h" 6 | #include "util.h" 7 | 8 | #define MAX_ALIGN 32 9 | 10 | static int 11 | default_chunk_sz() { 12 | static int page_sz; 13 | if (!page_sz) { 14 | page_sz = sysconf(_SC_PAGESIZE); 15 | /* Adjust by malloc overhead in an attemp to fit the chunk in a page*/ 16 | page_sz -= 64; 17 | } 18 | return page_sz; 19 | } 20 | 21 | static void 22 | align_free_pointer(chunk_hdr_t* chunk_hdr, int align) { 23 | char* p = chunk_hdr->free; 24 | p = (char*)(((intptr_t)p) & ~(align - 1)); 25 | chunk_hdr->free = p; 26 | } 27 | 28 | /* Allocate a chunk which can accommodate an object of given size. If size 29 | * is not specified (i.e. size = 0), default size is used. 30 | */ 31 | static chunk_hdr_t* 32 | alloc_chunk(int size) { 33 | int s = default_chunk_sz(); 34 | if (!size) 35 | size = s; 36 | else { 37 | size += sizeof(mempool_t) + MAX_ALIGN; 38 | if (size < s) 39 | size = s; 40 | } 41 | 42 | char* blk = (char*) malloc(size); 43 | chunk_hdr_t* chunk_hdr = (chunk_hdr_t*)blk; 44 | chunk_hdr->next = NULL; 45 | chunk_hdr->chunk_end = blk + size; 46 | chunk_hdr->free = blk + sizeof(chunk_hdr_t); 47 | align_free_pointer(chunk_hdr, DEFAULT_ALIGN); 48 | 49 | return chunk_hdr; 50 | } 51 | 52 | /* Allocate a new chunk and add it to the mempool, return 1 on success, 53 | * 0 otherwise. 54 | */ 55 | static int 56 | add_a_chunk(mempool_t* mp, int size) { 57 | chunk_hdr_t* new_chunk = alloc_chunk(size); 58 | if (!new_chunk) 59 | return 0; 60 | 61 | if (!mp->chunk_hdr.next) { 62 | ASSERT(mp->last == &mp->chunk_hdr); 63 | mp->chunk_hdr.next = new_chunk; 64 | } 65 | 66 | mp->last->next = new_chunk; 67 | mp->last = new_chunk; 68 | ASSERT(new_chunk->chunk_end - new_chunk->free >= size); 69 | 70 | return 1; 71 | } 72 | 73 | mempool_t* 74 | mp_create() { 75 | chunk_hdr_t* chunk_hdr = alloc_chunk(0); 76 | if (!chunk_hdr) 77 | return NULL; 78 | 79 | chunk_hdr->free = sizeof(struct mempool) + (char*)(void*)chunk_hdr; 80 | align_free_pointer(chunk_hdr, DEFAULT_ALIGN); 81 | 82 | mempool_t* mp = (mempool_t*)(void*)chunk_hdr; 83 | mp->last = chunk_hdr; 84 | 85 | return mp; 86 | } 87 | 88 | /* the slow-path of mp_alloc() */ 89 | void* 90 | mp_alloc_slow(mempool_t* mp, int size) { 91 | if (unlikely(add_a_chunk(mp, size) == 0)) 92 | return NULL; 93 | return mp_alloc(mp, size); 94 | } 95 | 96 | void 97 | mp_destroy(mempool_t* mp) { 98 | chunk_hdr_t* iter = mp->chunk_hdr.next; 99 | while (iter) { 100 | chunk_hdr_t* next = iter->next; 101 | free((void*)iter); 102 | iter = next; 103 | }; 104 | 105 | free((void*)mp); 106 | } 107 | 108 | /* Free all blocks allocated so far */ 109 | void 110 | mp_free_all(mempool_t* mp) { 111 | chunk_hdr_t* iter; 112 | 113 | for (iter = mp->chunk_hdr.next; iter != 0;) { 114 | chunk_hdr_t* next = iter->next; 115 | free((void*)iter); 116 | iter = next; 117 | } 118 | 119 | chunk_hdr_t* chunk = &mp->chunk_hdr; 120 | chunk->next = 0; 121 | mp->last = chunk; 122 | 123 | chunk->free = sizeof(mempool_t) + (char*)(void*)chunk; 124 | align_free_pointer(chunk, DEFAULT_ALIGN); 125 | } 126 | -------------------------------------------------------------------------------- /mempool.h: -------------------------------------------------------------------------------- 1 | /* **************************************************************************** 2 | * 3 | * The mempool is used to speedup memory allocation. Mempool allocate pape-size 4 | * "chunks" by calling malloc(), and the subsequent memory allocation is done by 5 | * carving block from the these chunks. A block dose not have management overhead, 6 | * and mempool dose not try to reclaim individual block, but it can free all the 7 | * blocks in one stroke. 8 | * 9 | * The interface functions are: 10 | * ============================= 11 | * o. mp_create : create a memory pool instance. 12 | * o. mp_destroy: destroy the memory pool instance. 13 | * o. mp_alloc(mempool, size) : allocate a block having at least "size"-byte. 14 | * block is 8-byte aligned. 15 | * o. mp_free_all() : free all blocks allocated so far. 16 | * 17 | * **************************************************************************** 18 | */ 19 | #ifndef MEM_POOL_H 20 | #define MEM_POOL_H 21 | 22 | /* A chunk is typically 4k-byte in size; the management structure resides at 23 | * the beginning of the chunk. 24 | */ 25 | typedef struct chunk_hdr chunk_hdr_t; 26 | struct chunk_hdr { 27 | chunk_hdr_t* next; 28 | char* chunk_end; 29 | char* free; 30 | }; 31 | 32 | struct mempool; 33 | typedef struct mempool mempool_t; 34 | struct mempool { 35 | chunk_hdr_t chunk_hdr; 36 | chunk_hdr_t* last; 37 | }; 38 | 39 | #define DEFAULT_ALIGN 8 40 | 41 | /* create a mempool */ 42 | mempool_t* mp_create(); 43 | 44 | /* destroy the mempool*/ 45 | void mp_destroy(mempool_t*); 46 | 47 | /* Free all blocks allocated by the mempool */ 48 | void mp_free_all(mempool_t*); 49 | 50 | /* Allocate a block of "size" bytes. Default alignment is 8-byte. */ 51 | static inline void* 52 | mp_alloc(mempool_t* mp, int size) { 53 | size = (size + DEFAULT_ALIGN - 1) & ~(DEFAULT_ALIGN - 1); 54 | 55 | chunk_hdr_t* chunk = mp->last; 56 | char* free_addr = chunk->free; 57 | char* free_end = chunk->chunk_end; 58 | 59 | if (free_addr + size <= free_end) { 60 | char* t = free_addr; 61 | chunk->free += size; 62 | return (void*)t; 63 | } 64 | 65 | void* mp_alloc_slow(mempool_t* mp, int size); 66 | return mp_alloc_slow(mp, size); 67 | } 68 | 69 | /* To allocate a block of type "t" */ 70 | #define MEMPOOL_ALLOC_TYPE(mp, t) ((t*)mp_alloc((mp), sizeof(t))) 71 | 72 | /* To allocate an array with "n" elements. Elements are of type "t".*/ 73 | #define MEMPOOL_ALLOC_TYPE_N(mp, t, n) ((t*)mp_alloc((mp), sizeof(t) * (n))) 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /parse_array.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util.h" 3 | #include "parser.h" 4 | 5 | static const char* syntax_err = "Array syntax error, expect ',' or ']'"; 6 | 7 | /* Emit this array by adding it to the nesting composite data structure. */ 8 | static void 9 | emit_array(parser_t* parser) { 10 | composite_state_t* top = pstack_top(parser); 11 | ASSERT(top->obj.common.obj_ty == OT_ARRAY); 12 | 13 | obj_t* array_obj = &top->obj.common; 14 | composite_state_t* new_top = pstack_pop(parser); 15 | insert_subobj(&new_top->obj, array_obj); 16 | } 17 | 18 | /* state returned from parse_array_elmt */ 19 | typedef enum { 20 | /* An element (must be primitive object) was successfully parsed */ 21 | PAE_DONE, 22 | 23 | /* Parsing the nesting composite element */ 24 | PAE_COMPOSITE, 25 | 26 | /* See ']' */ 27 | PAE_CLOSE, 28 | 29 | PAE_ERR 30 | } PAE_STATE; 31 | 32 | static PAE_STATE 33 | parse_array_elmt(parser_t* parser, obj_composite_t* array_obj, token_t* tk) { 34 | /* case 1: The token contains a primitive object */ 35 | if (tk_is_primitive(tk)) { 36 | if (emit_primitive_tk(parser->mempool, tk, array_obj)) 37 | return PAE_DONE; 38 | return PAE_ERR; 39 | } 40 | 41 | if (tk->type == TT_CHAR) { 42 | char c = tk->char_val; 43 | 44 | /* case 2: The token is the starting delimiter of composite objects. */ 45 | if (c == '{') { 46 | if (!start_parsing_hashtab(parser)) 47 | return PAE_ERR; 48 | return PAE_COMPOSITE; 49 | } 50 | 51 | if (c == '[') { 52 | if (!start_parsing_array(parser)) 53 | return PAE_ERR; 54 | return PAE_COMPOSITE; 55 | } 56 | 57 | /* case 3: see the array closing delimiter */ 58 | if (c == ']') 59 | return PAE_CLOSE; 60 | } 61 | 62 | set_parser_err(parser, syntax_err); 63 | return PAE_ERR; 64 | } 65 | 66 | typedef enum { 67 | /* The scaner just saw '[', and moving on the parsing the 1st element */ 68 | PA_JUST_BEGUN, 69 | 70 | /* at least one element is parsed, and now parsing "{',' }" */ 71 | PA_PARSING_MORE_ELMT, 72 | 73 | /* Parsing the 1st *composite* element */ 74 | PA_PARSING_1st_ELMT 75 | } PA_STATE; 76 | 77 | /* Parse an array object, return 0 if something wrong take places, or 0 implying 78 | * so-far-so-good. 79 | * 80 | * Array syntax : '[' [ ELMT {',' ELMT } * ] ']' 81 | */ 82 | int 83 | parse_array(parser_t* parser) { 84 | scaner_t* scaner = &parser->scaner; 85 | const char* json_end = scaner->json_end; 86 | composite_state_t* state = pstack_top(parser); 87 | PA_STATE parse_state = state->parse_state; 88 | 89 | while (1) { 90 | /* case 1: So far we have successfully parsed at least one element, 91 | * and now move on parsing remaining elements. 92 | * 93 | * At this moment we are expecting to see: 94 | * o. "',' ELEMENT ....", or 95 | * o. closing delimiter of an array, i.e. the ']'. 96 | */ 97 | if (parse_state == PA_PARSING_MORE_ELMT) { 98 | token_t* delimiter = sc_get_token(scaner, json_end); 99 | if (delimiter->type == TT_CHAR) { 100 | char c = delimiter->char_val; 101 | if (c == ',') { 102 | token_t* tk = sc_get_token(scaner, json_end); 103 | PAE_STATE ret = parse_array_elmt(parser, &state->obj, tk); 104 | if (ret == PAE_DONE) 105 | continue; 106 | else if (ret == PAE_COMPOSITE) { 107 | /* remember where we leave off */ 108 | state->parse_state = PA_PARSING_MORE_ELMT; 109 | return 1; 110 | } else { 111 | goto err_out; 112 | } 113 | } 114 | 115 | if (c == ']') { 116 | emit_array(parser); 117 | return 1; 118 | } 119 | } 120 | goto err_out; 121 | } 122 | 123 | /* case 2: Just saw '[', and try to parse the first element. */ 124 | if (parse_state == PA_JUST_BEGUN) { 125 | token_t* tk = sc_get_token(scaner, json_end); 126 | PAE_STATE ret = parse_array_elmt(parser, &state->obj, tk); 127 | switch (ret) { 128 | case PAE_DONE: 129 | parse_state = PA_PARSING_MORE_ELMT; 130 | continue; 131 | 132 | case PAE_COMPOSITE: 133 | /* The 1st element is an composite object */ 134 | state->parse_state = PA_PARSING_1st_ELMT; 135 | return 1; 136 | 137 | case PAE_CLOSE: 138 | /* This is an empty array */ 139 | emit_array(parser); 140 | return 1; 141 | 142 | default: 143 | goto err_out; 144 | } 145 | } 146 | 147 | /* case 3: The first element is a composite object, and it is just 148 | * successfully parsed. 149 | */ 150 | ASSERT(parse_state == PA_PARSING_1st_ELMT); 151 | parse_state = PA_PARSING_MORE_ELMT; 152 | } 153 | 154 | err_out: 155 | set_parser_err(parser, syntax_err); 156 | return 0; 157 | } 158 | 159 | int 160 | start_parsing_array(parser_t* parser) { 161 | if (!pstack_push(parser, OT_ARRAY, PA_JUST_BEGUN)) 162 | return 0; 163 | 164 | return parse_array(parser); 165 | } 166 | -------------------------------------------------------------------------------- /parse_hashtab.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "parser.h" 3 | 4 | static void 5 | emit_hashtab(parser_t* parser) { 6 | composite_state_t* top = pstack_top(parser); 7 | ASSERT(top->obj.common.obj_ty == OT_HASHTAB); 8 | 9 | obj_t* array_obj = &top->obj.common; 10 | composite_state_t* new_top = pstack_pop(parser); 11 | insert_subobj(&new_top->obj, array_obj); 12 | } 13 | 14 | typedef enum { 15 | PKVP_DONE, 16 | PKVP_COMPOSITE, 17 | PKVP_CLOSE, 18 | PKVP_ERR, 19 | } PKVP_STATE; 20 | 21 | static PKVP_STATE 22 | parse_keyval_pair(parser_t* parser, obj_composite_t* htab_obj) { 23 | scaner_t* scaner = &parser->scaner; 24 | const char* json_end = scaner->json_end; 25 | 26 | token_t* tk = sc_get_token(scaner, json_end); 27 | 28 | /* step 1: Parse the key string */ 29 | if (tk->type == TT_STR) { 30 | if (unlikely(!emit_primitive_tk(parser->mempool, tk, htab_obj))) { 31 | return PKVP_ERR; 32 | } 33 | } else if (tk->type == TT_CHAR && tk->char_val == '}') { 34 | return PKVP_CLOSE; 35 | } else { 36 | if (tk->type != TT_ERR) { 37 | sc_rewind(scaner); 38 | set_parser_err(parser, "Key must be a string"); 39 | } 40 | 41 | return PKVP_ERR; 42 | } 43 | 44 | /* step 2: Expect ':' delimiter */ 45 | tk = sc_get_token(scaner, json_end); 46 | if (tk->type != TT_CHAR || tk->char_val != ':') { 47 | set_parser_err(parser, "expect ':'"); 48 | return PKVP_ERR; 49 | } 50 | 51 | /* step 3: parse the 'value' part */ 52 | tk = sc_get_token(scaner, json_end); 53 | if (tk_is_primitive(tk)) { 54 | if (unlikely(!emit_primitive_tk(parser->mempool, tk, htab_obj))) 55 | return PKVP_ERR; 56 | return PKVP_DONE; 57 | } 58 | 59 | if (tk->type == TT_CHAR) { 60 | char c = tk->char_val; 61 | if (c == '{') { 62 | start_parsing_hashtab(parser); 63 | return PKVP_COMPOSITE; 64 | } 65 | 66 | if (c == '[') { 67 | start_parsing_array(parser); 68 | return PKVP_COMPOSITE; 69 | } 70 | } 71 | 72 | set_parser_err(parser, "value object syntax error"); 73 | return PKVP_ERR; 74 | } 75 | 76 | typedef enum { 77 | PHT_JUST_BEGUN, 78 | PHT_PARSING_ELMT, 79 | } PHT_STATE; 80 | 81 | int 82 | parse_hashtab(parser_t* parser) { 83 | scaner_t* scaner = &parser->scaner; 84 | const char* json_end = scaner->json_end; 85 | 86 | composite_state_t* state = pstack_top(parser); 87 | PHT_STATE parse_state = state->parse_state; 88 | 89 | if (parse_state == PHT_JUST_BEGUN) { 90 | state->parse_state = PHT_PARSING_ELMT; 91 | PKVP_STATE ret = parse_keyval_pair(parser, &state->obj); 92 | switch (ret) { 93 | case PKVP_DONE: 94 | parse_state = PHT_PARSING_ELMT; 95 | break; 96 | 97 | case PKVP_COMPOSITE: 98 | return 1; 99 | 100 | case PKVP_CLOSE: 101 | emit_hashtab(parser); 102 | return 1; 103 | 104 | default: 105 | goto err_out; 106 | } 107 | } 108 | 109 | while (1) { 110 | token_t* tk = sc_get_token(scaner, json_end); 111 | if (tk->type == TT_CHAR) { 112 | char c = tk->char_val; 113 | if (c == ',') { 114 | PKVP_STATE ret = parse_keyval_pair(parser, &state->obj); 115 | if (ret == PKVP_DONE) 116 | continue; 117 | else if (ret == PKVP_COMPOSITE) { 118 | return 1; 119 | } 120 | 121 | goto err_out; 122 | } 123 | 124 | if (c == '}') { 125 | emit_hashtab(parser); 126 | return 1; 127 | } 128 | } 129 | 130 | goto err_out; 131 | } 132 | 133 | err_out: 134 | set_parser_err(parser, "hashtab syntax error"); 135 | return 0; 136 | } 137 | 138 | int 139 | start_parsing_hashtab(parser_t* parser) { 140 | if (!pstack_push(parser, OT_HASHTAB, PHT_JUST_BEGUN)) 141 | return 0; 142 | 143 | return parse_hashtab(parser); 144 | } 145 | -------------------------------------------------------------------------------- /parser.c: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * 3 | * This file implements the json parser. 4 | * 5 | * Working example and terminology 6 | * ================================ 7 | * We use following json to depict 8 | * how it works: 9 | * [1, 2, {"key": 3.4}] 10 | * 11 | * We call [...] as *array*, and {...} as *hashtab*. Array and hashtab are 12 | * *composite objects*, and number/string/boolean/null are *primitive objects*. 13 | * 14 | * This json snippet has two composite objects: 15 | * - O2: is a hash-table having only one element with key being "key", and 16 | * value being 3.4. 17 | * - O1: is a array containing three elements, i.e. 1, 2 and O2. 18 | * 19 | * O2 is *nested* in O1, and O1 is O2's *immediate nesting* composite object. 20 | * 21 | * How it works 22 | * ============= 23 | * The parser walks the input json from left to right, calling scaner to get a 24 | * token at a time. The scaner recognizes following tokens in order: 25 | * 26 | * token type value 27 | * --------------------------- 28 | * char '[' 29 | * number 1 30 | * char ',' 31 | * number 2 32 | * char ',' 33 | * cahr '{' 34 | * string "key" 35 | * .... 36 | * 37 | * At the heart of the parser is a *parsing-stack*, which push a level when 38 | * seeing the starting delimiter of a composite object (e.g. seeing '[' of 39 | * an array), and pop until the closing delimiter of the same composite object 40 | * is seen). So, the parse-stack is in essence mimicking the nesting 41 | * relationship. Actually in our implementation, the stack element contains 42 | * a data structure keeping track of the current composite object being 43 | * processed. 44 | * 45 | * The result of the parser is organized in reverse-nesting order linked 46 | * in a singly-linked list. See the comment to jp_parse() in ljson_parser.h 47 | * for details. 48 | * 49 | * ************************************************************************ 50 | */ 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | 57 | #include "util.h" 58 | #include "mempool.h" 59 | #include "scaner.h" 60 | #include "parser.h" 61 | 62 | #ifdef DEBUG 63 | static int verfiy_reverse_nesting_order(obj_t* parse_result); 64 | #endif 65 | /* ************************************************************************** 66 | * 67 | * About parse-stack. 68 | * 69 | * ************************************************************************** 70 | */ 71 | static inline void 72 | init_obj(obj_t* obj, obj_ty_t ty) { 73 | obj->next = 0; 74 | obj->obj_ty = ty; 75 | obj->elmt_num = 0; 76 | } 77 | 78 | static inline void 79 | init_composite_obj(obj_composite_t* obj, obj_ty_t ty, uint32_t id) { 80 | init_obj(&obj->common, ty); 81 | obj->subobjs = 0; 82 | obj->id = id; 83 | } 84 | 85 | static inline composite_state_t* 86 | alloc_composite_state(parser_t* parser) { 87 | composite_state_t* cs; 88 | cs = MEMPOOL_ALLOC_TYPE(parser->mempool, composite_state_t); 89 | return cs; 90 | } 91 | 92 | static void 93 | pstack_init(parser_t* parser) { 94 | composite_state_t* cs = &parser->parse_stack; 95 | init_composite_obj(&cs->obj, OT_ROOT, 0); 96 | 97 | cs->next = 0; 98 | cs->prev = cs; /* this is *top* */ 99 | } 100 | 101 | int 102 | pstack_push(parser_t* parser, obj_ty_t obj_ty, int init_state) { 103 | /* Step 1: Allocate an stack element */ 104 | composite_state_t* cs = alloc_composite_state(parser); 105 | if (unlikely(!cs)) 106 | return 0; 107 | 108 | /* Step 2: Initialize the corresponding composite object. */ 109 | obj_composite_t* cobj = &cs->obj; 110 | init_composite_obj(cobj, obj_ty, parser->next_cobj_id++); 111 | 112 | /* link the composite objects in reverse-nesting order */ 113 | cobj->reverse_nesting_order = (obj_composite_t*)(void*)parser->result; 114 | parser->result = &cobj->common; 115 | 116 | /* Step 3: Push one level */ 117 | cs->parse_state = init_state; 118 | cs->next = 0; 119 | 120 | composite_state_t* root = &parser->parse_stack; 121 | composite_state_t* top = root->prev; 122 | cs->prev = top; 123 | root->prev = cs; /* update the "top" */ 124 | 125 | return 1; 126 | } 127 | 128 | composite_state_t* 129 | pstack_pop(parser_t* parser) { 130 | composite_state_t* ps = &parser->parse_stack; 131 | composite_state_t* top = ps->prev; 132 | 133 | composite_state_t* new_top = top->prev; 134 | new_top->next = 0; 135 | ps->prev = new_top; 136 | 137 | return new_top; 138 | } 139 | 140 | /*************************************************************************** 141 | * 142 | * Emit Objects 143 | * 144 | *************************************************************************** 145 | */ 146 | 147 | /* Convert the primitive token to primitive object */ 148 | static inline obj_t* 149 | cvt_primitive_tk(mempool_t* mp, token_t* tk) { 150 | ASSERT(tk_is_primitive(tk)); 151 | obj_primitive_t* obj = MEMPOOL_ALLOC_TYPE(mp, obj_primitive_t); 152 | if (unlikely(!obj)) 153 | return 0; 154 | 155 | ASSERT((((int)TT_INT64 == (int)OT_INT64) && 156 | ((int)TT_FP == (int)OT_FP) && 157 | ((int)TT_STR == (int)OT_STR) && 158 | ((int)TT_BOOL == (int)OT_BOOL) && 159 | ((int)TT_NULL == (int)OT_NULL))); 160 | 161 | obj->common.obj_ty = tk->type; 162 | obj->common.str_len = tk->str_len; 163 | obj->int_val = tk->int_val; 164 | 165 | return &obj->common; 166 | } 167 | 168 | void 169 | insert_subobj(obj_composite_t* nesting, obj_t* nested) { 170 | nested->next = nesting->subobjs; 171 | nesting->subobjs = nested; 172 | nesting->common.elmt_num ++; 173 | } 174 | 175 | int 176 | emit_primitive_tk(mempool_t* mp, token_t* tk, 177 | obj_composite_t* nesting_cobj) { 178 | obj_t* obj = cvt_primitive_tk(mp, tk); 179 | if (obj) { 180 | insert_subobj(nesting_cobj, obj); 181 | return 1; 182 | } 183 | 184 | return 0; 185 | } 186 | 187 | /*************************************************************************** 188 | * 189 | * Parser driver 190 | * 191 | *************************************************************************** 192 | */ 193 | obj_t* 194 | parse(parser_t* parser, const char* json, uint32_t json_len) { 195 | scaner_t* scaner = &parser->scaner; 196 | const char* json_end = scaner->json_end; 197 | pstack_init(parser); 198 | 199 | token_t* tk = sc_get_token(scaner, json_end); 200 | token_ty_t tk_ty = tk->type; 201 | 202 | /* case 1: The input json starts with delimiter of composite objects 203 | * (i.e. array/hashtab). 204 | */ 205 | if (tk_ty == TT_CHAR) { 206 | int succ = 0; 207 | char c = tk->char_val; 208 | if (c == '{') { 209 | succ = start_parsing_hashtab(parser); 210 | } else if (c == '[') { 211 | succ = start_parsing_array(parser); 212 | } else { 213 | set_parser_err_fmt(parser, "Unknow object starting with '%c'", c); 214 | return 0; 215 | } 216 | 217 | while (succ) { 218 | composite_state_t* top = pstack_top(parser); 219 | obj_ty_t ot = top->obj.common.obj_ty; 220 | if (ot == OT_HASHTAB) { 221 | succ = parse_hashtab(parser); 222 | } else if (ot == OT_ARRAY) { 223 | succ = parse_array(parser); 224 | } else { 225 | ASSERT(ot == OT_ROOT); 226 | break; 227 | } 228 | } 229 | 230 | if (unlikely(!succ)) 231 | return 0; 232 | 233 | token_t* end_tk = sc_get_token(scaner, json_end); 234 | if (end_tk->type != TT_END) { 235 | goto trailing_junk; 236 | } 237 | 238 | return parser->result; 239 | } 240 | 241 | /* case 2: The input jason is empty */ 242 | if (unlikely(tk_ty == TT_END)) { 243 | parser->err_msg = "Input json is empty"; 244 | return 0; 245 | } 246 | 247 | /* case 3: The input starts with a primitive object. I don't know if it 248 | * conforms to spec or not. 249 | */ 250 | if (tk_is_primitive(tk)) { 251 | parser->result = cvt_primitive_tk(parser->mempool, tk); 252 | if (sc_get_token(scaner, json_end)->type == TT_END) { 253 | return parser->result; 254 | } 255 | } 256 | 257 | trailing_junk: 258 | parser->result = 0; 259 | set_parser_err(parser, "Extraneous stuff"); 260 | return 0; 261 | } 262 | 263 | static void 264 | reset_parser(parser_t* parser, const char* json, uint32_t json_len) { 265 | mempool_t* mp = parser->mempool; 266 | mp_free_all(mp); 267 | 268 | pstack_init(parser); 269 | sc_init_scaner(&parser->scaner, mp, json, json_len); 270 | parser->result = 0; 271 | 272 | parser->err_msg = 0; 273 | parser->next_cobj_id = 1; 274 | } 275 | 276 | /**************************************************************************** 277 | * 278 | * Implementation of the exported functions 279 | * 280 | *************************************************************************** 281 | */ 282 | struct json_parser* 283 | jp_create(void) { 284 | parser_t* p = (parser_t*)malloc(sizeof(parser_t)); 285 | if (unlikely(!p)) 286 | return 0; 287 | 288 | mempool_t* mp = mp_create(); 289 | if (unlikely(!mp)) 290 | return 0; 291 | 292 | p->mempool = mp; 293 | p->result = 0; 294 | p->err_msg = "Out of Memory"; /* default error message :-)*/ 295 | 296 | pstack_init(p); 297 | return (struct json_parser*)(void*)p; 298 | } 299 | 300 | obj_t* 301 | jp_parse(struct json_parser* jp, const char* json, uint32_t len) { 302 | parser_t* parser = (parser_t*)(void*)jp; 303 | reset_parser(parser, json, len); 304 | 305 | obj_t* obj = parse(parser, json, len); 306 | ASSERT(verfiy_reverse_nesting_order(obj)); 307 | return obj; 308 | } 309 | 310 | void 311 | jp_destroy(struct json_parser* p) { 312 | parser_t* parser = (parser_t*)(void*)p; 313 | mp_destroy(parser->mempool); 314 | free((void*)p); 315 | } 316 | 317 | /* ***************************************************************************** 318 | * 319 | * Debugging, error handling and other cold code 320 | * 321 | * ***************************************************************************** 322 | */ 323 | void __attribute__((format(printf, 2, 3), cold)) 324 | set_parser_err_fmt(parser_t* parser, const char* fmt, ...) { 325 | if (parser->err_msg) 326 | return; 327 | 328 | int buf_len = 250; 329 | char* buf = MEMPOOL_ALLOC_TYPE_N(parser->mempool, char, buf_len); 330 | if (!buf) { 331 | parser->err_msg = "OOM"; 332 | return; 333 | } 334 | parser->err_msg = buf; 335 | 336 | scaner_t* scaner = &parser->scaner; 337 | /* In case error take place in scaner, we should go for scaner's 338 | * error message. 339 | */ 340 | 341 | if (scaner->err_msg) { 342 | snprintf(buf, buf_len, "%s", scaner->err_msg); 343 | return; 344 | } 345 | 346 | int loc_info_len = snprintf(buf, buf_len, "(line:%d,col:%d) ", 347 | scaner->line_num, scaner->col_num); 348 | buf += loc_info_len; 349 | buf_len -= loc_info_len; 350 | 351 | va_list vl; 352 | va_start(vl, fmt); 353 | vsnprintf(buf, buf_len, fmt, vl); 354 | va_end(vl); 355 | } 356 | 357 | void __attribute__((cold)) 358 | set_parser_err(parser_t* parser, const char* str) { 359 | if (!parser->err_msg) 360 | set_parser_err_fmt(parser, "%s", str); 361 | } 362 | 363 | static void __attribute__((cold)) 364 | dump_primitive_obj (FILE* f, obj_t* the_obj) { 365 | obj_primitive_t* obj = (obj_primitive_t*)(void*)the_obj; 366 | 367 | switch (the_obj->obj_ty) { 368 | case OT_INT64: 369 | fprintf(f, "%" PRIi64, obj->int_val); 370 | break; 371 | 372 | case OT_FP: 373 | fprintf(f, "%.16f", obj->db_val); 374 | break; 375 | 376 | case OT_STR: 377 | { 378 | int idx = 0; 379 | int len = the_obj->str_len; 380 | fputc('"', f); 381 | for (; idx < len; idx++) { 382 | char c = obj->str_val[idx]; 383 | if (isprint(c)) { 384 | fputc(c, f); 385 | } else { 386 | fprintf(f, "\\%#02x", c); 387 | } 388 | } 389 | fputc('"', f); 390 | } 391 | break; 392 | 393 | case OT_BOOL: 394 | fputs(obj->int_val ? "true" : "false", f); 395 | break; 396 | 397 | case OT_NULL: 398 | fputs("null", f); 399 | break; 400 | 401 | default: 402 | ASSERT(0 && "NOT Primitive"); 403 | break; 404 | } 405 | } 406 | 407 | void __attribute__((cold)) 408 | dump_composite_obj(FILE* f, obj_composite_t* cobj) { 409 | obj_ty_t type = cobj->common.obj_ty; 410 | if (type != OT_ARRAY && type != OT_HASHTAB) { 411 | fprintf(f, "unknown composite type %d\n", (int)type); 412 | return; 413 | } 414 | 415 | obj_t* elmt_slist = cobj->subobjs; 416 | int elmt_num = cobj->common.elmt_num; 417 | 418 | obj_t** elmt_vect = (obj_t**)malloc(sizeof(obj_t*) * elmt_num); 419 | int i = elmt_num - 1; 420 | while (elmt_slist) { 421 | elmt_vect[i] = elmt_slist; 422 | elmt_slist = elmt_slist->next; 423 | i--; 424 | } 425 | 426 | if (i != -1) { 427 | free(elmt_vect); 428 | fprintf(f, "the numbers of elements disagree\n"); 429 | return; 430 | } 431 | 432 | if (type == OT_ARRAY) { 433 | fprintf (f, "[ (id:%d) ", cobj->id); 434 | int i; 435 | for(i = 0; i < elmt_num; i++) { 436 | obj_t* elmt = elmt_vect[i]; 437 | if (elmt->obj_ty <= OT_LAST_PRIMITIVE) { 438 | dump_primitive_obj(f, elmt); 439 | } else { 440 | int id = ((obj_composite_t*)(void*)elmt)->id; 441 | fprintf(f, "obj-%d", id); 442 | } 443 | 444 | if (i != elmt_num - 1) 445 | fputs(", ", f); 446 | } 447 | fputs("]\n", f); 448 | } else { 449 | ASSERT(type == OT_HASHTAB); 450 | ASSERT((elmt_num & 1) == 0); 451 | 452 | fprintf(f, "{ (id:%d) ", cobj->id); 453 | int i; 454 | for(i = 0; i < elmt_num; i+=2) { 455 | obj_t* key = elmt_vect[i]; 456 | obj_t* val = elmt_vect[i+1]; 457 | dump_primitive_obj(f, key); 458 | 459 | fputc(':', f); 460 | 461 | if (val->obj_ty <= OT_LAST_PRIMITIVE) { 462 | dump_primitive_obj(f, val); 463 | } else { 464 | int id = ((obj_composite_t*)(void*)val)->id; 465 | fprintf(f, "obj-%d", id); 466 | } 467 | 468 | if (i != elmt_num - 2) 469 | fputs(", ", f); 470 | } 471 | fputs("}\n", f); 472 | } 473 | 474 | free(elmt_vect); 475 | } 476 | 477 | void __attribute__((cold)) 478 | dump_obj(FILE* f, obj_t* obj) { 479 | if (!obj) { 480 | fprintf(f, "null\n"); 481 | return; 482 | } 483 | 484 | obj_ty_t type = obj->obj_ty; 485 | if (type <= OT_LAST_PRIMITIVE) { 486 | dump_primitive_obj(f, obj); 487 | fputc('\n', f); 488 | } else { 489 | obj_composite_t* cobj = (obj_composite_t*)(void*)obj; 490 | for (; cobj; cobj = cobj->reverse_nesting_order) { 491 | dump_composite_obj(f, cobj); 492 | } 493 | } 494 | } 495 | 496 | const char* __attribute__((cold)) 497 | jp_get_err(struct json_parser* p) { 498 | parser_t* parser = (parser_t*)(void*)p; 499 | return parser->err_msg; 500 | } 501 | 502 | #ifdef DEBUG 503 | static int 504 | verfiy_reverse_nesting_order(obj_t* parse_result) { 505 | if (!parse_result) 506 | return 1; 507 | 508 | obj_ty_t type = parse_result->obj_ty; 509 | if (type <= OT_LAST_PRIMITIVE) 510 | return 0; 511 | 512 | obj_composite_t* cobj = (obj_composite_t*)(void*)parse_result; 513 | 514 | int obj_cnt = 1; 515 | 516 | int first_id, last_id; 517 | first_id = last_id = cobj->id; 518 | 519 | /* loop over all composite-object in the the reverse-nesting order */ 520 | for (cobj = cobj->reverse_nesting_order; 521 | cobj != 0; 522 | cobj = cobj->reverse_nesting_order) { 523 | if (cobj->id != last_id - 1) 524 | return 0; 525 | 526 | last_id = cobj->id; 527 | obj_cnt++; 528 | } 529 | 530 | if (last_id != 1 || obj_cnt != first_id) 531 | return 0; 532 | 533 | return 1; 534 | } 535 | #endif 536 | -------------------------------------------------------------------------------- /parser.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_H 2 | #define PARSER_H 3 | 4 | //#include "adt.h" 5 | #include "mempool.h" 6 | #include "ljson_parser.h" 7 | #include "scaner.h" 8 | 9 | /**************************************************************************** 10 | * 11 | * Data structures. 12 | * 13 | **************************************************************************** 14 | */ 15 | 16 | /* state of parsing composite object */ 17 | typedef struct composite_state_tag composite_state_t; 18 | struct composite_state_tag { 19 | obj_composite_t obj; 20 | int parse_state; 21 | composite_state_t* prev; 22 | composite_state_t* next; 23 | }; 24 | 25 | typedef struct { 26 | composite_state_t parse_stack; 27 | scaner_t scaner; 28 | const char* err_msg; 29 | mempool_t* mempool; 30 | /* link the composite objects in a reverse nesting order. e.g 31 | * Suppose Json is: [1, {"key":val}], the result is the linked list with 32 | * 1st element being the hashtab, and the second one being its enclosing 33 | * array. 34 | */ 35 | obj_t* result; 36 | int next_cobj_id; /* next composite object id */ 37 | } parser_t; 38 | 39 | /**************************************************************************** 40 | * 41 | * Implementation of pstack_t 42 | * 43 | **************************************************************************** 44 | */ 45 | static inline composite_state_t* 46 | pstack_top(parser_t* parser) { 47 | composite_state_t* ps = &parser->parse_stack; 48 | return ps->prev; 49 | } 50 | 51 | int pstack_push(parser_t*, obj_ty_t, int init_state); 52 | composite_state_t* pstack_pop(parser_t*); 53 | 54 | /**************************************************************************** 55 | * 56 | * Utilities 57 | * 58 | **************************************************************************** 59 | */ 60 | int emit_primitive_tk(mempool_t* mp, token_t* tk, obj_composite_t* nesting_obj); 61 | 62 | void insert_subobj(obj_composite_t* nesting, obj_t* nested); 63 | 64 | void __attribute__((format(printf, 2, 3), cold)) 65 | set_parser_err_fmt(parser_t* parser, const char* fmt, ...); 66 | 67 | void __attribute__((cold)) set_parser_err(parser_t*, const char* str); 68 | 69 | int start_parsing_array(parser_t*); 70 | int start_parsing_hashtab(parser_t*); 71 | 72 | int parse_hashtab(parser_t* parser); 73 | int parse_array(parser_t* parser); 74 | 75 | #endif /* PARSER_H */ 76 | -------------------------------------------------------------------------------- /scan_fp.h: -------------------------------------------------------------------------------- 1 | #ifndef SCAN_FP_H 2 | #define SCAN_FP_H 3 | 4 | typedef union { 5 | int64_t int_val; 6 | double db_val; 7 | } int_db_union_t; 8 | 9 | /* return 0 on error, 1 if the result contains integer value, and 2 if the 10 | * result contains floating point value. 11 | */ 12 | int scan_fp(const char** scan_str, const char* str_e, int_db_union_t* result); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /scan_fp_relax.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * 3 | * This file tries to parse floating point literal quickly. There are two 4 | * implementations in this files: 5 | * 6 | * 1). The one very relaxed floating point mode (FP_RELAX >= 2), and 7 | * 2). The one with *almost* restrict mode. (FP_RELAX == 1). 8 | * 9 | * In variant 1), we evaluate a floating point number, say 12.345E12 as following: 10 | * a) let d1 = 12 11 | * b) let d2 = 345 * (10**-2 * 10 **-1) (NOTE: reciprocal is very imprecise) 12 | * c) let d3 = d1 + d2 13 | * d) let result = d3 * (10**3 + 10**2) 14 | * 15 | * The reason and the only reason to keep the toy-grade variant-1 is to set 16 | * a bar (in terms of parsing speed) for the future work). 17 | * 18 | * Variant-2) is *almost* restrict. It can efficiently parse a floating point 19 | * literal if it's in the form of nnnn.mmm, and the integer part contains no 20 | * more than 20 digits, fraction part contains than 16 digits (it the liternal 21 | * does not satisfy this restrct, it would resort to expensive strtod() libc 22 | * function call). Variant 2) evaluate a liternal, say 123.456 this way: 23 | * a) let d1 = 123 24 | * b) let d2 = 456/10**3 25 | * c) let result = d1 + d2 26 | * 27 | * FIXME: { 28 | * case 1: If interger part is 0 (d1 == 0), the "result" is precise 29 | * unless the rounding mode we are using in the '/' operator 30 | * is not what json expect (But does Json spec define which 31 | * rounding mode should we go). 32 | * 33 | * case 2: If the interger-part > (1<<53), "result == d1" should hold. 34 | * case 3: If the integer-part < (1<<53), the rounding in step b) could 35 | * ripple to step c) and hence incur 1/(2**53) relative error. 36 | * } 37 | * 38 | * TODO: Implement the algorithm depicted in 39 | * http://www.exploringbinary.com/correct-decimal-to-floating-point-using-big-integers/, 40 | * Make sure the common cases can be parsed as fast as the variant-1 41 | * and variant-2. 42 | */ 43 | #include 44 | #include /* for str2od() */ 45 | #include "util.h" 46 | #include "scan_fp.h" 47 | 48 | #if FP_RELAX >= 2 49 | 50 | /* HINT: max double = 1.797693E+308, min-double = 2.225074E-308 */ 51 | #define MAX_EXP_ABS 308 52 | 53 | static double pos_pow10[22] = { 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 54 | 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 55 | 1e18, 1e19, 1e20, 1e21}; 56 | 57 | static double neg_pow10[22] = { 0.1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 58 | 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 59 | 1e-15, 1e-16, 1e-17, 1e-18, 1e-19, 1e-20, 60 | 1e-21}; 61 | 62 | static double 63 | mypow10(int exp, int negative) { 64 | ASSERT(exp >= 0 && exp <= MAX_EXP_ABS); 65 | if (exp < sizeof(pos_pow10)/sizeof(pos_pow10[0])) { 66 | return negative ? neg_pow10[exp] : pos_pow10[exp]; 67 | } 68 | 69 | static const double exp_fact1[9] = { 70 | 1e1, 1e2, 1e4, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256 71 | }; 72 | 73 | static const double exp_fact2[9] = { 74 | 1e-1, 1e-2, 1e-4, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256 75 | }; 76 | 77 | double val = 1; 78 | const double* dbl_fact = negative ? exp_fact2 : exp_fact1; 79 | 80 | int idx = 0; 81 | while (exp) { 82 | if (exp & 1) { 83 | val *= dbl_fact[idx]; 84 | } 85 | exp = exp >> 1; 86 | idx++; 87 | } 88 | 89 | return val; 90 | } 91 | 92 | int 93 | scan_fp(const char** scan_ptr, const char* str_end, int_db_union_t* result) { 94 | const char* str, *p; 95 | str = p = *scan_ptr; 96 | int negative = 0; 97 | 98 | if (*p == '-') { 99 | negative = 1; 100 | p++; 101 | } 102 | 103 | int int_len = 0; 104 | int64_t int_val = 0; 105 | 106 | /* step 1: Calculate the integer part */ 107 | while (p < str_end) { 108 | char c = *p; 109 | if (c >= '0' && c <= '9') { 110 | int_val = c - '0' + int_val * 10; 111 | p++; 112 | } else { 113 | break; 114 | } 115 | } 116 | 117 | int_len = p - str; 118 | if (unlikely(p >= str_end) || unlikely(int_len >= 20)) { 119 | /*The "len < 20" condition is to guaranteed the value fit in int64_t.*/ 120 | goto too_nasty; 121 | } 122 | 123 | char c = *p; 124 | if (c != '.' && ((c | 0x20) != 'e')) { 125 | result->int_val = negative ? - int_val : int_val; 126 | *scan_ptr = p; 127 | return 1; 128 | } 129 | 130 | /* step 2: Calculate the fraction part */ 131 | double frac = 0.0; 132 | int frac_len = 0; 133 | if (c == '.') { 134 | const char* frac_start = ++p; 135 | while (p < str_end) { 136 | char c = *p; 137 | if (c >= '0' && c <= '9') { 138 | frac = c - '0' + frac * 10; 139 | p++; 140 | } else { 141 | break; 142 | } 143 | } 144 | 145 | frac_len = p - frac_start; 146 | if (frac_len > 20) { 147 | goto too_nasty; 148 | } 149 | frac = frac * mypow10(frac_len, 1); 150 | } 151 | 152 | if (unlikely(p >= str_end)) { 153 | /* The floating-point literal per se is nothing wrong. However, this 154 | * condition implies that the literal is the last token of the json 155 | * being processed, which is not correct. 156 | */ 157 | return 0; 158 | } 159 | 160 | /* step 3: Calculate the exponent part */ 161 | double dbl_result = (double)int_val + frac; 162 | if (negative) 163 | dbl_result = - dbl_result; 164 | 165 | c = *p; 166 | int exp = 0; 167 | if ((c | 0x20) == 'e') { 168 | if (int_len != 1) 169 | goto too_nasty; 170 | 171 | p++; 172 | int neg_exp = 0; 173 | if (p < str_end && *p == '-') { 174 | neg_exp = 1; 175 | p++; 176 | } 177 | 178 | while (p < str_end) { 179 | char c = *p; 180 | if (c >= '0' && c <= '9') { 181 | exp = c - '0' + exp * 10; 182 | /* HINT: max double = 1.797693E+308, 183 | * min-double = 2.225074E-308 184 | */ 185 | if (exp >= 308) 186 | goto too_nasty; 187 | p++; 188 | } else { 189 | break; 190 | } 191 | } 192 | 193 | dbl_result *= mypow10(exp, neg_exp); 194 | } 195 | result->db_val = dbl_result; 196 | *scan_ptr = p; 197 | return 2; 198 | 199 | too_nasty: 200 | { 201 | fprintf(stderr, "too nasty %s!\n", str); 202 | char* fp_end; 203 | double d = strtod(str, &fp_end); 204 | if (fp_end != str) { 205 | result->db_val = d; 206 | *scan_ptr = fp_end; 207 | return 2; 208 | } 209 | } 210 | return 0; 211 | } 212 | #endif 213 | 214 | #if FP_RELAX == 1 215 | 216 | /* If the fraction part can fit in 53-bit, it can be represented by a 217 | * "double"-typed value exactly. The "((long long)1 << 53) - 1" evaluates to 218 | * 9007199254740991 which has 16 digits. If the faction part has more than 16 219 | * digit, we simply give up. 220 | */ 221 | #define MAX_FRAC_LEN 16 222 | 223 | static double 224 | mypow10(int exp) { 225 | ASSERT(exp <= MAX_FRAC_LEN); 226 | static double pos_pow10[16] = { 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 227 | 1e10, 1e11, 1e12, 1e13, 1e14, 1e15 }; 228 | 229 | return pos_pow10[exp]; 230 | } 231 | 232 | int 233 | scan_fp(const char** scan_ptr, const char* str_end, int_db_union_t* result) { 234 | const char* str, *p; 235 | str = p = *scan_ptr; 236 | int negative = 0; 237 | 238 | if (*p == '-') { 239 | negative = 1; 240 | p++; 241 | } 242 | 243 | int int_len = 0; 244 | int64_t int_val = 0; 245 | 246 | /* step 1: Calculate the integer part */ 247 | while (p < str_end) { 248 | char c = *p; 249 | if (c >= '0' && c <= '9') { 250 | int_val = c - '0' + int_val * 10; 251 | p++; 252 | } else { 253 | break; 254 | } 255 | } 256 | 257 | int_len = p - str; 258 | if (unlikely(p >= str_end) || unlikely(int_len >= 20)) { 259 | /*The "len < 20" condition is to guaranteed the value fit in int64_t.*/ 260 | goto too_nasty; 261 | } 262 | 263 | char c = *p; 264 | if (c != '.' && ((c | 0x20) != 'e')) { 265 | result->int_val = negative ? - int_val : int_val; 266 | *scan_ptr = p; 267 | return 1; 268 | } 269 | 270 | /* step 2: Calculate the fraction part */ 271 | int64_t frac_int = 0; 272 | int frac_len = 0; 273 | if (c == '.') { 274 | const char* frac_start = ++p; 275 | while (p < str_end) { 276 | char c = *p; 277 | if (c >= '0' && c <= '9') { 278 | frac_int = c - '0' + frac_int * 10; 279 | p++; 280 | } else { 281 | break; 282 | } 283 | } 284 | 285 | frac_len = p - frac_start; 286 | if (frac_len >= (((int64_t)1) << 53) - 1) { 287 | /* make sure frac_len can fit in 53 bit, such that it can be 288 | * represented exactly by a double. 289 | */ 290 | goto too_nasty; 291 | } 292 | } 293 | 294 | if (unlikely(p >= str_end)) { 295 | /* The floating-point literal per se is nothing wrong. However, this 296 | * condition implies that the literal is the last token of the json 297 | * being processed, which is not correct. 298 | */ 299 | return 0; 300 | } 301 | 302 | /* step 3: give up if it's in scientific notation */ 303 | if ((*p | 0x20) == 'e') { 304 | goto too_nasty; 305 | } 306 | 307 | result->db_val = int_val + (double)frac_int / mypow10(frac_len); 308 | *scan_ptr = p; 309 | return 2; 310 | 311 | too_nasty: 312 | { 313 | char* fp_end; 314 | double d = strtod(str, &fp_end); 315 | if (fp_end != str) { 316 | result->db_val = d; 317 | *scan_ptr = fp_end; 318 | return 2; 319 | } 320 | } 321 | return 0; 322 | } 323 | #endif 324 | -------------------------------------------------------------------------------- /scan_fp_strict.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include /* for str2od() */ 3 | #include "util.h" 4 | #include "scan_fp.h" 5 | 6 | #if FP_RELAX == 0 7 | /* i.e strict floating point mode */ 8 | 9 | int 10 | scan_fp(const char** scan_str, const char* str_e, int_db_union_t* result) { 11 | const char* str_save = *scan_str; 12 | const char* str = *scan_str; 13 | 14 | int is_negative = (*str == '-') ? 1 : 0; 15 | str += is_negative; 16 | 17 | /* More often than not, the number is of interger type that can fit in 18 | * int64_t. So, we speculatively try to convert input string into 19 | * an int64_t as we go along. In case it turns out to be a floating 20 | * point number, or the interger is too big to fit in int64_t, we start 21 | * over converting the string to "double"-typed value. 22 | */ 23 | int64_t int_val = 0; 24 | 25 | while (str < str_e) { 26 | char c = *str; 27 | if (c >= '0' && c <= '9') { 28 | int_val = int_val * 10 + (c - '0'); 29 | str++; 30 | } else { 31 | if (c != '.' && (c | 0x20) != 'e') { 32 | if (str - str_save < 20) { 33 | /* It's guaranteed to fit in int64_t */ 34 | if (!is_negative) { 35 | result->int_val = int_val; 36 | } else { 37 | result->int_val = - int_val; 38 | } 39 | *scan_str = str; 40 | return 1; 41 | } 42 | } 43 | 44 | double d = strtod(str_save, (char**)scan_str); 45 | if (*scan_str != str_save) { 46 | result->db_val = d; 47 | return 2; 48 | } 49 | return 0; 50 | } 51 | } 52 | return 0; 53 | } 54 | 55 | #endif /* FP_RELAX == 0 */ 56 | -------------------------------------------------------------------------------- /scaner.c: -------------------------------------------------------------------------------- 1 | #include /* for isdigit */ 2 | #include /* for memchr() */ 3 | #include 4 | #include /* for strtod() */ 5 | #include 6 | #include /* for the time being */ 7 | #include "util.h" 8 | #include "scaner.h" 9 | #include "scan_fp.h" 10 | 11 | static const char* unrecog_token = "Unrecognizable token"; 12 | 13 | /* Forward decl */ 14 | static void __attribute__((format(printf, 3, 4), cold)) 15 | set_scan_err_fmt(scaner_t* scaner, const char* loc, const char* fmt, ...); 16 | 17 | static void __attribute__((cold)) 18 | set_scan_err(scaner_t* scaner, const char* loc, const char* str); 19 | 20 | static char token_predict[256]; 21 | static char esc_char[256]; 22 | 23 | #define TT_IS_SPACE (TT_LAST + 1) 24 | 25 | static void 26 | init_token_predict() { 27 | memset(token_predict, TT_ERR, sizeof(token_predict)); 28 | 29 | /* Seperator */ 30 | { 31 | int i, e; 32 | const char* sep = "{}[],:"; 33 | for (i = 0, e = (int)strlen(sep); i < e; i++) { 34 | uint8_t c = (uint8_t)sep[i]; 35 | token_predict[c] = TT_CHAR; 36 | } 37 | } 38 | 39 | /* Null predictor */ 40 | token_predict['n'] = TT_NULL; 41 | token_predict['N'] = TT_NULL; 42 | 43 | /* Number(int/fp) predictor. NOTE: unlike C, numbers 44 | * like +1.2 .5, -.4 are illegal. 45 | */ 46 | const char* np = "-0123456789"; 47 | int idx, idx_e; 48 | for (idx = 0, idx_e = strlen(np); idx < idx_e; idx++) { 49 | token_predict[(uint8_t)np[idx]] = TT_FP; 50 | } 51 | 52 | /* Boolean predictor */ 53 | const char* bp = "tTfF"; 54 | for (idx = 0, idx_e = strlen(bp); idx < idx_e; idx++) { 55 | token_predict[(uint8_t)bp[idx]] = TT_BOOL; 56 | } 57 | 58 | /* string predictor */ 59 | token_predict['"'] = TT_STR; 60 | 61 | token_predict[' '] = TT_IS_SPACE; 62 | token_predict['\t'] = TT_IS_SPACE; 63 | token_predict['\r'] = TT_IS_SPACE; 64 | token_predict['\n'] = TT_IS_SPACE; 65 | token_predict['\f'] = TT_IS_SPACE; 66 | token_predict['\v'] = TT_IS_SPACE; 67 | } 68 | 69 | static void 70 | init_esc_table() { 71 | memset(esc_char, 0, sizeof(esc_char)); 72 | esc_char['"'] = '"'; 73 | esc_char['/'] = '/'; 74 | esc_char['\\'] = '\\'; 75 | esc_char['b'] = '\b'; 76 | esc_char['f'] = '\f'; 77 | esc_char['n'] = '\n'; 78 | esc_char['r'] = '\r'; 79 | esc_char['t'] = '\t'; 80 | } 81 | 82 | static void __attribute__((constructor)) 83 | init_const_table() { 84 | init_token_predict(); 85 | init_esc_table(); 86 | } 87 | 88 | /* On success, scaner advance the pointer right after the token just 89 | * recognized, and the token_t::span records span of the token in 90 | * input string. 91 | */ 92 | static inline void 93 | update_ptr_on_succ(scaner_t* scaner, const char* scan_starts, 94 | int32_t span) { 95 | scaner->scan_ptr = scan_starts + span; 96 | scaner->token.span = span; 97 | scaner->col_num += span; 98 | } 99 | 100 | /* On failure, scaner's pointer is not advanced, and token_t::span points 101 | * to the locations where lexical error takes place. 102 | */ 103 | static inline void 104 | update_ptr_on_failure(scaner_t* scaner, const char* scan_starts, 105 | int32_t span) { 106 | scaner->scan_ptr = scan_starts; 107 | scaner->token.span = span; 108 | scaner->token.type = TT_ERR; 109 | } 110 | 111 | static token_t* 112 | char_handler(scaner_t* scaner, const char* str, const char* str_e) { 113 | update_ptr_on_succ(scaner, str, 1); 114 | 115 | token_t* tk = &scaner->token; 116 | tk->type = TT_CHAR; 117 | tk->char_val = *str; 118 | 119 | return tk; 120 | } 121 | 122 | static token_t* 123 | null_handler(scaner_t* scaner, const char* str, const char* str_e) { 124 | token_t* tk = &scaner->token; 125 | 126 | if (str + 4 < str_e && !strncmp(str, "null", 4)) { 127 | update_ptr_on_succ(scaner, str, 4); 128 | tk->type = TT_NULL; 129 | return tk; 130 | } 131 | 132 | update_ptr_on_failure(scaner, str, 0); 133 | if (str + 4 < str_e && !strncasecmp(str, "null", 4)) { 134 | set_scan_err(scaner, str, "'null' must be in lower case"); 135 | } else { 136 | set_scan_err(scaner, str, 0); 137 | } 138 | return tk; 139 | } 140 | 141 | static token_t* 142 | fp_handler(scaner_t* scaner, const char* str, const char* str_e) { 143 | const char* advance = str; 144 | int_db_union_t val; 145 | int res = scan_fp(&advance, str_e, &val); 146 | 147 | token_t* tk = &scaner->token; 148 | if (res == 1) { 149 | update_ptr_on_succ(scaner, str, advance - str); 150 | tk->type = TT_INT64, 151 | tk->int_val = val.int_val; 152 | } else if (res == 2) { 153 | update_ptr_on_succ(scaner, str, advance - str); 154 | tk->type = TT_FP, 155 | tk->db_val = val.db_val; 156 | } else { 157 | update_ptr_on_failure(scaner, str, advance - str); 158 | } 159 | 160 | return tk; 161 | } 162 | 163 | static token_t* 164 | bool_handler(scaner_t* scaner, const char* str, const char* str_e) { 165 | int len = str_e - str; 166 | token_t* tk = &scaner->token; 167 | tk->type = TT_BOOL; 168 | if (len >= 5) { 169 | if (!strncmp(str, "true", 4)) { 170 | tk->int_val = 1; 171 | update_ptr_on_succ(scaner, str, 4); 172 | return tk; 173 | } 174 | 175 | if (!strncmp(str, "false", 5)) { 176 | tk->int_val = 0; 177 | update_ptr_on_succ(scaner, str, 5); 178 | return tk; 179 | } 180 | } 181 | 182 | update_ptr_on_failure(scaner, str, 0); 183 | 184 | /* Emit eror-message if true/false is not in lower case, or the token 185 | * starts with [tTfF], but is not boolean value at all. 186 | */ 187 | if ((len >= 4 && strncasecmp(str, "true", 4)) || 188 | (len >= 5 && strncasecmp(str, "false", 5))) { 189 | set_scan_err(scaner, str, "boolean value must be in lower case"); 190 | } else { 191 | set_scan_err(scaner, str, 0); 192 | } 193 | 194 | return tk; 195 | } 196 | 197 | static token_t* 198 | unknown_tk_handler(scaner_t* scaner, const char* str, const char* str_e) { 199 | token_t* tk = &scaner->token; 200 | update_ptr_on_failure(scaner, str, 0); 201 | set_scan_err(scaner, str, 0); 202 | return tk; 203 | } 204 | 205 | /* *********************************************************************** 206 | * 207 | * Handle String 208 | * 209 | * *********************************************************************** 210 | */ 211 | 212 | /* The input "hex4" is a string with *four* leading hex-digits, 213 | * this function is to convert them into an positive integer. 214 | * 215 | * For instance, given input hex4 being "aBc9...", the return value would 216 | * be 0xabc9. Hexadecimal digits are case insensitive. If the leading 217 | * four character include non-hex-digit, -1 is returned. 218 | * 219 | * NOTE: It's up to the caller to ensure the length of "hex4" is no less 220 | * than 4. 221 | */ 222 | static int32_t 223 | hex4_to_int(const char* hex4) { 224 | unsigned char c = *hex4++; 225 | int hval = 0, value; 226 | 227 | if (c >= '0' && c <= '9') 228 | hval = c - '0'; 229 | else if ((c | 0x20) >= 'a' && (c | 0x20) <= 'f') { 230 | hval = (c | 0x20) - 'a' + 10; 231 | } else { 232 | return -1; 233 | } 234 | value = hval; 235 | 236 | c = *hex4++; 237 | if (c >= '0' && c <= '9') 238 | hval = c - '0'; 239 | else if ((c | 0x20) >= 'a' && (c | 0x20) <= 'f') { 240 | hval = (c | 0x20) - 'a' + 10; 241 | } else { 242 | return -1; 243 | } 244 | value = (value << 4) | hval; 245 | 246 | c = *hex4++; 247 | if (c >= '0' && c <= '9') 248 | hval = c - '0'; 249 | else if ((c | 0x20) >= 'a' && (c | 0x20) <= 'f') { 250 | hval = (c | 0x20) - 'a' + 10; 251 | } else { 252 | return -1; 253 | } 254 | value = (value << 4) | hval; 255 | 256 | c = *hex4++; 257 | if (c >= '0' && c <= '9') 258 | hval = c - '0'; 259 | else if ((c | 0x20) >= 'a' && (c | 0x20) <= 'f') { 260 | hval = (c | 0x20) - 'a' + 10; 261 | } else { 262 | return -1; 263 | } 264 | 265 | value = (value << 4) | hval; 266 | return value; 267 | } 268 | 269 | /* determine the number of bytes needed to encode the given codepoint */ 270 | static int 271 | utf8_encode_len(int codepoint) { 272 | if (codepoint < 0x80) 273 | return 1; 274 | 275 | if (codepoint < 0x800) 276 | return 2; 277 | 278 | if (codepoint < 0x10000) 279 | return 3; 280 | 281 | return 4; 282 | } 283 | 284 | /* Encode the given codepoint in a sequence of UTF-8s */ 285 | static void 286 | utf8_encode(char* buf, int codepoint, int len) { 287 | static unsigned char len_mark[] = {0, 0xc0, 0xe0, 0xf0 }; 288 | switch (len) { 289 | case 4: *(buf + 3) = ((codepoint | 0x80) & 0xbf); 290 | codepoint >>= 6; 291 | /* fall through */ 292 | 293 | case 3: *(buf + 2) = ((codepoint | 0x80) & 0xbf); 294 | codepoint >>= 6; 295 | /* fall through */ 296 | 297 | case 2: *(buf + 1) = ((codepoint | 0x80) & 0xbf); 298 | codepoint >>= 6; 299 | /* fall through */ 300 | 301 | default: break; 302 | } 303 | 304 | *buf = codepoint | len_mark[len - 1]; 305 | } 306 | 307 | /* Process \u escape. 308 | * 309 | * The legal input string falls in one of the following two cases: 310 | * 1. "\uzzzz", where the zzzz in (0, 0xD800] (Note zzzz > 0) 311 | * 2. "\uxxxx\uyyyy", where the xxxx in [0xD800, 0xDBFF] yyyy in [DC00,DFFF]. 312 | * i.e. the input string is a UTF-16 surrogate pair. 313 | * 314 | * This function is to convert the input string to up to four UTF-8s and 315 | * save them to "dest". 316 | * 317 | * On success, return 1, and the "src_advance" and "dest_advance" is set to 318 | * the amount of byte the source and the destination string need to advance, 319 | * respectively. Otherwise, 0 is returned. 320 | */ 321 | static const char* illegal_u_esc = "Illegal \\u escape"; 322 | static int 323 | process_u_esc(scaner_t* scaner, const char* src, const char* src_end, 324 | char* dest, int* src_advance, int* dest_advance) { 325 | int32_t codepoint; 326 | 327 | /* Step 1: get the codepoint */ 328 | if (unlikely(src + 6 > src_end)) 329 | return 0; 330 | 331 | codepoint = hex4_to_int(src + 2); 332 | if (unlikely(codepoint < 0)) { 333 | set_scan_err(scaner, src, illegal_u_esc); 334 | return 0; 335 | } 336 | 337 | *src_advance = 6; /* skip the \\uxxxx, hence 6 */ 338 | 339 | /* Detect UTF-16 surrogate pair. The codepoint be in this form : 110110x... 340 | */ 341 | if (codepoint >= 0xd800) { 342 | int32_t codepoint_low; 343 | const char* lower = src + 6; 344 | 345 | if (codepoint & 0x400) { 346 | set_scan_err(scaner, src, "Higher part of UTF-16 surrogate must " 347 | "be in the range of [0xd800, 0xdbff]"); 348 | return 0; 349 | } 350 | 351 | if (unlikely(src + 12 > src_end) || 352 | unlikely(*(src + 6) != '\\') || unlikely(*(src + 7) != 'u')) { 353 | set_scan_err(scaner, src + 6, 354 | "Expect \\u escape for lower part " 355 | "of UTF-16 surrogate"); 356 | return 0; 357 | } 358 | 359 | codepoint_low = hex4_to_int(lower + 2); 360 | if (codepoint_low < 0) { 361 | set_scan_err(scaner, lower, illegal_u_esc); 362 | return 0; 363 | } 364 | 365 | if (unlikely(codepoint_low < 0xdc00) || 366 | unlikely(codepoint_low > 0xdfff)) { 367 | set_scan_err(scaner, lower, "Lower part of UTF-16 surrogate must " 368 | "be in the range of [0xdc00, 0xdfff]"); 369 | return 0; 370 | } 371 | 372 | /* Extract the lower 10-bit from surrogate pairs, and concatenate 373 | * them together. 374 | */ 375 | codepoint = (codepoint_low & 0x3ff) | ((codepoint & 0x3ff) << 10); 376 | codepoint |= 0x10000; 377 | 378 | *src_advance = 12; /* skip the "\\uxxxx\\uyyyy", hence 12 */ 379 | } 380 | 381 | /* Step 2: Encode the codepoint with UTF-8 sequence */ 382 | utf8_encode(dest, codepoint, *dest_advance = utf8_encode_len(codepoint)); 383 | return 1; 384 | } 385 | 386 | static token_t* 387 | str_handler(scaner_t* scaner, const char* str, const char* str_e) { 388 | /* step 1: determine the end of string */ 389 | const char* str_quote = str; 390 | token_t* tk = &scaner->token; 391 | 392 | do { 393 | str_quote = memchr(str_quote + 1, '"', str_e - str_quote); 394 | if (unlikely(!str_quote)) { 395 | /* The string dose not end with quote*/ 396 | set_scan_err(scaner, str, "String does not end with quote"); 397 | return tk; 398 | } 399 | 400 | if (likely(*(str_quote - 1) != '\\')) { 401 | break; 402 | } else { 403 | /* Consider the cases like "Junk\\" */ 404 | const char* t = str_quote - 2; 405 | int cnt = 1; 406 | for (; *t == '\\'; t--, cnt++) {} 407 | if ((cnt & 1) == 0) { 408 | break; 409 | } 410 | } 411 | } while(1); 412 | 413 | /* step 2: allocate space for the string. The new string has trailing 414 | * '\0' for easing purpose. 415 | */ 416 | char* new_str = 417 | MEMPOOL_ALLOC_TYPE_N(scaner->mempool, char, str_quote - str); 418 | if (unlikely(!new_str)) { 419 | set_scan_err(scaner, str, "OOM"); 420 | return tk; 421 | } 422 | 423 | /* step 3: copy the string */ 424 | { 425 | char* dest = new_str; 426 | const char* src = str + 1; 427 | do { 428 | int len = str_quote - src; 429 | char* esc = (char*)memchr(src, '\\', len); 430 | if (!esc) { 431 | memcpy(dest, src, len); 432 | src += len; 433 | dest += len; 434 | *dest = '\0'; /* to ease debugging*/ 435 | 436 | tk->str_val = new_str; 437 | tk->str_len = dest - new_str; 438 | tk->type = TT_STR; 439 | update_ptr_on_succ(scaner, str, str_quote - str + 1); 440 | return tk; 441 | } 442 | 443 | /* Handle escape */ 444 | len = esc - src; 445 | memcpy(dest, src, len); 446 | src = esc; 447 | dest += len; 448 | 449 | char esc_key = esc[1]; 450 | char esc_val = esc_char[(unsigned char)esc_key]; 451 | 452 | /* successfully processed non-unicode (\u) escape */ 453 | if (esc_val) { 454 | *dest++ = esc_val; 455 | src += sizeof("\\n") - 1; 456 | continue; 457 | } 458 | 459 | /* process unicode escape */ 460 | if (esc_key == 'u') { 461 | int src_adv, dest_adv; 462 | if (process_u_esc(scaner, src, str_quote, dest, 463 | &src_adv, &dest_adv)) { 464 | src += src_adv; 465 | dest += dest_adv; 466 | continue; 467 | } 468 | } 469 | 470 | /* illegal escape */ 471 | set_scan_err_fmt(scaner, esc, "illegal escape \\%c", esc[1]); 472 | return tk; 473 | } while(1); 474 | } 475 | 476 | /* Should not reach here. The return-statement is just to make 477 | * stupid compilers happy. 478 | */ 479 | ASSERT(0); 480 | return NULL; 481 | } 482 | 483 | static token_t* space_handler(scaner_t*, const char*, const char*); 484 | 485 | typedef token_t* (*tk_hd_func)(scaner_t*, const char*, const char*); 486 | tk_hd_func token_handler[] = { 487 | [TT_INT64] = 0, 488 | [TT_FP] = fp_handler, 489 | [TT_STR] = str_handler, 490 | [TT_BOOL] = bool_handler, 491 | [TT_NULL] = null_handler, 492 | [TT_CHAR] = char_handler, 493 | [TT_ERR] = unknown_tk_handler, 494 | [TT_IS_SPACE] = space_handler, 495 | }; 496 | 497 | static token_t* 498 | space_handler(scaner_t* scaner, const char* str_ptr, const char* str_end) { 499 | int32_t ln = 0; 500 | int32_t col = 0; 501 | 502 | char lookahead = *str_ptr; 503 | token_ty_t tt; 504 | do { 505 | col = (lookahead == '\n') ? 1 : col + 1; 506 | ln += ((lookahead == '\n') ? 1 : 0); 507 | 508 | if (unlikely(str_end <= ++str_ptr)) { 509 | scaner->token.type = TT_END; 510 | return &scaner->token; 511 | } 512 | 513 | lookahead = *str_ptr; 514 | tt = (token_ty_t)token_predict[(uint32_t)(uint8_t)lookahead]; 515 | if (tt != TT_IS_SPACE) 516 | break; 517 | } while (1); 518 | 519 | scaner->line_num += ln; 520 | scaner->col_num += col; 521 | /* It is not necessary to set scan_ptr as token-handler will update it.*/ 522 | /*scaner->scan_ptr = str_ptr; */ 523 | 524 | return token_handler[tt](scaner, str_ptr, str_end); 525 | } 526 | 527 | token_t* 528 | sc_get_token(scaner_t* scaner, const char* str_end) { 529 | const char* str_ptr = scaner->scan_ptr; 530 | ASSERT(str_end == scaner->json_end); 531 | 532 | if (unlikely(str_ptr >= str_end)) { 533 | scaner->token.type = TT_END; 534 | return &scaner->token; 535 | } 536 | 537 | char lookahead = *str_ptr; 538 | token_ty_t tt = (token_ty_t)token_predict[(uint32_t)(uint8_t)lookahead]; 539 | return token_handler[tt](scaner, str_ptr, str_end); 540 | } 541 | 542 | void 543 | sc_init_scaner(scaner_t* scaner, mempool_t* mp, 544 | const char* json, uint32_t json_len) { 545 | scaner->mempool = mp; 546 | scaner->json_begin = json; 547 | scaner->json_end = json + json_len; 548 | scaner->scan_ptr = json; 549 | scaner->line_num = 1; 550 | scaner->col_num = 1; 551 | scaner->err_msg = NULL; 552 | } 553 | 554 | void 555 | sc_rewind (scaner_t* scaner) { 556 | int span = scaner->token.span; 557 | scaner->scan_ptr -= span; 558 | scaner->col_num -= span; 559 | } 560 | 561 | /**************************************************************** 562 | * 563 | * Error handling and other cold code cluster here 564 | * 565 | ***************************************************************** 566 | */ 567 | static void __attribute__((format(printf, 3, 4))) 568 | set_scan_err_fmt(scaner_t* scaner, const char* loc, const char* fmt, ...) { 569 | if (scaner->err_msg) 570 | return; 571 | 572 | token_t* tk = &scaner->token; 573 | tk->type = TT_ERR; 574 | 575 | int buf_len = 250; 576 | char* buf = MEMPOOL_ALLOC_TYPE_N(scaner->mempool, char, buf_len); 577 | if (!buf) { 578 | scaner->err_msg = "OOM"; 579 | return; 580 | } 581 | 582 | scaner->err_msg = buf; 583 | int span = loc - scaner->scan_ptr; 584 | int loc_info_len = snprintf(buf, buf_len, "(line:%d,col:%d) ", 585 | scaner->line_num, scaner->col_num + span); 586 | 587 | buf += loc_info_len; 588 | buf_len -= loc_info_len; 589 | 590 | va_list vl; 591 | va_start(vl, fmt); 592 | vsnprintf(buf, buf_len, fmt, vl); 593 | va_end(vl); 594 | } 595 | 596 | static void __attribute__((cold)) 597 | set_scan_err(scaner_t* scaner, const char* loc, const char* str) { 598 | if (scaner->err_msg) 599 | return; 600 | 601 | if (!str) { str = unrecog_token; } 602 | set_scan_err_fmt(scaner, loc, "%s", str); 603 | } 604 | -------------------------------------------------------------------------------- /scaner.h: -------------------------------------------------------------------------------- 1 | /* **************************************************************************** 2 | * 3 | * The scaner is to decompose input json into tokens. A json 4 | * number/string/bool/null will be recognized as a single "primitive" token, 5 | * and a delimiter (i.e. one of ":,[]{}") is recognized as a token of TT_char 6 | * type. 7 | * 8 | * The scaner is driven by the parser (parser*.c), recognizing one token at 9 | * a time. The last token is TT_END token indicating the end of input json. 10 | * 11 | * Scaner keeps track of the location of input json as it moves on. If a token 12 | * was successfully recognized, scaner_t::line_num/col_num refers to the point 13 | * right after the token; if it comes across any lexical error, TT_ERR token is 14 | * returned, and the scaner_t::line_num/col_num points to starting location where 15 | * the problem take place. 16 | * 17 | * The major interface functions include: 18 | * 19 | * o. sc_init_scaner: 20 | * Initiaize the scaner (write down the beginning and ending location of 21 | * input json etc). 22 | * 23 | * o. sc_get_token: 24 | * Return next token. 25 | * 26 | * o. sc_rewind: 27 | * The retreat points back to the starting point of the token just 28 | * sucessfully recognized. This function is called when scaner 29 | * successfuly recognize the token, which is not what the parser expects. 30 | * 31 | * **************************************************************************** 32 | */ 33 | #ifndef SCANER_H 34 | #define SCANER_H 35 | 36 | #include 37 | #include "ljson_parser.h" 38 | #include "mempool.h" 39 | 40 | typedef enum { 41 | /* Integer that can fit in int64_t. Otherwise, it would be represented with 42 | * double-precision floating-point number. 43 | */ 44 | TT_INT64 = OT_INT64, 45 | 46 | /* double-precision number */ 47 | TT_FP = OT_FP, 48 | 49 | TT_STR = OT_STR, 50 | TT_BOOL = OT_BOOL, 51 | TT_NULL = OT_NULL, 52 | TT_LAST_PRIMITIVE = TT_NULL, 53 | 54 | /* If scanner fail to recognaize a primtive at current position, it just 55 | * returns the character at current position. Since the scanner skip 56 | * whitespaces as it moves forward, the character at "current position" is 57 | * guaranted to be a non-whitespace. 58 | */ 59 | TT_CHAR, 60 | 61 | TT_ERR, 62 | 63 | /* Meet the end of input json */ 64 | TT_END, 65 | TT_LAST = TT_END + 1 66 | } token_ty_t; 67 | 68 | typedef struct { 69 | union { 70 | int64_t int_val; 71 | char* str_val; 72 | char char_val; 73 | double db_val; 74 | }; 75 | token_ty_t type; 76 | 77 | /* valid iff the token is a string */ 78 | int32_t str_len; 79 | 80 | /* How many chars in the input json string representing this token. In 81 | * the even of lexical problem, the span points to starting location where 82 | * the problem occurs. 83 | */ 84 | int32_t span; 85 | } token_t; 86 | 87 | typedef struct { 88 | /* The last token we get. NOTE: token dose not live across get_token() */ 89 | token_t token; 90 | 91 | /* The half-open interval "[json_begin, json_end)" is the input json 92 | * in memory. 93 | */ 94 | const char* json_begin; 95 | const char* json_end; 96 | 97 | /* pointer moving forward from json_text toward json_end */ 98 | const char* scan_ptr; 99 | mempool_t* mempool; 100 | 101 | /* The location of current pointer */ 102 | int32_t line_num; 103 | int32_t col_num; 104 | 105 | const char* err_msg; 106 | } scaner_t; 107 | 108 | /* Return 1 iff the "tk" is a primitive token */ 109 | static inline int 110 | tk_is_primitive(const token_t* tk) { 111 | return ((uint32_t)tk->type) <= TT_LAST_PRIMITIVE; 112 | } 113 | 114 | void sc_init_scaner(scaner_t*, mempool_t*, const char* json, uint32_t json_len); 115 | 116 | /* NOTE: The str_end is equal to scaner_t::json_end.*/ 117 | token_t* sc_get_token(scaner_t*, const char* str_end); 118 | 119 | /* Rewind the pointer back to beginning of token just successfully scaned. 120 | * It's called by parser when it detects syntax error. 121 | */ 122 | void sc_rewind(scaner_t*); 123 | 124 | #endif 125 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | SRC := unit_test.cxx test_util.cxx 2 | OBJ := ${SRC:.cxx=.o} 3 | 4 | OS := $(shell uname) 5 | ifeq ($(OS), Darwin) 6 | LIBLJSON := libljson.dylib 7 | else 8 | LIBLJSON := libljson.so 9 | endif 10 | 11 | CXXFLAGS := -Wall -O0 -g -MMD 12 | LDFLAGS := -Wl,-rpath,.. -L.. -lljson 13 | 14 | PROGRAM = unit_test 15 | 16 | .PHONY = all test 17 | 18 | all : $(PROGRAM) 19 | ifeq ($(OS), Darwin) 20 | # The symbolic link command is a dirty work-around for the fact that 21 | # json_decoder.lua is looking for the "libljson.so" (hard-coded) in 22 | # package.cpath. 23 | # 24 | test -L ../libljson.so || ln -s libljson.dylib ../libljson.so 25 | DYLD_LIBRARY_PATH=$$(DYLD_LIBRARY_PATH):.. ./$(PROGRAM) ./$(PROGRAM) 26 | else 27 | ./$(PROGRAM) 28 | endif 29 | @echo 30 | @echo "Testing Lua wrapper..." 31 | luajit ./test.lua 32 | 33 | $(PROGRAM) : $(OBJ) ../$(LIBLJSON) 34 | $(CXX) $(OBJ) $(LDFLAGS) -o $@ 35 | @-cat *.d > dep.txt 36 | 37 | ${OBJ} : %.o : %.cxx 38 | $(CXX) $(CXXFLAGS) -c $< 39 | 40 | -include dep.txt 41 | 42 | clean: 43 | rm -f *.txt *.d *.o $(PROGRAM) 44 | -------------------------------------------------------------------------------- /tests/test.lua: -------------------------------------------------------------------------------- 1 | package.cpath = package.cpath..";../?.so" 2 | package.path = package.cpath..";../?.lua" 3 | 4 | local ljson_decoder = require 'json_decoder' 5 | local decoder = ljson_decoder.new() 6 | 7 | local function cmp_lua_var(obj1, obj2) 8 | if type(obj1) ~= type(obj2) then 9 | return 10 | end 11 | 12 | if type(obj1) == "string" or 13 | type(obj1) == "number" or 14 | type(obj1) == "nil" or 15 | type(obj1) == "boolean" then 16 | return obj1 == obj2 and true or nil 17 | end 18 | 19 | if (type(obj1) ~= "table") then 20 | print("unknown type", type(obj1)); 21 | return 22 | end 23 | 24 | -- compare table 25 | for k, v in pairs(obj1) do 26 | if not cmp_lua_var(v, obj2[k]) then 27 | -- print(v," vs ", obj2[k]) 28 | return 29 | end 30 | end 31 | 32 | for k, v in pairs(obj2) do 33 | if not cmp_lua_var(v, obj1[k]) then 34 | -- print(v," vs ", obj1[k]) 35 | return 36 | end 37 | end 38 | 39 | return true 40 | end 41 | 42 | local test_fail_num = 0; 43 | local test_total = 0; 44 | 45 | local function ljson_test(test_id, parser, input, expect) 46 | test_total = test_total + 1 47 | io.write(string.format("Testing %s ...", test_id)) 48 | local result = decoder:decode(input) 49 | if cmp_lua_var(result, expect) then 50 | print("succ!") 51 | else 52 | test_fail_num = test_fail_num + 1 53 | print("failed!") 54 | --ljson_decoder.debug(result) 55 | end 56 | end 57 | 58 | local json_parser = ljson_decoder.create() 59 | 60 | -- Test 1 61 | local input = [=[[1, 2, 3, {"key1":"value1", "key2":"value2"}, "lol"]]=] 62 | local output = {1, 2, 3, {["key1"] = "value1", ["key2"] = "value2" }, "lol"} 63 | ljson_test("test1", json_parser, input, output); 64 | 65 | -- Test 2 66 | input = [=[[]]=] 67 | output = {} 68 | ljson_test("test2", json_parser, input, output); 69 | 70 | -- Test 3 71 | input = [=[[{}]]=] 72 | output = {{}} 73 | ljson_test("test3", json_parser, input, output); 74 | 75 | input = [=[[null]]=] 76 | output = {nil} 77 | ljson_test("test4", json_parser, input, output); 78 | 79 | input = [=[[true, false]]=] 80 | output = {true, false} 81 | ljson_test("test5", json_parser, input, output); 82 | 83 | input = "-" -- invalid input 84 | output = nil 85 | ljson_test("test6", json_parser, input, output); 86 | 87 | -- The input string is "[\0265]", where char \0265 is illegal. The json decoder 88 | -- once crash on this case. 89 | input = string.format("[%c]", 181) 90 | output = nil 91 | ljson_test("test7", json_parser, input, output); 92 | 93 | input = string.format("[ %c]", 181) 94 | output = nil 95 | ljson_test("test8", json_parser, input, output); 96 | 97 | io.write(string.format( 98 | "\n============================\nTotal test count %d, fail %d\n", 99 | test_total, test_fail_num)) 100 | 101 | if test_fail_num == 0 then 102 | os.exit(0) 103 | else 104 | os.exit(1) 105 | end 106 | -------------------------------------------------------------------------------- /tests/test_cmp.lua: -------------------------------------------------------------------------------- 1 | package.cpath = package.cpath .. ";../?.so" 2 | package.path = package.path .. ";../?.lua" 3 | 4 | local cjson = require "cjson" 5 | local ljson_decoder = require 'json_decoder' 6 | 7 | local f, err = io.open("test_cmp.json", "r") 8 | 9 | local function cmp_lua_var(obj1, obj2) 10 | if type(obj1) ~= type(obj2) then 11 | return 12 | end 13 | 14 | if type(obj1) == "string" or 15 | type(obj1) == "number" or 16 | type(obj1) == "nil" or 17 | type(obj1) == "boolean" then 18 | if obj1 == obj2 then 19 | return true 20 | end 21 | 22 | print(obj1, "of tyep", type(obj1), "vs", obj2, "of type", obj2) 23 | return 24 | end 25 | 26 | if (type(obj1) ~= "table") then 27 | print("unknown type", type(obj1)); 28 | return 29 | end 30 | 31 | -- compare table 32 | for k, v in pairs(obj1) do 33 | if not cmp_lua_var(v, obj2[k]) then 34 | print("key =", k, "value:", v," vs ", obj2[k]) 35 | return 36 | end 37 | end 38 | 39 | for k, v in pairs(obj2) do 40 | if not cmp_lua_var(v, obj1[k]) then 41 | print("key =", k, "value:", v," vs ", obj1[k]) 42 | return 43 | end 44 | end 45 | 46 | return true 47 | end 48 | 49 | local instance, err = ljson_decoder.new() 50 | if not instance then 51 | print("fail to create decoder instance") 52 | end 53 | 54 | local linenum = 0 55 | local fail_num = 0; 56 | for line in f:lines() do 57 | local result1 = instance:decode(line) 58 | local result2 = cjson.decode(line) 59 | 60 | linenum = linenum + 1 61 | 62 | if not cmp_lua_var(result1, result2) then 63 | print("Fail with JSON at line", linenum) 64 | fail_num = fail_num + 1 65 | end 66 | end 67 | 68 | if fail_num == 0 then 69 | print("pass!") 70 | os.exit(0) 71 | else 72 | print("Fail!") 73 | os.exit(1) 74 | end 75 | -------------------------------------------------------------------------------- /tests/test_spec/test_composite.txt: -------------------------------------------------------------------------------- 1 | input: [1 , 2, "lol"] 2 | output: [1,2,"lol"] 3 | 4 | input: [1, 2, {"key":34}] 5 | output: [1,2,{"key":34}] 6 | 7 | input: [1, 2, {"key":[3, 4, "5"]}] 8 | output: [1,2,{"key":[3,4,"5"]}] 9 | 10 | input: {"key":[3, 4, "5"], "key2":67} 11 | output:{"key":[3,4,"5"],"key2":67} 12 | 13 | input: {"key":[3, 4, "5"], "key2":{"key3":"value3"}} 14 | output: {"key":[3,4,"5"],"key2":{"key3":"value3"}} 15 | 16 | # test empty composite object 17 | input : [] 18 | output: [] 19 | 20 | input: {} 21 | output: {} 22 | 23 | input: [ {}] 24 | output: [{}] 25 | -------------------------------------------------------------------------------- /tests/test_spec/test_diagnostic.txt: -------------------------------------------------------------------------------- 1 | input: [Null] 2 | output: (line:1,col:9) 'null' must be in lower case 3 | 4 | input: [NUll] 5 | output: (line:1,col:9) 'null' must be in lower case 6 | 7 | input: [nUll] 8 | output: (line:1,col:9) 'null' must be in lower case 9 | 10 | input: [ lol] 11 | output: (line:1,col:10) Unrecognizable token 12 | 13 | input: [True] 14 | output: (line:1,col:9) boolean value must be in lower case 15 | 16 | input: { 123:456} 17 | output: (line:1,col:10) Key must be a string 18 | 19 | # for escape 20 | input: ["\ud800"] 21 | output: (line:1,col:16) Expect \u escape for lower part of UTF-16 surrogate 22 | 23 | input: ["\udc00"] 24 | output: (line:1,col:10) Higher part of UTF-16 surrogate must be in the range of [0xd800, 0xdbff] 25 | 26 | input: ["\u",""] 27 | output: (line:1,col:10) illegal escape \u 28 | 29 | input: {"\uE330\uE330": [" 30 | output: (line:1,col:16) Lower part of UTF-16 surrogate must be in the range of [0xdc00, 0xdfff] 31 | -------------------------------------------------------------------------------- /tests/test_spec/test_misc.txt: -------------------------------------------------------------------------------- 1 | # This testing case is to make sure the mempool is working properly for 2 | # allocate blocks bigger than a page. 3 | # 4 | input: ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"] 5 | output : ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"] 6 | -------------------------------------------------------------------------------- /tests/test_spec/test_token.txt: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # 3 | # This file include testing cases for scaner. The purose 4 | # is to see if scaner get the token correctly. 5 | # 6 | ########################################################### 7 | # 8 | 9 | #!!!!!!!!!!! test boolean token !!!!!!!!!!!!!!!!!! 10 | input: [ true ] 11 | output: [true] 12 | 13 | input: [false ] 14 | output: [false] 15 | 16 | # !!!!!!!!!!! test null token !!!!!!!!!!!!!!! 17 | input: [null] 18 | output: [null] 19 | 20 | input: [ null] 21 | output: [null] 22 | 23 | input: [ null ] 24 | output: [null] 25 | 26 | #------------------------------------------------------------- 27 | # 28 | # !!!!!!!!!!! string token !!!!!!!!!!!!!!! 29 | # 30 | #------------------------------------------------------------- 31 | # 32 | input: [ "lol"] 33 | output: ["lol"] 34 | 35 | input: [ "l\"ol"] 36 | output: ["l\"ol"] 37 | 38 | # test string escape 39 | input: [ "\"\\\/\b\f\n\r\t\"" ] 40 | output: ["\"\\\/\b\f\n\r\t\""] 41 | 42 | # \u escape 43 | input: [ "lo\u0026l"] 44 | output: ["lo&l"] 45 | 46 | input: ["\ud7ff"] 47 | output:["\ud7ff"] 48 | 49 | input: ["\ud800\udc12"] 50 | output:["\ud800\udc12"] 51 | 52 | input: ["leading-junk \ud800\udc12 trailing-junk"] 53 | output:["leading-junk \ud800\udc12 trailing-junk"] 54 | 55 | # Scaner once mistakenly consider the closing quote in "xx\\" as a escaped quote 56 | input: ["xx\\", "xx\\\"yy", "xx\\\\\\"] 57 | output: ["xx\\","xx\\\"yy","xx\\\\\\"] 58 | 59 | #------------------------------------------------------------- 60 | # 61 | # !!!!!!!!! integer and floating point number !!!!!!!! 62 | # 63 | #------------------------------------------------------------- 64 | # 65 | input: [ 1230] 66 | output: [1230] 67 | 68 | # note: we print fp with 8 digit in precision 69 | input: [ 12.3] 70 | output: [12.30000000] 71 | 72 | # 18446744073709551615 == (uint64_t)-1. 73 | # For integer with 20 or more digit, we will convert it to floating point. 74 | # 75 | input: [18446744073709551615] 76 | output: [18446744073709551616.00000000] 77 | 78 | input: [1844674407370955161] 79 | output: [1844674407370955161] 80 | 81 | # 18446744073709551616 is too big to fit in int64_t 82 | input: [18446744073709551616] 83 | output: [18446744073709551616.00000000] 84 | 85 | input: [-2] 86 | output: [-2] 87 | -------------------------------------------------------------------------------- /tests/test_util.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "test_util.h" 10 | 11 | using namespace std; 12 | 13 | ////////////////////////////////////////////////////////////////////// 14 | // 15 | // Implementation of JsonDumper 16 | // 17 | ////////////////////////////////////////////////////////////////////// 18 | // 19 | void 20 | JsonDumper::dump_primitive(const obj_t* the_obj) { 21 | obj_primitive_t* obj = (obj_primitive_t*)(void*)the_obj; 22 | char buf[128]; 23 | int buf_size = sizeof(buf)/sizeof(buf[0]); 24 | 25 | obj_ty_t ot = (obj_ty_t)the_obj->obj_ty; 26 | int dump_len = 0; 27 | switch (ot) { 28 | case OT_INT64: 29 | dump_len = snprintf(buf, buf_size, "%" PRIi64, obj->int_val); 30 | break; 31 | 32 | case OT_FP: 33 | dump_len = snprintf(buf, buf_size, "%.8f", obj->db_val); 34 | break; 35 | 36 | case OT_BOOL: 37 | dump_len = snprintf(buf, buf_size, "%s", 38 | obj->int_val ? "true" : "false"); 39 | break; 40 | 41 | case OT_NULL: 42 | dump_len = snprintf(buf, buf_size, "null"); 43 | break; 44 | 45 | case OT_STR: 46 | dump_str(the_obj); 47 | return; 48 | 49 | default: 50 | dump_len = snprintf(buf, buf_size, "(unkonwn obj of ty:%d)", (int)ot); 51 | } 52 | 53 | append_str(buf, dump_len); 54 | } 55 | 56 | int 57 | JsonDumper::get_utf8_codepoint(const char* utf8_seq, int len, int& seq_real_len) { 58 | /* step 1: determine the length of the sequence */ 59 | unsigned char c = *utf8_seq; 60 | int codepoint = 0; 61 | seq_real_len = 0; 62 | if ((c & 0xf8) == 0xf0) { 63 | seq_real_len = 4; 64 | codepoint = c & 7; 65 | } else if ((c & 0xf0) == 0xe0) { 66 | seq_real_len = 3; 67 | codepoint = c & 0xf; 68 | } else if ((c & 0xe0) == 0xc0) { 69 | seq_real_len = 2; 70 | codepoint = c & 0x1f; 71 | } else { 72 | return -1; 73 | } 74 | 75 | /* Concatenate lower-6-bits of the following UTF8s */ 76 | for (int i = 1; i < seq_real_len; i++) { 77 | codepoint = ((codepoint << 6) | (utf8_seq[i] & 0x3f)); 78 | } 79 | 80 | return codepoint; 81 | } 82 | 83 | int 84 | JsonDumper::dump_str(const obj_t* str_obj, bool dryrun) { 85 | obj_primitive_t* obj = (obj_primitive_t*)(void*)str_obj; 86 | 87 | ASSERT(obj->obj_ty == OT_STR); 88 | 89 | if (!dryrun) { 90 | int len = dump_str(str_obj, true); 91 | resize( _content_len + len); 92 | } 93 | 94 | char *dest = _buf + _content_len; 95 | int len = 0; 96 | 97 | /* print the opening quotion */ 98 | len++; 99 | if (!dryrun) { *dest++ = '"'; } 100 | 101 | const char* str = obj->str_val; 102 | for (int i = 0, e = str_obj->str_len; i < e;) { 103 | char c = str[i]; 104 | /* case 1: Print regular ASCII */ 105 | if ((c & 0x80) == 0) { 106 | char esc = 0; 107 | switch(c) { 108 | case '/': esc = '/'; break; 109 | case '\\': esc = '\\'; break; 110 | case '"': esc = '"'; break; 111 | case '\b': esc = 'b'; break; 112 | case '\f': esc = 'f'; break; 113 | case '\r': esc = 'r'; break; 114 | case '\n': esc = 'n'; break; 115 | case '\t': esc = 't'; break; 116 | default: ; 117 | }; 118 | 119 | len++; 120 | if (esc) { 121 | len++; 122 | if (!dryrun) { *dest++ = '\\'; *dest++ = esc; } 123 | } else if (!dryrun) { 124 | *dest++ = c; 125 | } 126 | 127 | i++; 128 | continue; 129 | } 130 | 131 | int seq_len; 132 | int codepoint = get_utf8_codepoint(str + i, e - i, seq_len); 133 | if (codepoint < 0) { 134 | /* case 2: Something wrong with the UTF8 sequence. In this 135 | * case, we print a question-mark and move on. 136 | */ 137 | len++; 138 | if (!dryrun) { *dest++ = '?'; } 139 | 140 | i++; 141 | continue; 142 | } 143 | 144 | i += seq_len; 145 | 146 | /* case 3: print utf16 surrogate pair */ 147 | if (codepoint >= 0x10000) { 148 | int low = (codepoint & 0xffff) & 0x3ff; 149 | int high = ((codepoint & 0xffff) >> 10) & 0x3ff; 150 | 151 | low |= 0xdc00; 152 | high |= 0xd800; 153 | 154 | len += 12; 155 | if (!dryrun) { 156 | sprintf(dest, "\\u%04x\\u%04x", high, low); 157 | dest += 12; 158 | } 159 | continue; 160 | } 161 | 162 | len += 6; 163 | if (!dryrun) { 164 | sprintf(dest, "\\u%04x", codepoint); 165 | dest += 6; 166 | } 167 | } 168 | 169 | /* print the closing quotion */ 170 | len ++; 171 | if (!dryrun) { 172 | *dest++ = '"'; 173 | _content_len = dest - _buf; 174 | } 175 | 176 | return len; 177 | } 178 | 179 | void 180 | JsonDumper::dump_array(const obj_t* obj) { 181 | ASSERT(obj->obj_ty == OT_ARRAY); 182 | obj_composite_t* array_obj = (obj_composite_t*)(void*)obj; 183 | obj_t* elmt_slist = array_obj->subobjs; 184 | 185 | int elmt_num = obj->elmt_num; 186 | obj_t** elmt_vect = new obj_t*[obj->elmt_num]; 187 | 188 | int i = elmt_num - 1; 189 | for (; elmt_slist != 0 && i >= 0; elmt_slist = elmt_slist->next, i --) { 190 | elmt_vect[i] = elmt_slist; 191 | } 192 | 193 | if (elmt_slist) { 194 | fprintf(stderr, "array elements list seems to be corrupted\n"); 195 | return; 196 | } 197 | 198 | output_char('['); 199 | for (int i = 0; i < elmt_num; i++) { 200 | obj_t* elmt = elmt_vect[i]; 201 | dump_obj(elmt); 202 | if (i + 1 != elmt_num) { 203 | output_char(','); 204 | } 205 | } 206 | output_char(']'); 207 | 208 | delete[] elmt_vect; 209 | } 210 | 211 | void 212 | JsonDumper::dump_hashtab(const obj_t* obj) { 213 | ASSERT(obj->obj_ty == OT_HASHTAB); 214 | obj_composite_t* htab_obj = (obj_composite_t*)(void*)obj; 215 | obj_t* elmt_slist = htab_obj->subobjs; 216 | 217 | int elmt_num = obj->elmt_num; 218 | obj_t** elmt_vect = new obj_t*[obj->elmt_num]; 219 | 220 | int i = elmt_num - 1; 221 | for (; elmt_slist != 0 && i >= 0; elmt_slist = elmt_slist->next, i --) { 222 | elmt_vect[i] = elmt_slist; 223 | } 224 | 225 | if (elmt_slist) { 226 | fprintf(stderr, "array elements list seems to be corrupted\n"); 227 | return; 228 | } 229 | 230 | output_char('{'); 231 | 232 | for (int i = 0; i < elmt_num; i+=2) { 233 | obj_t* key = elmt_vect[i]; 234 | dump_obj(key); 235 | 236 | output_char(':'); 237 | 238 | obj_t* val= elmt_vect[i+1]; 239 | dump_obj(val); 240 | 241 | if (i + 2 != elmt_num) { 242 | output_char(','); 243 | } 244 | } 245 | output_char('}'); 246 | 247 | delete[] elmt_vect; 248 | } 249 | 250 | void 251 | JsonDumper::dump_obj(const obj_t* obj) { 252 | obj_ty_t ot = (obj_ty_t) obj->obj_ty; 253 | if (ot <= OT_LAST_PRIMITIVE) { 254 | dump_primitive(obj); 255 | return; 256 | } 257 | 258 | if (ot == OT_ARRAY) { 259 | dump_array(obj); 260 | return; 261 | } 262 | 263 | if (ot == OT_HASHTAB) { 264 | dump_hashtab(obj); 265 | return; 266 | } 267 | 268 | resize(128); 269 | int remain_sz = _buf_len - _content_len; 270 | snprintf(_buf + _content_len, remain_sz, "unknown obj type %d", ot); 271 | } 272 | 273 | void 274 | JsonDumper::dump(const obj_t* obj) { 275 | if (!_buf) { 276 | _buf_len = 128; 277 | _content_len = 0; 278 | _buf = (char*)malloc(_buf_len); 279 | } 280 | 281 | const obj_t* outmost = obj; 282 | obj_ty_t ot = (obj_ty_t) obj->obj_ty; 283 | if (ot > OT_LAST_PRIMITIVE) { 284 | obj_composite_t* cobj = (obj_composite_t*)(void*)obj; 285 | do { 286 | if ((cobj = cobj->reverse_nesting_order)) { 287 | outmost = (obj_t*)(void*)cobj; 288 | } else { 289 | break; 290 | } 291 | } while (true); 292 | } 293 | 294 | dump_obj(outmost); 295 | _buf[_content_len] = '\0'; 296 | } 297 | 298 | void 299 | JsonDumper::output_char(char c) { 300 | resize(1); 301 | _buf[_content_len++] = c; 302 | } 303 | 304 | void 305 | JsonDumper::resize(uint32_t remain_sz) { 306 | uint32_t min_sz = remain_sz + _content_len + 1; 307 | if (_buf_len < min_sz) { 308 | _buf_len = min_sz * 2; 309 | _buf = (char*) realloc(_buf, _buf_len); 310 | } 311 | } 312 | 313 | void 314 | JsonDumper::append_str(const char* str, uint32_t str_len) { 315 | resize(_content_len + str_len); 316 | 317 | memcpy(_buf + _content_len, str, str_len + 1); 318 | _content_len += str_len; 319 | } 320 | 321 | void 322 | JsonDumper::free_buf() { 323 | if (_buf) 324 | free((void*)_buf); 325 | 326 | _buf_len = _content_len = 0; 327 | } 328 | 329 | JsonDumper::JsonDumper() { 330 | _buf = 0; 331 | _buf_len = 0; 332 | _content_len = 0; 333 | } 334 | 335 | ////////////////////////////////////////////////////////////////////// 336 | // 337 | // Implementation of TestSpecIter 338 | // 339 | ////////////////////////////////////////////////////////////////////// 340 | // 341 | const char* TestSpecIter::_input_banner = "input"; 342 | const char* TestSpecIter::_output_banner = "output"; 343 | const char TestSpecIter::_banner_delimiter = ':'; 344 | 345 | TestSpecIter::TestSpecIter(const char* filename): 346 | _err_occ(false), _cur_linenum(0) { 347 | 348 | _input_file.open(filename); 349 | if (_input_file.fail()) { 350 | _err_occ = true; 351 | _err_msg = "fail to open "; 352 | _err_msg += filename; 353 | } 354 | } 355 | 356 | TestSpecIter::~TestSpecIter() { 357 | if (_input_file.is_open()) 358 | _input_file.close(); 359 | } 360 | 361 | bool 362 | TestSpecIter::get_spec(string& input, string& output, int& linenum) { 363 | if (_err_occ) 364 | return false; 365 | 366 | if (!get_line(input, _input_banner, _banner_delimiter)) 367 | return false; 368 | 369 | linenum = _cur_linenum; 370 | if (!get_line(output, _output_banner, _banner_delimiter)) 371 | return false; 372 | 373 | return true; 374 | } 375 | 376 | 377 | string::iterator 378 | TestSpecIter::first_non_space(string::iterator start, 379 | string::iterator end) const { 380 | for (; start != end; ++start) { 381 | char c = *start; 382 | if (c != ' ' && c != '\t') { 383 | return start; 384 | } 385 | } 386 | return end; 387 | } 388 | 389 | // Get the next line of the input stream, skipping empty and comment 390 | // lines. The next line is expected to be in the format of 391 | // " ...". If it is not in this format, or something 392 | // wrong happens, false is returned; otherwise, true is retuened, and 393 | // the "result" is set to be the next line. 394 | // 395 | // if "banner_2_space" is set, the banner and delimitor are replaced 396 | // with space. 397 | // 398 | bool 399 | TestSpecIter::get_line(string& result, const char* leading_banner, 400 | char banner_delimiter, bool banner_2_space) { 401 | if (_err_occ) 402 | return false; 403 | 404 | int banner_len = strlen(leading_banner); 405 | 406 | do { 407 | if (!getline(_input_file, result)) 408 | return false; 409 | 410 | _cur_linenum++; 411 | string::iterator iter , str_end = result.end(); 412 | iter = first_non_space(result.begin(), str_end); 413 | 414 | if (iter == str_end || *iter == '#') { 415 | // it is either empty line or a comment line. 416 | continue; 417 | } 418 | 419 | // find the leading-banner 420 | if (result.compare(iter - result.begin(), 421 | banner_len, leading_banner) != 0) { 422 | format_err_msg("line:%d expect to start with '%s * %c'", 423 | _cur_linenum, leading_banner, banner_delimiter); 424 | return false; 425 | } 426 | 427 | iter = first_non_space(iter + banner_len, str_end); 428 | if (iter == str_end || *iter != banner_delimiter) { 429 | format_err_msg("line:%d expect to deliminter '%c' after '%s'", 430 | _cur_linenum, banner_delimiter, leading_banner); 431 | return false; 432 | } 433 | 434 | iter = iter + 1; 435 | if (banner_2_space) { 436 | result.replace(result.begin(), iter, iter - result.begin(), ' '); 437 | } 438 | 439 | return true; 440 | } while(true); 441 | 442 | return false; 443 | } 444 | 445 | bool 446 | TestSpecIter::err_occur(std::string& errmsg) { 447 | if (_err_occ) { 448 | errmsg = _err_msg; 449 | return true; 450 | } 451 | 452 | return false; 453 | } 454 | 455 | void __attribute__((format(printf, 2, 3))) 456 | TestSpecIter::format_err_msg(const char* fmt, ...) { 457 | int buflen = 1024; 458 | char* buf = new char[buflen]; 459 | 460 | va_list vl; 461 | va_start(vl, fmt); 462 | vsnprintf(buf, buflen, fmt, vl); 463 | va_end(vl); 464 | 465 | _err_msg = buf; 466 | delete[] buf; 467 | } 468 | -------------------------------------------------------------------------------- /tests/test_util.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_UTIL_H 2 | #define TEST_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "../ljson_parser.h" 8 | 9 | #ifdef DEBUG 10 | #define ASSERT(c) if (!(c))\ 11 | { fprintf(stderr, "%s:%d Assert: %s\n", __FILE__, __LINE__, #c); abort(); } 12 | #else 13 | #define ASSERT(c) ((void)0) 14 | #endif 15 | 16 | ////////////////////////////////////////////////////////////////////// 17 | // 18 | // JsonDumper is for dumpping obj_t into a human-readable json format 19 | // 20 | ////////////////////////////////////////////////////////////////////// 21 | // 22 | class JsonDumper { 23 | public: 24 | JsonDumper(); 25 | ~JsonDumper() { free_buf(); } 26 | 27 | void dump(const obj_t* obj); 28 | const char* get_buf() { return _buf; } 29 | void free_buf(); 30 | 31 | private: 32 | void dump_primitive(const obj_t*); 33 | int dump_str(const obj_t*, bool dryrun=false); 34 | void dump_hashtab(const obj_t*); 35 | void dump_array(const obj_t*); 36 | void dump_obj(const obj_t*); 37 | 38 | void resize(uint32_t min_remain_sz); 39 | void output_char(char); 40 | void append_str(const char* str, uint32_t str_len); 41 | 42 | int get_utf8_codepoint(const char* utf8_seq, int len, int& seq_len); 43 | 44 | private: 45 | char* _buf; 46 | uint32_t _buf_len; 47 | uint32_t _content_len; 48 | }; 49 | 50 | ////////////////////////////////////////////////////////////////////// 51 | // 52 | // TestSpecIter is class to iterate test-spec file. Test-spec file 53 | // is in this format: 54 | // 55 | // input : 56 | // output : 57 | // ... 58 | // input : 59 | // output : 60 | // 61 | ////////////////////////////////////////////////////////////////////// 62 | // 63 | class TestSpecIter { 64 | public: 65 | TestSpecIter(const char* file); 66 | ~TestSpecIter(); 67 | 68 | // return false if error occur or hit the end of the input file. 69 | bool get_spec(std::string& input, std::string& output, int& linenum); 70 | 71 | // Return true iff error occurs. 72 | bool err_occur(std::string& errmsg); 73 | 74 | private: 75 | bool get_line(std::string& result, const char* leading_banner, 76 | char banner_delimiter, bool banner_2_space = true); 77 | 78 | std::string::iterator first_non_space(std::string::iterator start, 79 | std::string::iterator end) const; 80 | 81 | void format_err_msg(const char* fmt, ...) 82 | __attribute__((format(printf, 2, 3))) ; 83 | 84 | private: 85 | static const char* _input_banner; 86 | static const char* _output_banner; 87 | static const char _banner_delimiter; 88 | 89 | std::ifstream _input_file; 90 | std::string _err_msg; 91 | bool _err_occ; 92 | int _cur_linenum; 93 | }; 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /tests/unit_test.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../ljson_parser.h" 7 | #include "test_util.h" 8 | 9 | using namespace std; 10 | 11 | static int fail_num = 0; 12 | static int test_num = 0; 13 | static bool test_spec_wrong = false; 14 | 15 | // trim the leading and trailing space of the given string. 16 | static void 17 | trim_space_both_ends(string& str) { 18 | const char* s = str.c_str(); 19 | int len = str.size(); 20 | int first, last; 21 | for (first = 0; first < len; first++) { 22 | char c = s[first]; 23 | if (c != ' ' && c != '\t') 24 | break; 25 | } 26 | 27 | for (last = len - 1; last >= 0; last--) { 28 | char c = s[last]; 29 | if (c != ' ' && c != '\t') 30 | break; 31 | } 32 | 33 | if (first != len) { 34 | str = str.substr(first, last - first + 1); 35 | } else { 36 | str.clear(); 37 | } 38 | } 39 | 40 | void 41 | test_driver(const char* test_spec_file, const char* message, 42 | bool expect_fail = false) { 43 | fprintf(stdout, "\n\n%s \n (test-spec:%s)\n" 44 | "========================================\n", 45 | message, test_spec_file); 46 | 47 | struct json_parser* parser = jp_create(); 48 | if (!parser) { 49 | fprintf(stdout, "Fail to create parser\n"); 50 | exit(1); 51 | } 52 | 53 | // go through each testing cases 54 | TestSpecIter test_iter(test_spec_file); 55 | 56 | string input, expect_output; 57 | int line_num; 58 | while (test_iter.get_spec(input, expect_output, line_num)) { 59 | test_num++; 60 | 61 | fprintf(stdout, "Testing line:%3d ... ", line_num); 62 | trim_space_both_ends(expect_output); 63 | 64 | string real_output; 65 | 66 | obj_t* result = jp_parse(parser, input.c_str(), input.size()); 67 | if (!result) { 68 | if (expect_fail) { 69 | real_output = jp_get_err(parser); 70 | } else { 71 | fprintf(stdout, "fail! %s\n", jp_get_err(parser)); 72 | fail_num++; 73 | continue; 74 | } 75 | } else { 76 | JsonDumper dumper; 77 | dumper.dump(result); 78 | real_output = dumper.get_buf(); 79 | } 80 | 81 | if (expect_output.compare(real_output) != 0) { 82 | fprintf(stdout, "fail!\n >>>expect:%s\n >>>got:%s\n", 83 | expect_output.c_str(), real_output.c_str()); 84 | 85 | fail_num++; 86 | continue; 87 | } else { 88 | fprintf(stdout, "succ\n"); 89 | } 90 | } 91 | 92 | string err_msg; 93 | if (test_iter.err_occur(err_msg)) { 94 | fprintf(stdout, "fail: %s\n", err_msg.c_str()); 95 | test_spec_wrong = true; 96 | } 97 | 98 | jp_destroy(parser); 99 | } 100 | 101 | int 102 | main(int argc, char** argv) { 103 | test_driver("test_spec/test_token.txt", "Scaner testing cases"); 104 | test_driver("test_spec/test_composite.txt", "Test array/hashtab"); 105 | test_driver("test_spec/test_misc.txt", "Misc testing cases"); 106 | test_driver("test_spec/test_diagnostic.txt", "Test diagnoistic information", true); 107 | 108 | fprintf(stdout, 109 | "\nSummary\n=====================================\n Test: %d, fail :%d\n", 110 | test_num, fail_num); 111 | 112 | return (fail_num != 0 || test_spec_wrong) ? 1 : 0; 113 | } 114 | -------------------------------------------------------------------------------- /util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #ifdef DEBUG 5 | #include /* for abort */ 6 | #endif 7 | 8 | #include "ljson_parser.h" 9 | 10 | #define likely(x) __builtin_expect((x),1) 11 | #define unlikely(x) __builtin_expect((x),0) 12 | 13 | #define offsetof(t, m) __builtin_offsetof(t, m) 14 | 15 | #ifdef DEBUG 16 | #define ASSERT(c) if (!(c))\ 17 | { fprintf(stderr, "%s:%d Assert: %s\n", __FILE__, __LINE__, #c); abort(); } 18 | #else 19 | #define ASSERT(c) ((void)0) 20 | #endif 21 | 22 | #endif 23 | --------------------------------------------------------------------------------