├── LICENSE.md
├── README.md
├── dataset
├── bestbuy_sample_large_record.json
├── bestbuy_sample_small_records.json
├── twitter_sample_large_record.json
└── twitter_sample_small_records.json
├── doc
├── compare_large_server1.png
├── compare_large_server2.png
├── compare_large_server3.png
├── compare_small_server1.png
└── compare_small_server2.png
├── example
├── example1.cpp
├── example2.cpp
├── example3.cpp
└── example4.cpp
├── makefile
└── src
├── Bitmap.h
├── BitmapConstructor.cpp
├── BitmapConstructor.h
├── BitmapIterator.h
├── LocalBitmap.cpp
├── LocalBitmap.h
├── ParallelBitmap.cpp
├── ParallelBitmap.h
├── ParallelBitmapConstructor.cpp
├── ParallelBitmapConstructor.h
├── ParallelBitmapIterator.cpp
├── ParallelBitmapIterator.h
├── RecordLoader.cpp
├── RecordLoader.h
├── Records.h
├── SerialBitmap.cpp
├── SerialBitmap.h
├── SerialBitmapConstructor.cpp
├── SerialBitmapConstructor.h
├── SerialBitmapIterator.cpp
├── SerialBitmapIterator.h
├── Tokenizer.cpp
└── Tokenizer.h
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 AutomataLab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pison
2 | Pison builds structural index (bitmaps for colon and comma of different levels) for JSON records to accelerate JSON analytics.
3 | It leverages both coarse-grained (multicore) parallelism and fine-grained (bitwise and SIMD) parallelism to make index construction efficient.
4 | For more details about Pison, please refer to our paper [1].
5 |
6 | The original idea of JSON structural index construction was proposed in Mison [2]. The major improvement of Pison over Mison is the capability of building structure index for **a single large JSON record** in parallel. In addition, it optimizes the index construction steps, including adopting some bitwise operations used in [simdjson](https://github.com/simdjson/simdjson) [3], to further enhance the performance.
7 |
8 | ## Publications
9 | [1] Lin Jiang, Junqiao Qiu, Zhijia Zhao. [Scalable Structural Index Construction for JSON Analytics](https://vldb.org/pvldb/vol14/p694-zhao.pdf). PVLDB, 14(4):694-707, 2021.
10 |
11 | [2] Yinan Li, Nikos R. Katsipoulakis, Badrish Chandramouli, Jonathan Goldstein, D. Kossmann. Mison: A Fast JSON Parser for Data Analytics. PVLDB, 10(10): 2017.
12 |
13 | [3] Langdale, Geoff, and Daniel Lemire. "Parsing gigabytes of JSON per second." The VLDB Journal 28, no. 6 (2019): 941-960.
14 |
15 | ## Getting Started
16 | ### Prerequisites
17 | - **Hardware**: CPU processors should support `64-bit ALU instructions`, `256-bit SIMD instruction set`, and the `carry-less multiplication instruction (pclmulqdq)`
18 | - **Operating System**: `Linux`
19 | - **C++ Compiler**: `g++` (7.4.0 or higher)
20 |
21 | ### Dataset
22 | Four sample datasets are included in `dataset` folder. Large datasets (used in performance evaluation) can be downloaded from https://drive.google.com/drive/folders/1157Uho73N3b4e2a7ZI7CUx9gpdG_0pmM?usp=drive_link and placed into the `dataset` folder.
23 |
24 | ### Examples
25 | A few examples (in `cpp` files) are provided in the `example` folder. They demostrate how to use our APIs to implement JSON queries. To create and test your examples, please update the `makefile` accordingly.
26 |
27 | ### Build
28 | ```
29 | make clean
30 | make all
31 | ```
32 | ### Run
33 | Assume executable example file is `example1`.
34 | ```
35 | cd bin
36 | ./example1
37 | ```
38 |
39 | ## Performance Results
40 | We compared Pison with [RapidJSON](https://github.com/Tencent/rapidjson) and [simdjson](https://github.com/simdjson/simdjson) for processing (i) a single bulky JSON record and (ii) a sequence of small JSON records. These datasets include Best Buy (BB) product dataset, tweets (TT) from Twitter developer API, Google Maps Directions (GMD) dataset, National Statistics Post-code Lookup (NSPL) dataset for United Kingdom, Walmart (WM) product dataset, and Wikipedia (WP) entity dataset. Each dataset is a single large JSON record of approximately 1GB. Small records are extracted from the dominating array (a large array consists with sub-records) in each dataset, and are delimited by newlines. For each dataset, we created a JSONPath query, listed in the following table (for bulky records made of small records, an additional prefix `[*]` is added):
41 |
42 | | ID | JSONPath Query | Number of Matches |
43 | | :-----------------: |:---------------------------| :---------------------:|
44 | | TT | `{$.user.lang, $.lang}` | 300,270 |
45 | | BB | `{$.categoryPath[1:3].id}` | 459,332 |
46 | | GMD | `{$.routes[*].legs[*].steps[*].distance.text}` | 1,716,752 |
47 | | NSPL | `{$.meta.view.columns[*].name}` | 44 |
48 | | WM | `{$.bestMarketplacePrice.price, $.name}` | 288,391 |
49 | | WP | `{$.claims.P150[*].mainsnak.property}` | 15,603 |
50 |
51 |
52 | All experiments were conducted on two Xeon servers:
53 | - **[Server 1]**: a 16-core machine equipped with two Intel 2.1GHz Xeon E5-2620 v4 CPUs and 64GB RAM.
54 | - **[Server 2]**: a 4-core machine equipped with an Intel 3.5GHz Xeon E3-1240 v5 CPUs and 16GB RAM.
55 |
56 | The following two figures report the exeuction time (including both the index construction and the query evaluation) for bulky JSON record processing. Overall, both Pison and simdjson outperform RapidJSON, thanks to the use of SIMD and bitwise parallelism. The performance of serial Pison is comparable to simdjson, while parallel Pison achieves 5.4X and 3.1X speedups (on average) over simdjson on Server 1 (with 8 threads) and Server 2 (with 4 threads), respectively.
57 |
58 | Fig.1 - Execution Time of Processing A Single Large Record (Server 1).
59 |
60 |
61 |
62 | Fig.2 - Execution Time of Processing A Single Large Record (Server 2).
63 |
64 |
65 |
66 |
67 | In the scenario of small records processing, parallelism can be easily achieved at the task level (i.e., processing different records in parallel), so we only report the serial performance of Pison. Overall, performance results are consistent with those in large record processing.
68 |
69 | Fig.3 - Execution Time of Processing A Sequence of Small Records (Server 1).
70 |
71 |
72 |
73 | Fig.4 - Execution Time of Processing A Sequence of Small Records (Server 2).
74 |
75 |
76 |
77 | More detailed evaluation can be found in our VLDB'21 paper (see reference above).
78 |
79 | ## APIs
80 | ### Records Loading (Class: RecordLoader)
81 | - `static Record* loadSingleRecord(char* file_path)`: loads the whole input file as one single record (allow newlines in strings and other legal places).
82 | - `static RecordSet* loadRecords(char* file_path)`: loads multiple records from the input file (all newlines are treated as delimiters; no newlines (except for `\n` and `\r` in JSON strings) are allowed within a record); `RecordSet` can be accessed in array style (see `example3.cpp` and `example4.cpp` in `example` folder).
83 | ### Generating Leveled Bitmap Indices (Class: BitmapConstructor)
84 | - `static Bitmap* construct(Record* record, int thread_num = 1, int level_num = MAX_LEVEL)`: constructs leveled bitmaps for a single record in parallel (indicated by `thread_num`); bitmap indices can be created based on the maximum level of given queries or the JSON record (indicated by `level`).
85 | - `static BitmapIterator* getIterator(Bitmap* bi)`: creates iterator for bitmap indices.
86 | ### Bitmap Indices Iterator (Class: BitmapIterator)
87 | - `BitmapIterator* getCopy()`: gets a copy of an iterator (used for parallel accessing).
88 | - `bool down()`: moves to the lower level of the leveled bitmaps.
89 | - `bool up()`: moves to the upper level of the leveled bitmaps.
90 | - `bool isObject()`: checks if the iterator points to an object.
91 | - `bool isArray()`: checks if the iterator points to an array.
92 | - `bool moveToKey(char* key)`: moves to the corresponding key field inside the current object.
93 | - `bool moveToKey(unordered_set& key_set)`: moves to one of the corresponding key fields inside the current object.
94 | - `bool moveToIndex(index) `: moves to a specific element in the current array.
95 | - `bool moveNext()`: moves to the next element in the current array.
96 | - `char* getValue()`: gets the value/element of the current key/array index.
97 | - `int numArrayElements()`: gets the number of elements inside the current array.
98 |
--------------------------------------------------------------------------------
/doc/compare_large_server1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server1.png
--------------------------------------------------------------------------------
/doc/compare_large_server2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server2.png
--------------------------------------------------------------------------------
/doc/compare_large_server3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server3.png
--------------------------------------------------------------------------------
/doc/compare_small_server1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_small_server1.png
--------------------------------------------------------------------------------
/doc/compare_small_server2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_small_server2.png
--------------------------------------------------------------------------------
/example/example1.cpp:
--------------------------------------------------------------------------------
1 | #include "../src/RecordLoader.h"
2 | #include "../src/BitmapIterator.h"
3 | #include "../src/BitmapConstructor.h"
4 |
5 | // $[*].user.id
6 | string query(BitmapIterator* iter) {
7 | string output = "";
8 | while (iter->isArray() && iter->moveNext() == true) {
9 | if (iter->down() == false) continue; /* array element on the top level */
10 | if (iter->isObject() && iter->moveToKey("user")) {
11 | if (iter->down() == false) continue; /* value of "user" */
12 | if (iter->isObject() && iter->moveToKey("id")) {
13 | // value of "id"
14 | char* value = iter->getValue();
15 | output.append(value).append(";");
16 | if (value) free(value);
17 | }
18 | iter->up();
19 | }
20 | iter->up();
21 | }
22 | return output;
23 | }
24 |
25 | int main() {
26 | char* file_path = "../dataset/twitter_sample_large_record.json";
27 | Record* rec = RecordLoader::loadSingleRecord(file_path);
28 | if (rec == NULL) {
29 | cout<<"record loading fails."<isObject() && iter->moveToKey("products")) {
9 | if (iter->down() == false) return output; /* value of "products" */
10 | while (iter->isArray() && iter->moveNext() == true) {
11 | if (iter->down() == false) continue;
12 | if (iter->isObject() && iter->moveToKey("categoryPath")) {
13 | if (iter->down() == false) continue; /* value of "categoryPath" */
14 | if (iter->isArray()) {
15 | for (int idx = 1; idx <= 2; ++idx) {
16 | // 2nd and 3rd elements inside "categoryPath" array
17 | if (iter->moveToIndex(idx)) {
18 | if (iter->down() == false) continue;
19 | if (iter->isObject() && iter->moveToKey("id")) {
20 | // value of "id"
21 | char* value = iter->getValue();
22 | output.append(value).append(";");
23 | if (value) free(value);
24 | }
25 | iter->up();
26 | }
27 | }
28 | }
29 | iter->up();
30 | }
31 | iter->up();
32 | }
33 | iter->up();
34 | }
35 | return output;
36 | }
37 |
38 | int main() {
39 | char* file_path = "../dataset/bestbuy_sample_large_record.json";
40 | Record* rec = RecordLoader::loadSingleRecord(file_path);
41 | if (rec == NULL) {
42 | cout<<"record loading fails."<isObject()) {
9 | unordered_set set;
10 | set.insert("user");
11 | set.insert("retweet_count");
12 | char* key = NULL;
13 | while ((key = iter->moveToKey(set)) != NULL) {
14 | if (strcmp(key, "retweet_count") == 0) {
15 | // value of "retweet_count"
16 | char* value = iter->getValue();
17 | output.append(value).append(";");
18 | if (value) free(value);
19 | } else {
20 | if (iter->down() == false) continue; /* value of "user" */
21 | if (iter->isObject() && iter->moveToKey("id")) {
22 | // value of "id"
23 | char* value = iter->getValue();
24 | output.append(value).append(";");
25 | if (value) free(value);
26 | }
27 | iter->up();
28 | }
29 | }
30 | }
31 | return output;
32 | }
33 |
34 | int main() {
35 | char* file_path = "../dataset/twitter_sample_small_records.json";
36 | RecordSet* record_set = RecordLoader::loadRecords(file_path);
37 | if (record_set->size() == 0) {
38 | cout<<"record loading fails."<size();
56 | Bitmap* bm = NULL;
57 | for (int i = 0; i < num_recs; i++) {
58 | bm = BitmapConstructor::construct((*record_set)[i], thread_num, level_num);
59 | BitmapIterator* iter = BitmapConstructor::getIterator(bm);
60 | output.append(query(iter));
61 | delete iter;
62 | delete bm;
63 | }
64 | delete record_set;
65 |
66 | cout<<"matches are: "<