├── LICENSE.md
├── README.md
├── dataset
    ├── bestbuy_sample_large_record.json
    ├── bestbuy_sample_small_records.json
    ├── twitter_sample_large_record.json
    └── twitter_sample_small_records.json
├── doc
    ├── compare_large_server1.png
    ├── compare_large_server2.png
    ├── compare_large_server3.png
    ├── compare_small_server1.png
    └── compare_small_server2.png
├── example
    ├── example1.cpp
    ├── example2.cpp
    ├── example3.cpp
    └── example4.cpp
├── makefile
└── src
    ├── Bitmap.h
    ├── BitmapConstructor.cpp
    ├── BitmapConstructor.h
    ├── BitmapIterator.h
    ├── LocalBitmap.cpp
    ├── LocalBitmap.h
    ├── ParallelBitmap.cpp
    ├── ParallelBitmap.h
    ├── ParallelBitmapConstructor.cpp
    ├── ParallelBitmapConstructor.h
    ├── ParallelBitmapIterator.cpp
    ├── ParallelBitmapIterator.h
    ├── RecordLoader.cpp
    ├── RecordLoader.h
    ├── Records.h
    ├── SerialBitmap.cpp
    ├── SerialBitmap.h
    ├── SerialBitmapConstructor.cpp
    ├── SerialBitmapConstructor.h
    ├── SerialBitmapIterator.cpp
    ├── SerialBitmapIterator.h
    ├── Tokenizer.cpp
    └── Tokenizer.h


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 AutomataLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pison
 2 | Pison builds structural index (bitmaps for colon and comma of different levels) for JSON records to accelerate JSON analytics. 
 3 | It leverages both coarse-grained (multicore) parallelism and fine-grained (bitwise and SIMD) parallelism to make index construction efficient.
 4 | For more details about Pison, please refer to our paper [1].
 5 | 
 6 | The original idea of JSON structural index construction was proposed in Mison [2]. The major improvement of Pison over Mison is the capability of building structure index for **a single large JSON record** in parallel. In addition, it optimizes the index construction steps, including adopting some bitwise operations used in [simdjson](https://github.com/simdjson/simdjson) [3], to further enhance the performance. 
 7 | 
 8 | ## Publications
 9 | [1] Lin Jiang, Junqiao Qiu, Zhijia Zhao. [Scalable Structural Index Construction for JSON Analytics](https://vldb.org/pvldb/vol14/p694-zhao.pdf). PVLDB, 14(4):694-707, 2021.
10 | 
11 | [2] Yinan Li, Nikos R. Katsipoulakis, Badrish Chandramouli, Jonathan  Goldstein, D. Kossmann. Mison: A Fast JSON Parser for Data Analytics. PVLDB, 10(10): 2017.
12 | 
13 | [3] Langdale, Geoff, and Daniel Lemire. "Parsing gigabytes of JSON per second." The VLDB Journal 28, no. 6 (2019): 941-960.
14 | 
15 | ## Getting Started
16 | ### Prerequisites
17 | - **Hardware**: CPU processors should support `64-bit ALU instructions`, `256-bit SIMD instruction set`, and the `carry-less multiplication instruction (pclmulqdq)`
18 | - **Operating System**: `Linux`
19 | - **C++ Compiler**: `g++` (7.4.0 or higher)
20 | 
21 | ### Dataset
22 | Four sample datasets are included in `dataset` folder. Large datasets (used in performance evaluation) can be downloaded from https://drive.google.com/drive/folders/1157Uho73N3b4e2a7ZI7CUx9gpdG_0pmM?usp=drive_link and placed into the `dataset` folder. 
23 | 
24 | ### Examples
25 | A few examples (in `cpp` files) are provided in the `example` folder. They demostrate how to use our APIs to implement JSON queries. To create and test your examples, please update the `makefile` accordingly.
26 | 
27 | ### Build
28 |   ```
29 |   make clean
30 |   make all
31 |   ```
32 | ### Run
33 | Assume executable example file is `example1`.
34 |   ```
35 |   cd bin
36 |   ./example1
37 |   ```
38 | 
39 | ## Performance Results
40 | We compared Pison with [RapidJSON](https://github.com/Tencent/rapidjson) and [simdjson](https://github.com/simdjson/simdjson) for processing (i) a single bulky JSON record and (ii) a sequence of small JSON records. These datasets include Best Buy (BB) product dataset, tweets (TT) from Twitter developer API, Google Maps Directions (GMD) dataset, National Statistics Post-code Lookup (NSPL) dataset for United Kingdom, Walmart (WM) product dataset, and Wikipedia (WP) entity dataset. Each dataset is a single large JSON record of approximately 1GB. Small records are extracted from the dominating array (a large array consists with sub-records) in each dataset, and are delimited by newlines. For each dataset, we created a JSONPath query, listed in the following table (for bulky records made of small records, an additional prefix `[*]` is added):
41 | 
42 | | ID                  |   JSONPath Query                     |    Number of Matches   |
43 | | :-----------------: |:---------------------------| :---------------------:|
44 | | TT                  |   `{$.user.lang, $.lang}`     |    300,270            |
45 | | BB                  |   `{$.categoryPath[1:3].id}`  |    459,332            |
46 | | GMD                 |   `{$.routes[*].legs[*].steps[*].distance.text}`  |    1,716,752            |
47 | | NSPL                       | `{$.meta.view.columns[*].name}`     |    44     |
48 | | WM                    | `{$.bestMarketplacePrice.price, $.name}`      |   288,391  |
49 | | WP                       | `{$.claims.P150[*].mainsnak.property}`          |  15,603  |
50 | 
51 | 
52 | All experiments were conducted on two Xeon servers: 
53 | - **[Server 1]**: a 16-core machine equipped with two Intel 2.1GHz Xeon E5-2620 v4 CPUs and 64GB RAM. 
54 | - **[Server 2]**: a 4-core machine equipped with an Intel 3.5GHz Xeon E3-1240 v5 CPUs and 16GB RAM. 
55 | 
56 | The following two figures report the exeuction time (including both the index construction and the query evaluation) for bulky JSON record processing. Overall, both Pison and simdjson outperform RapidJSON, thanks to the use of SIMD and bitwise parallelism. The performance of serial Pison is comparable to simdjson, while parallel Pison achieves 5.4X and 3.1X speedups (on average) over simdjson on Server 1 (with 8 threads) and Server 2 (with 4 threads), respectively. 
57 | 
58 | <figcaption style="text-align:center"><b>Fig.1 - Execution Time of Processing A Single Large Record (Server 1).</b></figcaption>
59 | <br/>
60 | <img src="doc/compare_large_server1.png" width="70%"></img>
61 | 
62 | <figcaption style="text-align:center"><b>Fig.2 - Execution Time of Processing A Single Large Record (Server 2).</b></figcaption>
63 | <br/>
64 | <img src="doc/compare_large_server2.png" width="70%"></img>
65 | 
66 | 
67 | In the scenario of small records processing, parallelism can be easily achieved at the task level (i.e., processing different records in parallel), so we only report the serial performance of Pison. Overall, performance results are consistent with those in large record processing.
68 | 
69 | <figcaption style="text-align:center"><b>Fig.3 - Execution Time of Processing A Sequence of Small Records (Server 1).</b></figcaption>
70 | <br/>
71 | <img src="doc/compare_small_server1.png" width="70%"></img>
72 | 
73 | <figcaption style="text-align:center"><b>Fig.4 - Execution Time of Processing A Sequence of Small Records (Server 2).</b></figcaption>
74 | <br/>
75 | <img src="doc/compare_small_server2.png" width="70%"></img>
76 | 
77 | More detailed evaluation can be found in our VLDB'21 paper (see reference above).
78 | 
79 | ## APIs
80 | ### Records Loading (Class: RecordLoader)
81 | - `static Record* loadSingleRecord(char* file_path)`: loads the whole input file as one single record (allow newlines in strings and other legal places). 
82 | - `static RecordSet* loadRecords(char* file_path)`: loads multiple records from the input file (all newlines are treated as delimiters; no newlines (except for `\n` and `\r` in JSON strings) are allowed within a record); `RecordSet` can be accessed in array style (see `example3.cpp` and `example4.cpp` in `example` folder).
83 | ### Generating Leveled Bitmap Indices (Class: BitmapConstructor)
84 | - `static Bitmap* construct(Record* record, int thread_num = 1, int level_num = MAX_LEVEL)`: constructs leveled bitmaps for a single record in parallel (indicated by `thread_num`); bitmap indices can be created based on the maximum level of given queries or the JSON record (indicated by `level`).
85 | - `static BitmapIterator* getIterator(Bitmap* bi)`: creates iterator for bitmap indices.
86 | ### Bitmap Indices Iterator (Class: BitmapIterator)
87 | - `BitmapIterator* getCopy()`: gets a copy of an iterator (used for parallel accessing).
88 | - `bool down()`: moves to the lower level of the leveled bitmaps.
89 | - `bool up()`: moves to the upper level of the leveled bitmaps.
90 | - `bool isObject()`: checks if the iterator points to an object.
91 | - `bool isArray()`: checks if the iterator points to an array.
92 | - `bool moveToKey(char* key)`: moves to the corresponding key field inside the current object.
93 | - `bool moveToKey(unordered_set<char*>& key_set)`: moves to one of the corresponding key fields inside the current object.
94 | - `bool moveToIndex(index) `: moves to a specific element in the current array.
95 | - `bool moveNext()`: moves to the next element in the current array.
96 | - `char* getValue()`: gets the value/element of the current key/array index.
97 | - `int numArrayElements()`: gets the number of elements inside the current array.
98 | 


--------------------------------------------------------------------------------
/doc/compare_large_server1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server1.png


--------------------------------------------------------------------------------
/doc/compare_large_server2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server2.png


--------------------------------------------------------------------------------
/doc/compare_large_server3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_large_server3.png


--------------------------------------------------------------------------------
/doc/compare_small_server1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_small_server1.png


--------------------------------------------------------------------------------
/doc/compare_small_server2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutomataLab/Pison/8a01ce3354742365321b39ae1ed44edc2a91f7ad/doc/compare_small_server2.png


--------------------------------------------------------------------------------
/example/example1.cpp:
--------------------------------------------------------------------------------
 1 | #include "../src/RecordLoader.h"
 2 | #include "../src/BitmapIterator.h"
 3 | #include "../src/BitmapConstructor.h"
 4 | 
 5 | // $[*].user.id
 6 | string query(BitmapIterator* iter) {
 7 |     string output = "";
 8 |     while (iter->isArray() && iter->moveNext() == true) {
 9 |         if (iter->down() == false) continue;  /* array element on the top level */
10 |         if (iter->isObject() && iter->moveToKey("user")) {
11 |             if (iter->down() == false) continue; /* value of "user" */
12 |             if (iter->isObject() && iter->moveToKey("id")) {
13 |                 // value of "id"
14 |                 char* value = iter->getValue();
15 |                 output.append(value).append(";");
16 |                 if (value) free(value);
17 |             }
18 |             iter->up();
19 |         }
20 |         iter->up();
21 |     }
22 |     return output;
23 | }
24 | 
25 | int main() {
26 |     char* file_path = "../dataset/twitter_sample_large_record.json";
27 |     Record* rec = RecordLoader::loadSingleRecord(file_path);
28 |     if (rec == NULL) {
29 |         cout<<"record loading fails."<<endl;
30 |         return -1;
31 |     }
32 | 
33 |     // set the number of threads for parallel bitmap construction
34 |     int thread_num = 16;
35 | 
36 |     /* set the number of levels of bitmaps to create, either based on the
37 |      * query or the JSON records. E.g., query $[*].user.id needs three levels 
38 |      * (level 0, 1, 2), but the record may be of more than three levels
39 |      */
40 |     int level_num = 3;
41 | 
42 |     /* process the input record: first build bitmap, then perform 
43 |      * the query with a bitmap iterator
44 |      */
45 |     Bitmap* bm = BitmapConstructor::construct(rec, thread_num, level_num);
46 |     BitmapIterator* iter = BitmapConstructor::getIterator(bm);
47 |     string output = query(iter);
48 |     delete iter;
49 |     delete bm;
50 |     delete rec;
51 | 
52 |     cout<<"matches are: "<<output<<endl;
53 |     return 0;
54 | }
55 | 


--------------------------------------------------------------------------------
/example/example2.cpp:
--------------------------------------------------------------------------------
 1 | #include "../src/RecordLoader.h"
 2 | #include "../src/BitmapIterator.h"
 3 | #include "../src/BitmapConstructor.h"
 4 | 
 5 | // $.products[*].categoryPath[1:3].id
 6 | string query(BitmapIterator* iter) {
 7 |     string output = "";
 8 |     if (iter->isObject() && iter->moveToKey("products")) {
 9 |         if (iter->down() == false) return output;  /* value of "products" */
10 |         while (iter->isArray() && iter->moveNext() == true) {
11 |             if (iter->down() == false) continue;
12 |             if (iter->isObject() && iter->moveToKey("categoryPath")) {
13 |                 if (iter->down() == false) continue; /* value of "categoryPath" */
14 |                 if (iter->isArray()) {
15 |                     for (int idx = 1; idx <= 2; ++idx) {
16 |                         // 2nd and 3rd elements inside "categoryPath" array
17 |                         if (iter->moveToIndex(idx)) {
18 |                             if (iter->down() == false) continue;
19 |                             if (iter->isObject() && iter->moveToKey("id")) {
20 |                                 // value of "id"
21 |                                 char* value = iter->getValue();
22 |                                 output.append(value).append(";");
23 |                                 if (value) free(value);
24 |                             }
25 |                             iter->up();
26 |                         }
27 |                     }
28 |                 }
29 |                 iter->up();
30 |             }
31 |             iter->up();
32 |         }
33 |         iter->up();
34 |     }
35 |     return output;
36 | }
37 | 
38 | int main() {
39 |     char* file_path = "../dataset/bestbuy_sample_large_record.json";
40 |     Record* rec = RecordLoader::loadSingleRecord(file_path);
41 |     if (rec == NULL) {
42 |         cout<<"record loading fails."<<endl;
43 |         return -1;
44 |     }
45 | 
46 |     /* process the input record in serial order: first build bitmap,
47 |      * then perform the query with a bitmap iterator
48 |      */
49 |     Bitmap* bm = BitmapConstructor::construct(rec);
50 |     BitmapIterator* iter = BitmapConstructor::getIterator(bm);
51 |     string output = query(iter);
52 |     delete iter;
53 |     delete bm;
54 |     delete rec;
55 | 
56 |     cout<<"matches are: "<<output<<endl;    
57 |     return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/example/example3.cpp:
--------------------------------------------------------------------------------
 1 | #include "../src/RecordLoader.h"
 2 | #include "../src/BitmapIterator.h"
 3 | #include "../src/BitmapConstructor.h"
 4 | 
 5 | // {$.user.id, $.retweet_count}
 6 | string query(BitmapIterator* iter) {
 7 |     string output = "";
 8 |     if (iter->isObject()) {
 9 |         unordered_set<char*> set;
10 |         set.insert("user");
11 |         set.insert("retweet_count");
12 |         char* key = NULL;
13 |         while ((key = iter->moveToKey(set)) != NULL) {
14 |             if (strcmp(key, "retweet_count") == 0) {
15 |                 // value of "retweet_count"
16 |                 char* value = iter->getValue();
17 |                 output.append(value).append(";");
18 |                 if (value) free(value);
19 |             } else {
20 |                 if (iter->down() == false) continue;  /* value of "user" */
21 |                 if (iter->isObject() && iter->moveToKey("id")) {
22 |                     // value of "id"
23 |                     char* value = iter->getValue();
24 |                     output.append(value).append(";");
25 |                     if (value) free(value);
26 |                 }
27 |                 iter->up();
28 |             }
29 |         }
30 |     }
31 |     return output;
32 | }
33 | 
34 | int main() {
35 |     char* file_path = "../dataset/twitter_sample_small_records.json";
36 |     RecordSet* record_set = RecordLoader::loadRecords(file_path);
37 |     if (record_set->size() == 0) {
38 |         cout<<"record loading fails."<<endl;
39 |         return -1;
40 |     }
41 |     string output = "";
42 |     
43 |     // fix the number of threads to 1 for small records scenario; parallel bitmap construction is TBD. 
44 |     int thread_num = 1;  
45 |    
46 |     /* set the number of levels of bitmaps to create, either based on the
47 |      * query or the JSON records. E.g., query $[*].user.id needs three levels
48 |      * (level 0, 1, 2), but the record may be of more than three levels
49 |      */
50 |     int level_num = 2;
51 |  
52 |     /* process the records one by one: for each one, first build bitmap, then perform 
53 |      * the query with a bitmap iterator
54 |      */
55 |     int num_recs = record_set->size();
56 |     Bitmap* bm = NULL; 
57 |     for (int i = 0; i < num_recs; i++) {
58 |         bm = BitmapConstructor::construct((*record_set)[i], thread_num, level_num);
59 |         BitmapIterator* iter = BitmapConstructor::getIterator(bm);
60 |         output.append(query(iter));
61 |         delete iter;
62 |         delete bm;
63 |     }
64 |     delete record_set;
65 |     
66 |     cout<<"matches are: "<<output<<endl;
67 |     return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/example/example4.cpp:
--------------------------------------------------------------------------------
 1 | #include "../src/RecordLoader.h"
 2 | #include "../src/BitmapIterator.h"
 3 | #include "../src/BitmapConstructor.h"
 4 | 
 5 | // $.categoryPath[1:3].id
 6 | string query(BitmapIterator* iter) {
 7 |     string output = "";
 8 |     if (iter->isObject() && iter->moveToKey("categoryPath")) {
 9 |         if (iter->down() == false) return output; /* value of "categoryPath" */
10 |         if (iter->isArray()) {
11 |             for (int idx = 1; idx <= 2; ++idx) {
12 |                 // 2nd and 3rd elements inside "categoryPath" array
13 |                 if (iter->moveToIndex(idx)) {
14 |                     if (iter->down() == false) continue;
15 |                     if (iter->isObject() && iter->moveToKey("id")) {
16 |                         // value of "id"
17 |                         char* value = iter->getValue();
18 |                         output.append(value).append(";");
19 |                         if (value) free(value);
20 |                     }
21 |                     iter->up();
22 |                 }
23 |             }
24 |         }
25 |         iter->up();
26 |     }
27 |     return output;
28 | }
29 | 
30 | int main() {
31 |     char* file_path = "../dataset/bestbuy_sample_small_records.json";
32 |     RecordSet* record_set = RecordLoader::loadRecords(file_path);
33 |     if (record_set->size() == 0) {
34 |         cout<<"record loading fails."<<endl;
35 |         return -1;
36 |     }
37 |     string output = "";
38 | 
39 |     // fix the number of threads to 1 for small records scenario; parallel bitmap construction is TBD. 
40 |     int thread_num = 1;
41 | 
42 |     /* set the number of levels of bitmaps to create, either based on the
43 |      * query or the JSON records. E.g., query $[*].user.id needs three levels
44 |      * (level 0, 1, 2), but the record may be of more than three levels
45 |      */
46 |     int level_num = 3;
47 | 
48 |     /* process the records one by one: for each one, first build bitmap, then perform
49 |      * the query with a bitmap iterator
50 |      */
51 |     int num_recs = record_set->size();
52 |     Bitmap* bm = NULL;
53 |     for (int i = 0; i < num_recs; i++) {
54 |         bm = BitmapConstructor::construct((*record_set)[i], thread_num, level_num);
55 |         BitmapIterator* iter = BitmapConstructor::getIterator(bm);
56 |         output.append(query(iter));
57 |         delete iter;
58 |         delete bm;
59 |     }
60 |     delete record_set;
61 | 
62 |     cout<<"matches are: "<<output<<endl;
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | DIR = bin
 2 | EXEC1 = $(DIR)/example1
 3 | EXEC2 = $(DIR)/example2
 4 | EXEC3 = $(DIR)/example3
 5 | EXEC4 = $(DIR)/example4
 6 | TARGET = $(EXEC1) ${EXEC2} ${EXEC3} ${EXEC4}
 7 | all: $(TARGET)
 8 | 
 9 | CC = g++
10 | CC_FLAGS = -O3 -std=c++11 -mavx -mavx2 -msse -msse2 -msse4 -msse4.2 -mpclmul
11 | POST_FLAGS = -lpthread -mcmodel=medium -static-libstdc++
12 | 
13 | SOURCE1 = src/*.cpp example/example1.cpp
14 | $(EXEC1): $(SOURCE1)
15 | 	mkdir -p $(DIR)
16 | 	$(CC) $(CC_FLAGS) -o $(EXEC1) $(SOURCE1) $(POST_FLAGS)
17 | 
18 | SOURCE2 = src/*.cpp example/example2.cpp
19 | $(EXEC2): $(SOURCE2)
20 | 	mkdir -p $(DIR)
21 | 	$(CC) $(CC_FLAGS) -o $(EXEC2) $(SOURCE2) $(POST_FLAGS)
22 | 
23 | SOURCE3 = src/*.cpp example/example3.cpp
24 | $(EXEC3): $(SOURCE3)
25 | 	mkdir -p $(DIR)
26 | 	$(CC) $(CC_FLAGS) -o $(EXEC3) $(SOURCE3) $(POST_FLAGS)
27 | 
28 | SOURCE4 = src/*.cpp example/example4.cpp
29 | $(EXEC4): $(SOURCE4)
30 | 	mkdir -p $(DIR)
31 | 	$(CC) $(CC_FLAGS) -o $(EXEC4) $(SOURCE4) $(POST_FLAGS)
32 | 
33 | clean:
34 | 	-$(RM) $(TARGET)
35 | 


--------------------------------------------------------------------------------
/src/Bitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITMAP_H
 2 | #define BITMAP_H
 3 | 
 4 | #include <iostream>
 5 | using namespace std;
 6 | 
 7 | #define MAX_LEVEL 22
 8 | #define SEQUENTIAL 1
 9 | #define PARALLEL 2
10 | 
11 | class Bitmap {
12 |     friend class BitmapConstructor;
13 |   private:
14 |     int type;
15 |   public:
16 |     Bitmap() {
17 |         type = SEQUENTIAL;
18 |     }
19 |     virtual void setRecordLength(long length) {}
20 |     virtual void indexConstruction() {}
21 |     virtual void setStreamFlag(bool flag) {}
22 |     virtual ~Bitmap() {}
23 | };
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/BitmapConstructor.cpp:
--------------------------------------------------------------------------------
 1 | #include "BitmapConstructor.h"
 2 | 
 3 | Bitmap* BitmapConstructor::construct(Record* record, int thread_num, int level_num) {
 4 |     Bitmap* bm = NULL;
 5 |     if (thread_num == 1) {
 6 |         bm = SerialBitmapConstructor::construct(record, level_num);
 7 |         bm->type = SEQUENTIAL;
 8 |     } else {
 9 |         bm = ParallelBitmapConstructor::construct(record, thread_num, level_num);
10 |         bm->type = PARALLEL;
11 |     }
12 |     return bm;
13 | }
14 | 
15 | BitmapIterator* BitmapConstructor::getIterator(Bitmap* bm) {
16 |     BitmapIterator* bi = NULL;
17 |     if (bm->type == SEQUENTIAL) {
18 |         bi = new SerialBitmapIterator((SerialBitmap*)bm);
19 |         bi->type = SEQUENTIAL;
20 |     } else {
21 |         bi = new ParallelBitmapIterator((ParallelBitmap*)bm);
22 |         bi->type = PARALLEL;
23 |     }
24 |     return bi;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/BitmapConstructor.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITMAPCONSTRUCTOR_H
 2 | #define BITMAPCONSTRUCTOR_H
 3 | 
 4 | #include <string>
 5 | #include "Bitmap.h"
 6 | #include "BitmapIterator.h"
 7 | #include "SerialBitmapIterator.h"
 8 | #include "ParallelBitmapIterator.h"
 9 | #include "SerialBitmapConstructor.h"
10 | #include "ParallelBitmapConstructor.h"
11 | #include "Records.h"
12 | using namespace std;
13 | 
14 | class BitmapConstructor {
15 |   public:
16 |     // construct leveled bitmaps for a JSON record
17 |     static Bitmap* construct(Record* record, int thread_num = 1, int level_num = MAX_LEVEL);
18 |     // get bitmap iterator for given bitmap index
19 |     static BitmapIterator* getIterator(Bitmap* bi);
20 | };
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/BitmapIterator.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITMAPITERATOR_H
 2 | #define BITMAPITERATOR_H
 3 | 
 4 | #include "Bitmap.h"
 5 | #include <unordered_set>
 6 | using namespace std;
 7 | 
 8 | class Bitmap;
 9 | 
10 | #define OBJECT 1
11 | #define ARRAY 2
12 | #define PRIMITIVE 3
13 | 
14 | #define ERR -1
15 | #define MAX_FIELD_SIZE 1000
16 | 
17 | // Metadata for parsing and querying, saves context information at each level when iterating leveled bitmap.
18 | struct IterCtxInfo {
19 |     // current thread id for parsing and querying. Only used during leveled bitmap iteration.
20 |     int thread_id;
21 |     // OBJECT or ARRAY
22 |     int type;
23 |     // position array for colon and comma
24 |     long* positions;
25 |     // start index of the record position array at the current level
26 |     long start_idx;
27 |     // end index of the record position array at the current level
28 |     long end_idx;
29 |     // current index of the record position array at the current level
30 |     long cur_idx;
31 |     // the current level
32 |     int level;
33 | };
34 | 
35 | struct KeyPos {
36 |     long start;
37 |     long end;
38 | };
39 | 
40 | class BitmapIterator {
41 |     friend class BitmapConstructor;
42 |     friend class BitmapConstructor1;
43 |   private:
44 |     int type;
45 |   public:
46 |     int mVisitedFields;
47 |   public:
48 |     // Creates a copy of iterator. Often used for parallel querying.
49 |     virtual BitmapIterator* getCopy() = 0;
50 |     // Moves back to the object or array which contains the current nested record. 
51 |     // Often used when the current nested record has been processed. 
52 |     // Valid except for the first level of the record. 
53 |     virtual bool up() = 0; 
54 |     // Moves to the start of the nested object or array. 
55 |     // Gets all colon or comma positions from leveled bitmap indexes for current nested record. 
56 |     // Valid if we are at { or [.
57 |     virtual bool down() = 0;
58 |     // Whether the iterator points to an object.
59 |     virtual bool isObject() = 0;
60 |     // Whether the iterator points to an array.
61 |     virtual bool isArray() = 0;
62 |     // Moves iterator to the next array item.
63 |     virtual bool moveNext() = 0; 
64 |     // Moves to the corresponding key field inside the current object.
65 |     virtual bool moveToKey(char* key) = 0;
66 |     // Moves to the corresponding key fields inside the current object, returns the current key name.
67 |     // After this operation, the current key field will be removed from key_set.
68 |     virtual char* moveToKey(unordered_set<char*>& key_set) = 0;
69 |     // Returns the number of elements inside current array.
70 |     virtual int numArrayElements() = 0;
71 |     // If the current record is an array, moves to an item based on index.
72 |     // Returns false if the index is out of the boundary.
73 |     virtual bool moveToIndex(int index) = 0;
74 |     // Gets the content of the current value inside an object or array. 
75 |     virtual char* getValue() = 0;
76 |     virtual ~BitmapIterator() {
77 | 
78 |     }
79 | };
80 | #endif
81 | 


--------------------------------------------------------------------------------
/src/LocalBitmap.cpp:
--------------------------------------------------------------------------------
  1 | #include "LocalBitmap.h"
  2 | #include <immintrin.h>
  3 | #include <emmintrin.h>
  4 | #include <string.h>
  5 | #include <sys/time.h>
  6 | #include <stdio.h>
  7 | #include <string.h>
  8 | #include <stdlib.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | #include <ctype.h>
 13 | #include <pthread.h>
 14 | #include <malloc.h>
 15 | #include <sys/time.h>
 16 | #include <sys/file.h>
 17 | #include <unistd.h>
 18 | #include <sched.h>
 19 | #include <unordered_map>
 20 | using namespace std;
 21 | 
 22 | LocalBitmap::LocalBitmap() {
 23 | 
 24 | }
 25 | 
 26 | LocalBitmap::LocalBitmap(char* record, int level_num) {
 27 |     this->mThreadId = 0;
 28 |     this->mRecord = record;
 29 |     this->mDepth = level_num - 1;
 30 |     this->mStartWordId = 0;
 31 |     this->mEndWordId = 0;
 32 |     this->mQuoteBitmap = NULL;
 33 |     this->mEscapeBitmap = NULL;
 34 |     this->mColonBitmap = NULL;
 35 |     this->mCommaBitmap = NULL;
 36 |     this->mStrBitmap = NULL;
 37 |     this->mLbraceBitmap = NULL;
 38 |     this->mRbraceBitmap = NULL;
 39 |     this->mLbracketBitmap = NULL;
 40 |     this->mRbracketBitmap = NULL;
 41 |     for (int i = 0; i < MAX_LEVEL; ++i) {
 42 |         this->mLevColonBitmap[i] = NULL;
 43 |         this->mLevCommaBitmap[i] = NULL;
 44 |         this->mNegLevColonBitmap[i] = NULL;
 45 |         this->mNegLevCommaBitmap[i] = NULL;
 46 |         this->mFinalLevColonBitmap[i] = NULL;
 47 |         this->mFinalLevCommaBitmap[i] = NULL;
 48 |     }
 49 |     this->mStartInStrBitmap = 0ULL;
 50 |     this->mEndInStrBitmap = 0ULL;
 51 |     this->mMaxPositiveLevel = 0;
 52 |     this->mMinNegativeLevel = -1;
 53 | 
 54 |     this->mNumTknErr = 0;
 55 |     this->mNumTrial = 0;
 56 | }
 57 | 
 58 | void LocalBitmap::freeMemory()
 59 | {
 60 |     for(int m = 0; m < MAX_LEVEL; ++m){
 61 |         if (mLevColonBitmap[m]) {
 62 |             free(mLevColonBitmap[m]);
 63 |             mLevColonBitmap[m] = NULL;
 64 |         }
 65 |         if (mLevCommaBitmap[m]) {
 66 |             free(mLevCommaBitmap[m]);
 67 |             mLevCommaBitmap[m] = NULL;
 68 |         }
 69 |         if (mNegLevColonBitmap[m]) {
 70 |             free(mNegLevColonBitmap[m]);
 71 |             mNegLevColonBitmap[m] = NULL;
 72 |         }
 73 |         if (mNegLevCommaBitmap[m]) {
 74 |             free(mNegLevCommaBitmap[m]);
 75 |             mNegLevCommaBitmap[m] = NULL;
 76 |         }
 77 |     }
 78 |     if (mQuoteBitmap) {
 79 |         free(mQuoteBitmap);
 80 |         mQuoteBitmap = NULL;
 81 |     }
 82 |     if (mEscapeBitmap) {
 83 |         free(mEscapeBitmap);
 84 |         mEscapeBitmap = NULL;
 85 |     }
 86 |     if (mStrBitmap) {
 87 |         free(mStrBitmap);
 88 |         mStrBitmap = NULL;
 89 |     }
 90 |     if (mColonBitmap) {
 91 |         free(mColonBitmap);
 92 |         mColonBitmap = NULL;
 93 |     }
 94 |     if (mCommaBitmap) {
 95 |         free(mCommaBitmap);
 96 |         mCommaBitmap = NULL;
 97 |     }
 98 |     if (mLbraceBitmap) {
 99 |         free(mLbraceBitmap);
100 |         mLbraceBitmap = NULL;
101 |     }
102 |     if (mRbraceBitmap) {
103 |         free(mRbraceBitmap);
104 |         mRbraceBitmap = NULL;
105 |     }
106 |     if (mLbracketBitmap) {
107 |         free(mLbracketBitmap);
108 |         mLbracketBitmap = NULL;
109 |     }
110 |     if (mRbracketBitmap) {
111 |         free(mRbracketBitmap);
112 |         mRbracketBitmap = NULL;
113 |     }
114 | }
115 | 
116 | LocalBitmap::~LocalBitmap()
117 | {
118 |     freeMemory();
119 | }
120 | 
121 | void LocalBitmap::setRecordLength(long length) {
122 |     this->mRecordLength = length;
123 |     this->mNumTmpWords = length / 32;
124 |     this->mNumWords = length / 64;
125 |     this->mQuoteBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
126 | }
127 | 
128 | int LocalBitmap::contextInference() {
129 |     Tokenizer tkn;
130 |     int start_states[2] = {OUT, IN};
131 |     bool getStartState = false;
132 |     int start_state = OUT;
133 |     for (int j = 0; j < 2; ++j) {
134 |         ++mNumTrial;
135 |         int state = start_states[j];
136 |         tkn.createIterator(mRecord, state);
137 |         while (true) {
138 |             int tkn_status = tkn.hasNextToken();
139 |             if (tkn_status == END)
140 |                 break;
141 |             if (tkn_status == ERROR) {
142 |                 ++mNumTknErr;
143 |                 start_state = tkn.oppositeState(state);
144 |                 getStartState = true;
145 |                 break;
146 |             }
147 |             tkn.nextToken();
148 |         }
149 |         if (getStartState == true) break;
150 |     }
151 |     if (start_state == IN) {
152 |         mStartInStrBitmap = 0xffffffffffffffffULL;
153 |     } else {
154 |         mStartInStrBitmap = 0ULL;
155 |     }
156 |     //cout<<"inference result num of trails: "<<mNumTrial<<" num of token error "<<mNumTknErr<<endl;
157 |     //cout<<"inference result "<<start_state<<" "<<getStartState<<endl;
158 |     if (getStartState == true) return start_state;
159 |     return UNKNOWN;
160 | }
161 | 
162 | void LocalBitmap::nonSpecIndexConstruction() {
163 |     // vectors for structural characters
164 |     __m256i v_quote = _mm256_set1_epi8(0x22);
165 |     __m256i v_colon = _mm256_set1_epi8(0x3a);
166 |     __m256i v_escape = _mm256_set1_epi8(0x5c);
167 |     __m256i v_lbrace = _mm256_set1_epi8(0x7b);
168 |     __m256i v_rbrace = _mm256_set1_epi8(0x7d);
169 |     __m256i v_comma = _mm256_set1_epi8(0x2c); 
170 |     __m256i v_lbracket = _mm256_set1_epi8(0x5b);
171 |     __m256i v_rbracket = _mm256_set1_epi8(0x5d);
172 | 	
173 |     // variables for saving temporary results in the first four steps
174 |     unsigned long colonbit0, quotebit0, escapebit0, stringbit0, lbracebit0, rbracebit0, commabit0, lbracketbit0, rbracketbit0;
175 |     unsigned long colonbit, quotebit, escapebit, stringbit, lbracebit, rbracebit, commabit, lbracketbit, rbracketbit;
176 |     unsigned long str_mask;
177 | 	
178 |     // variables for saving temporary results in the last step
179 |     unsigned long lb_mask, rb_mask, cb_mask;
180 |     unsigned long lb_bit, rb_bit, cb_bit;
181 |     unsigned long first, second;
182 |     int cur_level = -1;
183 | 	
184 |     // variables for saving context information among different words
185 |     int top_word = -1;
186 |     uint64_t prev_iter_ends_odd_backslash = 0ULL;
187 |     uint64_t prev_iter_inside_quote = mStartInStrBitmap;
188 |     const uint64_t even_bits = 0x5555555555555555ULL;
189 |     const uint64_t odd_bits = ~even_bits; 
190 | 
191 |     for (int j = 0; j < mNumTmpWords; ++j) {
192 |         colonbit = 0, quotebit = 0, escapebit = 0, stringbit = 0, lbracebit = 0, rbracebit = 0, commabit = 0, lbracketbit = 0, rbracketbit = 0;
193 |         unsigned long i = j * 32; 
194 |         // step 1: build structural character bitmaps
195 |         __m256i v_text = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mRecord + i));
196 |         colonbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_colon));
197 |         quotebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_quote)); 
198 |         escapebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_escape)); 
199 |         lbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbrace));
200 |         rbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbrace));
201 |         commabit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_comma));
202 | 	lbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbracket));
203 | 	rbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbracket));
204 |         // first half of the word (lowest 32 bits)
205 |         if(j % 2 == 0) {
206 |             colonbit0 = colonbit;
207 |             quotebit0 = quotebit;
208 |             escapebit0 = escapebit;
209 |             lbracebit0 = lbracebit;
210 |             rbracebit0 = rbracebit;
211 |             commabit0 = commabit;
212 |             lbracketbit0 = lbracketbit;
213 |             rbracketbit0 = rbracketbit;
214 |             continue;
215 |         } else {
216 |             // highest 32 bits inside a word
217 |             colonbit = (colonbit << 32) | colonbit0;
218 |             quotebit = (quotebit << 32) | quotebit0;
219 |             escapebit = (escapebit << 32) | escapebit0;
220 |             lbracebit = (lbracebit << 32) | lbracebit0;
221 |             rbracebit = (rbracebit << 32) | rbracebit0;
222 |             commabit = (commabit << 32) | commabit0;
223 |             lbracketbit = (lbracketbit << 32) | lbracketbit0;
224 |             rbracketbit = (rbracketbit << 32) | rbracketbit0;
225 | 
226 |             // step 2: update structural quote bitmaps
227 |             uint64_t bs_bits = escapebit;
228 |             uint64_t start_edges = bs_bits & ~(bs_bits << 1);
229 |             int64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
230 |             uint64_t even_starts = start_edges & even_start_mask;
231 |             uint64_t odd_starts = start_edges & ~even_start_mask;
232 |             uint64_t even_carries = bs_bits + even_starts;
233 |             int64_t odd_carries;
234 |             bool iter_ends_odd_backslash = __builtin_uaddll_overflow(bs_bits, odd_starts,
235 |                 (unsigned long long *)(&odd_carries));
236 |             odd_carries |= prev_iter_ends_odd_backslash;
237 |             prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
238 |             uint64_t even_carry_ends = even_carries & ~bs_bits;
239 |             uint64_t odd_carry_ends = odd_carries & ~bs_bits;
240 |             uint64_t even_start_odd_end = even_carry_ends & odd_bits;
241 |             uint64_t odd_start_even_end = odd_carry_ends & even_bits;
242 |             uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
243 |             int64_t quote_bits = quotebit & ~odd_ends;
244 |             mQuoteBitmap[++top_word] = quote_bits;
245 |         
246 |             str_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
247 |                 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
248 |             str_mask ^= prev_iter_inside_quote;
249 |             prev_iter_inside_quote = static_cast<uint64_t>(static_cast<int64_t>(str_mask) >> 63);
250 | 	
251 |             // step 4: update structural character bitmaps
252 |             unsigned long tmp = (~str_mask);
253 |             colonbit = colonbit & tmp;
254 |             lbracebit = lbracebit & tmp;
255 |             rbracebit = rbracebit & tmp;
256 |             commabit = commabit & tmp;
257 |             lbracketbit = lbracketbit & tmp;
258 |             rbracketbit = rbracketbit & tmp;
259 | 	
260 |             // step 5: generate leveled bitmaps
261 |             lb_mask = lbracebit | lbracketbit;
262 |             rb_mask = rbracebit | rbracketbit;
263 |             cb_mask = lb_mask | rb_mask;
264 |             lb_bit = lb_mask & (-lb_mask);
265 |             rb_bit = rb_mask & (-rb_mask);
266 |             if (!cb_mask) {
267 |                 if (cur_level >= 0 && cur_level <= mDepth) {
268 |                     if (!mLevColonBitmap[cur_level]) {
269 |                         mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
270 |                     }
271 |                     if (!mLevCommaBitmap[cur_level]) {
272 |                         mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
273 |                     }
274 |                     if (colonbit) {
275 |                         mLevColonBitmap[cur_level][top_word] = colonbit;
276 |                     } else {
277 |                         mLevCommaBitmap[cur_level][top_word] = commabit;
278 | 	            }
279 | 	        } else if (cur_level < 0) {
280 |                     if (!mNegLevColonBitmap[-cur_level]) {
281 |                         mNegLevColonBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
282 |                         // before finding the first bracket, update minimum negative level
283 |                         if (cur_level < mMinNegativeLevel) {
284 |                             mMinNegativeLevel = cur_level;
285 |                         }
286 |                     }
287 |                     if (!mNegLevCommaBitmap[-cur_level]) {
288 |                         mNegLevCommaBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
289 |                     }
290 |                     if (colonbit) {
291 |                         mNegLevColonBitmap[-cur_level][top_word] = colonbit;
292 |                     } else {
293 |                         mNegLevCommaBitmap[-cur_level][top_word] = commabit;
294 |                     }
295 |                 }
296 |             } else {
297 |                 first = 1;
298 |                 while (cb_mask || first) {
299 |                     if (!cb_mask) {
300 |                         second = 1UL<<63;
301 |                     } else {
302 |                         cb_bit = cb_mask & (-cb_mask);
303 |                         second = cb_bit;
304 |                     }
305 |                     if (cur_level >= 0 && cur_level <= mDepth) {
306 |                         if (!mLevColonBitmap[cur_level]) {
307 |                             mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
308 |                         }
309 |                         if (!mLevCommaBitmap[cur_level]) {
310 |                             mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
311 |                         }
312 |                         unsigned long mask = second - first;
313 |                         if (!cb_mask) mask = mask | second;
314 |                         unsigned long colon_mask = mask & colonbit;
315 |                         if (colon_mask) {
316 |                             mLevColonBitmap[cur_level][top_word] |= colon_mask;
317 |                         } else {
318 |                             mLevCommaBitmap[cur_level][top_word] |= (commabit & mask);
319 |                         }
320 |                         if (cb_mask) {
321 |                             if (cb_bit == rb_bit) {
322 |                                 mLevColonBitmap[cur_level][top_word] |= cb_bit;
323 |                                 mLevCommaBitmap[cur_level][top_word] |= cb_bit;
324 |                             }
325 |                             else if (cb_bit == lb_bit && cur_level + 1 <= mDepth) {
326 |                                 if (!mLevCommaBitmap[cur_level + 1]) {
327 |                                      mLevCommaBitmap[cur_level + 1] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
328 |                                 }
329 |                                 mLevCommaBitmap[cur_level + 1][top_word] |= cb_bit;
330 |                             }
331 |                         }
332 |                     } else if (cur_level < 0) {
333 |                         if (!mNegLevColonBitmap[-cur_level]) { 
334 |                             mNegLevColonBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
335 |                         }
336 |                         if (!mNegLevCommaBitmap[-cur_level]) {
337 |                             mNegLevCommaBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
338 |                         }
339 |                         unsigned long mask = second - first;
340 |                         if (!cb_mask) mask = mask | second;
341 |                         unsigned long colon_mask = mask & colonbit;
342 |                         if (colon_mask) {
343 |                             mNegLevColonBitmap[-cur_level][top_word] |= colon_mask;
344 |                         } else {
345 |                             mNegLevCommaBitmap[-cur_level][top_word] |= (commabit & mask);
346 |                         }
347 |                         if (cb_mask) {
348 |                             if (cb_bit == rb_bit) {
349 |                                 mNegLevColonBitmap[-cur_level][top_word] |= cb_bit;
350 |                                 mNegLevCommaBitmap[-cur_level][top_word] |= cb_bit;
351 |                             }
352 |                             else if (cb_bit == lb_bit) {
353 |                                 if (cur_level + 1 == 0) {
354 |                                     if (!mLevCommaBitmap[0]) {
355 |                                          mLevCommaBitmap[0] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
356 |                                     }
357 |                                     mLevCommaBitmap[0][top_word] |= cb_bit;
358 |                                 } else {
359 |                                     if (!mNegLevCommaBitmap[-(cur_level + 1)]) {
360 |                                         mNegLevCommaBitmap[-(cur_level + 1)] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
361 |                                     }
362 |                                     mNegLevCommaBitmap[-(cur_level + 1)][top_word] |= cb_bit;
363 |                                 }
364 |                             }
365 |                         } 
366 |                     }
367 |                     if (cb_mask) {
368 |                         if (cb_bit == lb_bit) {
369 |                             lb_mask = lb_mask & (lb_mask - 1);
370 |                             lb_bit = lb_mask & (-lb_mask);
371 |                             ++cur_level;
372 |                             if (mThreadId == 0 && cur_level == 0) {
373 |                                 // JSON record at the top level could be an array
374 |                                 if (!mLevCommaBitmap[cur_level]) {
375 |                                      mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
376 |                                 }
377 |                                 mLevCommaBitmap[cur_level][top_word] |= cb_bit;
378 |                             }
379 |                         } else if (cb_bit == rb_bit) {
380 |                             rb_mask = rb_mask & (rb_mask - 1);
381 |                             rb_bit = rb_mask & (-rb_mask);
382 |                             --cur_level;
383 |                         }
384 |                         first = second;
385 |                         cb_mask = cb_mask & (cb_mask - 1);
386 |                         if (cur_level > mMaxPositiveLevel) {
387 |                             mMaxPositiveLevel = cur_level;
388 |                         } else if (cur_level < mMinNegativeLevel) {
389 |                             mMinNegativeLevel = cur_level;
390 |                         }
391 |                     } else {
392 |                         first = 0;
393 |                     }
394 |                 }
395 | 	    }
396 |         }
397 |     }
398 |     if (mDepth == MAX_LEVEL - 1) mDepth = mMaxPositiveLevel;
399 |     mEndLevel = cur_level;
400 | }
401 | 
402 | void LocalBitmap::buildStringMaskBitmap() {
403 |     // allocate memory space for saving results
404 |     if (!mQuoteBitmap) {
405 |         mQuoteBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
406 |     }
407 |     if (!mColonBitmap) {
408 |         mColonBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
409 |     }
410 |     if (!mCommaBitmap) {
411 |         mCommaBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
412 |     }
413 |     if (!mStrBitmap) {
414 |         mStrBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
415 |     }
416 |     if (!mLbraceBitmap) {
417 |         mLbraceBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
418 |     }
419 |     if (!mRbraceBitmap) {
420 |         mRbraceBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
421 |     }
422 |     if (!mLbracketBitmap) {
423 |         mLbracketBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
424 |     }
425 |     if (!mRbracketBitmap) {
426 |         mRbracketBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
427 |     }
428 |  
429 |     // vectors for structural characters
430 |     __m256i v_quote = _mm256_set1_epi8(0x22);
431 |     __m256i v_colon = _mm256_set1_epi8(0x3a);
432 |     __m256i v_escape = _mm256_set1_epi8(0x5c);
433 |     __m256i v_lbrace = _mm256_set1_epi8(0x7b);
434 |     __m256i v_rbrace = _mm256_set1_epi8(0x7d);
435 |     __m256i v_comma = _mm256_set1_epi8(0x2c); 
436 |     __m256i v_lbracket = _mm256_set1_epi8(0x5b);
437 |     __m256i v_rbracket = _mm256_set1_epi8(0x5d);
438 | 	
439 |     // variables for saving temporary results
440 |     unsigned long colonbit0, quotebit0, escapebit0, stringbit0, lbracebit0, rbracebit0, commabit0, lbracketbit0, rbracketbit0;
441 |     unsigned long colonbit, quotebit, escapebit, stringbit, lbracebit, rbracebit, commabit, lbracketbit, rbracketbit;
442 |     unsigned long str_mask;
443 | 
444 |     // variables for saving context information among different words
445 |     int top_word = -1;
446 |     uint64_t prev_iter_ends_odd_backslash = 0ULL;
447 |     uint64_t prev_iter_inside_quote = mStartInStrBitmap;
448 |     const uint64_t even_bits = 0x5555555555555555ULL;
449 |     const uint64_t odd_bits = ~even_bits; 
450 | 
451 |     for (int j = 0; j < mNumTmpWords; ++j) {
452 |         colonbit = 0, quotebit = 0, escapebit = 0, stringbit = 0, lbracebit = 0, rbracebit = 0, commabit = 0, lbracketbit = 0, rbracketbit = 0;
453 |         unsigned long i = j * 32; 
454 |         // step 1: build structural character bitmaps
455 |         __m256i v_text = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mRecord + i));
456 |         colonbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_colon));
457 |         quotebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_quote)); 
458 |         escapebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_escape)); 
459 |         lbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbrace));
460 |         rbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbrace));
461 |         commabit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_comma));
462 | 	lbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbracket));
463 | 	rbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbracket));
464 |         // first half of the word (lowest 32 bits)
465 |         if(j % 2 == 0) {
466 |             colonbit0 = colonbit;
467 |             quotebit0 = quotebit;
468 |             escapebit0 = escapebit;
469 |             lbracebit0 = lbracebit;
470 |             rbracebit0 = rbracebit;
471 |             commabit0 = commabit;
472 |             lbracketbit0 = lbracketbit;
473 |             rbracketbit0 = rbracketbit;
474 |             continue;
475 |         } else {
476 |             // highest 32 bits inside a word
477 |             colonbit = (colonbit << 32) | colonbit0;
478 |             quotebit = (quotebit << 32) | quotebit0;
479 |             escapebit = (escapebit << 32) | escapebit0;
480 |             lbracebit = (lbracebit << 32) | lbracebit0;
481 |             rbracebit = (rbracebit << 32) | rbracebit0;
482 |             commabit = (commabit << 32) | commabit0;
483 |             lbracketbit = (lbracketbit << 32) | lbracketbit0;
484 |             rbracketbit = (rbracketbit << 32) | rbracketbit0;
485 |             mColonBitmap[++top_word] = colonbit;
486 |             mCommaBitmap[top_word] = commabit;
487 |             mLbraceBitmap[top_word] = lbracebit;
488 |             mRbraceBitmap[top_word] = rbracebit;
489 |             mLbracketBitmap[top_word] = lbracketbit;
490 |             mRbracketBitmap[top_word] = rbracketbit;
491 | 
492 |             // step 2: update structural quote bitmaps
493 |             uint64_t bs_bits = escapebit;
494 |             uint64_t start_edges = bs_bits & ~(bs_bits << 1);
495 |             int64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
496 |             uint64_t even_starts = start_edges & even_start_mask;
497 |             uint64_t odd_starts = start_edges & ~even_start_mask;
498 |             uint64_t even_carries = bs_bits + even_starts;
499 |             int64_t odd_carries;
500 |             bool iter_ends_odd_backslash = __builtin_uaddll_overflow(bs_bits, odd_starts,
501 |                 (unsigned long long *)(&odd_carries));
502 |             odd_carries |= prev_iter_ends_odd_backslash;
503 |             prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
504 |             uint64_t even_carry_ends = even_carries & ~bs_bits;
505 |             uint64_t odd_carry_ends = odd_carries & ~bs_bits;
506 |             uint64_t even_start_odd_end = even_carry_ends & odd_bits;
507 |             uint64_t odd_start_even_end = odd_carry_ends & even_bits;
508 |             uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
509 |             int64_t quote_bits = quotebit & ~odd_ends;
510 |             mQuoteBitmap[top_word] = quote_bits;
511 |         
512 |             // step 3: build string mask bitmaps
513 |             str_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
514 |                 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
515 |             str_mask ^= prev_iter_inside_quote;
516 |             mStrBitmap[top_word] = str_mask;
517 |             prev_iter_inside_quote = static_cast<uint64_t>(static_cast<int64_t>(str_mask) >> 63);
518 |         }
519 |     }
520 |     mEndInStrBitmap = prev_iter_inside_quote;
521 | }
522 | 
523 | void LocalBitmap::buildLeveledBitmap() {
524 |     // variables for saving temporary results in the first four steps
525 |     unsigned long colonbit, quotebit, escapebit, stringbit, lbracebit, rbracebit, commabit, lbracketbit, rbracketbit;
526 |     unsigned long str_mask;
527 | 
528 |      // variables for saving temporary results in the last step
529 |     unsigned long lb_mask, rb_mask, cb_mask;
530 |     unsigned long lb_bit, rb_bit, cb_bit;
531 |     unsigned long first, second;
532 |     int cur_level = -1;
533 | 
534 |     for (int j = 0; j < mNumWords; ++j) {
535 |         // get input info
536 |         colonbit = mColonBitmap[j];
537 |         commabit = mCommaBitmap[j];
538 |         lbracebit = mLbraceBitmap[j];
539 |         rbracebit = mRbraceBitmap[j];
540 |         lbracketbit = mLbracketBitmap[j];
541 |         rbracketbit = mRbracketBitmap[j];
542 |         str_mask = mStrBitmap[j];
543 |         
544 |         // step 4: update structural character bitmaps
545 |         unsigned long tmp = (~str_mask);
546 |         colonbit = colonbit & tmp;
547 |         lbracebit = lbracebit & tmp;
548 |         rbracebit = rbracebit & tmp;
549 |         commabit = commabit & tmp;
550 |         lbracketbit = lbracketbit & tmp;
551 |         rbracketbit = rbracketbit & tmp;
552 |         
553 |         // step 5: generate leveled bitmaps
554 |         lb_mask = lbracebit | lbracketbit;
555 |         rb_mask = rbracebit | rbracketbit;
556 |         cb_mask = lb_mask | rb_mask;
557 |         lb_bit = lb_mask & (-lb_mask);
558 |         rb_bit = rb_mask & (-rb_mask);
559 |         int top_word = j;
560 |         if (!cb_mask) {
561 |             if (cur_level >= 0 && cur_level <= mDepth) {
562 |                 if (!mLevColonBitmap[cur_level]) {
563 |                     mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
564 |                 }
565 |                 if (!mLevCommaBitmap[cur_level]) {
566 |                     mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
567 |                 }
568 |                 if (colonbit) {
569 |                     mLevColonBitmap[cur_level][top_word] = colonbit;
570 |                 } else {
571 |                     mLevCommaBitmap[cur_level][top_word] = commabit;
572 |                 }
573 |             } else if (cur_level < 0) {
574 |                 if (!mNegLevColonBitmap[-cur_level]) {
575 |                     mNegLevColonBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
576 |                 }
577 |                 if (!mNegLevCommaBitmap[-cur_level]) {
578 |                     mNegLevCommaBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
579 |                 }
580 |                 if (colonbit) {
581 |                     mNegLevColonBitmap[-cur_level][top_word] = colonbit;
582 |                 } else {
583 |                     mNegLevCommaBitmap[-cur_level][top_word] = commabit;
584 |                 }
585 |             }
586 |         } else {
587 |             first = 1;
588 |             while (cb_mask || first) {
589 |                 if (!cb_mask) {
590 |                     second = 1UL<<63;
591 |                 } else {
592 |                     cb_bit = cb_mask & (-cb_mask);
593 |                     second = cb_bit;
594 |                 }
595 |                 if (cur_level >= 0 && cur_level <= mDepth) {
596 |                     if (!mLevColonBitmap[cur_level]) {
597 |                         mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
598 |                     }
599 |                     if (!mLevCommaBitmap[cur_level]) {
600 |                         mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
601 |                     }
602 |                     unsigned long mask = second - first;
603 |                     if (!cb_mask) mask = mask | second;
604 |                     unsigned long colon_mask = mask & colonbit;
605 |                     if (colon_mask) {
606 |                         mLevColonBitmap[cur_level][top_word] |= colon_mask;
607 |                     } else {
608 |                         mLevCommaBitmap[cur_level][top_word] |= (commabit & mask);
609 |                     }
610 |                     if (cb_mask) {
611 |                         if (cb_bit == rb_bit) {
612 |                             mLevColonBitmap[cur_level][top_word] |= cb_bit;
613 |                             mLevCommaBitmap[cur_level][top_word] |= cb_bit;
614 |                         }
615 |                         else if (cb_bit == lb_bit && cur_level + 1 <= mDepth) {
616 |                             if (!mLevCommaBitmap[cur_level + 1]) {
617 |                                 mLevCommaBitmap[cur_level + 1] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
618 |                             }
619 |                             mLevCommaBitmap[cur_level + 1][top_word] |= cb_bit;
620 |                         }
621 |                     }
622 |                 } else if (cur_level < 0) {
623 |                     if (!mNegLevColonBitmap[-cur_level]) {
624 |                         mNegLevColonBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
625 |                     }
626 |                     if (!mNegLevCommaBitmap[-cur_level]) {
627 |                         mNegLevCommaBitmap[-cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
628 |                     }
629 |                     unsigned long mask = second - first;
630 |                     if (!cb_mask) mask = mask | second;
631 |                     unsigned long colon_mask = mask & colonbit;
632 |                     if (colon_mask) {
633 |                         mNegLevColonBitmap[-cur_level][top_word] |= colon_mask;
634 |                     } else {
635 |                         mNegLevCommaBitmap[-cur_level][top_word] |= (commabit & mask);
636 |                     }
637 |                     if (cb_mask) {
638 |                         if (cb_bit == rb_bit) {
639 |                             mNegLevColonBitmap[-cur_level][top_word] |= cb_bit;
640 |                             mNegLevCommaBitmap[-cur_level][top_word] |= cb_bit;
641 |                         }
642 |                         else if (cb_bit == lb_bit) {
643 |                             if (cur_level + 1 == 0) {
644 |                                 if (!mLevCommaBitmap[0]) {
645 |                                     mLevCommaBitmap[0] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
646 |                                 }
647 |                                 mLevCommaBitmap[0][top_word] |= cb_bit;
648 |                             } else {
649 |                                 if (!mNegLevCommaBitmap[-(cur_level + 1)]) {
650 |                                     mNegLevCommaBitmap[-(cur_level + 1)] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
651 |                                 }
652 |                                 mNegLevCommaBitmap[-(cur_level + 1)][top_word] |= cb_bit;
653 |                             }
654 |                         }
655 |                     }
656 |                 }
657 |                 if (cb_mask) {
658 |                     if (cb_bit == lb_bit) {
659 |                         lb_mask = lb_mask & (lb_mask - 1);
660 |                         lb_bit = lb_mask & (-lb_mask);
661 |                         ++cur_level;
662 |                         if (mThreadId == 0 && cur_level == 0) {
663 |                             // JSON record at the top level could be an array
664 |                             if (!mLevCommaBitmap[cur_level]) {
665 |                                 mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
666 |                             }
667 |                             mLevCommaBitmap[cur_level][top_word] |= cb_bit;
668 |                         }
669 |                     } else if (cb_bit == rb_bit) {
670 |                         rb_mask = rb_mask & (rb_mask - 1);
671 |                         rb_bit = rb_mask & (-rb_mask);
672 |                         --cur_level;
673 |                     }
674 |                     first = second;
675 |                     cb_mask = cb_mask & (cb_mask - 1);
676 |                     if (cur_level > mMaxPositiveLevel) {
677 |                         mMaxPositiveLevel = cur_level;
678 |                     } else if (cur_level < mMinNegativeLevel) {
679 |                         mMinNegativeLevel = cur_level;
680 |                     }
681 |                 } else {
682 |                     first = 0;
683 |                 }
684 |             }
685 |         }
686 |     }
687 |     if (mDepth == MAX_LEVEL - 1) mDepth = mMaxPositiveLevel; 
688 |     mEndLevel = cur_level;
689 | }
690 | 


--------------------------------------------------------------------------------
/src/LocalBitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOCALBITMAP_H
 2 | #define LOCALBITMAP_H
 3 | #include <string>
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <bitset>
 7 | #include <cassert>
 8 | #include <stack>
 9 | #include <algorithm>
10 | #include <unordered_map>
11 | #include <functional>
12 | #include <math.h>
13 | #include <immintrin.h>
14 | #include "Bitmap.h"
15 | #include "Tokenizer.h"
16 | using namespace std;
17 | 
18 | class LocalBitmap : public Bitmap {
19 |     friend class ParallelBitmap;
20 |     friend class ParallelBitmapIterator;
21 |   private:
22 |     int mNumTknErr;
23 |     int mNumTrial;
24 |     int mThreadId;
25 |     char* mRecord;
26 |     // for a single large record, stream length equals to record length
27 |     long mRecordLength;
28 |     // each temp word has 32 bytes
29 |     long mNumTmpWords;
30 |     // each word has 64 bytes
31 |     long mNumWords;
32 |     // the deepest level of leveled bitmap indexes (starting from 0)
33 |     int mDepth;
34 |     // structural character bitmaps
35 |     unsigned long *mEscapeBitmap, *mStrBitmap, *mColonBitmap, *mCommaBitmap, *mLbracketBitmap, *mRbracketBitmap, *mLbraceBitmap, *mRbraceBitmap;
36 | 
37 |     // following two variables are used for validating inference results of Step 3 (build string mask bitmap)
38 |     // marks whether current chunk starts inside string or not
39 |     unsigned long mStartInStrBitmap;
40 |     // marks whether current chunk ends inside string or not
41 |     unsigned long mEndInStrBitmap;
42 | 
43 |     // following variables are used for merging phase (after Step 5, merge leveled bitmap)
44 |     // each thread starts with level 0, following two arrays save bitmaps for levels higher than 0 (temporary result)
45 |     unsigned long *mLevColonBitmap[MAX_LEVEL];
46 |     unsigned long *mLevCommaBitmap[MAX_LEVEL];
47 |     // each thread starts with level 0, following two arrays save bitmaps for levels less than 0 (temporary result)
48 |     unsigned long *mNegLevColonBitmap[MAX_LEVEL];
49 |     unsigned long *mNegLevCommaBitmap[MAX_LEVEL];
50 |     // each thread starts with level 0
51 |     // mMaxPositiveLevel saves the maximum positive level in current thread
52 |     int mMaxPositiveLevel;
53 |     // mMinNegativeLevel saves the minimum negative level in current thread
54 |     int mMinNegativeLevel;
55 |     // saves the level after processing the whole chunk, used for parallel index construction
56 |     int mEndLevel;
57 | 
58 |     // following variables are used by ParallelBitmapIterator
59 |     // temporary leveled colon bitmap is mapped to the correct level, which happens during the merging phase
60 |     unsigned long *mFinalLevColonBitmap[MAX_LEVEL];
61 |     unsigned long *mFinalLevCommaBitmap[MAX_LEVEL];
62 |     // structural quote bitmap, used for getting the key field when iterating bitmaps
63 |     unsigned long *mQuoteBitmap; 
64 |     // word ids for the first and last words, often used when iterating leveled bitmap to get some information like colon, comma and key field positions
65 |     long mStartWordId;
66 |     long mEndWordId;
67 |   
68 |   public:
69 |     LocalBitmap();	
70 |     LocalBitmap(char* record, int level_num);
71 |     ~LocalBitmap();
72 |     // context inference for parallel index construction (step 3).
73 |     // if it context information couldn't be inferred, return SPECULATIVE; else return NOSPECULATIVE
74 |     int contextInference();
75 |     // function for non-speculative parallel index construction
76 |     void nonSpecIndexConstruction();
77 |     // following two functions are used for speculative parallel index construction 
78 |     void buildStringMaskBitmap();
79 |     void buildLeveledBitmap();
80 |     void setRecordLength(long length);
81 |     void setThreadId(int thread_id) {mThreadId = thread_id;}
82 |  
83 |   private:
84 |     void freeMemory();
85 | };
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/ParallelBitmap.cpp:
--------------------------------------------------------------------------------
  1 | #include "ParallelBitmap.h"
  2 | 
  3 | ParallelBitmap::ParallelBitmap(char* record, int thread_num, int depth) {
  4 |     mRecord = record;
  5 |     mDepth = depth;
  6 |     mThreadNum = thread_num;
  7 |     char* start_chunk = record;
  8 |     int rec_len = strlen(record);
  9 |     mRecordLength = rec_len;
 10 |     int chunk_len = rec_len / thread_num;
 11 |     if (chunk_len % 64 > 0) {
 12 |         chunk_len = chunk_len + 64 - chunk_len % 64;
 13 |     }
 14 |     int cur_len = 0;
 15 |     mParallelMode = NONSPECULATIVE;
 16 |     for (int i = 0; i < thread_num; ++i) {
 17 |         mBitmaps[i] = new LocalBitmap(start_chunk, depth);
 18 |         mBitmaps[i]->setThreadId(i);
 19 |         if (i < thread_num - 1) {
 20 |             int pad_len = 0;
 21 |             // escaped backslashes are not separated into different chunks
 22 |             while (start_chunk[chunk_len + pad_len - 1] == '\\') {
 23 |                 pad_len += 64;
 24 |             }
 25 |             mBitmaps[i]->setRecordLength(chunk_len + pad_len);
 26 |             start_chunk = start_chunk + chunk_len + pad_len;
 27 |             cur_len += (chunk_len + pad_len);
 28 |         } else {
 29 |             int last_chunk_len = rec_len - cur_len;
 30 |             mBitmaps[i]->setRecordLength(last_chunk_len);
 31 |         }
 32 |         // perform context inference and decide whether the program runs in speculative mode
 33 |         if (mBitmaps[i]->contextInference() == UNKNOWN) {
 34 |             mParallelMode = SPECULATIVE;
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | ParallelBitmap::ParallelBitmap(char* record, long rec_len, int thread_num, int depth) {
 40 |     mRecord = record;
 41 |     mDepth = depth;
 42 |     mThreadNum = thread_num;
 43 |     char* start_chunk = record;
 44 |     mRecordLength = rec_len;
 45 |     int chunk_len = rec_len / thread_num;
 46 |     if (chunk_len % 64 > 0) {
 47 |         chunk_len = chunk_len + 64 - chunk_len % 64;
 48 |     }
 49 |     int cur_len = 0;
 50 |     mParallelMode = NONSPECULATIVE;
 51 |     for (int i = 0; i < thread_num; ++i) {
 52 |         mBitmaps[i] = new LocalBitmap(start_chunk, depth);
 53 |         if (i < thread_num - 1) {
 54 |             int pad_len = 0;
 55 |             // escaped backslashes are not separated into different chunks
 56 |             while (start_chunk[chunk_len + pad_len - 1] == '\\') {
 57 |                 pad_len += 64;
 58 |             }
 59 |             mBitmaps[i]->setRecordLength(chunk_len + pad_len);
 60 |             start_chunk = start_chunk + chunk_len + pad_len;
 61 |             cur_len += (chunk_len + pad_len);
 62 |         } else {
 63 |             int last_chunk_len = rec_len - cur_len;
 64 |             mBitmaps[i]->setRecordLength(last_chunk_len);
 65 |         }
 66 |         // perform context inference and decide whether the program runs in speculative mode
 67 |         if (mBitmaps[i]->contextInference() == UNKNOWN) {
 68 |             mParallelMode = SPECULATIVE;
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | ParallelBitmap::~ParallelBitmap() {
 74 |     for (int i = 0; i < mThreadNum; ++i) {
 75 |         delete mBitmaps[i];
 76 |     }
 77 | }
 78 | 
 79 | int ParallelBitmap::parallelMode() {
 80 |     return mParallelMode;
 81 | }
 82 | 
 83 | void ParallelBitmap::setRecordLength(long length) {
 84 |     mRecordLength = length;
 85 | }
 86 | 
 87 | void ParallelBitmap::rectifyStringMaskBitmaps() {
 88 |     //cout<<"start verification"<<endl;
 89 |     unsigned long prev_iter_inside_quote = mBitmaps[0]->mEndInStrBitmap;
 90 |     for (int i = 1; i < mThreadNum; ++i) {
 91 |        if (prev_iter_inside_quote != mBitmaps[i]->mStartInStrBitmap) {
 92 |            mBitmaps[i]->mStartInStrBitmap = prev_iter_inside_quote;
 93 |            // flip string mask bitmaps
 94 |            //cout<<"flip for "<<i<<"th thread "<<endl;
 95 |            for (int j = 0; j < mBitmaps[i]->mNumWords; ++j) {
 96 |                mBitmaps[i]->mStrBitmap[j] = ~mBitmaps[i]->mStrBitmap[j];
 97 |            }
 98 |            if (mBitmaps[i]->mEndInStrBitmap == 0) {
 99 |                mBitmaps[i]->mEndInStrBitmap = 0xffffffffffffffffULL;
100 |            } else {
101 |                mBitmaps[i]->mEndInStrBitmap = 0ULL;
102 |            }
103 |        }
104 |        prev_iter_inside_quote = mBitmaps[i]->mEndInStrBitmap;
105 |     }
106 |     //cout<<"end verification"<<endl;
107 | }
108 | 
109 | void ParallelBitmap::mergeBitmaps() {
110 |     //cout<<"start merge"<<endl;
111 |     int cur_level = mBitmaps[0]->mEndLevel;
112 |     long offset = 0;
113 |     for (int i = 0; i <= mBitmaps[0]->mMaxPositiveLevel; ++i) {
114 |         mBitmaps[0]->mFinalLevColonBitmap[i] = mBitmaps[0]->mLevColonBitmap[i];
115 |         mBitmaps[0]->mFinalLevCommaBitmap[i] = mBitmaps[0]->mLevCommaBitmap[i];
116 |     }
117 |     offset += mBitmaps[0]->mNumWords;
118 |     mBitmaps[0]->mStartWordId = 0;
119 |     mBitmaps[0]->mEndWordId = offset;
120 |     // link leveled colon and comma bitmaps generated from different threads
121 |     for (int i = 1; i < mThreadNum; ++i) {
122 |         mBitmaps[i]->mStartWordId = offset;
123 |         mBitmaps[i]->mEndWordId = offset + mBitmaps[i]->mNumWords;
124 |         for(int j = 1; j <= -mBitmaps[i]->mMinNegativeLevel && (cur_level - j + 1) >= 0; ++j) {
125 |             mBitmaps[i]->mFinalLevColonBitmap[cur_level - j + 1] = mBitmaps[i]->mNegLevColonBitmap[j];
126 |             mBitmaps[i]->mFinalLevCommaBitmap[cur_level - j + 1] = mBitmaps[i]->mNegLevCommaBitmap[j]; 
127 |         }
128 |         for(int j = 0; j <= mBitmaps[i]->mMaxPositiveLevel && (cur_level + j + 1) >= 0; ++j) {
129 |             mBitmaps[i]->mFinalLevColonBitmap[cur_level + j + 1] = mBitmaps[i]->mLevColonBitmap[j];
130 |             mBitmaps[i]->mFinalLevCommaBitmap[cur_level + j + 1] = mBitmaps[i]->mLevCommaBitmap[j];
131 |         }
132 |         cur_level += (mBitmaps[i]->mEndLevel + 1);
133 |         offset += mBitmaps[i]->mNumWords;
134 |     }
135 |     //cout<<"final level after merge "<<cur_level<<" "<<endl;
136 | }
137 | 


--------------------------------------------------------------------------------
/src/ParallelBitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARALLELBITMAP_H
 2 | #define PARALLELBITMAP_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <ctype.h>
 7 | #include <pthread.h>
 8 | #include <malloc.h>
 9 | #include <sys/time.h>
10 | #include <sys/file.h>
11 | #include <unistd.h>
12 | #include <sched.h>
13 | #include "Bitmap.h"
14 | #include "LocalBitmap.h"
15 | 
16 | #define MAX_THREAD 70
17 | #define SPECULATIVE 10
18 | #define NONSPECULATIVE 11
19 | 
20 | class ParallelBitmap : public Bitmap {
21 |     friend class ParallelBitmapConstructor;
22 |     friend class ParallelBitmapIterator;
23 |   private:
24 |     LocalBitmap* mBitmaps[MAX_THREAD];
25 |     int mThreadNum;
26 |     char* mRecord;
27 |     long mRecordLength;
28 |     int mDepth;
29 |     int mParallelMode;
30 | 
31 |   public:
32 |     ParallelBitmap(char* record, int thread_num, int depth);
33 |     ParallelBitmap(char* record, long rec_len, int thread_num, int depth);
34 |     ~ParallelBitmap();
35 |     void setRecordLength(long length);
36 |     // SPECULATIVE or NONSPECULATIVE
37 |     int parallelMode();
38 |     // validation after step 3
39 |     void rectifyStringMaskBitmaps();
40 |     // validation after step 5
41 |     void mergeBitmaps();
42 | };
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/ParallelBitmapConstructor.cpp:
--------------------------------------------------------------------------------
 1 | #include "ParallelBitmapConstructor.h"
 2 | 
 3 | ParallelBitmap* ParallelBitmapConstructor::mParallelBitmap = NULL;
 4 | pthread_t ParallelBitmapConstructor::mThreads[MAX_THREAD]; 
 5 | int ParallelBitmapConstructor::mThreadArgs[MAX_THREAD];
 6 | 
 7 | ParallelBitmap* ParallelBitmapConstructor::construct(Record* record, int thread_num, int level_num) {
 8 |     char* record_text = NULL;
 9 |     long length = 0;
10 |     if (record->rec_start_pos > 0) record_text = record->text + record->rec_start_pos;
11 |     else record_text = record->text;
12 |     if (record->rec_length > 0) length = record->rec_length;
13 |     else length = strlen(record->text);
14 |     mParallelBitmap = new ParallelBitmap(record_text, length, thread_num, level_num);
15 |     int mode = mParallelBitmap->parallelMode();
16 |     if (mode == NONSPECULATIVE) {
17 |         for (int i = 0; i < thread_num; ++i) {
18 |             mThreadArgs[i] = i;
19 |             int rc=pthread_create(&mThreads[i], NULL, &ParallelBitmapConstructor::nonSpecIndexConstruction, &mThreadArgs[i]);
20 |             if (rc)
21 |             {
22 |                 cout<<"Thread Error; return code is "<<rc<<endl;
23 |                 return NULL;
24 |             }
25 |         }
26 |         for(int i = 0; i < thread_num; ++i)
27 |         {
28 |             int rc = pthread_join(mThreads[i], NULL);
29 |             if (rc)
30 |             {
31 |                 cout<<"Thread Error; return code is "<<rc<<endl;
32 |                 return NULL;
33 |             }
34 |         }
35 |         mParallelBitmap->mergeBitmaps();
36 |     } else {
37 |         for (int i = 0; i < thread_num; ++i) {
38 |             mThreadArgs[i] = i;
39 |             int rc=pthread_create(&mThreads[i], NULL, &ParallelBitmapConstructor::buildStringMaskBitmap, &mThreadArgs[i]);
40 |             if (rc)
41 |             {
42 |                 cout<<"Thread Error; return code is "<<rc<<endl;
43 |                 return NULL;
44 |             }
45 |         }
46 |         for(int i = 0; i < thread_num; ++i)
47 |         {
48 |             int rc = pthread_join(mThreads[i], NULL);
49 |             if (rc)
50 |             {
51 |                 cout<<"Thread Error; return code is "<<rc<<endl;
52 |                 return NULL;
53 |             }
54 |         }
55 |         mParallelBitmap->rectifyStringMaskBitmaps();
56 |         for (int i = 0; i < thread_num; ++i) {
57 |             mThreadArgs[i] = i;
58 |             int rc=pthread_create(&mThreads[i], NULL, &ParallelBitmapConstructor::buildLeveledBitmap, &mThreadArgs[i]);
59 |             if (rc)
60 |             {
61 |                 cout<<"Thread Error; return code is "<<rc<<endl;
62 |                 return NULL;
63 |             }
64 |         }
65 |         for(int i = 0; i < thread_num; ++i)
66 |         {
67 |             int rc = pthread_join(mThreads[i], NULL);
68 |             if (rc)
69 |             {
70 |                 cout<<"Thread Error; return code is "<<rc<<endl;
71 |                 return NULL;
72 |             }
73 |         }
74 |         mParallelBitmap->mergeBitmaps();
75 |     }
76 |     return mParallelBitmap;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/ParallelBitmapConstructor.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARALLELBITMAPCONSTRUCTOR_H
 2 | #define PARALLELBITMAPCONSTRUCTOR_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <ctype.h>
 7 | #include <pthread.h>
 8 | #include <malloc.h>
 9 | #include <sys/time.h>
10 | #include <sys/file.h>
11 | #include <unistd.h>
12 | #include <sched.h>
13 | #include "ParallelBitmap.h"
14 | #include "Records.h"
15 | 
16 | // ParallelBitmapConstructor is a static class
17 | class ParallelBitmapConstructor {
18 |   private:
19 |     static ParallelBitmap* mParallelBitmap;
20 |     static pthread_t mThreads[MAX_THREAD]; 
21 |     static int mThreadArgs[MAX_THREAD];
22 |   public:
23 |     static ParallelBitmap* construct(Record* record, int thread_num, int level_num = MAX_LEVEL);
24 |   private:
25 |     // builds bitmap index in non-speculative mode.
26 |     static void* nonSpecIndexConstruction(void* arg) {
27 |         int thread_id = (int)(*((int*)arg));
28 |         // cout<<"thread "<<thread_id<<" starts building structral indexes."<<endl;
29 |         // bind CPU
30 |         cpu_set_t mask;
31 |         cpu_set_t get;
32 |         CPU_ZERO(&mask);
33 |         CPU_SET(thread_id, &mask);
34 |         if(pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
35 |             cout<<"CPU binding failed for thread "<<thread_id<<endl;
36 |         struct timeval begin,end;
37 |         double duration;
38 |         gettimeofday(&begin,NULL);
39 |         mParallelBitmap->mBitmaps[thread_id]->nonSpecIndexConstruction();
40 |         // cout<<thread_id<<"th thread finishes structural index construction."<<endl;
41 |         gettimeofday(&end,NULL);
42 |         duration=1000000*(end.tv_sec-begin.tv_sec)+end.tv_usec-begin.tv_usec;
43 |         // printf("The duration of index construction in %dth thread is %lf\n",thread_id, duration/1000000);
44 |         return NULL;
45 |     }
46 |     // builds bitmap index in speculative mode (Step 1 - 3)
47 |     static void* buildStringMaskBitmap(void* arg) {
48 |         int thread_id = (int)(*((int*)arg));
49 |         // bind CPU
50 |         cpu_set_t mask;
51 |         cpu_set_t get;
52 |         CPU_ZERO(&mask);
53 |         CPU_SET(thread_id, &mask);
54 |         if(pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)<0)
55 |             cout<<"CPU binding failed for thread "<<thread_id<<endl;
56 |         //cout<<thread_id<<"th thread starts building string mask bitmap."<<endl; 
57 |         mParallelBitmap->mBitmaps[thread_id]->buildStringMaskBitmap();
58 |         //cout<<thread_id<<"th thread finishes building string mask bitmap."<<endl;
59 |         return NULL;
60 |     }
61 |     // finish the last two steps to finish strucutral index construction
62 |     static void* buildLeveledBitmap(void* arg) {
63 |         int thread_id = (int)(*((int*)arg));
64 |         // bind CPU
65 |         cpu_set_t mask;
66 |         cpu_set_t get;
67 |         CPU_ZERO(&mask);
68 |         CPU_SET(thread_id, &mask);
69 |         if(pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)<0)
70 |             cout<<"CPU binding failed for thread "<<thread_id<<endl;
71 |         mParallelBitmap->mBitmaps[thread_id]->buildLeveledBitmap();
72 |         //cout<<thread_id<<"th thread finishes building leveled bitmap."<<endl;
73 |         return NULL;
74 |     }
75 | };
76 | #endif
77 | 


--------------------------------------------------------------------------------
/src/ParallelBitmapIterator.cpp:
--------------------------------------------------------------------------------
  1 | #include "ParallelBitmapIterator.h"
  2 | #include <sys/time.h>
  3 | #include <pthread.h>
  4 | 
  5 | struct ParallelBitmapMetadata {
  6 |     int start_word_id;
  7 |     int end_word_id;
  8 |     unsigned long* quote_bitmap;
  9 |     unsigned long* lev_colon_bitmap[MAX_LEVEL + 1];
 10 |     unsigned long* lev_comma_bitmap[MAX_LEVEL + 1];
 11 | };
 12 | 
 13 | ParallelBitmapMetadata pb_metadata[MAX_THREAD];
 14 | 
 15 | typedef struct CommaPosInfo {
 16 |     int thread_id;
 17 |     int level;
 18 |     long start_pos;
 19 |     long end_pos;
 20 |     long* comma_positions;
 21 |     long top_comma_positions;
 22 | }CommaPosInfo;
 23 | 
 24 | CommaPosInfo comma_pos_info[MAX_THREAD];
 25 | 
 26 | int num_of_threads = 1;
 27 | 
 28 | void* generateCommaPositionsInThread(void* arg) {
 29 |     int thread_id = (int)(*((int*)arg));;
 30 |     int level = comma_pos_info[thread_id].level;
 31 |     long start_pos = comma_pos_info[thread_id].start_pos;
 32 |     long end_pos = comma_pos_info[thread_id].end_pos;
 33 |     comma_pos_info[thread_id].comma_positions = new long[MAX_NUM_ELE /num_of_threads + 1];
 34 |     comma_pos_info[thread_id].top_comma_positions = -1;
 35 |     // cout<<"thread "<<thread_id<<" start generating comma positions."<<endl;
 36 |     // bind CPU
 37 |     cpu_set_t mask;
 38 |     cpu_set_t get;
 39 |     CPU_ZERO(&mask);
 40 |     CPU_SET(thread_id, &mask);
 41 |     if(pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
 42 |         cout<<"CPU binding failed for thread "<<thread_id<<endl;
 43 |     unsigned long* levels = pb_metadata[thread_id].lev_comma_bitmap[level];
 44 |     if (levels == NULL) {
 45 |         return NULL;
 46 |     }
 47 |     unsigned long commabit;
 48 |     long cur_start_pos = pb_metadata[thread_id].start_word_id;
 49 |     long cur_end_pos = pb_metadata[thread_id].end_word_id;
 50 |     long st = cur_start_pos > (start_pos / 64) ? cur_start_pos : (start_pos / 64);
 51 |     long ed = cur_end_pos < (ceil(double(end_pos) / 64)) ? cur_end_pos : (ceil(double(end_pos) / 64));
 52 |     for (long i = st; i < ed; ++i) {
 53 |         unsigned long idx = 0;
 54 |         if (thread_id >= 1) idx = i - cur_start_pos;
 55 |         else idx = i;
 56 |         commabit = levels[idx];
 57 |         int cnt = __builtin_popcountl(commabit);
 58 |         while (commabit) {
 59 |             long offset = i * 64 + __builtin_ctzll(commabit);
 60 |             if (start_pos <= offset && offset <= end_pos) {
 61 |                 comma_pos_info[thread_id].comma_positions[++comma_pos_info[thread_id].top_comma_positions] = offset;
 62 |             }
 63 |             commabit = commabit & (commabit - 1);
 64 |         }
 65 |     }
 66 |     return NULL;    
 67 | }
 68 | 
 69 | void ParallelBitmapIterator::generateCommaPositionsParallel(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions) {
 70 |     // find starting and ending chunks in linked leveled comma bitmaps
 71 |     int start_chunk = -1;
 72 |     int end_chunk = -1;
 73 |     int chunk_num = mParallelBitmap->mThreadNum;
 74 |     for (int i = mCurChunkId; i < chunk_num; ++i) {
 75 |         if (pb_metadata[i].start_word_id <= (start_pos / 64)) {
 76 |             start_chunk = i;
 77 |         }
 78 |         if (pb_metadata[i].end_word_id >= (ceil(double(end_pos) / 64)) && end_chunk == -1) {
 79 |             end_chunk = i;
 80 |         }
 81 |         if (start_chunk > -1 && end_chunk > -1) break;
 82 |     }
 83 |     if(start_chunk == 0 && end_chunk == -1) end_chunk = 0;
 84 |     mCurChunkId = start_chunk;
 85 |     pthread_t thread[MAX_THREAD];
 86 |     int thread_args[MAX_THREAD];
 87 |     // iterate through corresponding linked leveled comma bitmaps
 88 |     for (int i = start_chunk; i <= end_chunk; ++i) {
 89 |         thread_args[i] = i;
 90 |         comma_pos_info[i].thread_id = i;
 91 |         comma_pos_info[i].level = level;
 92 |         comma_pos_info[i].start_pos = start_pos;
 93 |         comma_pos_info[i].end_pos = end_pos;
 94 |         int rc = pthread_create(&thread[i], NULL, generateCommaPositionsInThread, &thread_args[i]);
 95 |         if (rc)
 96 |         {
 97 |             cout<<"Thread Error; return code is "<<rc<<endl;
 98 |             return;
 99 |         }
100 |     }
101 |     for (int i = start_chunk; i <= end_chunk; ++i) {
102 |         int rc = pthread_join(thread[i], NULL);
103 |         if (rc)
104 |         {
105 |             cout<<"Thread Error; return code is "<<rc<<endl;
106 |             return;
107 |         }
108 |     }
109 |     for (int i = start_chunk; i <= end_chunk; ++i) {
110 |         for (int j = 0; j <= comma_pos_info[i].top_comma_positions; ++j) {
111 |             comma_positions[++top_comma_positions] = comma_pos_info[i].comma_positions[j];
112 |         }
113 |         free(comma_pos_info[i].comma_positions);
114 |     }
115 | }
116 | 
117 | // Saving metadata of linked leveled bitmap in consecutive order can further improve the performance.
118 | void ParallelBitmapIterator::gatherParallelBitmapInfo() {
119 |     int chunk_num = mParallelBitmap->mThreadNum;
120 |     int depth = mParallelBitmap->mDepth;
121 |     for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
122 |         pb_metadata[chunk_id].start_word_id = mParallelBitmap->mBitmaps[chunk_id]->mStartWordId;
123 |         pb_metadata[chunk_id].end_word_id = mParallelBitmap->mBitmaps[chunk_id]->mEndWordId;
124 |         pb_metadata[chunk_id].quote_bitmap = mParallelBitmap->mBitmaps[chunk_id]->mQuoteBitmap;
125 |         for (int l = 0; l <= depth; ++l) {
126 |             pb_metadata[chunk_id].lev_colon_bitmap[l] = mParallelBitmap->mBitmaps[chunk_id]->mFinalLevColonBitmap[l];
127 |             pb_metadata[chunk_id].lev_comma_bitmap[l] = mParallelBitmap->mBitmaps[chunk_id]->mFinalLevCommaBitmap[l];
128 |         }
129 |     }
130 | }
131 | 
132 | void ParallelBitmapIterator::generateColonPositions(long start_pos, long end_pos, int level, long* colon_positions, long& top_colon_positions) {
133 |     // find starting and ending chunks in linked leveled colon bitmaps
134 |     int start_chunk = -1;
135 |     int end_chunk = -1;
136 |     int thread_num = mParallelBitmap->mThreadNum;
137 |     for (int i = mCurChunkId; i < thread_num; ++i) {
138 |         if (pb_metadata[i].start_word_id <= (start_pos / 64)) {
139 |             start_chunk = i;
140 |         }
141 |         if (pb_metadata[i].end_word_id >= (ceil(double(end_pos) / 64)) && end_chunk == -1) {
142 |             end_chunk = i;
143 |         }
144 |         if (start_chunk > -1 && end_chunk > -1) break;
145 |     }
146 |     if(start_chunk == 0 && end_chunk == -1) end_chunk = 0;
147 |     mCurChunkId = start_chunk;
148 |     // iterate through the corresponding linked leveled colon bitmaps
149 |     int cur_chunk = start_chunk;
150 |     while (cur_chunk <= end_chunk) {
151 |         unsigned long* levels = pb_metadata[cur_chunk].lev_colon_bitmap[level];
152 |         if (levels == NULL) {
153 |             ++cur_chunk;
154 |             continue;
155 |         } 
156 |         unsigned long colonbit;
157 |         long cur_start_pos = pb_metadata[cur_chunk].start_word_id;
158 |         long cur_end_pos = pb_metadata[cur_chunk].end_word_id;
159 |         long st = cur_start_pos > (start_pos / 64) ? cur_start_pos : (start_pos / 64);
160 |         long ed = cur_end_pos < (ceil(double(end_pos) / 64)) ? cur_end_pos : (ceil(double(end_pos) / 64));
161 |         for (long i = st; i < ed; ++i) {
162 |             unsigned long idx = 0;
163 |             if (cur_chunk >= 1) idx = i - cur_start_pos;
164 |             else idx = i;
165 |             colonbit = levels[idx];
166 |             int cnt = __builtin_popcountl(colonbit);
167 |             while (colonbit) {
168 |                 long offset = i * 64 + __builtin_ctzll(colonbit);
169 |                 if (start_pos <= offset && offset <= end_pos) {
170 |                     colon_positions[++top_colon_positions] = offset;
171 |                 }
172 |                 colonbit = colonbit & (colonbit - 1);
173 |             }
174 |         }
175 |         ++cur_chunk;
176 |     }
177 | }
178 | 
179 | void ParallelBitmapIterator::generateCommaPositions(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions) {
180 |     // find starting and ending chunks in linked leveled comma bitmaps
181 |     int start_chunk = -1;
182 |     int end_chunk = -1;
183 |     int chunk_num = mParallelBitmap->mThreadNum;
184 |     for (int i = mCurChunkId; i < chunk_num; ++i) {
185 |         if (pb_metadata[i].start_word_id <= (start_pos / 64)) {
186 |             start_chunk = i;
187 |         }
188 |         if (pb_metadata[i].end_word_id >= (ceil(double(end_pos) / 64)) && end_chunk == -1) {
189 |             end_chunk = i;
190 |         }
191 |         if (start_chunk > -1 && end_chunk > -1) break;
192 |     }
193 |     if(start_chunk == 0 && end_chunk == -1) end_chunk = 0;
194 |     mCurChunkId = start_chunk;
195 |     // iterate through the corresponding linked leveled comma bitmaps
196 |     int cur_chunk = start_chunk; 
197 |     while (cur_chunk <= end_chunk) {
198 |         unsigned long* levels = pb_metadata[cur_chunk].lev_comma_bitmap[level];
199 |         if (levels == NULL) {
200 |             ++cur_chunk;
201 |             continue;
202 |         }
203 |         unsigned long commabit;
204 |         long cur_start_pos = pb_metadata[cur_chunk].start_word_id;
205 |         long cur_end_pos = pb_metadata[cur_chunk].end_word_id;
206 |         long st = cur_start_pos > (start_pos / 64) ? cur_start_pos : (start_pos / 64);
207 |         long ed = cur_end_pos < (ceil(double(end_pos) / 64)) ? cur_end_pos : (ceil(double(end_pos) / 64));
208 |         for (long i = st; i < ed; ++i) {
209 |             unsigned long idx = 0;
210 |             if (cur_chunk >= 1) idx = i - cur_start_pos;
211 |             else idx = i;
212 |             commabit = levels[idx];
213 |             int cnt = __builtin_popcountl(commabit);
214 |             while (commabit) {
215 |                 long offset = i * 64 + __builtin_ctzll(commabit);
216 |                 if (start_pos <= offset && offset <= end_pos) {
217 |                     comma_positions[++top_comma_positions] = offset;
218 |                 }
219 |                 commabit = commabit & (commabit - 1);
220 |             }
221 |         }
222 |         ++cur_chunk;
223 |     }
224 | }
225 | 
226 | bool ParallelBitmapIterator::findFieldQuotePos(long colon_pos, long& start_pos, long& end_pos) {
227 |     long w_id = colon_pos/64;
228 |     long offset = colon_pos%64;
229 |     long start_quote = 0;
230 |     long end_quote = 0;
231 |     start_pos = 0; end_pos = 0;
232 |     int cur_chunk = -1;
233 |     int chunk_num = mParallelBitmap->mThreadNum;
234 |     // find the chunk where the current colon is in
235 |     for (int i = mCurChunkId; i < chunk_num; ++i) {
236 |         if (w_id >= pb_metadata[i].start_word_id && w_id < pb_metadata[i].end_word_id) {
237 |             cur_chunk = i;
238 |             break;
239 |         }
240 |     }
241 |     if (cur_chunk == -1) {
242 |         return false;
243 |     }
244 |     while (w_id >= 0)
245 |     {
246 |         // check whether the current chunk needs to be updated  
247 |         if (w_id < pb_metadata[cur_chunk].start_word_id) {
248 |             //cout<<"update chunk id "<<cur_chunk<<endl;
249 |             if ((--cur_chunk) == -1) {
250 |                 return false;
251 |             }
252 |         }
253 |         long quote_id = w_id - pb_metadata[cur_chunk].start_word_id;
254 |         unsigned long quotebit = pb_metadata[cur_chunk].quote_bitmap[quote_id];
255 |         unsigned long offset = w_id * 64 + __builtin_ctzll(quotebit);
256 |         while (quotebit && offset < colon_pos)
257 |         {
258 |             if (end_pos != 0)
259 |             {
260 |                 start_quote = offset;
261 |             }
262 |             else if(start_quote == 0)
263 |             {
264 |                 start_quote = offset;
265 |             }
266 |             else if(end_quote == 0)
267 |             { 
268 |                 end_quote = offset;
269 |             }
270 |             else
271 |             {
272 |                 start_quote = end_quote;
273 |                 end_quote = offset;
274 |             }
275 |             quotebit = quotebit & (quotebit - 1);
276 |             offset = w_id * 64 + __builtin_ctzll(quotebit); 
277 |         }
278 |         if(start_quote != 0 && end_quote == 0)
279 |         {
280 |             end_quote = start_quote;
281 |             start_quote = 0;
282 |             end_pos = end_quote;
283 |         }
284 |         else if(start_quote != 0 && end_quote != 0)
285 |         {
286 |             start_pos = start_quote;
287 |             end_pos = end_quote;
288 |             return true;
289 |         }
290 |         --w_id;
291 |     }
292 |     return false;
293 | }
294 | 
295 | ParallelBitmapIterator* ParallelBitmapIterator::getCopy() {
296 |     ParallelBitmapIterator* pbi = new ParallelBitmapIterator();
297 |     pbi->mParallelBitmap = mParallelBitmap;
298 |     pbi->mCurLevel = mCurLevel;
299 |     pbi->mTopLevel = mCurLevel;
300 |     pbi->mCurChunkId = mCurChunkId;
301 |     pbi->mFindDomArray = mFindDomArray;
302 |     if (pbi->mTopLevel >= 0) {
303 |         pbi->mCtxInfo[mCurLevel].type = mCtxInfo[mCurLevel].type;
304 |         pbi->mCtxInfo[mCurLevel].positions = mCtxInfo[mCurLevel].positions;
305 |         pbi->mCtxInfo[mCurLevel].start_idx = mCtxInfo[mCurLevel].start_idx;
306 |         pbi->mCtxInfo[mCurLevel].end_idx = mCtxInfo[mCurLevel].end_idx;
307 |         pbi->mCtxInfo[mCurLevel].cur_idx = -1;
308 |         pbi->mPosArrAlloc[mCurLevel] = mPosArrAlloc[mCurLevel];
309 |         pbi->mCtxInfo[mCurLevel + 1].positions = NULL;
310 |         for (int i = mCurLevel + 1; i < MAX_LEVEL; ++i) {
311 |             pbi->mPosArrAlloc[i] = false;
312 |             pbi->mPosArrAlloc[i] = NULL;
313 |         }
314 |     }
315 |     pbi->mCopiedIterator = true;
316 |     return pbi;
317 | }
318 | 
319 | bool ParallelBitmapIterator::up() {
320 |     if (mCurLevel == mTopLevel) return false;
321 |     --mCurLevel;
322 |     return true;
323 | }
324 | 
325 | bool ParallelBitmapIterator::down() {
326 |     if (mCurLevel < mTopLevel || mCurLevel > mParallelBitmap->mDepth) return false;
327 |     ++mCurLevel;
328 |     long  start_pos = -1;
329 |     long end_pos = -1;
330 |     int thread_num = mParallelBitmap->mThreadNum;
331 |     if (mCurLevel == mTopLevel + 1) {
332 |         if (mTopLevel == -1) {
333 |             long text_length = mParallelBitmap->mRecordLength;
334 |             start_pos = 0;
335 |             end_pos = text_length;
336 |             mCtxInfo[mCurLevel].positions = (long*)malloc((text_length / thread_num + 1) * sizeof (long));
337 |             mPosArrAlloc[mCurLevel] = true;
338 |         } else {
339 |             long cur_idx = mCtxInfo[mCurLevel - 1].cur_idx;
340 |             start_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx];
341 |             end_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx + 1];
342 |             if (mCtxInfo[mCurLevel].positions == NULL || mPosArrAlloc[mCurLevel] == false) {
343 |                 mCtxInfo[mCurLevel].positions = (long*)malloc((MAX_NUM_ELE / thread_num + 1) * sizeof (long));
344 |                 mPosArrAlloc[mCurLevel] = true;
345 |             }
346 |         }
347 |         mCtxInfo[mCurLevel].start_idx = 0;
348 |         mCtxInfo[mCurLevel].cur_idx = -1;
349 |         mCtxInfo[mCurLevel].end_idx = -1; 
350 |     } else {
351 |         long cur_idx = mCtxInfo[mCurLevel - 1].cur_idx;
352 |         if (cur_idx > mCtxInfo[mCurLevel - 1].end_idx) {
353 |             --mCurLevel;
354 |             return false;
355 |         }
356 |         start_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx];
357 |         end_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx + 1];
358 |         mCtxInfo[mCurLevel].positions = mCtxInfo[mCurLevel - 1].positions;
359 |         mCtxInfo[mCurLevel].start_idx = mCtxInfo[mCurLevel - 1].end_idx + 1;
360 |         mCtxInfo[mCurLevel].cur_idx = mCtxInfo[mCurLevel - 1].end_idx;
361 |         mCtxInfo[mCurLevel].end_idx = mCtxInfo[mCurLevel - 1].end_idx;
362 |     }
363 |     long i = start_pos;
364 |     if (start_pos > 0 || mCurLevel > 0) ++i;
365 |     char ch = mParallelBitmap->mRecord[i];
366 |     while (i < end_pos && (ch == ' ' || ch == '\n')) {
367 |         ch = mParallelBitmap->mRecord[++i];
368 |     }
369 |     if (mParallelBitmap->mRecord[i] == '{') {
370 |         mCtxInfo[mCurLevel].type = OBJECT;
371 |         generateColonPositions(i, end_pos, mCurLevel, mCtxInfo[mCurLevel].positions, mCtxInfo[mCurLevel].end_idx);
372 |         return true;
373 |     } else if (mParallelBitmap->mRecord[i] == '[') {
374 |         mCtxInfo[mCurLevel].type = ARRAY;
375 |         if (mFindDomArray == false && (end_pos - i + 1) > SINGLE_THREAD_MAX_ARRAY_SIZE) {
376 |             generateCommaPositionsParallel(i, end_pos, mCurLevel, mCtxInfo[mCurLevel].positions, mCtxInfo[mCurLevel].end_idx);
377 |             mFindDomArray = true;  
378 |         } else {
379 |             generateCommaPositions(i, end_pos, mCurLevel, mCtxInfo[mCurLevel].positions, mCtxInfo[mCurLevel].end_idx);
380 |         }
381 |         return true;
382 |     }
383 |     --mCurLevel;
384 |     return false;
385 | }
386 | 
387 | bool ParallelBitmapIterator::isObject() {
388 |     if (mCurLevel >= 0 && mCurLevel <= mParallelBitmap->mDepth && mCtxInfo[mCurLevel].type == OBJECT) {
389 |         return true;
390 |     }
391 |     return false;
392 | }
393 | 
394 | bool ParallelBitmapIterator::isArray() {
395 |     if (mCurLevel >= 0 && mCurLevel <= mParallelBitmap->mDepth && mCtxInfo[mCurLevel].type == ARRAY) {
396 |         return true;
397 |     }
398 |     return false;
399 | }
400 | 
401 | bool ParallelBitmapIterator::moveNext() {
402 |     if (mCurLevel < 0 || mCurLevel > mParallelBitmap->mDepth || mCtxInfo[mCurLevel].type != ARRAY) return false;
403 |     long next_idx = mCtxInfo[mCurLevel].cur_idx + 1;
404 |     if (next_idx >= mCtxInfo[mCurLevel].end_idx) return false;
405 |     mCtxInfo[mCurLevel].cur_idx = next_idx;
406 |     return true;
407 | }
408 | 
409 | bool ParallelBitmapIterator::moveToKey(char* key) {
410 |     if (mCurLevel < 0 || mCurLevel > mParallelBitmap->mDepth || mCtxInfo[mCurLevel].type != OBJECT) return false;
411 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx + 1;
412 |     long end_idx = mCtxInfo[mCurLevel].end_idx;
413 |     while (cur_idx < end_idx) {
414 |         long colon_pos = mCtxInfo[mCurLevel].positions[cur_idx];
415 |         long start_pos = 0, end_pos = 0;
416 |         if (!findFieldQuotePos(colon_pos, start_pos, end_pos)) {
417 |             return false;
418 |         }
419 |         int key_size = end_pos - start_pos - 1;
420 |         if (key_size == strlen(key)) {
421 |             memcpy(mKey, mParallelBitmap->mRecord + start_pos + 1, key_size);
422 |             mKey[end_pos - start_pos - 1] = '\0';
423 |             if (memcmp(mKey, key, key_size) == 0) {
424 |                 mCtxInfo[mCurLevel].cur_idx = cur_idx;
425 |                 return true;
426 |             }
427 |         }
428 |         ++cur_idx;
429 |     }
430 |     return false;
431 | }
432 | 
433 | char* ParallelBitmapIterator::moveToKey(unordered_set<char*>& key_set) {
434 |     if (key_set.empty() == true || mCurLevel < 0 || mCurLevel > mParallelBitmap->mDepth || mCtxInfo[mCurLevel].type != OBJECT) return NULL;
435 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx + 1;
436 |     long end_idx = mCtxInfo[mCurLevel].end_idx;
437 |     while (cur_idx < end_idx) {
438 |         long colon_pos = mCtxInfo[mCurLevel].positions[cur_idx];
439 |         long start_pos = 0, end_pos = 0;
440 |         if (!findFieldQuotePos(colon_pos, start_pos, end_pos)) {
441 |             return NULL;
442 |         }
443 |         bool has_m_key = false;
444 |         unordered_set<char*>::iterator iter;
445 |         for (iter = key_set.begin(); iter != key_set.end(); ++iter) {
446 |             char* key = (*iter);
447 |             int key_size = end_pos - start_pos - 1;
448 |             if (key_size == strlen(key)) {
449 |                 if (has_m_key == false) {
450 |                     memcpy(mKey, mParallelBitmap->mRecord + start_pos + 1, key_size);
451 |                     mKey[end_pos - start_pos - 1] = '\0';
452 |                     has_m_key = true;
453 |                 }
454 |                 if (memcmp(mKey, key, key_size) == 0) {
455 |                     mCtxInfo[mCurLevel].cur_idx = cur_idx;
456 |                     key_set.erase(iter);
457 |                     return key;
458 |                 }
459 |             }
460 |         }
461 |         ++cur_idx;
462 |     }
463 |     mCtxInfo[mCurLevel].cur_idx = cur_idx;
464 |     return NULL;
465 | }
466 | 
467 | int ParallelBitmapIterator::numArrayElements() {
468 |     if (mCurLevel >= 0 && mCurLevel <= mParallelBitmap->mDepth && mCtxInfo[mCurLevel].type == ARRAY) {
469 |         return mCtxInfo[mCurLevel].end_idx - mCtxInfo[mCurLevel].start_idx;
470 |     }
471 |     return 0;
472 | }
473 | 
474 | bool ParallelBitmapIterator::moveToIndex(int index) {
475 |     if (mCurLevel < 0 || mCurLevel > mParallelBitmap->mDepth || mCtxInfo[mCurLevel].type != ARRAY) return false;
476 |     long next_idx = mCtxInfo[mCurLevel].start_idx + index;
477 |     if (next_idx > mCtxInfo[mCurLevel].end_idx) return false;
478 |     mCtxInfo[mCurLevel].cur_idx = next_idx;
479 |     return true;
480 | }
481 | 
482 | char* ParallelBitmapIterator::getValue() {
483 |     if (mCurLevel < 0 || mCurLevel > mParallelBitmap->mDepth) return NULL;
484 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx;
485 |     long next_idx = cur_idx + 1;
486 |     if (next_idx > mCtxInfo[mCurLevel].end_idx) return NULL;
487 |     // current ':' or ','
488 |     long cur_pos = mCtxInfo[mCurLevel].positions[cur_idx];
489 |     // next ':' or ','
490 |     long next_pos = mCtxInfo[mCurLevel].positions[next_idx];
491 |     int type = mCtxInfo[mCurLevel].type;
492 |     if (type == OBJECT && next_idx < mCtxInfo[mCurLevel].end_idx) {
493 |         long start_pos = 0, end_pos = 0;
494 |         if (findFieldQuotePos(next_pos, start_pos, end_pos) == false) {
495 |             return "";
496 |         }
497 |         // next quote
498 |         next_pos = start_pos;
499 |     }
500 |     long text_length = next_pos - cur_pos - 1;
501 |     if (text_length <= 0) return "";
502 |     char* ret = (char*)malloc(text_length + 1);
503 |     memcpy(ret, mParallelBitmap->mRecord + cur_pos + 1, text_length);
504 |     ret[text_length] = '\0';
505 |     return ret;
506 | }
507 | 


--------------------------------------------------------------------------------
/src/ParallelBitmapIterator.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARALLELBITMAPITERATOR_H
  2 | #define PARALLELBITMAPITERATOR_H
  3 | 
  4 | #include "ParallelBitmapConstructor.h"
  5 | #include "BitmapIterator.h"
  6 | #include <string.h>
  7 | #include <unordered_set>
  8 | using namespace std;
  9 | 
 10 | #define MAX_NUM_ELE 1000000
 11 | #define SINGLE_THREAD_MAX_ARRAY_SIZE 100000
 12 | 
 13 | class ParallelBitmapIterator : public BitmapIterator {
 14 |   private:
 15 |     ParallelBitmap* mParallelBitmap;
 16 |     IterCtxInfo mCtxInfo[MAX_LEVEL];
 17 |     bool mPosArrAlloc[MAX_LEVEL];
 18 |     int mCurLevel;
 19 |     int mTopLevel;
 20 |     char mKey[MAX_FIELD_SIZE];
 21 |     // improve the performance when iterating leveled bitmap generated by multiple threads
 22 |     // mCurChunkId: pointer to the chunk which generates the current leveled bitmaps this iterator visits
 23 |     int mCurChunkId;
 24 |     bool mFindDomArray;
 25 |     bool mCopiedIterator;
 26 |     
 27 |   public:
 28 |     ParallelBitmapIterator() {}
 29 |     
 30 |     ParallelBitmapIterator(ParallelBitmap* pbm) {
 31 |         mParallelBitmap = pbm;
 32 |         mCurLevel = -1;
 33 |         mTopLevel = -1;
 34 |         mCurChunkId = 0;
 35 |         mFindDomArray = false;
 36 |         mCopiedIterator = false;
 37 |         for (int i = 0; i < MAX_LEVEL; ++i) {
 38 |             mPosArrAlloc[i] = false;
 39 |         }
 40 |         gatherParallelBitmapInfo();
 41 |         // initially, iterator points to the first level of the record
 42 |         down(); 
 43 |     }
 44 | 
 45 |     ~ParallelBitmapIterator() {
 46 |         for (int i = mTopLevel + 1; i < MAX_LEVEL; ++i) {
 47 |             if (mPosArrAlloc[i] == true) {
 48 |                 free(mCtxInfo[i].positions);
 49 |                 mCtxInfo[i].positions = NULL;
 50 |             } else {
 51 |                 break;
 52 |             }
 53 |         }
 54 |     }
 55 | 
 56 |     // Creates a copy of iterator. Often used for parallel querying.
 57 |     ParallelBitmapIterator* getCopy();
 58 |     // Moves back to the object or array which contains the current nested record.
 59 |     // Often used when the current nested record has been processed.
 60 |     // Valid except for the first level of the record.
 61 |     bool up();
 62 |     // Moves to the start of the nested object or array.
 63 |     // Gets all colon or comma positions from leveled bitmap indexes for current nested record.
 64 |     // Valid if we are at { or [.
 65 |     bool down();
 66 |     // Whether the iterator points to an object.
 67 |     bool isObject();
 68 |     // Whether the iterator points to an array.
 69 |     bool isArray();
 70 |     // Moves iterator to the next array item.
 71 |     bool moveNext();
 72 |     // Returns the size of the current key field.
 73 |     int keySize();
 74 |     // Gets the content of the current key field.
 75 |     char* getKey();
 76 |     // Moves to the corresponding key field inside the current object.
 77 |     bool moveToKey(char* key);
 78 |     // Moves to the corresponding key fields inside the current object, returns the current key name.
 79 |     // After this operation, the current key field will be removed from key_set.
 80 |     char* moveToKey(unordered_set<char*>& key_set);
 81 |     // Returns the number of elements inside current array.
 82 |     int numArrayElements();
 83 |     // If the current record is an array, moves to an item based on index.
 84 |     // Returns false if the index is out of the boundary.
 85 |     bool moveToIndex(int index);
 86 |     // Gets the content of the current value inside an object or array.
 87 |     char* getValue();
 88 |   
 89 |   private:
 90 |     // this operation can further improve the performance.
 91 |     void gatherParallelBitmapInfo();
 92 |     // get positions of all colons between start_idx and end_idx from input stream
 93 |     // prev_thread_id: thread which generates leveled bitmap indexes for the previous word
 94 |     void generateColonPositions(long start_pos, long end_pos, int level, long* colon_positions, long& top_colon_positions);
 95 |     // get positions of all commas between start_idx and end_idx from input stream
 96 |     // prev_thread_id: thread which generates leveled bitmap indexes for the previous word
 97 |     void generateCommaPositions(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions);
 98 |     // generate comma positions in parallel (used for generating positions of dominating array)
 99 |     void generateCommaPositionsParallel(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions);
100 |     // prev_thread_id: thread which generates leveled bitmap indexes for the previous word 
101 |     bool findFieldQuotePos(long colon_pos, long& start_pos, long& end_pos);
102 | };
103 | #endif
104 | 


--------------------------------------------------------------------------------
/src/RecordLoader.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include "RecordLoader.h"
 3 | using namespace std;
 4 | 
 5 | #define MAX_PAD 64
 6 | 
 7 | Record* RecordLoader::loadSingleRecord(char* file_path) {
 8 |     unsigned long size;
 9 |     FILE* fp = fopen (file_path,"rb");
10 |     if (fp == NULL) {
11 |         return NULL;
12 |     }
13 |     fseek (fp, 0, SEEK_END);
14 |     size = ftell(fp);
15 |     rewind(fp);
16 |     void* p;
17 |     if (posix_memalign(&p, 64, (size + MAX_PAD)*sizeof(char)) != 0) {
18 |         cout<<"Fail to allocate memory space for input record."<<endl;
19 |     }
20 |     char* record_text = (char*) p;
21 |     size_t load_size = fread (record_text, 1, size, fp);
22 |     if (load_size == 0) {
23 |         cout<<"Fail to load the input record into memory"<<endl;
24 |     }
25 |     int remain = 64 - (size % 64);
26 |     int counter = 0;
27 |     // pad the input data where its size can be divided by 64
28 |     while (counter < remain)
29 |     {
30 |         record_text[size+counter] = 'd';
31 |         counter++;
32 |     }
33 |     record_text[size+counter]='\0';
34 |     fclose(fp);
35 |     // only one single record
36 |     Record* record = new Record();
37 |     record->text = record_text;
38 |     record->rec_start_pos = 0;
39 |     record->rec_length = strlen(record_text);
40 |     return record;
41 | }
42 | 
43 | RecordSet* RecordLoader::loadRecords(char* file_path) {
44 |     FILE *fp = fopen(file_path, "r");
45 |     RecordSet* rs = new RecordSet();
46 |     if (fp) {
47 |         char line[MAX_RECORD_SIZE];
48 |         string str;
49 |         int start_pos = 0;
50 |         while (fgets(line, sizeof(line), fp) != NULL) {
51 |             if (strlen(line) <= MIN_RECORD_SIZE) continue;
52 |             int remain = 64 - strlen(line) % 64;
53 |             int top = strlen(line);
54 |             while (remain > 0) {
55 |                 line[top++] = 'd';
56 |                 --remain;
57 |             }
58 |             line[top] = '\0';
59 |             if (strlen(line) > MIN_RECORD_SIZE) {
60 |                 // concating a sequence of record texts into one single string generates the best performance for indexing and querying
61 |                 str.append(line);
62 |                 Record* record = new Record();
63 |                 record->rec_start_pos = start_pos;
64 |                 record->rec_length = strlen(line);
65 |                 start_pos += strlen(line);
66 |                 rs->recs.push_back(record);
67 |                 ++rs->num_recs;
68 |             }
69 |         }
70 |         void* p;
71 |         if(posix_memalign(&p, 64, str.size()*sizeof(char)) != 0) {
72 |             cout<<"Fail to allocate memory space for records from input file."<<endl;
73 |         }
74 |         for (int i = 0; i < rs->recs.size(); ++i) {
75 |             // all record objects points to the same input text which contacts a sequence of JSON records
76 |             rs->recs[i]->text = (char*) p;
77 |             if (i == 0) strcpy(rs->recs[0]->text, str.c_str());
78 |             // deconstructor in the last record object can delete input text
79 |             if (i < rs->recs.size() - 1) rs->recs[i]->can_delete_text = false;
80 |         }
81 |         fclose(fp);
82 |         return rs;
83 |     }
84 |     cout<<"Fail open the file."<<endl;
85 |     return rs;
86 | }
87 | 


--------------------------------------------------------------------------------
/src/RecordLoader.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RECORDLOADER_H
 2 | #define _RECORDLOADER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <ctype.h>
 8 | #include <pthread.h>
 9 | #include <malloc.h>
10 | #include <sys/time.h>
11 | #include <sys/file.h>
12 | #include <unistd.h>
13 | #include <sched.h>
14 | #include <iostream>
15 | #include <string>
16 | #include <vector>
17 | #include "Records.h"
18 | using namespace std;
19 | 
20 | class RecordLoader{
21 |   public:
22 |     static Record* loadSingleRecord(char* file_path);
23 |     static RecordSet* loadRecords(char* file_path);
24 | };
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/Records.h:
--------------------------------------------------------------------------------
 1 | #ifndef RECORDS_H
 2 | #define RECORDS_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | using namespace std;
 7 | 
 8 | #define MIN_RECORD_SIZE 5
 9 | #define MAX_RECORD_SIZE 1000000
10 | 
11 | // information for a single JSON record
12 | struct Record {
13 |     // for line-delimited JSON stream with a sequence of records,
14 |     // contacting them into one single string generates the best
15 |     // performance for indexing and querying
16 |     char* text;
17 |     long rec_start_pos;
18 |     long rec_length;
19 |     // text could be shared among different Record objects
20 |     // (e.g. line-delimited JSON stream with a sequence of records)
21 |     bool can_delete_text;
22 | 
23 |     Record() {
24 |         text = NULL;
25 |         rec_start_pos = 0;
26 |         rec_length = 0;
27 |         can_delete_text = true;
28 |     }
29 | 
30 |     ~Record() {
31 |         if (can_delete_text == true && text != NULL) {
32 |             free(text);
33 |             text = NULL;
34 |             can_delete_text = false;
35 |         }
36 |     }
37 | };
38 | 
39 | // information for a sequence of JSON records
40 | class RecordSet {
41 |     friend class RecordLoader;
42 |   private:
43 |     vector<Record*> recs;
44 |     long num_recs;
45 | 
46 |   public:
47 |     RecordSet() {
48 |         num_recs = 0;
49 |     }
50 | 
51 |     // record can be accessed in array style.
52 |     Record*& operator[] (long idx) {
53 |         if (idx >= 0 && idx < num_recs)
54 |             return recs[idx];
55 |         cout << "Array index in RecordSet out of bound."<<endl; 
56 |         exit(0); 
57 |     }
58 | 
59 |     long size() {
60 |         return num_recs;
61 |     }
62 | 
63 |     ~RecordSet() {
64 |         for (long i = 0; i < num_recs; ++i) {
65 |             if (recs[i] != NULL)
66 |                 delete recs[i];
67 |         }
68 |     }
69 | };
70 | #endif
71 | 


--------------------------------------------------------------------------------
/src/SerialBitmap.cpp:
--------------------------------------------------------------------------------
  1 | #include "SerialBitmap.h"
  2 | #include <immintrin.h>
  3 | 
  4 | #include <emmintrin.h>
  5 | #include <string.h>
  6 | 
  7 | #include <sys/time.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <stdlib.h>
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | #include <ctype.h>
 15 | #include <pthread.h>
 16 | #include <malloc.h>
 17 | #include <sys/time.h>
 18 | #include <sys/file.h>
 19 | #include <unistd.h>
 20 | #include <sched.h>
 21 | #include <unordered_map>
 22 | using namespace std;
 23 | 
 24 | SerialBitmap::SerialBitmap() {
 25 | 
 26 | }
 27 | 
 28 | SerialBitmap::SerialBitmap(char* record, int level_num) {
 29 |     this->mRecord = record;
 30 |     this->mDepth = level_num - 1;
 31 |     this->mQuoteBitmap = NULL;
 32 |     for (int i = 0; i <= this->mDepth; ++i) {
 33 |         this->mLevColonBitmap[i] = NULL;
 34 |         this->mLevCommaBitmap[i] = NULL;
 35 |     }
 36 | }
 37 | 
 38 | void SerialBitmap::freeMemory()
 39 | {
 40 |     for(int m = 0; m <= mDepth; m++){
 41 |         if (mLevColonBitmap[m]) {
 42 |             free(mLevColonBitmap[m]);
 43 |             mLevColonBitmap[m] = NULL;
 44 |         }
 45 |         if (mLevCommaBitmap[m]) {
 46 |             free(mLevCommaBitmap[m]);
 47 |             mLevCommaBitmap[m] = NULL;
 48 |         }
 49 |     }
 50 |     if (mQuoteBitmap) {
 51 |         free(mQuoteBitmap);
 52 |         mQuoteBitmap = NULL;
 53 |     }
 54 | }
 55 | 
 56 | SerialBitmap::~SerialBitmap()
 57 | {
 58 |     freeMemory();
 59 | }
 60 | 
 61 | void SerialBitmap::setRecordLength(long length) {
 62 |     this->mRecordLength = length;
 63 |     this->mNumTmpWords = length / 32;
 64 |     this->mNumWords = length / 64;
 65 |     // allocate memory space for saving temporary results
 66 |     if (!mQuoteBitmap) {
 67 |         mQuoteBitmap = (unsigned long*)malloc((mNumWords) * sizeof(unsigned long));
 68 |     }
 69 | }
 70 | 
 71 | void SerialBitmap::indexConstruction() {
 72 |     // vectors for structural characters
 73 |     __m256i v_quote = _mm256_set1_epi8(0x22);
 74 |     __m256i v_colon = _mm256_set1_epi8(0x3a);
 75 |     __m256i v_escape = _mm256_set1_epi8(0x5c);
 76 |     __m256i v_lbrace = _mm256_set1_epi8(0x7b);
 77 |     __m256i v_rbrace = _mm256_set1_epi8(0x7d);
 78 |     __m256i v_comma = _mm256_set1_epi8(0x2c); 
 79 |     __m256i v_lbracket = _mm256_set1_epi8(0x5b);
 80 |     __m256i v_rbracket = _mm256_set1_epi8(0x5d);
 81 | 	
 82 |     // variables for saving temporary results in the first four steps
 83 |     unsigned long colonbit0, quotebit0, escapebit0, stringbit0, lbracebit0, rbracebit0, commabit0, lbracketbit0, rbracketbit0;
 84 |     unsigned long colonbit, quotebit, escapebit, stringbit, lbracebit, rbracebit, commabit, lbracketbit, rbracketbit;
 85 |     unsigned long str_mask;
 86 | 	
 87 |     // variables for saving temporary results in the last step
 88 |     unsigned long lb_mask, rb_mask, cb_mask;
 89 |     unsigned long lb_bit, rb_bit, cb_bit;
 90 |     unsigned long first, second;
 91 |     int cur_level = -1;
 92 |     int max_positive_level = -1;
 93 | 	
 94 |     // variables for saving context information among different words
 95 |     int top_word = -1;
 96 |     uint64_t prev_iter_ends_odd_backslash = 0ULL;
 97 |     uint64_t prev_iter_inside_quote = 0ULL;
 98 |     const uint64_t even_bits = 0x5555555555555555ULL;
 99 |     const uint64_t odd_bits = ~even_bits; 
100 | 
101 |     for (int j = 0; j < mNumTmpWords; ++j) {
102 |         colonbit = 0, quotebit = 0, escapebit = 0, stringbit = 0, lbracebit = 0, rbracebit = 0, commabit = 0, lbracketbit = 0, rbracketbit = 0;
103 |         unsigned long i = j * 32; 
104 |         // step 1: build structural character bitmaps
105 |         __m256i v_text = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mRecord + i));
106 |         colonbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_colon));
107 |         quotebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_quote)); 
108 |         escapebit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_escape)); 
109 |         lbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbrace));
110 |         rbracebit  = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbrace));
111 |         commabit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_comma));
112 | 	lbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_lbracket));
113 | 	rbracketbit = (unsigned)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v_text, v_rbracket));
114 |         // first half of the word (lowest 32 bits)
115 |         if(j % 2 == 0) {
116 |             colonbit0 = colonbit;
117 |             quotebit0 = quotebit;
118 |             escapebit0 = escapebit;
119 |             lbracebit0 = lbracebit;
120 |             rbracebit0 = rbracebit;
121 |             commabit0 = commabit;
122 |             lbracketbit0 = lbracketbit;
123 |             rbracketbit0 = rbracketbit;
124 |             continue;
125 |         } else {
126 |             // highest 32 bits inside a word
127 |             colonbit = (colonbit << 32) | colonbit0;
128 |             quotebit = (quotebit << 32) | quotebit0;
129 |             escapebit = (escapebit << 32) | escapebit0;
130 |             lbracebit = (lbracebit << 32) | lbracebit0;
131 |             rbracebit = (rbracebit << 32) | rbracebit0;
132 |             commabit = (commabit << 32) | commabit0;
133 |             lbracketbit = (lbracketbit << 32) | lbracketbit0;
134 |             rbracketbit = (rbracketbit << 32) | rbracketbit0;
135 | 
136 |             // step 2: update structural quote bitmaps
137 |             uint64_t bs_bits = escapebit;
138 |             uint64_t start_edges = bs_bits & ~(bs_bits << 1);
139 |             int64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
140 |             uint64_t even_starts = start_edges & even_start_mask;
141 |             uint64_t odd_starts = start_edges & ~even_start_mask;
142 |             uint64_t even_carries = bs_bits + even_starts;
143 |             int64_t odd_carries;
144 |             bool iter_ends_odd_backslash = __builtin_uaddll_overflow(bs_bits, odd_starts,
145 |                 (unsigned long long *)(&odd_carries));
146 |             odd_carries |= prev_iter_ends_odd_backslash;
147 |             prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
148 |             uint64_t even_carry_ends = even_carries & ~bs_bits;
149 |             uint64_t odd_carry_ends = odd_carries & ~bs_bits;
150 |             uint64_t even_start_odd_end = even_carry_ends & odd_bits;
151 |             uint64_t odd_start_even_end = odd_carry_ends & even_bits;
152 |             uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
153 |             int64_t quote_bits = quotebit & ~odd_ends;
154 |             mQuoteBitmap[++top_word] = quote_bits;
155 |         
156 |             // step 3: build string mask bitmaps
157 |             str_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
158 |                 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
159 |             str_mask ^= prev_iter_inside_quote;
160 |             prev_iter_inside_quote = static_cast<uint64_t>(static_cast<int64_t>(str_mask) >> 63);
161 | 	
162 |             // step 4: update structural character bitmaps
163 |             unsigned long tmp = (~str_mask);
164 |             colonbit = colonbit & tmp;
165 |             lbracebit = lbracebit & tmp;
166 |             rbracebit = rbracebit & tmp;
167 |             commabit = commabit & tmp;
168 |             lbracketbit = lbracketbit & tmp;
169 |             rbracketbit = rbracketbit & tmp;
170 | 	
171 |             // step 5: generate leveled bitmaps
172 |             lb_mask = lbracebit | lbracketbit;
173 |             rb_mask = rbracebit | rbracketbit;
174 |             cb_mask = lb_mask | rb_mask;
175 |             lb_bit = lb_mask & (-lb_mask);
176 |             rb_bit = rb_mask & (-rb_mask);
177 |             if (!cb_mask) {
178 |                 if (cur_level >= 0 && cur_level <= mDepth) {
179 |                     if (!mLevColonBitmap[cur_level]) {
180 |                         mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
181 |                     }
182 |                     if (!mLevCommaBitmap[cur_level]) {
183 |                         mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
184 |                     }
185 |                     if (colonbit) {
186 |                         mLevColonBitmap[cur_level][top_word] = colonbit;
187 |                     } else {
188 |                         mLevCommaBitmap[cur_level][top_word] = commabit;
189 | 	            }
190 | 	        }
191 |             } else {
192 |                 first = 1;
193 |                 while (cb_mask || first) {
194 |                     if (!cb_mask) {
195 |                         second = 1UL<<63;
196 |                     } else {
197 |                         cb_bit = cb_mask & (-cb_mask);
198 |                         second = cb_bit;
199 |                     }
200 |                     if (cur_level >= 0 && cur_level <= mDepth) {
201 |                         if (!mLevColonBitmap[cur_level]) {
202 |                             mLevColonBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
203 |                         }
204 |                         if (!mLevCommaBitmap[cur_level]) {
205 |                             mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
206 |                         }
207 |                         unsigned long mask = second - first;
208 |                         if (!cb_mask) mask = mask | second;
209 |                         unsigned long colon_mask = mask & colonbit;
210 |                         if (colon_mask) {
211 |                             mLevColonBitmap[cur_level][top_word] |= colon_mask;
212 |                         } else {
213 |                             mLevCommaBitmap[cur_level][top_word] |= (commabit & mask);
214 |                         }
215 |                         if (cb_mask) {
216 |                             if (cb_bit == rb_bit) {
217 |                                 mLevColonBitmap[cur_level][top_word] |= cb_bit;
218 |                                 mLevCommaBitmap[cur_level][top_word] |= cb_bit;
219 |                             }
220 |                             else if (cb_bit == lb_bit && cur_level + 1 <= mDepth) {
221 |                                 if (!mLevCommaBitmap[cur_level + 1]) {
222 |                                      mLevCommaBitmap[cur_level + 1] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
223 |                                 }
224 |                                 mLevCommaBitmap[cur_level + 1][top_word] |= cb_bit;
225 |                             }
226 |                         }
227 |                     }
228 |                     if (cb_mask) {
229 |                         if (cb_bit == lb_bit) {
230 |                             lb_mask = lb_mask & (lb_mask - 1);
231 |                             lb_bit = lb_mask & (-lb_mask);
232 |                             ++cur_level;
233 |                             if (cur_level == 0) {
234 |                                 // JSON record could be an array
235 |                                 if (!mLevCommaBitmap[cur_level]) {
236 |                                      mLevCommaBitmap[cur_level] = (unsigned long*)calloc(mNumWords, sizeof(unsigned long));
237 |                                 }
238 |                                 mLevCommaBitmap[cur_level][top_word] |= cb_bit;
239 |                             }
240 |                         } else if (cb_bit == rb_bit) {
241 |                             rb_mask = rb_mask & (rb_mask - 1);
242 |                             rb_bit = rb_mask & (-rb_mask);
243 |                             --cur_level;
244 |                         }
245 |                         first = second;
246 |                         cb_mask = cb_mask & (cb_mask - 1);
247 |                         if (cur_level > max_positive_level) {
248 |                             max_positive_level = cur_level;
249 |                         }
250 |                     } else {
251 |                         first = 0;
252 |                     }
253 |                 }
254 | 	    }
255 |         }
256 |     }
257 |     if (mDepth == MAX_LEVEL - 1) mDepth = max_positive_level;
258 |     //cout<<"cur level "<<cur_level<<endl;
259 | }
260 | 


--------------------------------------------------------------------------------
/src/SerialBitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef SERIALBITMAP_H
 2 | #define SERIALBITMAP_H
 3 | #include <string>
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <bitset>
 7 | #include <cassert>
 8 | #include <stack>
 9 | #include <algorithm>
10 | #include <unordered_map>
11 | #include <functional>
12 | #include <math.h>
13 | #include <immintrin.h>
14 | #include "Bitmap.h"
15 | using namespace std;
16 | 
17 | class SerialBitmap : public Bitmap {
18 |     friend class SerialBitmapIterator;
19 |   private:
20 |     char* mRecord;
21 |     // for a single large record, stream length equals to record length
22 |     long mRecordLength;
23 |     // each temp word has 32 bytes
24 |     long mNumTmpWords;
25 |     // each word has 64 bytes
26 |     long mNumWords;
27 |     // structural quote bitmap, used for key field parsing
28 |     unsigned long *mQuoteBitmap;
29 |     // leveled colon bitmap
30 |     unsigned long *mLevColonBitmap[MAX_LEVEL];
31 |     // leveled comma bitmap
32 |     unsigned long *mLevCommaBitmap[MAX_LEVEL];
33 |     // the deepest level of leveled bitmap indexes (starting from 0)
34 |     int mDepth;
35 |     
36 |   public:
37 |     SerialBitmap();
38 |     SerialBitmap(char* record, int level_num);
39 |     ~SerialBitmap();
40 |     void indexConstruction();
41 |     void setRecordLength(long length);
42 |  
43 |   private:
44 |     void freeMemory();
45 | };
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/SerialBitmapConstructor.cpp:
--------------------------------------------------------------------------------
 1 | #include "SerialBitmapConstructor.h"
 2 | #include <string.h>
 3 | #include <sys/time.h>
 4 | #include <sys/file.h>
 5 | 
 6 | SerialBitmap* SerialBitmapConstructor::construct(Record* record, int level_num) {
 7 |     char* record_text = NULL;
 8 |     long length = 0;
 9 |     if (record->rec_start_pos > 0) record_text = record->text + record->rec_start_pos;
10 |     else record_text = record->text;
11 |     if (record->rec_length > 0) length = record->rec_length;
12 |     else length = strlen(record->text);
13 |     SerialBitmap* bitmap = new SerialBitmap(record_text, level_num);
14 |     bitmap->setRecordLength(length);
15 |     bitmap->indexConstruction();
16 |     return bitmap;
17 | }
18 | 


--------------------------------------------------------------------------------
/src/SerialBitmapConstructor.h:
--------------------------------------------------------------------------------
 1 | #ifndef SERIALBITMAPCONSTRUCTOR_H
 2 | #define SERIALBITMAPCONSTRUCTOR_H
 3 | 
 4 | #include "SerialBitmap.h"
 5 | #include "Records.h"
 6 | 
 7 | class SerialBitmapConstructor {
 8 |   public:
 9 |     static SerialBitmap* construct(Record* record, int level_num = MAX_LEVEL);
10 | };
11 | #endif
12 | 


--------------------------------------------------------------------------------
/src/SerialBitmapIterator.cpp:
--------------------------------------------------------------------------------
  1 | #include "SerialBitmapIterator.h"
  2 | 
  3 | void SerialBitmapIterator::generateColonPositions(long start_pos, long end_pos, int level, long* colon_positions, long& top_colon_positions) {
  4 |     unsigned long colonbit;
  5 |     long st = start_pos / 64;
  6 |     long ed = ceil(double(end_pos) / 64);
  7 |     for (long i = st; i < ed; ++i) {
  8 |         colonbit = mSerialBitmap->mLevColonBitmap[level][i];
  9 |         int cnt = __builtin_popcountl(colonbit);
 10 |         while (colonbit) {
 11 |             long offset = i * 64 + __builtin_ctzll(colonbit);
 12 |             if (start_pos <= offset && offset <= end_pos) {
 13 |                 colon_positions[++top_colon_positions] = offset;
 14 |             }
 15 |             colonbit = colonbit & (colonbit - 1);
 16 |         }  
 17 |     }
 18 | }
 19 | 
 20 | void SerialBitmapIterator::generateCommaPositions(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions) {
 21 |     unsigned long commabit;
 22 |     long st = start_pos / 64;
 23 |     long ed = ceil(double(end_pos) / 64);
 24 |     for (long i = st; i < ed; ++i) {
 25 |         commabit = mSerialBitmap->mLevCommaBitmap[level][i];
 26 |         int cnt = __builtin_popcountl(commabit);
 27 |         while (commabit) {
 28 |             long offset = i * 64 + __builtin_ctzll(commabit);
 29 |             if (start_pos <= offset && offset <= end_pos) {
 30 |                 comma_positions[++top_comma_positions] = offset;
 31 |             }
 32 |             commabit = commabit & (commabit - 1);
 33 |         }
 34 |     }
 35 | }
 36 | 
 37 | bool SerialBitmapIterator::findFieldQuotePos(long colon_pos, long& start_pos, long& end_pos) {
 38 |     long w_id = colon_pos/64;
 39 |     long offset = colon_pos%64;
 40 |     long start_quote = 0;
 41 |     long end_quote = 0;
 42 |     start_pos = 0; end_pos = 0;
 43 |     while (w_id >= 0)
 44 |     {
 45 |         unsigned long quotebit = mSerialBitmap->mQuoteBitmap[w_id];
 46 |         unsigned long offset = w_id * 64 + __builtin_ctzll(quotebit);
 47 |         while (quotebit && offset < colon_pos)
 48 |         {
 49 |             if (end_pos != 0)
 50 |             {
 51 |                 start_quote = offset;
 52 |             }
 53 |             else if(start_quote == 0)
 54 |             {
 55 |                 start_quote = offset;
 56 |             }
 57 |             else if(end_quote == 0)
 58 |             {
 59 |                 end_quote = offset;
 60 |             }
 61 |             else
 62 |             {
 63 |                 start_quote = end_quote;
 64 |                 end_quote = offset;
 65 |             }
 66 |             quotebit = quotebit & (quotebit - 1);
 67 |             offset = w_id * 64 + __builtin_ctzll(quotebit); 
 68 |         }
 69 |         if(start_quote != 0 && end_quote == 0)
 70 |         {
 71 |             end_quote = start_quote;
 72 |             start_quote = 0;
 73 |             end_pos = end_quote;
 74 |         }
 75 |         else if(start_quote != 0 && end_quote != 0)
 76 |         {
 77 |             start_pos = start_quote;
 78 |             end_pos = end_quote;
 79 |             return true;
 80 |         }
 81 |         --w_id;
 82 |     }
 83 |     return false;
 84 | }
 85 | 
 86 | SerialBitmapIterator* SerialBitmapIterator::getCopy() {
 87 |     SerialBitmapIterator* sbi = new SerialBitmapIterator();
 88 |     sbi->mSerialBitmap = mSerialBitmap;
 89 |     sbi->mCurLevel = mCurLevel;
 90 |     sbi->mTopLevel = mCurLevel;
 91 |     if (sbi->mTopLevel >= 0) {
 92 |         sbi->mCtxInfo[mCurLevel].type = mCtxInfo[mCurLevel].type;
 93 |         sbi->mCtxInfo[mCurLevel].positions = mCtxInfo[mCurLevel].positions;
 94 |         sbi->mCtxInfo[mCurLevel].start_idx = mCtxInfo[mCurLevel].start_idx;
 95 |         sbi->mCtxInfo[mCurLevel].end_idx = mCtxInfo[mCurLevel].end_idx;
 96 |         sbi->mCtxInfo[mCurLevel].cur_idx = -1;
 97 |         sbi->mCtxInfo[mCurLevel + 1].positions = NULL;
 98 |     }
 99 |     return sbi;
100 | }
101 | 
102 | bool SerialBitmapIterator::up() {
103 |     if (mCurLevel == mTopLevel) return false;
104 |     --mCurLevel; 
105 |     return true;
106 | }
107 | 
108 | bool SerialBitmapIterator::down() {
109 |     if (mCurLevel < mTopLevel || mCurLevel > mSerialBitmap->mDepth) return false;
110 |     ++mCurLevel;
111 |     //cout<<"down function cur level "<<mCurLevel<<" "<<mTopLevel<<" "<<mSerialBitmap->mDepth<<endl;
112 |     long start_pos = -1;
113 |     long end_pos = -1;
114 |     if (mCurLevel == mTopLevel + 1) {
115 |         if (mTopLevel == -1) {
116 |             long record_length = mSerialBitmap->mRecordLength;
117 |             start_pos = 0;
118 |             end_pos = record_length;
119 |             mCtxInfo[mCurLevel].positions = new long[record_length / 8 + 1]; 
120 |             mPosArrAlloc[mCurLevel] = true;
121 |         } else {
122 |             long cur_idx = mCtxInfo[mCurLevel - 1].cur_idx;
123 |             start_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx];
124 |             end_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx + 1];
125 |             if (mCtxInfo[mCurLevel].positions == NULL || mPosArrAlloc[mCurLevel] == false) {
126 |                 mCtxInfo[mCurLevel].positions = new long[MAX_NUM_ELE / 8 + 1];
127 |                 mPosArrAlloc[mCurLevel] = true;
128 |             }
129 |         }
130 |         mCtxInfo[mCurLevel].start_idx = 0;
131 |         mCtxInfo[mCurLevel].cur_idx = -1;
132 |         mCtxInfo[mCurLevel].end_idx = -1; 
133 |     } else {
134 |         long cur_idx = mCtxInfo[mCurLevel - 1].cur_idx;
135 |         if (cur_idx > mCtxInfo[mCurLevel - 1].end_idx) {
136 |             --mCurLevel;
137 |   //          cout<<"exception "<<cur_idx<<" "<<mCtxInfo[mCurLevel - 1].end_idx<<endl;
138 |             return false;
139 |         }
140 |         start_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx];
141 |         end_pos = mCtxInfo[mCurLevel - 1].positions[cur_idx + 1];
142 |         mCtxInfo[mCurLevel].positions = mCtxInfo[mCurLevel - 1].positions;
143 |         mCtxInfo[mCurLevel].start_idx = mCtxInfo[mCurLevel - 1].end_idx + 1;
144 |         mCtxInfo[mCurLevel].cur_idx = mCtxInfo[mCurLevel - 1].end_idx;
145 |         mCtxInfo[mCurLevel].end_idx = mCtxInfo[mCurLevel - 1].end_idx;
146 |     }
147 |     long i = start_pos;
148 |     if (start_pos > 0 || mCurLevel > 0) ++i;
149 |     char ch = mSerialBitmap->mRecord[i];
150 |     while (i < end_pos && (ch == ' ' || ch == '\n')) {
151 |         ch = mSerialBitmap->mRecord[++i];
152 |     }
153 |     if (mSerialBitmap->mRecord[i] == '{') {
154 |         mCtxInfo[mCurLevel].type = OBJECT; 
155 |         generateColonPositions(i, end_pos, mCurLevel, mCtxInfo[mCurLevel].positions, mCtxInfo[mCurLevel].end_idx); 
156 |         return true;
157 |     } else if (mSerialBitmap->mRecord[i] == '[') {
158 |         mCtxInfo[mCurLevel].type = ARRAY;
159 |         generateCommaPositions(i, end_pos, mCurLevel, mCtxInfo[mCurLevel].positions, mCtxInfo[mCurLevel].end_idx);
160 |         return true;
161 |     }
162 |     --mCurLevel;
163 |     return false;
164 | }
165 | 
166 | bool SerialBitmapIterator::isObject() {
167 |     if (mCurLevel >= 0 && mCurLevel <= mSerialBitmap->mDepth && mCtxInfo[mCurLevel].type == OBJECT) {
168 |         return true;
169 |     }
170 |     return false;
171 | }
172 | 
173 | bool SerialBitmapIterator::isArray() {
174 |     if (mCurLevel >= 0 && mCurLevel <= mSerialBitmap->mDepth && mCtxInfo[mCurLevel].type == ARRAY) {
175 |         return true;
176 |     }
177 |     return false;
178 | }
179 | 
180 | bool SerialBitmapIterator::moveNext() {
181 |     if (mCurLevel < 0 || mCurLevel > mSerialBitmap->mDepth || mCtxInfo[mCurLevel].type != ARRAY) return false;
182 |     long next_idx = mCtxInfo[mCurLevel].cur_idx + 1;
183 |     if (next_idx >= mCtxInfo[mCurLevel].end_idx) return false;
184 |     mCtxInfo[mCurLevel].cur_idx = next_idx;
185 |     return true;
186 | }
187 | 
188 | bool SerialBitmapIterator::moveToKey(char* key) {
189 |     if (mCurLevel < 0 || mCurLevel > mSerialBitmap->mDepth || mCtxInfo[mCurLevel].type != OBJECT) return false;
190 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx + 1;
191 |     long end_idx = mCtxInfo[mCurLevel].end_idx;
192 |     while (cur_idx < end_idx) {
193 |         long colon_pos = mCtxInfo[mCurLevel].positions[cur_idx];
194 |         long start_pos = 0, end_pos = 0;
195 |         if (!findFieldQuotePos(colon_pos, start_pos, end_pos)) {
196 |             return false;
197 |         }
198 |         ++mVisitedFields;
199 |         int key_size = end_pos - start_pos - 1;
200 |         if (key_size == strlen(key)) {
201 |             memcpy(mKey, mSerialBitmap->mRecord + start_pos + 1, key_size);
202 |             mKey[end_pos - start_pos - 1] = '\0';
203 |             if (memcmp(mKey, key, key_size) == 0) {
204 |                 mCtxInfo[mCurLevel].cur_idx = cur_idx;
205 |                 return true;
206 |             }
207 |         }
208 |         ++cur_idx;
209 |     }
210 |     mCtxInfo[mCurLevel].cur_idx = cur_idx;
211 |     return false;
212 | }
213 | 
214 | char* SerialBitmapIterator::moveToKey(unordered_set<char*>& key_set) {
215 |     if (key_set.empty() == true|| mCurLevel < 0 || mCurLevel > mSerialBitmap->mDepth || mCtxInfo[mCurLevel].type != OBJECT) return NULL;
216 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx + 1;
217 |     long end_idx = mCtxInfo[mCurLevel].end_idx;
218 |     while (cur_idx < end_idx) {
219 |         long colon_pos = mCtxInfo[mCurLevel].positions[cur_idx];
220 |         long start_pos = 0, end_pos = 0;
221 |         if (!findFieldQuotePos(colon_pos, start_pos, end_pos)) {
222 |             return NULL;
223 |         }
224 |         ++mVisitedFields;
225 |         bool has_m_key = false;
226 |         unordered_set<char*>::iterator iter;
227 |         for (iter = key_set.begin(); iter != key_set.end(); ++iter) {
228 |             char* key = (*iter);
229 |             int key_size = end_pos - start_pos - 1;
230 |             if (key_size == strlen(key)) {
231 |                 if (has_m_key == false) {
232 |                     memcpy(mKey, mSerialBitmap->mRecord + start_pos + 1, key_size);
233 |                     mKey[end_pos - start_pos - 1] = '\0';
234 |                     has_m_key = true;
235 |                 }
236 |                 if (memcmp(mKey, key, key_size) == 0) {
237 |                     mCtxInfo[mCurLevel].cur_idx = cur_idx;
238 |                     key_set.erase(iter);
239 |                     return key;
240 |                 }
241 |             }
242 |         }
243 |         ++cur_idx;
244 |     }
245 |     mCtxInfo[mCurLevel].cur_idx = cur_idx;
246 |     return NULL;
247 | }
248 | 
249 | int SerialBitmapIterator::numArrayElements() {
250 |     if (mCurLevel >= 0 && mCurLevel <= mSerialBitmap->mDepth && mCtxInfo[mCurLevel].type == ARRAY) {
251 |         return mCtxInfo[mCurLevel].end_idx - mCtxInfo[mCurLevel].start_idx;
252 |     }
253 |     return 0;
254 | }
255 | 
256 | bool SerialBitmapIterator::moveToIndex(int index) {
257 |     if (mCurLevel < 0 || mCurLevel > mSerialBitmap->mDepth || mCtxInfo[mCurLevel].type != ARRAY) return false;
258 |     long next_idx = mCtxInfo[mCurLevel].start_idx + index;
259 |     if (next_idx > mCtxInfo[mCurLevel].end_idx) return false;
260 |     mCtxInfo[mCurLevel].cur_idx = next_idx;
261 |     return true;
262 | }
263 | 
264 | char* SerialBitmapIterator::getValue() {
265 |     if (mCurLevel < 0 || mCurLevel > mSerialBitmap->mDepth) return NULL;
266 |     long cur_idx = mCtxInfo[mCurLevel].cur_idx;
267 |     long next_idx = cur_idx + 1;
268 |     if (next_idx > mCtxInfo[mCurLevel].end_idx) return NULL;
269 |     // current ':' or ','
270 |     long cur_pos = mCtxInfo[mCurLevel].positions[cur_idx];
271 |     // next ':' or ','
272 |     long next_pos = mCtxInfo[mCurLevel].positions[next_idx];
273 |     int type = mCtxInfo[mCurLevel].type;
274 |     if (type == OBJECT && next_idx < mCtxInfo[mCurLevel].end_idx) {
275 |         long start_pos = 0, end_pos = 0;
276 |         if (findFieldQuotePos(next_pos, start_pos, end_pos) == false) {
277 |             return "";
278 |         }
279 |         // next quote
280 |         next_pos = start_pos;
281 |     }
282 |     long text_length = next_pos - cur_pos - 1;
283 |     if (text_length <= 0) return "";
284 |     char* ret = (char*)malloc(text_length + 1);
285 | ///    cout<<"cur pos "<<(cur_pos + 1)<<" length "<<text_length<<endl;
286 |     memcpy(ret, mSerialBitmap->mRecord + cur_pos + 1, text_length);
287 |     ret[text_length] = '\0';
288 |     return ret;
289 | }
290 | 


--------------------------------------------------------------------------------
/src/SerialBitmapIterator.h:
--------------------------------------------------------------------------------
 1 | #ifndef SERIALBITMAPITERATOR_H
 2 | #define SERIALBITMAPITERATOR_H
 3 | 
 4 | #include "SerialBitmap.h"
 5 | #include "BitmapIterator.h"
 6 | #include <string.h>
 7 | #include <unordered_set>
 8 | using namespace std;
 9 | 
10 | #define MAX_NUM_ELE 1000000
11 | 
12 | class SerialBitmap;
13 | 
14 | class SerialBitmapIterator : public BitmapIterator {
15 |   private:
16 |     SerialBitmap* mSerialBitmap;
17 |     IterCtxInfo mCtxInfo[MAX_LEVEL];
18 |     bool mPosArrAlloc[MAX_LEVEL];
19 |     int mCurLevel;
20 |     // the top level of the input record, or each sub-record (for parallel querying)
21 |     int mTopLevel;
22 |     // more efficient than allocating memory during runtime
23 |     char mKey[MAX_FIELD_SIZE];
24 |     
25 |   public:
26 |     SerialBitmapIterator() {}
27 |     SerialBitmapIterator(SerialBitmap* sbm) {
28 |         mSerialBitmap = sbm;
29 |         mCurLevel = -1;
30 |         mTopLevel = -1;
31 |         mVisitedFields = 0;
32 |         for (int i = 0; i < MAX_LEVEL; ++i) {
33 |             mPosArrAlloc[i] = false;
34 |             mCtxInfo[i].positions = NULL;
35 |         }
36 |         // initially, iterator points to the first level of the record
37 |         down();
38 |     }
39 | 
40 |     ~SerialBitmapIterator() {
41 |         for (int i = mTopLevel + 1; i < MAX_LEVEL; ++i) {
42 |             if (mPosArrAlloc[i] == true) {
43 |                 free(mCtxInfo[i].positions);
44 |                 mCtxInfo[i].positions = NULL;
45 |             } else {
46 |                 break;
47 |             }
48 |         }
49 |     }
50 | 
51 |     // Creates a copy of iterator. Often used for parallel querying.
52 |     SerialBitmapIterator* getCopy();
53 |     // Moves back to the object or array which contains the current nested record.
54 |     // Often used when the current nested record has been processed.
55 |     // Valid except for the first level of the record.
56 |     bool up();
57 |     // Moves to the start of the nested object or array.
58 |     // Gets all colon or comma positions from leveled bitmap indexes for current nested record.
59 |     // Valid if we are at { or [.
60 |     bool down();
61 |     // Whether the iterator points to an object.
62 |     bool isObject();
63 |     // Whether the iterator points to an array.
64 |     bool isArray();
65 |     // Moves iterator to the next array item.
66 |     bool moveNext();
67 |     // Moves to the corresponding key field inside the current object.
68 |     bool moveToKey(char* key);
69 |     // Moves to the corresponding key fields inside the current object, returns the current key name.
70 |     // After this operation, the current key field will be removed from key_set.
71 |     char* moveToKey(unordered_set<char*>& key_set);
72 |     // Returns the number of elements inside current array.
73 |     int numArrayElements();
74 |     // If the current record is an array, moves to an item based on index.
75 |     // Returns false if the index is out of the boundary.
76 |     bool moveToIndex(int index);
77 |     // Gets the content of the current value inside an object or array.
78 |     char* getValue();
79 |    
80 |   private:
81 |     // get positions of all colons between start_idx and end_idx from input stream
82 |     void generateColonPositions(long start_pos, long end_pos, int level, long* colon_positions, long& top_colon_positions);
83 |     // get positions of all commas between start_idx and end_idx from input stream
84 |     void generateCommaPositions(long start_pos, long end_pos, int level, long* comma_positions, long& top_comma_positions);
85 |     // get starting and ending positions for a string
86 |     bool findFieldQuotePos(long colon_pos, long& start_pos, long& end_pos);
87 | };
88 | #endif
89 | 


--------------------------------------------------------------------------------
/src/Tokenizer.cpp:
--------------------------------------------------------------------------------
  1 | #include "Tokenizer.h"
  2 | #include <iostream>
  3 | using namespace std;
  4 | 
  5 | int Tokenizer::getStringToken(int& pos) {
  6 |     while (pos < 64) {
  7 |         int escape_cnt = 0;
  8 |         while (pos < 64 &&mChunk[pos] == '\\') {
  9 |             ++escape_cnt;
 10 |             ++pos;
 11 |         }
 12 |         if (mChunk[pos] == '"') {
 13 |             ++pos;
 14 |             if (escape_cnt % 2 == 0) {
 15 |                 mCurTknType = STRING;
 16 |                 mNextTknPos = pos;
 17 |                 return TRUE;
 18 |             }
 19 |         } else {
 20 |             ++pos;
 21 |         }
 22 |     }
 23 |     return END;
 24 | }
 25 | 
 26 | int Tokenizer::hasNextToken() {
 27 |     if (mCurPos >= 64) return END;
 28 |     // current chunk starts inside string
 29 |     if (mCurPos == 0 && mStartState == IN) {
 30 |         int pos = mCurPos;
 31 |         return getStringToken(pos);
 32 |     }
 33 |     int pos = mCurPos;
 34 |     while (pos < 64) {
 35 |         switch(mChunk[pos]) {
 36 |             case '\t':
 37 |             case '\n':
 38 |                 ++pos;
 39 |                 break;
 40 |             case '{':
 41 |                 ++pos;
 42 |                 mCurTknType = LCB;
 43 |                 mNextTknPos = pos;
 44 |                 return TRUE;
 45 |             case '}':
 46 |                 ++pos;
 47 |                 mCurTknType = RCB;
 48 |                 mNextTknPos = pos;
 49 |                 return TRUE;
 50 |             case '[':
 51 |                 ++pos;
 52 |                 mCurTknType = LB;
 53 |                 mNextTknPos = pos;
 54 |                 return TRUE;
 55 |             case ']':
 56 |                 ++pos;
 57 |                 mCurTknType = RB;
 58 |                 mNextTknPos = pos;
 59 |                 return TRUE;
 60 |             case ',':
 61 |                 ++pos;
 62 |                 mCurTknType = COM;
 63 |                 mNextTknPos = pos;
 64 |                 return TRUE;
 65 |             case ':':
 66 |                 ++pos;
 67 |                 mCurTknType = COLON;
 68 |                 mNextTknPos = pos;
 69 |                 return TRUE;
 70 |             case '"':
 71 |                 ++pos;
 72 |                 return getStringToken(pos);
 73 |             case 't':
 74 |             {
 75 |                 if (mChunk[pos + 1] == 'r' && mChunk[pos + 2] == 'u' && mChunk[pos + 3] == 'e') {
 76 |                     pos += 4;
 77 |                     mCurTknType = PRI;
 78 |                     mNextTknPos = pos; 
 79 |                     return TRUE;
 80 |                 } else {
 81 |                     return ERROR;
 82 |                 }
 83 |             }
 84 |             case 'r':
 85 |             {
 86 |                 if (mChunk[pos + 2] == 'u' && mChunk[pos + 3] == 'e') {
 87 |                     pos += 3;
 88 |                     mCurTknType = PRI;
 89 |                     mNextTknPos = pos;
 90 |                     return TRUE;
 91 |                 } else {
 92 |                     return ERROR;
 93 |                 }
 94 |             }
 95 |             case 'e':
 96 |             {
 97 |                if (mChunk[pos + 1] == ',' || mChunk[pos + 1] == ']' || mChunk[pos + 1] == '}') {
 98 |                    ++pos;
 99 |                    mCurTknType = PRI;
100 |                    mNextTknPos = pos;
101 |                    return TRUE;
102 |                } else {
103 |                    return ERROR;
104 |                }
105 |             } 
106 |             case 'f':
107 |             {
108 |                 if (mChunk[pos + 1] == 'a' && mChunk[pos + 2] == 'l' && mChunk[pos + 3] == 's' && mChunk[pos + 4] == 'e') {
109 |                     pos += 5;
110 |                     mCurTknType = PRI;
111 |                     mNextTknPos = pos;
112 |                     return TRUE;
113 |                 } else {
114 |                     return ERROR;
115 |                 }
116 |             }
117 |             case 'a':
118 |             {
119 |                 if (mChunk[pos + 1] == 'l' && mChunk[pos + 2] == 's' && mChunk[pos + 3] == 'e') {
120 |                     pos += 4;
121 |                     mCurTknType = PRI;
122 |                     mNextTknPos = pos;
123 |                     return TRUE;
124 |                 } else {
125 |                     return ERROR;
126 |                 }
127 |             }
128 |             case 's':
129 |             {
130 |                 if (mChunk[pos + 1] == 'e') {
131 |                     pos += 2;
132 |                     mCurTknType = PRI;
133 |                     mNextTknPos = pos;
134 |                     return TRUE;
135 |                 } else {
136 |                     return ERROR;
137 |                 }
138 |             } 
139 |             case 'n':
140 |             {
141 |                if (mChunk[pos + 1] == 'u' && mChunk[pos + 2] == 'l' && mChunk[pos + 3] == 'l') {
142 |                    pos += 4;
143 |                    mCurTknType = PRI;
144 |                    mNextTknPos = pos;
145 |                    return TRUE;
146 |                } else {
147 |                    return ERROR;
148 |                }
149 |             }
150 |             case 'u':
151 |             {
152 |                if (mChunk[pos + 1] == 'l' && mChunk[pos + 2] == 'l') {
153 |                    pos += 3;
154 |                    mCurTknType = PRI;
155 |                    mNextTknPos = pos;
156 |                    return TRUE;
157 |                } else if (mChunk[pos + 1] == 'e') {
158 |                    pos += 2;
159 |                    mCurTknType = PRI;
160 |                    mNextTknPos = pos;
161 |                    return TRUE;
162 |                } else {
163 |                    return ERROR;
164 |                }
165 |             }
166 |             case 'l':
167 |             {
168 |                if (mChunk[pos + 1] == 'l' || mChunk[pos + 1] == ',' || mChunk[pos + 1] == ']' || mChunk[pos + 1] == '}') {
169 |                    ++pos;
170 |                    if (mChunk[pos] == 'l') ++pos;
171 |                    mCurTknType = PRI;
172 |                    mNextTknPos = pos;
173 |                    return TRUE;
174 |                } else if (mChunk[pos + 1] == 's' && mChunk[pos + 2] == 'e') {
175 |                    pos += 3;
176 |                    mCurTknType = PRI;
177 |                    mNextTknPos = pos;
178 |                    return TRUE;
179 |                } else {
180 |                    return ERROR;
181 |                }
182 |             }
183 |             case '-':
184 |             case '0':
185 |             case '1':
186 |             case '2':
187 |             case '3':
188 |             case '4':
189 |             case '5':
190 |             case '6':
191 |             case '7':
192 |             case '8':
193 |             case '9':
194 |             case '.':
195 |             {
196 |                 ++pos;
197 |                 char ch = mChunk[pos];
198 |                 if (ch == '}' || ch == ']' || ch == ' ' || ch == '\t' || ch == ',') {
199 |                     mCurTknType = PRI;
200 |                     mNextTknPos = pos;
201 |                     return TRUE;
202 |                 }
203 |                 if (ch == '"') {
204 |                     return ERROR;
205 |                 }
206 |                 break;
207 |             }
208 |             case ' ':
209 |                 ++pos;
210 |                 break;
211 |             default:
212 |                 return ERROR;
213 |         } 
214 |     }
215 |     return END;
216 | }
217 | 
218 | int Tokenizer::nextToken() {
219 |     mCurPos = mNextTknPos;
220 |     return mCurTknType;
221 | }
222 | 


--------------------------------------------------------------------------------
/src/Tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TOKENIZER_H
 2 | #define TONENIZER_H
 3 | 
 4 | #define IN 10
 5 | #define OUT 11
 6 | 
 7 | #define TRUE 100
 8 | #define ERROR 101
 9 | #define END 102
10 | 
11 | // token type
12 | #define LCB 1   //'{'
13 | #define RCB 2   //'}'
14 | #define LB 3    //'['
15 | #define RB 4    //']'
16 | #define COM 5   //','
17 | #define COLON 6 //':'
18 | #define PRI 7   //primitive
19 | #define STRING 8 // "abc"
20 | #define UNKNOWN 9 
21 | #define INVALID -1  //invalid
22 | 
23 | class Tokenizer {
24 |   private:
25 |     char* mChunk;
26 |     // starting state of the separated chunk
27 |     // IN -- inside string
28 |     // OUT -- outside string
29 |     int mStartState;
30 |     // current position of mText
31 |     long mCurPos;
32 |     // current token type
33 |     int mCurTknType;
34 |     // starting position of the next token
35 |     long mNextTknPos;
36 |   public:
37 |     Tokenizer() {}
38 | 
39 |     Tokenizer(char* chunk, int state) {
40 |         mChunk = chunk;
41 |         mStartState = state;
42 |         mCurPos = 0;
43 |         mCurTknType = UNKNOWN;
44 |         mNextTknPos = 0;
45 |     }
46 | 
47 |     // create iterator based on start state
48 |     void createIterator(char* chunk, int state) {
49 |         mChunk = chunk;
50 |         mStartState = state;
51 |         mCurPos = 0;
52 |         mCurTknType = UNKNOWN;
53 |         mNextTknPos = 0;
54 |     }
55 | 
56 |     int oppositeState(int state) {
57 |         if (state == IN) return OUT;
58 |         if (state == OUT) return IN;
59 |         return UNKNOWN;
60 |     }
61 | 
62 |     // whether the next valid token exists, only iterates through the first word (64 bits)
63 |     int hasNextToken();
64 |     // returns the type of the next token, only iterates through the first word (64 bits)
65 |     int nextToken();
66 | 
67 |   private:
68 |     int getStringToken(int& pos);
69 | };
70 | #endif
71 | 


--------------------------------------------------------------------------------