├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── docs ├── benchmark.md └── contributors-v1.2-draft.md ├── expected └── zson.out ├── img └── zson-logo.png ├── sql └── zson.sql ├── zson--1.0--1.1.sql ├── zson--1.1.sql ├── zson.c └── zson.control /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | results 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Aleksander Alekseev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = zson 2 | MODULES = zson 3 | DATA = zson--1.1.sql zson--1.0--1.1.sql 4 | REGRESS = zson 5 | 6 | PG_CPPFLAGS = -g -O2 7 | SHLIB_LINK = # -lz -llz4 8 | 9 | ifndef PG_CONFIG 10 | PG_CONFIG := pg_config 11 | endif 12 | PGXS := $(shell $(PG_CONFIG) --pgxs) 13 | include $(PGXS) 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZSON 2 | 3 | ![ZSON Logo](img/zson-logo.png) 4 | 5 | ## About 6 | 7 | ZSON is a PostgreSQL extension for transparent JSONB compression. Compression is 8 | based on a shared dictionary of strings most frequently used in specific JSONB 9 | documents (not only keys, but also values, array elements, etc). 10 | 11 | In some cases ZSON can save half of your disk space and give you about 10% more 12 | TPS. Memory is saved as well. See [docs/benchmark.md](docs/benchmark.md). 13 | Everything depends on your data and workload, though. Don't believe any 14 | benchmarks, re-check everything on your data, configuration, hardware, workload 15 | and PostgreSQL version. 16 | 17 | ZSON was originally created in 2016 by [Postgres Professional][pgpro] team: 18 | researched and coded by [Aleksander Alekseev][me]; ideas, code review, testing, 19 | etc by [Alexander Korotkov][ak] and [Teodor Sigaev][ts]. 20 | 21 | [me]: https://eax.me/ 22 | [ak]: https://akorotkov.github.io/ 23 | [ts]: http://www.sigaev.ru/ 24 | [pgpro]: https://postgrespro.com/ 25 | 26 | See also discussions on [pgsql-general@][gen], [Hacker News][hn], [Reddit][rd] 27 | and [HabraHabr][habr]. 28 | 29 | [gen]: https://www.postgresql.org/message-id/flat/20160930185801.38654a1c%40e754 30 | [hn]: https://news.ycombinator.com/item?id=12633486 31 | [rd]: https://www.reddit.com/r/PostgreSQL/comments/55mr4r/zson_postgresql_extension_for_transparent_jsonb/ 32 | [habr]: https://habr.com/ru/company/postgrespro/blog/312006/ 33 | 34 | ## Install 35 | 36 | Build and install ZSON: 37 | 38 | ``` 39 | cd /path/to/zson/source/code 40 | make 41 | sudo make install 42 | ``` 43 | 44 | Run tests: 45 | 46 | ``` 47 | make installcheck 48 | ``` 49 | 50 | Connect to PostgreSQL: 51 | 52 | ``` 53 | psql my_database 54 | ``` 55 | 56 | Enable extension: 57 | 58 | ``` 59 | create extension zson; 60 | ``` 61 | 62 | ## Uninstall 63 | 64 | Disable extension: 65 | 66 | ``` 67 | drop extension zson; 68 | ``` 69 | 70 | Uninstall ZSON: 71 | 72 | ``` 73 | cd /path/to/zson/source/code 74 | sudo make uninstall 75 | ``` 76 | 77 | ## Usage 78 | 79 | First ZSON should be *trained* on common data using zson\_learn procedure: 80 | 81 | ``` 82 | zson_learn( 83 | tables_and_columns text[][], 84 | max_examples int default 10000, 85 | min_length int default 2, 86 | max_length int default 128, 87 | min_count int default 2 88 | ) 89 | ``` 90 | 91 | Example: 92 | 93 | ``` 94 | select zson_learn('{{"table1", "col1"}, {"table2", "col2"}}'); 95 | ``` 96 | 97 | You can create a temporary table and write some common JSONB documents into it 98 | manually or use the existing tables. The idea is to provide a subset of real 99 | data. Let's say some document *type* is twice as frequent as another document 100 | type. ZSON expects that there will be twice as many documents of the first type 101 | as those of the second one in a learning set. 102 | 103 | Resulting dictionary could be examined using this query: 104 | 105 | ``` 106 | select * from zson_dict; 107 | ``` 108 | 109 | Now ZSON type could be used as a complete and transparent replacement of JSONB 110 | type: 111 | 112 | ``` 113 | zson_test=# create table zson_example(x zson); 114 | CREATE TABLE 115 | 116 | zson_test=# insert into zson_example values ('{"aaa": 123}'); 117 | INSERT 0 1 118 | 119 | zson_test=# select x -> 'aaa' from zson_example; 120 | -[ RECORD 1 ]- 121 | ?column? | 123 122 | ``` 123 | 124 | ## Migrating to a new dictionary 125 | 126 | When a schema of JSONB documents evolves ZSON could be *re-learned*: 127 | 128 | ``` 129 | select zson_learn('{{"table1", "col1"}, {"table2", "col2"}}'); 130 | ``` 131 | 132 | This time *second* dictionary will be created. Dictionaries are cached in memory 133 | so it will take about a minute before ZSON realizes that there is a new 134 | dictionary. After that old documents will be decompressed using the old 135 | dictionary and new documents will be compressed and decompressed using the new 136 | dictionary. 137 | 138 | To find out which dictionary is used for a given ZSON document use zson\_info 139 | procedure: 140 | 141 | ``` 142 | zson_test=# select zson_info(x) from test_compress where id = 1; 143 | -[ RECORD 1 ]--------------------------------------------------- 144 | zson_info | zson version = 0, dict version = 1, ... 145 | 146 | zson_test=# select zson_info(x) from test_compress where id = 2; 147 | -[ RECORD 1 ]--------------------------------------------------- 148 | zson_info | zson version = 0, dict version = 0, ... 149 | ``` 150 | 151 | If **all** ZSON documents are migrated to the new dictionary the old one could 152 | be safely removed: 153 | 154 | ``` 155 | delete from zson_dict where dict_id = 0; 156 | ``` 157 | 158 | In general, it's safer to keep old dictionaries just in case. Gaining a few KB 159 | of disk space is not worth the risk of losing data. 160 | 161 | ## When it's a time to re-learn? 162 | 163 | Unfortunately, it's hard to recommend a general approach. 164 | 165 | A good heuristic could be: 166 | 167 | ``` 168 | select pg_table_size('tt') / (select count(*) from tt) 169 | ``` 170 | 171 | ... i.e. average document size. When it suddenly starts to grow it's time to 172 | re-learn. 173 | 174 | However, developers usually know when they change a schema significantly. It's 175 | also easy to re-check whether the current schema differs a lot from the original 176 | one using zson\_dict table. 177 | 178 | ## Known limitations 179 | 180 | Intalling ZSON in a schema other than `public` is not supported (i.e. `CREATE EXTENSION zson WITH SCHEMA ...`). 181 | 182 | -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | **Disclaimer**: Synthetic benchmarks could not be trusted. Re-check everything 4 | on specific hardware, configuration, data and workload! 5 | 6 | We used the following server: 7 | 8 | * 12 cores (24 with HT) 9 | * 24 Gb of RAM 10 | * HDD 11 | * swap is off 12 | 13 | To simulate the scenario when a database doesn't fit into the memory we used 14 | `stress`: 15 | 16 | ``` 17 | sudo stress --vm-bytes 21500m --vm-keep -m 1 --vm-hang 0 18 | ``` 19 | 20 | OOM Killer configuration: 21 | 22 | ``` 23 | # allow up to (100 - 3)% of all memory to be used 24 | 25 | echo 100 > /proc/sys/vm/overcommit_ratio 26 | echo 2 > /proc/sys/vm/overcommit_memory # OVERCOMMIT_NEVER 27 | ``` 28 | 29 | Database configuration: 30 | 31 | ``` 32 | max_prepared_transactions = 100 33 | shared_buffers = 1GB 34 | wal_level = hot_standby 35 | wal_keep_segments = 128 36 | max_connections = 600 37 | listen_addresses = '*' 38 | autovacuum = off 39 | ``` 40 | 41 | Data: 42 | 43 | ``` 44 | \timing on 45 | 46 | create extension zson; 47 | 48 | create table jsonb_example(x jsonb); 49 | 50 | create table test_nocompress(id SERIAL PRIMARY KEY, x jsonb); 51 | create table test_compress(id SERIAL PRIMARY KEY, x zson); 52 | 53 | -- "common" JSON document - example of Consul REST API response 54 | insert into jsonb_example values (' 55 | { 56 | "Member": { 57 | "DelegateCur": 4, 58 | "DelegateMax": 4, 59 | "DelegateMin": 2, 60 | "Name": "postgresql-master", 61 | "Addr": "10.0.3.245", 62 | "Port": 8301, 63 | "Tags": { 64 | "vsn_min": "1", 65 | "vsn_max": "3", 66 | "vsn": "2", 67 | "role": "consul", 68 | "port": "8300", 69 | "expect": "3", 70 | "dc": "dc1", 71 | "build": "0.6.1:68969ce5" 72 | }, 73 | "Status": 1, 74 | "ProtocolMin": 1, 75 | "ProtocolMax": 3, 76 | "ProtocolCur": 2 77 | }, 78 | "Coord": { 79 | "Height": 1.1121755379893715e-05, 80 | "Adjustment": -2.023610556800026e-05, 81 | "Error": 0.19443548095527025, 82 | "Vec": [ 83 | -0.004713143383327869, 84 | -0.0032494905923075553, 85 | 0.0007104109540813835, 86 | 0.00472788328972092, 87 | -0.0015931587524407006, 88 | 0.0014436856764788407, 89 | 0.005688487740053884, 90 | -0.0039037697928507834 91 | ] 92 | }, 93 | "Config": { 94 | "Reap": null, 95 | "SessionTTLMinRaw": "", 96 | "SessionTTLMin": 0, 97 | "UnixSockets": { 98 | "Perms": "", 99 | "Grp": "", 100 | "Usr": "" 101 | }, 102 | "VersionPrerelease": "", 103 | "Version": "0.6.1", 104 | "Revision": "68969ce5f4499cbe3a4f946917be2e580f1b1936+CHANGES", 105 | "CAFile": "", 106 | "VerifyServerHostname": false, 107 | "VerifyOutgoing": false, 108 | "VerifyIncoming": false, 109 | "EnableDebug": false, 110 | "Protocol": 2, 111 | "DogStatsdTags": null, 112 | "DogStatsdAddr": "", 113 | "StatsdAddr": "", 114 | "StatsitePrefix": "consul", 115 | "StatsiteAddr": "", 116 | "SkipLeaveOnInt": false, 117 | "LeaveOnTerm": false, 118 | "Addresses": { 119 | "RPC": "", 120 | "HTTPS": "", 121 | "HTTP": "", 122 | "DNS": "" 123 | }, 124 | "Ports": { 125 | "Server": 8300, 126 | "SerfWan": 8302, 127 | "SerfLan": 8301, 128 | "RPC": 8400, 129 | "HTTPS": -1, 130 | "HTTP": 8500, 131 | "DNS": 8600 132 | }, 133 | "AdvertiseAddrWan": "10.0.3.245", 134 | "DNSRecursors": [], 135 | "DNSRecursor": "", 136 | "DataDir": "/var/lib/consul", 137 | "Datacenter": "dc1", 138 | "Server": true, 139 | "BootstrapExpect": 3, 140 | "Bootstrap": false, 141 | "DevMode": false, 142 | "DNSConfig": { 143 | "OnlyPassing": false, 144 | "MaxStale": 5e+09, 145 | "EnableTruncate": false, 146 | "AllowStale": false, 147 | "ServiceTTL": null, 148 | "NodeTTL": 0 149 | }, 150 | "Domain": "consul.", 151 | "LogLevel": "INFO", 152 | "NodeName": "postgresql-master", 153 | "ClientAddr": "0.0.0.0", 154 | "BindAddr": "10.0.3.245", 155 | "AdvertiseAddr": "10.0.3.245", 156 | "AdvertiseAddrs": { 157 | "RPCRaw": "", 158 | "RPC": null, 159 | "SerfWanRaw": "", 160 | "SerfWan": null, 161 | "SerfLanRaw": "", 162 | "SerfLan": null 163 | }, 164 | "CertFile": "", 165 | "KeyFile": "", 166 | "ServerName": "", 167 | "StartJoin": [], 168 | "StartJoinWan": [], 169 | "RetryJoin": [], 170 | "RetryMaxAttempts": 0, 171 | "RetryIntervalRaw": "", 172 | "RetryJoinWan": [], 173 | "RetryMaxAttemptsWan": 0, 174 | "RetryIntervalWanRaw": "", 175 | "EnableUi": false, 176 | "UiDir": "/usr/share/consul/web-ui", 177 | "PidFile": "", 178 | "EnableSyslog": false, 179 | "SyslogFacility": "LOCAL0", 180 | "RejoinAfterLeave": false, 181 | "CheckUpdateInterval": 3e+11, 182 | "ACLDatacenter": "", 183 | "ACLTTL": 3e+10, 184 | "ACLTTLRaw": "", 185 | "ACLDefaultPolicy": "allow", 186 | "ACLDownPolicy": "extend-cache", 187 | "Watches": null, 188 | "DisableRemoteExec": false, 189 | "DisableUpdateCheck": false, 190 | "DisableAnonymousSignature": false, 191 | "HTTPAPIResponseHeaders": null, 192 | "AtlasInfrastructure": "", 193 | "AtlasJoin": false, 194 | "AtlasEndpoint": "", 195 | "DisableCoordinates": false 196 | } 197 | } 198 | '); 199 | 200 | -- step 1 201 | 202 | insert into test_nocompress select * from generate_series(1, 10000), 203 | jsonb_example; 204 | 205 | select zson_learn('{{"test_nocompress", "x"}}'); 206 | 207 | insert into test_compress select * from generate_series(1, 10000), 208 | jsonb_example; 209 | 210 | 211 | select before, after, after :: float / before as ratio 212 | from (select pg_table_size('test_nocompress') as before, 213 | pg_table_size('test_compress') as after) as temp; 214 | 215 | -- step 2 216 | 217 | insert into test_nocompress 218 | select * from generate_series(10001, 1000000), jsonb_example; 219 | 220 | insert into test_compress 221 | select * from generate_series(10001, 1000000), jsonb_example; 222 | 223 | select before, after, after :: float / before as ratio 224 | from (select pg_table_size('test_nocompress') as before, 225 | pg_table_size('test_compress') as after) as temp; 226 | 227 | -- step 3 228 | 229 | insert into test_nocompress 230 | select * from generate_series(1000001, 3000000), jsonb_example; 231 | 232 | insert into test_compress 233 | select * from generate_series(1000001, 3000000), jsonb_example; 234 | 235 | select before, after, after :: float / before as ratio 236 | from (select pg_table_size('test_nocompress') as before, 237 | pg_table_size('test_compress') as after) as temp; 238 | 239 | -- VACUUM prevents PostgreSQL from *writing* data to disk during benchmark 240 | -- when only SELECT queries are executed 241 | -- For more details see https://wiki.postgresql.org/wiki/Hint_Bits 242 | 243 | vacuum full verbose; 244 | 245 | select before, after, after :: float / before as ratio 246 | from (select pg_table_size('test_nocompress') as before, 247 | pg_table_size('test_compress') as after) as temp; 248 | 249 | ``` 250 | 251 | File nocompress.pgbench: 252 | 253 | ``` 254 | \set maxid 3000000 255 | \setrandom from_id 1 :maxid 256 | select x -> 'time' from test_nocompress where id=:from_id; 257 | ``` 258 | 259 | File compress.pgbench: 260 | 261 | ``` 262 | \set maxid 3000000 263 | \setrandom from_id 1 :maxid 264 | select x -> 'time' from test_compress where id=:from_id; 265 | ``` 266 | 267 | On PostgreSQL server: 268 | 269 | ``` 270 | # restart is required to clean shared buffers 271 | 272 | sudo service postgresql restart 273 | ``` 274 | 275 | On second server (same hardware, same rack): 276 | 277 | ``` 278 | pgbench -h 10.110.0.10 -f nocompress.pgbench -T 600 -P 1 -c 40 -j 12 zson_test 279 | ``` 280 | 281 | On PostgreSQL server: 282 | 283 | ``` 284 | sudo service postgresql restart 285 | ``` 286 | 287 | On second server: 288 | 289 | ``` 290 | pgbench -h 10.110.0.10 -f compress.pgbench -T 600 -P 1 -c 40 -j 12 zson_test 291 | ``` 292 | 293 | Benchmark results: 294 | 295 | ``` 296 | transaction type: nocompress.pgbench 297 | scaling factor: 1 298 | query mode: simple 299 | number of clients: 40 300 | number of threads: 12 301 | duration: 600 s 302 | number of transactions actually processed: 583183 303 | latency average = 41.153 ms 304 | latency stddev = 75.002 ms 305 | tps = 971.802963 (including connections establishing) 306 | tps = 971.810536 (excluding connections establishing) 307 | 308 | 309 | transaction type: compress.pgbench 310 | scaling factor: 1 311 | query mode: simple 312 | number of clients: 40 313 | number of threads: 12 314 | duration: 600 s 315 | number of transactions actually processed: 651910 316 | latency average = 36.814 ms 317 | latency stddev = 52.750 ms 318 | tps = 1086.386943 (including connections establishing) 319 | tps = 1086.396431 (excluding connections establishing) 320 | ``` 321 | 322 | In this case ZSON gives about 11.8% more TPS. 323 | 324 | We can modify compress.pgbench and nocompress.pgbench so only the documents with 325 | id between 1 and 3000 will be requested. It will simulate a case when all the 326 | data *does* fits into the memory. In this case we see 141K TPS (JSONB) vs 134K 327 | TPS (ZSON) which is 5% slower. 328 | 329 | The compression ratio could be different depending on the documents, the 330 | database schema, the number of rows, etc. But in general ZSON compression is 331 | much better than build-in PostgreSQL compression (PGLZ): 332 | 333 | ``` 334 | before | after | ratio 335 | ------------+------------+------------------ 336 | 3961880576 | 1638834176 | 0.41365057440843 337 | (1 row) 338 | 339 | before | after | ratio 340 | ------------+------------+------------------- 341 | 8058904576 | 4916436992 | 0.610062688500061 342 | (1 row) 343 | 344 | before | after | ratio 345 | -------------+------------+------------------- 346 | 14204420096 | 9832841216 | 0.692238130775149 347 | ``` 348 | 349 | Not only is the disk space saved, but the data loaded to shared buffers is not 350 | decompressed. It means that memory is also saved and more data could be 351 | accessed without loading it from the disk. 352 | -------------------------------------------------------------------------------- /docs/contributors-v1.2-draft.md: -------------------------------------------------------------------------------- 1 | # Draft of contributors to v1.2 2 | 3 | * René Leonhardt 4 | * https://github.com/arssher 5 | * https://github.com/zheka-13 6 | -------------------------------------------------------------------------------- /expected/zson.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION zson; 2 | SELECT zson_extract_strings('true'); 3 | zson_extract_strings 4 | ---------------------- 5 | {} 6 | (1 row) 7 | 8 | SELECT zson_extract_strings('"aaa"'); 9 | zson_extract_strings 10 | ---------------------- 11 | {aaa} 12 | (1 row) 13 | 14 | SELECT zson_extract_strings('["some", ["dummy", "array"], 456, false]'); 15 | zson_extract_strings 16 | ---------------------- 17 | {some,dummy,array} 18 | (1 row) 19 | 20 | SELECT zson_extract_strings('{"IP":"10.0.0.3","Roles":["master", "slave"]}'); 21 | zson_extract_strings 22 | ---------------------------------- 23 | {IP,Roles,10.0.0.3,master,slave} 24 | (1 row) 25 | 26 | CREATE TABLE nocompress(x jsonb); 27 | INSERT INTO nocompress VALUES 28 | ('true'), 29 | ('123'), 30 | ('"aaa"'), 31 | ('["some", ["dummy", "array"], 456, false]'), 32 | (' 33 | [ 34 | {"ModifyIndex":1,"IP":"10.0.0.1","Roles":["master"]}, 35 | {"ModifyIndex":2,"IP":"10.0.0.2","Roles":["slave"]}, 36 | {"ModifyIndex":3,"IP":"10.0.0.3","Roles":["master", "slave"]} 37 | ] 38 | '); 39 | SELECT zson_learn('{{"nocompress", "x"}}'); 40 | zson_learn 41 | ------------------------------------------------------------------------------- 42 | Done! Run " select * from zson_dict where dict_id = 0; " to see a dictionary. 43 | (1 row) 44 | 45 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 46 | dict_id | word 47 | ---------+------------- 48 | 0 | IP 49 | 0 | ModifyIndex 50 | 0 | Roles 51 | 0 | master 52 | 0 | slave 53 | (5 rows) 54 | 55 | SELECT zson_learn('{{"nocompress", "x"}}', 10000, 1, 128, 1); 56 | zson_learn 57 | ------------------------------------------------------------------------------- 58 | Done! Run " select * from zson_dict where dict_id = 1; " to see a dictionary. 59 | (1 row) 60 | 61 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 62 | dict_id | word 63 | ---------+------------- 64 | 0 | IP 65 | 0 | ModifyIndex 66 | 0 | Roles 67 | 0 | master 68 | 0 | slave 69 | 1 | 10.0.0.1 70 | 1 | 10.0.0.2 71 | 1 | 10.0.0.3 72 | 1 | IP 73 | 1 | ModifyIndex 74 | 1 | Roles 75 | 1 | aaa 76 | 1 | array 77 | 1 | dummy 78 | 1 | master 79 | 1 | slave 80 | 1 | some 81 | (17 rows) 82 | 83 | SELECT '{"aaa": "ololo"}'::zson; 84 | zson 85 | ------------------ 86 | {"aaa": "ololo"} 87 | (1 row) 88 | 89 | SELECT '{"aaa": "ololo"}'::zson -> 'aaa'; 90 | ?column? 91 | ---------- 92 | "ololo" 93 | (1 row) 94 | 95 | CREATE TABLE zson_test(x zson); 96 | INSERT INTO zson_test VALUES('{"aaa":123}' :: jsonb); 97 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 98 | dict_id | word 99 | ---------+------------- 100 | 0 | IP 101 | 0 | ModifyIndex 102 | 0 | Roles 103 | 0 | master 104 | 0 | slave 105 | 1 | 10.0.0.1 106 | 1 | 10.0.0.2 107 | 1 | 10.0.0.3 108 | 1 | IP 109 | 1 | ModifyIndex 110 | 1 | Roles 111 | 1 | aaa 112 | 1 | array 113 | 1 | dummy 114 | 1 | master 115 | 1 | slave 116 | 1 | some 117 | (17 rows) 118 | 119 | SELECT x :: jsonb FROM zson_test; 120 | x 121 | -------------- 122 | {"aaa": 123} 123 | (1 row) 124 | 125 | DROP TABLE zson_test; 126 | DROP TABLE nocompress; 127 | DROP EXTENSION zson; 128 | -------------------------------------------------------------------------------- /img/zson-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/postgrespro/zson/e214c79711b9eb92786280756a549b9e4cb2a215/img/zson-logo.png -------------------------------------------------------------------------------- /sql/zson.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION zson; 2 | 3 | SELECT zson_extract_strings('true'); 4 | 5 | SELECT zson_extract_strings('"aaa"'); 6 | 7 | SELECT zson_extract_strings('["some", ["dummy", "array"], 456, false]'); 8 | 9 | SELECT zson_extract_strings('{"IP":"10.0.0.3","Roles":["master", "slave"]}'); 10 | 11 | CREATE TABLE nocompress(x jsonb); 12 | 13 | INSERT INTO nocompress VALUES 14 | ('true'), 15 | ('123'), 16 | ('"aaa"'), 17 | ('["some", ["dummy", "array"], 456, false]'), 18 | (' 19 | [ 20 | {"ModifyIndex":1,"IP":"10.0.0.1","Roles":["master"]}, 21 | {"ModifyIndex":2,"IP":"10.0.0.2","Roles":["slave"]}, 22 | {"ModifyIndex":3,"IP":"10.0.0.3","Roles":["master", "slave"]} 23 | ] 24 | '); 25 | 26 | SELECT zson_learn('{{"nocompress", "x"}}'); 27 | 28 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 29 | 30 | SELECT zson_learn('{{"nocompress", "x"}}', 10000, 1, 128, 1); 31 | 32 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 33 | 34 | SELECT '{"aaa": "ololo"}'::zson; 35 | 36 | SELECT '{"aaa": "ololo"}'::zson -> 'aaa'; 37 | 38 | CREATE TABLE zson_test(x zson); 39 | 40 | INSERT INTO zson_test VALUES('{"aaa":123}' :: jsonb); 41 | 42 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C"; 43 | 44 | SELECT x :: jsonb FROM zson_test; 45 | 46 | DROP TABLE zson_test; 47 | 48 | DROP TABLE nocompress; 49 | 50 | DROP EXTENSION zson; 51 | -------------------------------------------------------------------------------- /zson--1.0--1.1.sql: -------------------------------------------------------------------------------- 1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 2 | \echo Use "CREATE EXTENSION zson" to load this file. \quit 3 | 4 | SELECT pg_catalog.pg_extension_config_dump('zson_dict', ''); 5 | -------------------------------------------------------------------------------- /zson--1.1.sql: -------------------------------------------------------------------------------- 1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 2 | \echo Use "CREATE EXTENSION zson" to load this file. \quit 3 | 4 | CREATE TYPE zson; 5 | 6 | CREATE TABLE zson_dict ( 7 | dict_id SERIAL NOT NULL, 8 | word_id INTEGER NOT NULL, 9 | word text NOT NULL, 10 | PRIMARY KEY(dict_id, word_id) 11 | ); 12 | 13 | SELECT pg_catalog.pg_extension_config_dump('zson_dict', ''); 14 | 15 | -- Usage: select zson_learn('{{"table1", "col1"}, {"table2", "col2"}, ... }'); 16 | CREATE FUNCTION zson_learn( 17 | tables_and_columns text[][], 18 | max_examples int default 10000, 19 | min_length int default 2, 20 | max_length int default 128, 21 | min_count int default 2) 22 | RETURNS text AS $$ 23 | DECLARE 24 | tabname text; 25 | colname text; 26 | query text := ''; 27 | i int; 28 | next_dict_id int; 29 | BEGIN 30 | IF cardinality(tables_and_columns) = 0 THEN 31 | RAISE NOTICE 'First argument should not be an empty array!'; 32 | RETURN ''; 33 | END IF; 34 | 35 | FOR i IN array_lower(tables_and_columns, 1) .. 36 | array_upper(tables_and_columns, 1) 37 | LOOP 38 | tabname := tables_and_columns[i][1]; 39 | colname := tables_and_columns[i][2]; 40 | 41 | IF (tabname IS NULL) OR (colname IS NULL) THEN 42 | RAISE NOTICE 'Invalid list of tables and columns!'; 43 | RETURN ''; 44 | ELSIF position('"' in tabname) <> 0 THEN 45 | RAISE NOTICE 'Invalid table name %', tabname; 46 | RETURN ''; 47 | ELSIF position('"' in colname) <> 0 THEN 48 | RAISE NOTICE 'Invalid column name %', tabname; 49 | RETURN ''; 50 | ELSIF position('.' in tabname) <> 0 THEN 51 | tabname := quote_ident(split_part(tabname, '.', 1)) || 52 | '.' || quote_ident(split_part(tabname, '.', 2)); 53 | END IF; 54 | 55 | IF query <> '' THEN 56 | query := query || ' union all '; 57 | END IF; 58 | 59 | query := query || '( select unnest(zson_extract_strings(' || 60 | quote_ident(colname) || ')) as t from ' || tabname || ' limit ' || 61 | max_examples || ')'; 62 | 63 | END LOOP; 64 | 65 | select coalesce(max(dict_id), -1) + 1 INTO next_dict_id from zson_dict; 66 | 67 | query := 'select t from (select t, count(*) as sum from ( ' || 68 | query || ' ) as tt group by t) as s where length(t) >= ' || 69 | min_length || ' and length(t) <= ' || max_length || 70 | ' and sum >= ' || min_count || ' order by sum desc limit 65534'; 71 | 72 | query := 'insert into zson_dict select ' || next_dict_id || 73 | ' as dict_id, row_number() over () as word_id, t as word from ( ' || 74 | query || ' ) as top_words'; 75 | 76 | EXECUTE query; 77 | 78 | RETURN 'Done! Run " select * from zson_dict where dict_id = ' || 79 | next_dict_id || '; " to see a dictionary.'; 80 | END; 81 | $$ LANGUAGE plpgsql; 82 | 83 | CREATE FUNCTION zson_extract_strings(x jsonb) 84 | RETURNS text[] AS $$ 85 | DECLARE 86 | jtype text; 87 | jitem jsonb; 88 | BEGIN 89 | jtype := jsonb_typeof(x); 90 | IF jtype = 'object' THEN 91 | RETURN array(select unnest(z) from ( 92 | select array(select jsonb_object_keys(x)) as z 93 | union all ( 94 | select zson_extract_strings(x -> k) as z from ( 95 | select jsonb_object_keys(x) as k 96 | ) as kk 97 | ) 98 | ) as zz); 99 | ELSIF jtype = 'array' THEN 100 | RETURN ARRAY(select unnest(zson_extract_strings(t)) from 101 | (select jsonb_array_elements(x) as t) as tt); 102 | ELSIF jtype = 'string' THEN 103 | RETURN array[ x #>> array[] :: text[] ]; 104 | ELSE -- 'number', 'boolean', 'bool' 105 | RETURN array[] :: text[]; 106 | END IF; 107 | END; 108 | $$ LANGUAGE plpgsql; 109 | 110 | CREATE FUNCTION zson_in(cstring) 111 | RETURNS zson 112 | AS 'MODULE_PATHNAME' 113 | LANGUAGE C STRICT IMMUTABLE; 114 | 115 | CREATE FUNCTION zson_out(zson) 116 | RETURNS cstring 117 | AS 'MODULE_PATHNAME' 118 | LANGUAGE C STRICT IMMUTABLE; 119 | 120 | CREATE TYPE zson ( 121 | INTERNALLENGTH = -1, 122 | INPUT = zson_in, 123 | OUTPUT = zson_out, 124 | STORAGE = extended -- try to compress 125 | ); 126 | 127 | CREATE FUNCTION jsonb_to_zson(jsonb) 128 | RETURNS zson 129 | AS 'MODULE_PATHNAME' 130 | LANGUAGE C STRICT IMMUTABLE; 131 | 132 | CREATE FUNCTION zson_to_jsonb(zson) 133 | RETURNS jsonb 134 | AS 'MODULE_PATHNAME' 135 | LANGUAGE C STRICT IMMUTABLE; 136 | 137 | CREATE CAST (jsonb AS zson) WITH FUNCTION jsonb_to_zson(jsonb) AS ASSIGNMENT; 138 | CREATE CAST (zson AS jsonb) WITH FUNCTION zson_to_jsonb(zson) AS IMPLICIT; 139 | 140 | CREATE FUNCTION zson_info(zson) 141 | RETURNS cstring 142 | AS 'MODULE_PATHNAME' 143 | LANGUAGE C STRICT IMMUTABLE; 144 | 145 | --CREATE FUNCTION debug_dump_jsonb(jsonb) 146 | -- RETURNS cstring 147 | -- AS 'MODULE_PATHNAME' 148 | -- LANGUAGE C STRICT IMMUTABLE; 149 | -------------------------------------------------------------------------------- /zson.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // zson compression: 12 | // 13 | // VARHDRSZ 14 | // zson_version [uint8] 15 | // dict_version [uint32] 16 | // decoded_size [uint32] 17 | // hint [uint8 x PGLZ_HINT_SIZE] 18 | // { 19 | // skip_bytes [uint8] 20 | // ... skip_bytes bytes ... 21 | // string_code [uint16], 0 = no_string 22 | // } * 23 | 24 | PG_MODULE_MAGIC; 25 | 26 | PG_FUNCTION_INFO_V1(zson_in); 27 | PG_FUNCTION_INFO_V1(zson_out); 28 | PG_FUNCTION_INFO_V1(zson_to_jsonb); 29 | PG_FUNCTION_INFO_V1(jsonb_to_zson); 30 | PG_FUNCTION_INFO_V1(zson_info); 31 | PG_FUNCTION_INFO_V1(debug_dump_jsonb); 32 | 33 | /* In version 11 these macros have been changed */ 34 | #if PG_VERSION_NUM < 110000 35 | #define PG_GETARG_JSONB_P(v) PG_GETARG_JSONB(v) 36 | #define PG_RETURN_JSONB_P(v) PG_RETURN_JSONB(v) 37 | #endif 38 | 39 | #define ZSON_CURRENT_VERSION 0 40 | 41 | #define ZSON_HEADER_SIZE (sizeof(uint8) + sizeof(uint32)*2) 42 | #define ZSON_HEADER_VERSION(hdrp) (*(uint8*)hdrp) 43 | #define ZSON_HEADER_DICT_VERSION(hdrp) \ 44 | (*(uint32*)((uint8*)hdrp + sizeof(uint8))) 45 | #define ZSON_HEADER_DECODED_SIZE(hdrp) \ 46 | (*(uint32*)((uint8*)hdrp + sizeof(uint8) + sizeof(uint32))) 47 | 48 | #define PGLZ_HINT_SIZE 32 // courner case: 0 49 | 50 | #define DICT_MAX_WORDS (1 << 16) 51 | 52 | typedef struct { 53 | uint16 code; 54 | bool check_next; // next word starts with the same nbytes bytes 55 | size_t nbytes; // number of bytes (not letters) except \0 56 | char* word; 57 | } Word; 58 | 59 | typedef struct { 60 | int32 dict_id; 61 | uint32 nwords; 62 | Word words[DICT_MAX_WORDS]; // sorted by .word, word -> code 63 | uint16 code_to_word[DICT_MAX_WORDS]; // code -> word index 64 | } Dict; 65 | 66 | typedef struct DictListItem { 67 | Dict* pdict; 68 | union { 69 | time_t last_clean_sec; // for first list item 70 | time_t last_used_sec; // for rest list items 71 | }; 72 | struct DictListItem* next; 73 | } DictListItem; 74 | 75 | // update current dict_id every N seconds 76 | #define DICT_ID_CACHE_TIME_SEC 60 77 | 78 | // clean distList every N seconds 79 | #define DICT_LIST_CLEAN_INTERVAL_SEC 60 80 | 81 | // keep dict items not used for at most N seconds 82 | #define DICT_LIST_TTL_SEC 120 83 | 84 | // reserved code 85 | #define DICT_INVALID_CODE 0 86 | 87 | static int32 cachedDictId = -1; 88 | static time_t cachedDictIdLastUpdatedSec = 0; 89 | 90 | static SPIPlanPtr savedPlanGetDictId = NULL; 91 | static SPIPlanPtr savedPlanLoadDict = NULL; 92 | 93 | static DictListItem dictList = { 0 }; 94 | 95 | static Dict* 96 | dict_load(int32 dict_id) 97 | { 98 | int row; 99 | Datum qvalues[] = { Int32GetDatum(dict_id) }; 100 | Dict* pdict = calloc(sizeof(Dict), 1); 101 | if(pdict == NULL) 102 | return NULL; 103 | 104 | pdict->dict_id = dict_id; 105 | 106 | SPI_connect(); 107 | 108 | if(savedPlanLoadDict == NULL) 109 | { 110 | Oid argtypes[] = { INT4OID }; 111 | savedPlanLoadDict = SPI_prepare( 112 | "select word_id, word from public.zson_dict where dict_id = $1 " 113 | "order by word", 114 | 1, argtypes); 115 | if(savedPlanLoadDict == NULL) 116 | elog(ERROR, "Error preparing query"); 117 | if(SPI_keepplan(savedPlanLoadDict)) 118 | elog(ERROR, "Error keeping plan"); 119 | } 120 | 121 | if(SPI_execute_plan(savedPlanLoadDict, qvalues, NULL, 122 | true, DICT_MAX_WORDS) < 0) 123 | { 124 | elog(ERROR, "Failed to load dictionary"); 125 | } 126 | 127 | for(row = 0; row < SPI_processed; row++) 128 | { 129 | bool isnull; 130 | char* wordcopy; 131 | uint32 word_id = DatumGetInt32( 132 | SPI_getbinval(SPI_tuptable->vals[row], SPI_tuptable->tupdesc, 133 | 1, &isnull) 134 | ); 135 | char* word = DatumGetCString(DirectFunctionCall1(textout, 136 | SPI_getbinval(SPI_tuptable->vals[row], SPI_tuptable->tupdesc, 137 | 2, &isnull) 138 | )); 139 | 140 | size_t wordlen = strlen(word); 141 | // in case user filled zson_dict manually 142 | if(wordlen < 2) 143 | continue; 144 | 145 | wordcopy = malloc(wordlen+1); 146 | if(wordcopy == NULL) 147 | elog(ERROR, "Failed to allocate memory"); 148 | 149 | strcpy(wordcopy, word); 150 | pdict->words[pdict->nwords].code = (uint16)word_id; 151 | pdict->words[pdict->nwords].word = wordcopy; 152 | pdict->words[pdict->nwords].nbytes = wordlen; 153 | 154 | pdict->code_to_word[ (uint16)word_id ] = pdict->nwords; 155 | 156 | if((pdict->nwords > 0) && 157 | ( pdict->words[pdict->nwords-1].nbytes < 158 | pdict->words[pdict->nwords].nbytes) ) 159 | { 160 | pdict->words[pdict->nwords-1].check_next = 161 | (memcmp(pdict->words[pdict->nwords].word, 162 | pdict->words[pdict->nwords-1].word, 163 | pdict->words[pdict->nwords-1].nbytes 164 | ) == 0); 165 | } 166 | 167 | pdict->nwords++; 168 | } 169 | 170 | SPI_finish(); 171 | 172 | return pdict; 173 | } 174 | 175 | static void 176 | dict_free(Dict* pdict) 177 | { 178 | uint32 i; 179 | 180 | for(i = 0; i < pdict->nwords; i++) 181 | free(pdict->words[i].word); 182 | 183 | free(pdict); 184 | } 185 | 186 | // binary search 187 | static uint16 188 | dict_find_match(const Dict* pdict, 189 | const uint8* buff, size_t buff_size, size_t* pnbytes) 190 | { 191 | int res; 192 | int32 left = 0; 193 | int32 right = pdict->nwords-1; 194 | size_t best_nbytes = 0; 195 | uint16 best_code = DICT_INVALID_CODE; 196 | 197 | while(left <= right) 198 | { 199 | uint32 current = (left + right) / 2; 200 | size_t nbytes = pdict->words[current].nbytes; 201 | 202 | if(nbytes > buff_size) 203 | { 204 | /* current can be less or greater depending on the prefix */ 205 | res = memcmp(pdict->words[current].word, buff, buff_size); 206 | 207 | /* if prefixes match, current is greater */ 208 | if(res == 0) 209 | res = 1; 210 | } else 211 | res = memcmp(pdict->words[current].word, buff, nbytes); 212 | 213 | if(res == 0) // match 214 | { 215 | best_nbytes = nbytes; 216 | best_code = pdict->words[current].code; 217 | 218 | if((!pdict->words[current].check_next) || (nbytes == buff_size)) 219 | break; 220 | 221 | // maybe there is a longer match 222 | left = current + 1; 223 | } 224 | else if(res < 0) // current is less 225 | left = current + 1; 226 | else // current is greater 227 | right = current - 1; 228 | } 229 | 230 | *pnbytes = best_nbytes; 231 | return best_code; 232 | } 233 | 234 | static Dict* 235 | dict_get(int32 dict_id) 236 | { 237 | DictListItem* dict_list_item = &dictList; 238 | struct timeval tv; 239 | 240 | gettimeofday(&tv, NULL); 241 | 242 | // clean cache if necessary 243 | 244 | if(tv.tv_sec - dictList.last_clean_sec > DICT_LIST_CLEAN_INTERVAL_SEC) 245 | { 246 | DictListItem* prev_dict_list_item = NULL; 247 | 248 | while(dict_list_item) 249 | { 250 | if(dict_list_item->pdict && 251 | (tv.tv_sec - dict_list_item->last_used_sec > 252 | DICT_LIST_TTL_SEC)) 253 | { 254 | DictListItem* temp = dict_list_item->next; 255 | 256 | dict_free(dict_list_item->pdict); 257 | free(dict_list_item); 258 | 259 | prev_dict_list_item->next = temp; 260 | dict_list_item = temp; 261 | } 262 | else 263 | { 264 | prev_dict_list_item = dict_list_item; 265 | dict_list_item = dict_list_item->next; 266 | } 267 | } 268 | 269 | dict_list_item = &dictList; 270 | } 271 | 272 | // find an item 273 | 274 | while(dict_list_item) 275 | { 276 | if(dict_list_item->pdict && 277 | (dict_list_item->pdict->dict_id == dict_id)) 278 | { 279 | dict_list_item->last_used_sec = tv.tv_sec; 280 | return dict_list_item->pdict; 281 | } 282 | 283 | dict_list_item = dict_list_item->next; 284 | } 285 | 286 | // load dictionary and add it to the list 287 | 288 | dict_list_item = calloc(sizeof(DictListItem), 1); 289 | if(!dict_list_item) 290 | return NULL; 291 | 292 | dict_list_item->pdict = dict_load(dict_id); 293 | if(dict_list_item->pdict == NULL) 294 | { 295 | free(dict_list_item); 296 | return NULL; 297 | } 298 | 299 | dict_list_item->last_used_sec = tv.tv_sec; 300 | dict_list_item->next = dictList.next; 301 | dictList.next = dict_list_item; 302 | 303 | return dict_list_item->pdict; 304 | } 305 | 306 | // < 0: zson_dict not initialized 307 | // >= 0: current dict id 308 | static int32 309 | get_current_dict_id() 310 | { 311 | int32 id; 312 | bool isnull; 313 | struct timeval tv; 314 | gettimeofday(&tv, NULL); 315 | 316 | if(cachedDictId >= 0 && 317 | tv.tv_sec - cachedDictIdLastUpdatedSec < DICT_ID_CACHE_TIME_SEC) 318 | return cachedDictId; 319 | 320 | SPI_connect(); 321 | 322 | if(savedPlanGetDictId == NULL) 323 | { 324 | savedPlanGetDictId = SPI_prepare( 325 | "select max(dict_id) from public.zson_dict;", 0, NULL); 326 | if (savedPlanGetDictId == NULL) 327 | elog(ERROR, "Error preparing query"); 328 | if (SPI_keepplan(savedPlanGetDictId)) 329 | elog(ERROR, "Error keeping plan"); 330 | } 331 | 332 | if (SPI_execute_plan(savedPlanGetDictId, NULL, NULL, true, 1) < 0 || 333 | SPI_processed != 1) 334 | elog(ERROR, "Failed to get current dict_id"); 335 | 336 | id = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); 337 | 338 | SPI_finish(); 339 | 340 | id = isnull ? -1 : id; 341 | cachedDictId = id; 342 | cachedDictIdLastUpdatedSec = tv.tv_sec; 343 | return id; 344 | } 345 | 346 | // cstring -> zson 347 | Datum 348 | zson_in(PG_FUNCTION_ARGS) 349 | { 350 | Datum string_datum = PG_GETARG_DATUM(0); 351 | Datum jsonb_datum = DirectFunctionCall1(jsonb_in, string_datum); 352 | Datum zson_datum = DirectFunctionCall1(jsonb_to_zson, jsonb_datum); 353 | bytea* zson_bytea = DatumGetByteaP(zson_datum); 354 | PG_RETURN_BYTEA_P(zson_bytea); 355 | } 356 | 357 | // zson -> cstring 358 | Datum 359 | zson_out(PG_FUNCTION_ARGS) 360 | { 361 | bytea *zson_bytea = PG_GETARG_BYTEA_P(0); 362 | Datum zson_datum = PointerGetDatum(zson_bytea); 363 | Datum jsonb_datum = DirectFunctionCall1(zson_to_jsonb, zson_datum); 364 | Datum string_datum = DirectFunctionCall1(jsonb_out, jsonb_datum); 365 | PG_RETURN_CSTRING(DatumGetCString(string_datum)); 366 | } 367 | 368 | inline static Size 369 | zson_fastcompress_bound(Size size) 370 | { 371 | return PGLZ_HINT_SIZE + (size / 2 + 1)*3; 372 | } 373 | 374 | static bool 375 | zson_fastcompress(const Dict* pdict, 376 | const void* src_data_, Size src_size, 377 | void* encoded_data_, Size* pencoded_size) 378 | { 379 | size_t nbytes; 380 | Size inoffset; 381 | Size outskipoffset = 0; 382 | Size outoffset = 1; 383 | uint8 skipbytes = 0; 384 | const uint8* src_data = src_data_; 385 | uint8* encoded_data = ((uint8*)encoded_data_) + PGLZ_HINT_SIZE; 386 | 387 | memset(encoded_data_, 0, PGLZ_HINT_SIZE); 388 | 389 | for(inoffset = 0; inoffset < src_size; ) 390 | { 391 | uint16 code = dict_find_match(pdict, &(src_data[inoffset]), 392 | src_size - inoffset, &nbytes); 393 | 394 | if(code == DICT_INVALID_CODE) 395 | { 396 | skipbytes++; 397 | encoded_data[outoffset] = src_data[inoffset]; 398 | outoffset++; 399 | inoffset++; 400 | 401 | if(skipbytes == 255) 402 | { 403 | encoded_data[outskipoffset] = skipbytes; 404 | encoded_data[outoffset++] = 0; // DICT_INVALID_CODE 405 | encoded_data[outoffset++] = 0; 406 | outskipoffset = outoffset++; 407 | skipbytes = 0; 408 | } 409 | } 410 | else 411 | { 412 | encoded_data[outskipoffset] = skipbytes; 413 | encoded_data[outoffset++] = code >> 8; 414 | encoded_data[outoffset++] = code & 0xFF; 415 | outskipoffset = outoffset++; 416 | skipbytes = 0; 417 | inoffset += nbytes; 418 | } 419 | } 420 | 421 | encoded_data[outskipoffset] = skipbytes; 422 | *pencoded_size = outoffset + PGLZ_HINT_SIZE; 423 | 424 | return true; 425 | } 426 | 427 | static bool 428 | zson_fastdecompress(const Dict* pdict, 429 | const void* encoded_data_, Size encoded_size, 430 | void* decoded_data_, Size decoded_size) 431 | { 432 | Size inoffset = 0; 433 | Size outoffset = 0; 434 | uint16 code, idx; 435 | uint8 skipbytes; 436 | const uint8* encoded_data = ((uint8*)encoded_data_) + PGLZ_HINT_SIZE; 437 | uint8* decoded_data = decoded_data_; 438 | encoded_size -= PGLZ_HINT_SIZE; 439 | 440 | for(inoffset = 0; inoffset < encoded_size; ) 441 | { 442 | skipbytes = encoded_data[inoffset++]; 443 | 444 | if(skipbytes > decoded_size - outoffset) 445 | return false; 446 | 447 | if(skipbytes > encoded_size - inoffset) 448 | return false; 449 | 450 | memcpy( 451 | &(decoded_data[outoffset]), 452 | &(encoded_data[inoffset]), 453 | skipbytes 454 | ); 455 | 456 | outoffset += skipbytes; 457 | inoffset += skipbytes; 458 | 459 | if(encoded_size == inoffset && decoded_size == outoffset) 460 | break; /* end of input - its OK */ 461 | 462 | if(2 > encoded_size - inoffset) 463 | return false; 464 | 465 | code = (uint16)encoded_data[inoffset++]; 466 | code = (code << 8) | (uint16)encoded_data[inoffset++]; 467 | 468 | if(code != DICT_INVALID_CODE) 469 | { 470 | idx = pdict->code_to_word[code]; 471 | 472 | if(pdict->words[idx].nbytes > decoded_size - outoffset) 473 | return false; 474 | 475 | memcpy( 476 | &(decoded_data[outoffset]), 477 | pdict->words[idx].word, 478 | pdict->words[idx].nbytes 479 | ); 480 | 481 | outoffset += pdict->words[idx].nbytes; 482 | } 483 | } 484 | 485 | return true; 486 | } 487 | 488 | // jsonb -> zson 489 | Datum 490 | jsonb_to_zson(PG_FUNCTION_ARGS) 491 | { 492 | Jsonb *jsonb = PG_GETARG_JSONB_P(0); 493 | uint8* jsonb_data = (uint8*)VARDATA(jsonb); 494 | Size jsonb_data_size = VARSIZE(jsonb) - VARHDRSZ; 495 | uint8 *encoded_buff, *encoded_header, *encoded_data; 496 | Size encoded_size; 497 | bool res; 498 | Dict* pdict; 499 | Size encoded_buff_size = VARHDRSZ + ZSON_HEADER_SIZE + 500 | zson_fastcompress_bound(jsonb_data_size); 501 | int32 dict_id = get_current_dict_id(); 502 | 503 | if(dict_id < 0) 504 | ereport(ERROR, 505 | ( 506 | errcode(ERRCODE_INTERNAL_ERROR), 507 | errmsg("Unable to compress jsonb"), 508 | errdetail("zson_dict is not initialized"), 509 | errhint("You probably forgot to execute zson_learn(). In case you are restoring from a backup made via pg_dump, just move the content of zson_dict table above the current line.") 510 | )); 511 | 512 | pdict = dict_get(dict_id); 513 | if(pdict == NULL) 514 | elog(ERROR, "Unable to load dictionary"); 515 | 516 | encoded_buff = palloc(encoded_buff_size); 517 | encoded_header = (uint8*)VARDATA(encoded_buff); 518 | encoded_data = encoded_header + ZSON_HEADER_SIZE; 519 | 520 | ZSON_HEADER_VERSION(encoded_header) = ZSON_CURRENT_VERSION; 521 | ZSON_HEADER_DICT_VERSION(encoded_header) = dict_id; 522 | ZSON_HEADER_DECODED_SIZE(encoded_header) = jsonb_data_size; 523 | 524 | encoded_size = encoded_buff_size - VARHDRSZ - ZSON_HEADER_SIZE; 525 | 526 | res = zson_fastcompress(pdict, jsonb_data, jsonb_data_size, 527 | encoded_data, &encoded_size); 528 | if(!res) 529 | ereport(ERROR, 530 | ( 531 | errcode(ERRCODE_INTERNAL_ERROR), 532 | errmsg("Unable to compress jsonb"), 533 | errdetail("Procedure fastcompress() returned %d", res), 534 | errhint("You probably should report this to pgsql-bugs@") 535 | )); 536 | 537 | encoded_size += VARHDRSZ + ZSON_HEADER_SIZE; 538 | 539 | encoded_buff = repalloc(encoded_buff, encoded_size); 540 | SET_VARSIZE(encoded_buff, encoded_size); 541 | PG_RETURN_BYTEA_P(encoded_buff); 542 | } 543 | 544 | // zson -> jsonb 545 | Datum 546 | zson_to_jsonb(PG_FUNCTION_ARGS) 547 | { 548 | bytea *encoded_buff = PG_GETARG_BYTEA_P(0); 549 | uint8* encoded_header = (uint8*)VARDATA(encoded_buff); 550 | uint8* encoded_data = encoded_header + ZSON_HEADER_SIZE; 551 | Size encoded_size = VARSIZE(encoded_buff) - VARHDRSZ - ZSON_HEADER_SIZE; 552 | int zson_version = ZSON_HEADER_VERSION(encoded_header); 553 | uint32 dict_id; // cannot read until zson version is checked 554 | uint32 decoded_size; // cannot read until zson version is checked 555 | Jsonb* jsonb; 556 | uint8* jsonb_data; 557 | Dict* pdict; 558 | bool res; 559 | 560 | if(zson_version > ZSON_CURRENT_VERSION) 561 | ereport(ERROR, 562 | ( 563 | errcode(ERRCODE_INTERNAL_ERROR), 564 | errmsg("Unsupported zson version"), 565 | errdetail("Saved zson version is %d, extension version is %d", 566 | zson_version, ZSON_CURRENT_VERSION), 567 | errhint("You probably should upgrade zson extension " 568 | "or report a bug to pgsql-bugs@") 569 | )); 570 | 571 | dict_id = ZSON_HEADER_DICT_VERSION(encoded_header); 572 | decoded_size = ZSON_HEADER_DECODED_SIZE(encoded_header); 573 | pdict = dict_get(dict_id); 574 | if(pdict == NULL) 575 | elog(ERROR, "Unable to load dictionary"); 576 | 577 | jsonb = palloc(decoded_size + VARHDRSZ); 578 | jsonb_data = (uint8*)VARDATA(jsonb); 579 | 580 | res = zson_fastdecompress(pdict, 581 | encoded_data, encoded_size, 582 | jsonb_data, decoded_size); 583 | 584 | if(!res) 585 | ereport(ERROR, 586 | ( 587 | errcode(ERRCODE_INTERNAL_ERROR), 588 | errmsg("Unable to uncompress zson"), 589 | errdetail("Procedure fastdecompress() returned %d", res), 590 | errhint("You probably should report this to pgsql-bugs@") 591 | )); 592 | 593 | decoded_size += VARHDRSZ; 594 | SET_VARSIZE(jsonb, decoded_size); 595 | PG_RETURN_JSONB_P(jsonb); 596 | } 597 | 598 | // zson -> "size, encoded size, other info" 599 | Datum 600 | zson_info(PG_FUNCTION_ARGS) 601 | { 602 | bytea* zson = PG_GETARG_BYTEA_P(0); 603 | 604 | Size zson_size = VARSIZE(zson); 605 | uint8* zson_header = (uint8*)VARDATA(zson); 606 | uint32 zson_version = ZSON_HEADER_VERSION(zson_header); 607 | uint32 dict_version = ZSON_HEADER_DICT_VERSION(zson_header); 608 | uint32 decoded_size = ZSON_HEADER_DECODED_SIZE(zson_header) + VARHDRSZ; 609 | 610 | size_t buff_size = 1024; 611 | char* string_buff = palloc(buff_size); 612 | snprintf(string_buff, buff_size, 613 | "zson version = %u, dict version = %u, jsonb size = %u, " 614 | "zson size (without pglz compression) = %u (%.2f%%)", 615 | 616 | zson_version, dict_version, decoded_size, 617 | (uint32)zson_size, (float)zson_size*100/(float)decoded_size 618 | ); 619 | 620 | PG_RETURN_CSTRING(string_buff); 621 | } 622 | 623 | /* 624 | // jsonb -> "\xAA\xBB\xCC..." cstring 625 | Datum 626 | debug_dump_jsonb(PG_FUNCTION_ARGS) 627 | { 628 | Jsonb *jsonb = PG_GETARG_JSONB(0); 629 | uint8* jsonb_data = (uint8*)VARDATA(jsonb); 630 | Size jsonb_data_size = VARSIZE(jsonb) - VARHDRSZ; 631 | Size string_buff_size = jsonb_data_size*4 + 1; 632 | char* string_buff = palloc(string_buff_size); 633 | int i; 634 | 635 | for(i = 0; i < jsonb_data_size; i++) 636 | sprintf(&string_buff[i*4], "\\x%02X", jsonb_data[i]); 637 | 638 | PG_RETURN_CSTRING((Datum)string_buff); 639 | } 640 | */ 641 | -------------------------------------------------------------------------------- /zson.control: -------------------------------------------------------------------------------- 1 | comment = 'ZSON: an extension for transparent JSONB compression' 2 | default_version = '1.1' 3 | module_pathname = '$libdir/zson' 4 | relocatable = false 5 | --------------------------------------------------------------------------------