├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── benchmark.md
    └── contributors-v1.2-draft.md
├── expected
    └── zson.out
├── img
    └── zson-logo.png
├── sql
    └── zson.sql
├── zson--1.0--1.1.sql
├── zson--1.1.sql
├── zson.c
└── zson.control


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | results
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Aleksander Alekseev
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | EXTENSION = zson
 2 | MODULES = zson
 3 | DATA = zson--1.1.sql zson--1.0--1.1.sql
 4 | REGRESS = zson
 5 | 
 6 | PG_CPPFLAGS = -g -O2
 7 | SHLIB_LINK = # -lz -llz4
 8 | 
 9 | ifndef PG_CONFIG
10 | 	PG_CONFIG := pg_config
11 | endif
12 | PGXS := $(shell $(PG_CONFIG) --pgxs)
13 | include $(PGXS)
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ZSON
  2 | 
  3 | ![ZSON Logo](img/zson-logo.png)
  4 | 
  5 | ## About
  6 | 
  7 | ZSON is a PostgreSQL extension for transparent JSONB compression. Compression is
  8 | based on a shared dictionary of strings most frequently used in specific JSONB
  9 | documents (not only keys, but also values, array elements, etc).
 10 | 
 11 | In some cases ZSON can save half of your disk space and give you about 10% more
 12 | TPS. Memory is saved as well. See [docs/benchmark.md](docs/benchmark.md).
 13 | Everything depends on your data and workload, though. Don't believe any
 14 | benchmarks, re-check everything on your data, configuration, hardware, workload
 15 | and PostgreSQL version.
 16 | 
 17 | ZSON was originally created in 2016 by [Postgres Professional][pgpro] team:
 18 | researched and coded by [Aleksander Alekseev][me]; ideas, code review, testing,
 19 | etc by [Alexander Korotkov][ak] and [Teodor Sigaev][ts].
 20 | 
 21 | [me]: https://eax.me/
 22 | [ak]: https://akorotkov.github.io/
 23 | [ts]: http://www.sigaev.ru/
 24 | [pgpro]: https://postgrespro.com/
 25 | 
 26 | See also discussions on [pgsql-general@][gen], [Hacker News][hn], [Reddit][rd]
 27 | and [HabraHabr][habr].
 28 | 
 29 | [gen]: https://www.postgresql.org/message-id/flat/20160930185801.38654a1c%40e754
 30 | [hn]: https://news.ycombinator.com/item?id=12633486
 31 | [rd]: https://www.reddit.com/r/PostgreSQL/comments/55mr4r/zson_postgresql_extension_for_transparent_jsonb/
 32 | [habr]: https://habr.com/ru/company/postgrespro/blog/312006/
 33 | 
 34 | ## Install
 35 | 
 36 | Build and install ZSON:
 37 | 
 38 | ```
 39 | cd /path/to/zson/source/code
 40 | make
 41 | sudo make install
 42 | ```
 43 | 
 44 | Run tests:
 45 | 
 46 | ```
 47 | make installcheck
 48 | ```
 49 | 
 50 | Connect to PostgreSQL:
 51 | 
 52 | ```
 53 | psql my_database
 54 | ```
 55 | 
 56 | Enable extension:
 57 | 
 58 | ```
 59 | create extension zson;
 60 | ```
 61 | 
 62 | ## Uninstall
 63 | 
 64 | Disable extension:
 65 | 
 66 | ```
 67 | drop extension zson;
 68 | ```
 69 | 
 70 | Uninstall ZSON:
 71 | 
 72 | ```
 73 | cd /path/to/zson/source/code
 74 | sudo make uninstall
 75 | ```
 76 | 
 77 | ## Usage
 78 | 
 79 | First ZSON should be *trained* on common data using zson\_learn procedure:
 80 | 
 81 | ```
 82 | zson_learn(
 83 |     tables_and_columns text[][],
 84 |     max_examples int default 10000,
 85 |     min_length int default 2,
 86 |     max_length int default 128,
 87 |     min_count int default 2
 88 | )
 89 | ```
 90 | 
 91 | Example:
 92 | 
 93 | ```
 94 | select zson_learn('{{"table1", "col1"}, {"table2", "col2"}}');
 95 | ```
 96 | 
 97 | You can create a temporary table and write some common JSONB documents into it
 98 | manually or use the existing tables. The idea is to provide a subset of real
 99 | data.  Let's say some document *type* is twice as frequent as another document
100 | type.  ZSON expects that there will be twice as many documents of the first type
101 | as those of the second one in a learning set.
102 | 
103 | Resulting dictionary could be examined using this query:
104 | 
105 | ```
106 | select * from zson_dict;
107 | ```
108 | 
109 | Now ZSON type could be used as a complete and transparent replacement of JSONB
110 | type:
111 | 
112 | ```
113 | zson_test=# create table zson_example(x zson);
114 | CREATE TABLE
115 | 
116 | zson_test=# insert into zson_example values ('{"aaa": 123}');
117 | INSERT 0 1
118 | 
119 | zson_test=# select x -> 'aaa' from zson_example;
120 | -[ RECORD 1 ]-
121 | ?column? | 123
122 | ```
123 | 
124 | ## Migrating to a new dictionary
125 | 
126 | When a schema of JSONB documents evolves ZSON could be *re-learned*:
127 | 
128 | ```
129 | select zson_learn('{{"table1", "col1"}, {"table2", "col2"}}');
130 | ```
131 | 
132 | This time *second* dictionary will be created. Dictionaries are cached in memory
133 | so it will take about a minute before ZSON realizes that there is a new
134 | dictionary. After that old documents will be decompressed using the old
135 | dictionary and new documents will be compressed and decompressed using the new
136 | dictionary.
137 | 
138 | To find out which dictionary is used for a given ZSON document use zson\_info
139 | procedure:
140 | 
141 | ```
142 | zson_test=# select zson_info(x) from test_compress where id = 1;
143 | -[ RECORD 1 ]---------------------------------------------------
144 | zson_info | zson version = 0, dict version = 1, ...
145 | 
146 | zson_test=# select zson_info(x) from test_compress where id = 2;
147 | -[ RECORD 1 ]---------------------------------------------------
148 | zson_info | zson version = 0, dict version = 0, ...
149 | ```
150 | 
151 | If **all** ZSON documents are migrated to the new dictionary the old one could
152 | be safely removed:
153 | 
154 | ```
155 | delete from zson_dict where dict_id = 0;
156 | ```
157 | 
158 | In general, it's safer to keep old dictionaries just in case. Gaining a few KB
159 | of disk space is not worth the risk of losing data.
160 | 
161 | ## When it's a time to re-learn?
162 | 
163 | Unfortunately, it's hard to recommend a general approach.
164 | 
165 | A good heuristic could be:
166 | 
167 | ```
168 | select pg_table_size('tt') / (select count(*) from tt)
169 | ```
170 | 
171 | ... i.e. average document size. When it suddenly starts to grow it's time to
172 | re-learn.
173 | 
174 | However, developers usually know when they change a schema significantly. It's
175 | also easy to re-check whether the current schema differs a lot from the original
176 | one using zson\_dict table.
177 | 
178 | ## Known limitations 
179 | 
180 | Intalling ZSON in a schema other than `public` is not supported (i.e. `CREATE EXTENSION zson WITH SCHEMA ...`).
181 | 
182 | 


--------------------------------------------------------------------------------
/docs/benchmark.md:
--------------------------------------------------------------------------------
  1 | # Benchmark
  2 | 
  3 | **Disclaimer**: Synthetic benchmarks could not be trusted. Re-check everything
  4 | on specific hardware, configuration, data and workload!
  5 | 
  6 | We used the following server:
  7 | 
  8 | * 12 cores (24 with HT)
  9 | * 24 Gb of RAM
 10 | * HDD
 11 | * swap is off
 12 | 
 13 | To simulate the scenario when a database doesn't fit into the memory we used
 14 | `stress`:
 15 | 
 16 | ```
 17 | sudo stress --vm-bytes 21500m --vm-keep -m 1 --vm-hang 0
 18 | ```
 19 | 
 20 | OOM Killer configuration:
 21 | 
 22 | ```
 23 | # allow up to (100 - 3)% of all memory to be used
 24 | 
 25 | echo 100 > /proc/sys/vm/overcommit_ratio
 26 | echo 2 > /proc/sys/vm/overcommit_memory # OVERCOMMIT_NEVER
 27 | ```
 28 | 
 29 | Database configuration:
 30 | 
 31 | ```
 32 | max_prepared_transactions = 100
 33 | shared_buffers = 1GB
 34 | wal_level = hot_standby
 35 | wal_keep_segments = 128
 36 | max_connections = 600
 37 | listen_addresses = '*'
 38 | autovacuum = off
 39 | ```
 40 | 
 41 | Data:
 42 | 
 43 | ```
 44 | \timing on
 45 | 
 46 | create extension zson;
 47 | 
 48 | create table jsonb_example(x jsonb);
 49 | 
 50 | create table test_nocompress(id SERIAL PRIMARY KEY, x jsonb);
 51 | create table test_compress(id SERIAL PRIMARY KEY, x zson);
 52 | 
 53 | -- "common" JSON document - example of Consul REST API response
 54 | insert into jsonb_example values ('
 55 | 	{
 56 | 	  "Member": {
 57 | 	    "DelegateCur": 4,
 58 | 	    "DelegateMax": 4,
 59 | 	    "DelegateMin": 2,
 60 | 	    "Name": "postgresql-master",
 61 | 	    "Addr": "10.0.3.245",
 62 | 	    "Port": 8301,
 63 | 	    "Tags": {
 64 | 	      "vsn_min": "1",
 65 | 	      "vsn_max": "3",
 66 | 	      "vsn": "2",
 67 | 	      "role": "consul",
 68 | 	      "port": "8300",
 69 | 	      "expect": "3",
 70 | 	      "dc": "dc1",
 71 | 	      "build": "0.6.1:68969ce5"
 72 | 	    },
 73 | 	    "Status": 1,
 74 | 	    "ProtocolMin": 1,
 75 | 	    "ProtocolMax": 3,
 76 | 	    "ProtocolCur": 2
 77 | 	  },
 78 | 	  "Coord": {
 79 | 	    "Height": 1.1121755379893715e-05,
 80 | 	    "Adjustment": -2.023610556800026e-05,
 81 | 	    "Error": 0.19443548095527025,
 82 | 	    "Vec": [
 83 | 	      -0.004713143383327869,
 84 | 	      -0.0032494905923075553,
 85 | 	      0.0007104109540813835,
 86 | 	      0.00472788328972092,
 87 | 	      -0.0015931587524407006,
 88 | 	      0.0014436856764788407,
 89 | 	      0.005688487740053884,
 90 | 	      -0.0039037697928507834
 91 | 	    ]
 92 | 	  },
 93 | 	  "Config": {
 94 | 	    "Reap": null,
 95 | 	    "SessionTTLMinRaw": "",
 96 | 	    "SessionTTLMin": 0,
 97 | 	    "UnixSockets": {
 98 | 	      "Perms": "",
 99 | 	      "Grp": "",
100 | 	      "Usr": ""
101 | 	    },
102 | 	    "VersionPrerelease": "",
103 | 	    "Version": "0.6.1",
104 | 	    "Revision": "68969ce5f4499cbe3a4f946917be2e580f1b1936+CHANGES",
105 | 	    "CAFile": "",
106 | 	    "VerifyServerHostname": false,
107 | 	    "VerifyOutgoing": false,
108 | 	    "VerifyIncoming": false,
109 | 	    "EnableDebug": false,
110 | 	    "Protocol": 2,
111 | 	    "DogStatsdTags": null,
112 | 	    "DogStatsdAddr": "",
113 | 	    "StatsdAddr": "",
114 | 	    "StatsitePrefix": "consul",
115 | 	    "StatsiteAddr": "",
116 | 	    "SkipLeaveOnInt": false,
117 | 	    "LeaveOnTerm": false,
118 | 	    "Addresses": {
119 | 	      "RPC": "",
120 | 	      "HTTPS": "",
121 | 	      "HTTP": "",
122 | 	      "DNS": ""
123 | 	    },
124 | 	    "Ports": {
125 | 	      "Server": 8300,
126 | 	      "SerfWan": 8302,
127 | 	      "SerfLan": 8301,
128 | 	      "RPC": 8400,
129 | 	      "HTTPS": -1,
130 | 	      "HTTP": 8500,
131 | 	      "DNS": 8600
132 | 	    },
133 | 	    "AdvertiseAddrWan": "10.0.3.245",
134 | 	    "DNSRecursors": [],
135 | 	    "DNSRecursor": "",
136 | 	    "DataDir": "/var/lib/consul",
137 | 	    "Datacenter": "dc1",
138 | 	    "Server": true,
139 | 	    "BootstrapExpect": 3,
140 | 	    "Bootstrap": false,
141 | 	    "DevMode": false,
142 | 	    "DNSConfig": {
143 | 	      "OnlyPassing": false,
144 | 	      "MaxStale": 5e+09,
145 | 	      "EnableTruncate": false,
146 | 	      "AllowStale": false,
147 | 	      "ServiceTTL": null,
148 | 	      "NodeTTL": 0
149 | 	    },
150 | 	    "Domain": "consul.",
151 | 	    "LogLevel": "INFO",
152 | 	    "NodeName": "postgresql-master",
153 | 	    "ClientAddr": "0.0.0.0",
154 | 	    "BindAddr": "10.0.3.245",
155 | 	    "AdvertiseAddr": "10.0.3.245",
156 | 	    "AdvertiseAddrs": {
157 | 	      "RPCRaw": "",
158 | 	      "RPC": null,
159 | 	      "SerfWanRaw": "",
160 | 	      "SerfWan": null,
161 | 	      "SerfLanRaw": "",
162 | 	      "SerfLan": null
163 | 	    },
164 | 	    "CertFile": "",
165 | 	    "KeyFile": "",
166 | 	    "ServerName": "",
167 | 	    "StartJoin": [],
168 | 	    "StartJoinWan": [],
169 | 	    "RetryJoin": [],
170 | 	    "RetryMaxAttempts": 0,
171 | 	    "RetryIntervalRaw": "",
172 | 	    "RetryJoinWan": [],
173 | 	    "RetryMaxAttemptsWan": 0,
174 | 	    "RetryIntervalWanRaw": "",
175 | 	    "EnableUi": false,
176 | 	    "UiDir": "/usr/share/consul/web-ui",
177 | 	    "PidFile": "",
178 | 	    "EnableSyslog": false,
179 | 	    "SyslogFacility": "LOCAL0",
180 | 	    "RejoinAfterLeave": false,
181 | 	    "CheckUpdateInterval": 3e+11,
182 | 	    "ACLDatacenter": "",
183 | 	    "ACLTTL": 3e+10,
184 | 	    "ACLTTLRaw": "",
185 | 	    "ACLDefaultPolicy": "allow",
186 | 	    "ACLDownPolicy": "extend-cache",
187 | 	    "Watches": null,
188 | 	    "DisableRemoteExec": false,
189 | 	    "DisableUpdateCheck": false,
190 | 	    "DisableAnonymousSignature": false,
191 | 	    "HTTPAPIResponseHeaders": null,
192 | 	    "AtlasInfrastructure": "",
193 | 	    "AtlasJoin": false,
194 | 	    "AtlasEndpoint": "",
195 | 	    "DisableCoordinates": false
196 | 	  }
197 | 	}
198 | 	');
199 | 
200 | -- step 1
201 | 
202 | insert into test_nocompress select * from generate_series(1, 10000),
203 | 	jsonb_example;
204 | 
205 | select zson_learn('{{"test_nocompress", "x"}}');
206 | 
207 | insert into test_compress select * from generate_series(1, 10000),
208 | 	jsonb_example;
209 | 
210 | 
211 | select before, after, after :: float / before as ratio
212 | 	from (select pg_table_size('test_nocompress') as before,
213 | 	pg_table_size('test_compress') as after) as temp;
214 | 
215 | -- step 2
216 | 
217 | insert into test_nocompress
218 | 	select * from generate_series(10001, 1000000), jsonb_example;
219 | 
220 | insert into test_compress
221 | 	select * from generate_series(10001, 1000000), jsonb_example;
222 | 
223 | select before, after, after :: float / before as ratio
224 | 	from (select pg_table_size('test_nocompress') as before,
225 | 	pg_table_size('test_compress') as after) as temp;
226 | 
227 | -- step 3
228 | 
229 | insert into test_nocompress
230 | 	select * from generate_series(1000001, 3000000), jsonb_example;
231 | 
232 | insert into test_compress
233 | 	select * from generate_series(1000001, 3000000), jsonb_example;
234 | 
235 | select before, after, after :: float / before as ratio
236 | 	from (select pg_table_size('test_nocompress') as before,
237 | 	pg_table_size('test_compress') as after) as temp;
238 | 
239 | -- VACUUM prevents PostgreSQL from *writing* data to disk during benchmark
240 | -- when only SELECT queries are executed
241 | -- For more details see https://wiki.postgresql.org/wiki/Hint_Bits
242 | 
243 | vacuum full verbose;
244 | 
245 | select before, after, after :: float / before as ratio
246 | 	from (select pg_table_size('test_nocompress') as before,
247 | 	pg_table_size('test_compress') as after) as temp;
248 | 
249 | ```
250 | 
251 | File nocompress.pgbench:
252 | 
253 | ```
254 | \set maxid 3000000
255 | \setrandom from_id 1 :maxid
256 | select x -> 'time' from test_nocompress where id=:from_id;
257 | ```
258 | 
259 | File compress.pgbench:
260 | 
261 | ```
262 | \set maxid 3000000
263 | \setrandom from_id 1 :maxid
264 | select x -> 'time' from test_compress where id=:from_id;
265 | ```
266 | 
267 | On PostgreSQL server:
268 | 
269 | ```
270 | # restart is required to clean shared buffers
271 | 
272 | sudo service postgresql restart
273 | ```
274 | 
275 | On second server (same hardware, same rack):
276 | 
277 | ```
278 | pgbench -h 10.110.0.10 -f nocompress.pgbench -T 600 -P 1 -c 40 -j 12 zson_test
279 | ```
280 | 
281 | On PostgreSQL server:
282 | 
283 | ```
284 | sudo service postgresql restart
285 | ```
286 | 
287 | On second server:
288 | 
289 | ```
290 | pgbench -h 10.110.0.10 -f compress.pgbench -T 600 -P 1 -c 40 -j 12 zson_test
291 | ```
292 | 
293 | Benchmark results:
294 | 
295 | ```
296 | transaction type: nocompress.pgbench
297 | scaling factor: 1
298 | query mode: simple
299 | number of clients: 40
300 | number of threads: 12
301 | duration: 600 s
302 | number of transactions actually processed: 583183
303 | latency average = 41.153 ms
304 | latency stddev = 75.002 ms
305 | tps = 971.802963 (including connections establishing)
306 | tps = 971.810536 (excluding connections establishing)
307 | 
308 | 
309 | transaction type: compress.pgbench
310 | scaling factor: 1
311 | query mode: simple
312 | number of clients: 40
313 | number of threads: 12
314 | duration: 600 s
315 | number of transactions actually processed: 651910
316 | latency average = 36.814 ms
317 | latency stddev = 52.750 ms
318 | tps = 1086.386943 (including connections establishing)
319 | tps = 1086.396431 (excluding connections establishing)
320 | ```
321 | 
322 | In this case ZSON gives about 11.8% more TPS.
323 | 
324 | We can modify compress.pgbench and nocompress.pgbench so only the documents with
325 | id between 1 and 3000 will be requested. It will simulate a case when all the
326 | data *does* fits into the memory. In this case we see 141K TPS (JSONB) vs 134K
327 | TPS (ZSON) which is 5% slower.
328 | 
329 | The compression ratio could be different depending on the documents, the
330 | database schema, the number of rows, etc. But in general ZSON compression is
331 | much better than build-in PostgreSQL compression (PGLZ):
332 | 
333 | ```
334 |    before   |   after    |      ratio       
335 | ------------+------------+------------------
336 |  3961880576 | 1638834176 | 0.41365057440843
337 | (1 row)
338 | 
339 |    before   |   after    |       ratio       
340 | ------------+------------+-------------------
341 |  8058904576 | 4916436992 | 0.610062688500061
342 | (1 row)
343 | 
344 |    before    |   after    |       ratio       
345 | -------------+------------+-------------------
346 |  14204420096 | 9832841216 | 0.692238130775149
347 | ```
348 | 
349 | Not only is the disk space saved, but the data loaded to shared buffers is not
350 | decompressed.  It means that memory is also saved and more data could be
351 | accessed without loading it from the disk.
352 | 


--------------------------------------------------------------------------------
/docs/contributors-v1.2-draft.md:
--------------------------------------------------------------------------------
1 | # Draft of contributors to v1.2
2 | 
3 | * René Leonhardt
4 | * https://github.com/arssher
5 | * https://github.com/zheka-13
6 | 


--------------------------------------------------------------------------------
/expected/zson.out:
--------------------------------------------------------------------------------
  1 | CREATE EXTENSION zson;
  2 | SELECT zson_extract_strings('true');
  3 |  zson_extract_strings 
  4 | ----------------------
  5 |  {}
  6 | (1 row)
  7 | 
  8 | SELECT zson_extract_strings('"aaa"');
  9 |  zson_extract_strings 
 10 | ----------------------
 11 |  {aaa}
 12 | (1 row)
 13 | 
 14 | SELECT zson_extract_strings('["some", ["dummy", "array"], 456, false]');
 15 |  zson_extract_strings 
 16 | ----------------------
 17 |  {some,dummy,array}
 18 | (1 row)
 19 | 
 20 | SELECT zson_extract_strings('{"IP":"10.0.0.3","Roles":["master", "slave"]}');
 21 |        zson_extract_strings       
 22 | ----------------------------------
 23 |  {IP,Roles,10.0.0.3,master,slave}
 24 | (1 row)
 25 | 
 26 | CREATE TABLE nocompress(x jsonb);
 27 | INSERT INTO nocompress VALUES
 28 | ('true'),
 29 | ('123'),
 30 | ('"aaa"'),
 31 | ('["some", ["dummy", "array"], 456, false]'),
 32 | ('
 33 | [
 34 | {"ModifyIndex":1,"IP":"10.0.0.1","Roles":["master"]},
 35 | {"ModifyIndex":2,"IP":"10.0.0.2","Roles":["slave"]},
 36 | {"ModifyIndex":3,"IP":"10.0.0.3","Roles":["master", "slave"]}
 37 | ]
 38 | ');
 39 | SELECT zson_learn('{{"nocompress", "x"}}');
 40 |                                   zson_learn                                   
 41 | -------------------------------------------------------------------------------
 42 |  Done! Run " select * from zson_dict where dict_id = 0; " to see a dictionary.
 43 | (1 row)
 44 | 
 45 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
 46 |  dict_id |    word     
 47 | ---------+-------------
 48 |        0 | IP
 49 |        0 | ModifyIndex
 50 |        0 | Roles
 51 |        0 | master
 52 |        0 | slave
 53 | (5 rows)
 54 | 
 55 | SELECT zson_learn('{{"nocompress", "x"}}', 10000, 1, 128, 1);
 56 |                                   zson_learn                                   
 57 | -------------------------------------------------------------------------------
 58 |  Done! Run " select * from zson_dict where dict_id = 1; " to see a dictionary.
 59 | (1 row)
 60 | 
 61 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
 62 |  dict_id |    word     
 63 | ---------+-------------
 64 |        0 | IP
 65 |        0 | ModifyIndex
 66 |        0 | Roles
 67 |        0 | master
 68 |        0 | slave
 69 |        1 | 10.0.0.1
 70 |        1 | 10.0.0.2
 71 |        1 | 10.0.0.3
 72 |        1 | IP
 73 |        1 | ModifyIndex
 74 |        1 | Roles
 75 |        1 | aaa
 76 |        1 | array
 77 |        1 | dummy
 78 |        1 | master
 79 |        1 | slave
 80 |        1 | some
 81 | (17 rows)
 82 | 
 83 | SELECT '{"aaa": "ololo"}'::zson;
 84 |        zson       
 85 | ------------------
 86 |  {"aaa": "ololo"}
 87 | (1 row)
 88 | 
 89 | SELECT '{"aaa": "ololo"}'::zson -> 'aaa';
 90 |  ?column? 
 91 | ----------
 92 |  "ololo"
 93 | (1 row)
 94 | 
 95 | CREATE TABLE zson_test(x zson);
 96 | INSERT INTO zson_test VALUES('{"aaa":123}' :: jsonb);
 97 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
 98 |  dict_id |    word     
 99 | ---------+-------------
100 |        0 | IP
101 |        0 | ModifyIndex
102 |        0 | Roles
103 |        0 | master
104 |        0 | slave
105 |        1 | 10.0.0.1
106 |        1 | 10.0.0.2
107 |        1 | 10.0.0.3
108 |        1 | IP
109 |        1 | ModifyIndex
110 |        1 | Roles
111 |        1 | aaa
112 |        1 | array
113 |        1 | dummy
114 |        1 | master
115 |        1 | slave
116 |        1 | some
117 | (17 rows)
118 | 
119 | SELECT x :: jsonb FROM zson_test;
120 |       x       
121 | --------------
122 |  {"aaa": 123}
123 | (1 row)
124 | 
125 | DROP TABLE zson_test;
126 | DROP TABLE nocompress;
127 | DROP EXTENSION zson;
128 | 


--------------------------------------------------------------------------------
/img/zson-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postgrespro/zson/e214c79711b9eb92786280756a549b9e4cb2a215/img/zson-logo.png


--------------------------------------------------------------------------------
/sql/zson.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION zson;
 2 | 
 3 | SELECT zson_extract_strings('true');
 4 | 
 5 | SELECT zson_extract_strings('"aaa"');
 6 | 
 7 | SELECT zson_extract_strings('["some", ["dummy", "array"], 456, false]');
 8 | 
 9 | SELECT zson_extract_strings('{"IP":"10.0.0.3","Roles":["master", "slave"]}');
10 | 
11 | CREATE TABLE nocompress(x jsonb);
12 | 
13 | INSERT INTO nocompress VALUES
14 | ('true'),
15 | ('123'),
16 | ('"aaa"'),
17 | ('["some", ["dummy", "array"], 456, false]'),
18 | ('
19 | [
20 | {"ModifyIndex":1,"IP":"10.0.0.1","Roles":["master"]},
21 | {"ModifyIndex":2,"IP":"10.0.0.2","Roles":["slave"]},
22 | {"ModifyIndex":3,"IP":"10.0.0.3","Roles":["master", "slave"]}
23 | ]
24 | ');
25 | 
26 | SELECT zson_learn('{{"nocompress", "x"}}');
27 | 
28 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
29 | 
30 | SELECT zson_learn('{{"nocompress", "x"}}', 10000, 1, 128, 1);
31 | 
32 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
33 | 
34 | SELECT '{"aaa": "ololo"}'::zson;
35 | 
36 | SELECT '{"aaa": "ololo"}'::zson -> 'aaa';
37 | 
38 | CREATE TABLE zson_test(x zson);
39 | 
40 | INSERT INTO zson_test VALUES('{"aaa":123}' :: jsonb);
41 | 
42 | SELECT dict_id, word FROM zson_dict ORDER BY dict_id, word COLLATE "C";
43 | 
44 | SELECT x :: jsonb FROM zson_test;
45 | 
46 | DROP TABLE zson_test;
47 | 
48 | DROP TABLE nocompress;
49 | 
50 | DROP EXTENSION zson;
51 | 


--------------------------------------------------------------------------------
/zson--1.0--1.1.sql:
--------------------------------------------------------------------------------
1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
2 | \echo Use "CREATE EXTENSION zson" to load this file. \quit
3 | 
4 | SELECT pg_catalog.pg_extension_config_dump('zson_dict', '');
5 | 


--------------------------------------------------------------------------------
/zson--1.1.sql:
--------------------------------------------------------------------------------
  1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
  2 | \echo Use "CREATE EXTENSION zson" to load this file. \quit
  3 | 
  4 | CREATE TYPE zson;
  5 | 
  6 | CREATE TABLE zson_dict (
  7 |     dict_id SERIAL NOT NULL,
  8 |     word_id INTEGER NOT NULL,
  9 |     word text NOT NULL,
 10 |     PRIMARY KEY(dict_id, word_id)
 11 | );
 12 | 
 13 | SELECT pg_catalog.pg_extension_config_dump('zson_dict', '');
 14 | 
 15 | -- Usage: select zson_learn('{{"table1", "col1"}, {"table2", "col2"}, ... }');
 16 | CREATE FUNCTION zson_learn(
 17 |     tables_and_columns text[][],
 18 |     max_examples int default 10000,
 19 |     min_length int default 2,
 20 |     max_length int default 128,
 21 |     min_count int default 2)
 22 |     RETURNS text AS $$
 23 | DECLARE
 24 |     tabname text;
 25 |     colname text;
 26 |     query text := '';
 27 |     i int;
 28 |     next_dict_id int;
 29 | BEGIN
 30 |     IF cardinality(tables_and_columns) = 0 THEN
 31 |         RAISE NOTICE 'First argument should not be an empty array!';
 32 |         RETURN '';
 33 |     END IF;
 34 | 
 35 |     FOR i IN    array_lower(tables_and_columns, 1) ..
 36 |                 array_upper(tables_and_columns, 1)
 37 |     LOOP
 38 |         tabname := tables_and_columns[i][1];
 39 |         colname := tables_and_columns[i][2];
 40 | 
 41 |         IF (tabname IS NULL) OR (colname IS NULL) THEN
 42 |             RAISE NOTICE 'Invalid list of tables and columns!';
 43 |             RETURN '';
 44 |         ELSIF position('"' in tabname) <> 0 THEN
 45 |             RAISE NOTICE 'Invalid table name %', tabname;
 46 |             RETURN '';
 47 |         ELSIF position('"' in colname) <> 0 THEN
 48 |             RAISE NOTICE 'Invalid column name %', tabname;
 49 |             RETURN '';
 50 |         ELSIF position('.' in tabname) <> 0 THEN
 51 |             tabname := quote_ident(split_part(tabname, '.', 1)) ||
 52 |                 '.' || quote_ident(split_part(tabname, '.', 2));
 53 |         END IF;
 54 | 
 55 |         IF query <> '' THEN
 56 |             query := query || ' union all ';
 57 |         END IF;
 58 | 
 59 |         query := query || '( select unnest(zson_extract_strings(' ||
 60 |                     quote_ident(colname) || ')) as t from ' || tabname || ' limit ' ||
 61 |                     max_examples || ')';
 62 | 
 63 |     END LOOP;
 64 | 
 65 |     select coalesce(max(dict_id), -1) + 1 INTO next_dict_id from zson_dict;
 66 | 
 67 |     query := 'select t from (select t, count(*) as sum from ( ' ||
 68 |         query || ' ) as tt group by t) as s where length(t) >= ' ||
 69 |         min_length || ' and length(t) <= ' || max_length ||
 70 |         ' and sum >= ' || min_count || ' order by sum desc limit 65534';
 71 | 
 72 |     query := 'insert into zson_dict select ' || next_dict_id ||
 73 |         ' as dict_id, row_number() over () as word_id, t as word from ( ' ||
 74 |         query || ' ) as top_words';
 75 | 
 76 |     EXECUTE query;
 77 | 
 78 |     RETURN 'Done! Run " select * from zson_dict where dict_id = ' ||
 79 |         next_dict_id || '; " to see a dictionary.';
 80 | END;
 81 | $$ LANGUAGE plpgsql;
 82 | 
 83 | CREATE FUNCTION zson_extract_strings(x jsonb)
 84 |     RETURNS text[] AS $$
 85 | DECLARE
 86 |     jtype text;
 87 |     jitem jsonb;
 88 | BEGIN
 89 |     jtype := jsonb_typeof(x);
 90 |     IF jtype = 'object' THEN
 91 |         RETURN array(select unnest(z) from (
 92 |                 select array(select jsonb_object_keys(x)) as z
 93 |             union all (
 94 |                 select zson_extract_strings(x -> k) as z from (
 95 |                     select jsonb_object_keys(x) as k
 96 |                 ) as kk
 97 |             )
 98 |         ) as zz);
 99 |     ELSIF jtype = 'array' THEN
100 |        RETURN ARRAY(select unnest(zson_extract_strings(t)) from
101 |             (select jsonb_array_elements(x) as t) as tt);
102 |     ELSIF jtype = 'string' THEN
103 |         RETURN array[ x #>> array[] :: text[] ];
104 |     ELSE -- 'number', 'boolean', 'bool'
105 |         RETURN array[] :: text[];
106 |     END IF;
107 | END;
108 | $$ LANGUAGE plpgsql;
109 | 
110 | CREATE FUNCTION zson_in(cstring)
111 |     RETURNS zson
112 |     AS 'MODULE_PATHNAME'
113 |     LANGUAGE C STRICT IMMUTABLE;
114 | 
115 | CREATE FUNCTION zson_out(zson)
116 |     RETURNS cstring
117 |     AS 'MODULE_PATHNAME'
118 |     LANGUAGE C STRICT IMMUTABLE;
119 | 
120 | CREATE TYPE zson (
121 |     INTERNALLENGTH = -1,
122 |     INPUT = zson_in,
123 |     OUTPUT = zson_out,
124 |     STORAGE = extended -- try to compress
125 | );
126 | 
127 | CREATE FUNCTION jsonb_to_zson(jsonb)
128 |     RETURNS zson
129 |     AS 'MODULE_PATHNAME'
130 |     LANGUAGE C STRICT IMMUTABLE;
131 | 
132 | CREATE FUNCTION zson_to_jsonb(zson)
133 |     RETURNS jsonb
134 |     AS 'MODULE_PATHNAME'
135 |     LANGUAGE C STRICT IMMUTABLE;
136 | 
137 | CREATE CAST (jsonb AS zson) WITH FUNCTION jsonb_to_zson(jsonb) AS ASSIGNMENT;
138 | CREATE CAST (zson AS jsonb) WITH FUNCTION zson_to_jsonb(zson) AS IMPLICIT;
139 | 
140 | CREATE FUNCTION zson_info(zson)
141 |     RETURNS cstring
142 |     AS 'MODULE_PATHNAME'
143 |     LANGUAGE C STRICT IMMUTABLE;
144 | 
145 | --CREATE FUNCTION debug_dump_jsonb(jsonb)
146 | --    RETURNS cstring
147 | --    AS 'MODULE_PATHNAME'
148 | --    LANGUAGE C STRICT IMMUTABLE;
149 | 


--------------------------------------------------------------------------------
/zson.c:
--------------------------------------------------------------------------------
  1 | #include <postgres.h>
  2 | #include <port.h>
  3 | #include <catalog/pg_type.h>
  4 | #include <executor/spi.h>
  5 | #include <utils/builtins.h>
  6 | #include <utils/jsonb.h>
  7 | #include <sys/time.h>
  8 | #include <limits.h>
  9 | #include <string.h>
 10 | 
 11 | // zson compression:
 12 | //
 13 | // VARHDRSZ
 14 | // zson_version [uint8]
 15 | // dict_version [uint32]
 16 | // decoded_size [uint32]
 17 | // hint [uint8 x PGLZ_HINT_SIZE]
 18 | // {
 19 | //	skip_bytes [uint8]
 20 | //	... skip_bytes bytes ...
 21 | //	string_code [uint16], 0 = no_string
 22 | // } *
 23 | 
 24 | PG_MODULE_MAGIC;
 25 | 
 26 | PG_FUNCTION_INFO_V1(zson_in);
 27 | PG_FUNCTION_INFO_V1(zson_out);
 28 | PG_FUNCTION_INFO_V1(zson_to_jsonb);
 29 | PG_FUNCTION_INFO_V1(jsonb_to_zson);
 30 | PG_FUNCTION_INFO_V1(zson_info);
 31 | PG_FUNCTION_INFO_V1(debug_dump_jsonb);
 32 | 
 33 | /* In version 11 these macros have been changed */
 34 | #if PG_VERSION_NUM < 110000
 35 | #define PG_GETARG_JSONB_P(v) PG_GETARG_JSONB(v)
 36 | #define PG_RETURN_JSONB_P(v) PG_RETURN_JSONB(v)
 37 | #endif
 38 | 
 39 | #define ZSON_CURRENT_VERSION 0
 40 | 
 41 | #define ZSON_HEADER_SIZE (sizeof(uint8) + sizeof(uint32)*2)
 42 | #define ZSON_HEADER_VERSION(hdrp) (*(uint8*)hdrp)
 43 | #define ZSON_HEADER_DICT_VERSION(hdrp) \
 44 | 	(*(uint32*)((uint8*)hdrp + sizeof(uint8)))
 45 | #define ZSON_HEADER_DECODED_SIZE(hdrp) \
 46 | 	(*(uint32*)((uint8*)hdrp + sizeof(uint8) + sizeof(uint32)))
 47 | 
 48 | #define PGLZ_HINT_SIZE 32 // courner case: 0
 49 | 
 50 | #define DICT_MAX_WORDS (1 << 16)
 51 | 
 52 | typedef struct {
 53 | 	uint16 code;
 54 | 	bool check_next; // next word starts with the same nbytes bytes
 55 | 	size_t nbytes; // number of bytes (not letters) except \0
 56 | 	char* word;
 57 | } Word;
 58 | 
 59 | typedef struct {
 60 | 	int32 dict_id;
 61 | 	uint32 nwords;
 62 | 	Word words[DICT_MAX_WORDS]; // sorted by .word, word -> code
 63 | 	uint16 code_to_word[DICT_MAX_WORDS]; // code -> word index
 64 | } Dict;
 65 | 
 66 | typedef struct DictListItem {
 67 | 	Dict* pdict;
 68 | 	union {
 69 | 		time_t last_clean_sec; // for first list item
 70 | 		time_t last_used_sec; // for rest list items
 71 | 	};
 72 | 	struct DictListItem* next;
 73 | } DictListItem;
 74 | 
 75 | // update current dict_id every N seconds
 76 | #define DICT_ID_CACHE_TIME_SEC 60
 77 | 
 78 | // clean distList every N seconds
 79 | #define DICT_LIST_CLEAN_INTERVAL_SEC 60
 80 | 
 81 | // keep dict items not used for at most N seconds
 82 | #define DICT_LIST_TTL_SEC 120
 83 | 
 84 | // reserved code
 85 | #define DICT_INVALID_CODE 0
 86 | 
 87 | static int32 cachedDictId = -1;
 88 | static time_t cachedDictIdLastUpdatedSec = 0;
 89 | 
 90 | static SPIPlanPtr savedPlanGetDictId = NULL;
 91 | static SPIPlanPtr savedPlanLoadDict = NULL;
 92 | 
 93 | static DictListItem dictList = { 0 };
 94 | 
 95 | static Dict*
 96 | dict_load(int32 dict_id)
 97 | {
 98 | 	int row;
 99 | 	Datum qvalues[] = { Int32GetDatum(dict_id) };
100 | 	Dict* pdict = calloc(sizeof(Dict), 1);
101 | 	if(pdict == NULL)
102 | 		return NULL;
103 | 
104 | 	pdict->dict_id = dict_id;
105 | 
106 | 	SPI_connect();
107 | 
108 | 	if(savedPlanLoadDict == NULL)
109 | 	{
110 | 		Oid argtypes[] = { INT4OID };
111 | 		savedPlanLoadDict = SPI_prepare(
112 | 			"select word_id, word from public.zson_dict where dict_id = $1 "
113 | 			"order by word",
114 | 			1, argtypes);
115 | 		if(savedPlanLoadDict == NULL)
116 | 			elog(ERROR, "Error preparing query");
117 | 		if(SPI_keepplan(savedPlanLoadDict))
118 | 			elog(ERROR, "Error keeping plan");
119 | 	}
120 | 
121 | 	if(SPI_execute_plan(savedPlanLoadDict, qvalues, NULL,
122 | 		true, DICT_MAX_WORDS) < 0)
123 | 	{
124 | 		elog(ERROR, "Failed to load dictionary");
125 | 	}
126 | 
127 | 	for(row = 0; row < SPI_processed; row++)
128 | 	{
129 | 		bool isnull;
130 | 		char* wordcopy;
131 | 		uint32 word_id = DatumGetInt32(
132 | 				SPI_getbinval(SPI_tuptable->vals[row], SPI_tuptable->tupdesc,
133 | 					1, &isnull)
134 | 			);
135 | 		char* word = DatumGetCString(DirectFunctionCall1(textout,
136 | 				SPI_getbinval(SPI_tuptable->vals[row], SPI_tuptable->tupdesc,
137 | 					2, &isnull)
138 | 			));
139 | 
140 | 		size_t wordlen = strlen(word);
141 | 		// in case user filled zson_dict manually
142 | 		if(wordlen < 2)
143 | 			continue;
144 | 
145 | 		wordcopy = malloc(wordlen+1);
146 | 		if(wordcopy == NULL)
147 | 			elog(ERROR, "Failed to allocate memory");
148 | 
149 | 		strcpy(wordcopy, word);
150 | 		pdict->words[pdict->nwords].code = (uint16)word_id;
151 | 		pdict->words[pdict->nwords].word = wordcopy;
152 | 		pdict->words[pdict->nwords].nbytes = wordlen;
153 | 
154 | 		pdict->code_to_word[ (uint16)word_id ] = pdict->nwords;
155 | 
156 | 		if((pdict->nwords > 0) &&
157 | 			( pdict->words[pdict->nwords-1].nbytes <
158 | 				pdict->words[pdict->nwords].nbytes) )
159 | 		{
160 | 			pdict->words[pdict->nwords-1].check_next =
161 | 				(memcmp(pdict->words[pdict->nwords].word,
162 | 						pdict->words[pdict->nwords-1].word,
163 | 						pdict->words[pdict->nwords-1].nbytes
164 | 					) == 0);
165 | 		}
166 | 
167 | 		pdict->nwords++;
168 | 	}
169 | 
170 | 	SPI_finish();
171 | 
172 | 	return pdict;
173 | }
174 | 
175 | static void
176 | dict_free(Dict* pdict)
177 | {
178 | 	uint32 i;
179 | 
180 | 	for(i = 0; i < pdict->nwords; i++)
181 | 		free(pdict->words[i].word);
182 | 
183 | 	free(pdict);
184 | }
185 | 
186 | // binary search
187 | static uint16
188 | dict_find_match(const Dict* pdict,
189 | 				const uint8* buff, size_t buff_size, size_t* pnbytes)
190 | {
191 | 	int res;
192 | 	int32 left = 0;
193 | 	int32 right = pdict->nwords-1;
194 | 	size_t best_nbytes = 0;
195 | 	uint16 best_code = DICT_INVALID_CODE;
196 | 
197 | 	while(left <= right)
198 | 	{
199 | 		uint32 current = (left + right) / 2;
200 | 		size_t nbytes = pdict->words[current].nbytes;
201 | 
202 | 		if(nbytes > buff_size)
203 | 		{
204 | 			/* current can be less or greater depending on the prefix */
205 | 			res = memcmp(pdict->words[current].word, buff, buff_size);
206 | 
207 | 			/* if prefixes match, current is greater */
208 | 			if(res == 0)
209 | 				res = 1;
210 | 		} else
211 | 			res = memcmp(pdict->words[current].word, buff, nbytes);
212 | 
213 | 		if(res == 0) // match
214 | 		{
215 | 			best_nbytes = nbytes;
216 | 			best_code = pdict->words[current].code;
217 | 
218 | 			if((!pdict->words[current].check_next) || (nbytes == buff_size))
219 | 				break;
220 | 
221 | 			// maybe there is a longer match
222 | 			left = current + 1;
223 | 		}
224 | 		else if(res < 0) // current is less
225 | 			left = current + 1;
226 | 		else // current is greater
227 | 			right = current - 1;
228 | 	}
229 | 
230 | 	*pnbytes = best_nbytes;
231 | 	return best_code;
232 | }
233 | 
234 | static Dict*
235 | dict_get(int32 dict_id)
236 | {
237 | 	DictListItem* dict_list_item = &dictList;
238 | 	struct timeval tv;
239 | 
240 | 	gettimeofday(&tv, NULL);
241 | 
242 | 	// clean cache if necessary
243 | 
244 | 	if(tv.tv_sec - dictList.last_clean_sec > DICT_LIST_CLEAN_INTERVAL_SEC)
245 | 	{
246 | 		DictListItem* prev_dict_list_item = NULL;
247 | 
248 | 		while(dict_list_item)
249 | 		{
250 | 			if(dict_list_item->pdict &&
251 | 				(tv.tv_sec - dict_list_item->last_used_sec >
252 | 					DICT_LIST_TTL_SEC))
253 | 			{
254 | 				DictListItem* temp = dict_list_item->next;
255 | 
256 | 				dict_free(dict_list_item->pdict);
257 | 				free(dict_list_item);
258 | 
259 | 				prev_dict_list_item->next = temp;
260 | 				dict_list_item = temp;
261 | 			}
262 | 			else
263 | 			{
264 | 				prev_dict_list_item = dict_list_item;
265 | 				dict_list_item = dict_list_item->next;
266 | 			}
267 | 		}
268 | 
269 | 		dict_list_item = &dictList;
270 | 	}
271 | 
272 | 	// find an item
273 | 
274 | 	while(dict_list_item)
275 | 	{
276 | 		if(dict_list_item->pdict &&
277 | 			(dict_list_item->pdict->dict_id == dict_id))
278 | 		{
279 | 			dict_list_item->last_used_sec = tv.tv_sec;
280 | 			return dict_list_item->pdict;
281 | 		}
282 | 
283 | 		dict_list_item = dict_list_item->next;
284 | 	}
285 | 
286 | 	// load dictionary and add it to the list
287 | 
288 | 	dict_list_item = calloc(sizeof(DictListItem), 1);
289 | 	if(!dict_list_item)
290 | 		return NULL;
291 | 
292 | 	dict_list_item->pdict = dict_load(dict_id);
293 | 	if(dict_list_item->pdict == NULL)
294 | 	{
295 | 		free(dict_list_item);
296 | 		return NULL;
297 | 	}
298 | 
299 | 	dict_list_item->last_used_sec = tv.tv_sec;
300 | 	dict_list_item->next = dictList.next;
301 | 	dictList.next = dict_list_item;
302 | 
303 | 	return dict_list_item->pdict;
304 | }
305 | 
306 | //  < 0: zson_dict not initialized
307 | // >= 0: current dict id
308 | static int32
309 | get_current_dict_id()
310 | {
311 | 	int32 id;
312 | 	bool isnull;
313 | 	struct timeval tv;
314 | 	gettimeofday(&tv, NULL);
315 | 
316 | 	if(cachedDictId >= 0 &&
317 | 		tv.tv_sec - cachedDictIdLastUpdatedSec < DICT_ID_CACHE_TIME_SEC)
318 | 		return cachedDictId;
319 | 
320 | 	SPI_connect();
321 | 
322 | 	if(savedPlanGetDictId == NULL)
323 | 	{
324 | 		savedPlanGetDictId = SPI_prepare(
325 | 			"select max(dict_id) from public.zson_dict;", 0, NULL);
326 | 		if (savedPlanGetDictId == NULL)
327 | 			elog(ERROR, "Error preparing query");
328 | 		if (SPI_keepplan(savedPlanGetDictId))
329 | 			elog(ERROR, "Error keeping plan");
330 | 	}
331 | 
332 | 	if (SPI_execute_plan(savedPlanGetDictId, NULL, NULL, true, 1) < 0 ||
333 | 			SPI_processed != 1)
334 | 		elog(ERROR, "Failed to get current dict_id");
335 | 
336 | 	id = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
337 | 
338 | 	SPI_finish();
339 | 
340 | 	id = isnull ? -1 : id;
341 | 	cachedDictId = id;
342 | 	cachedDictIdLastUpdatedSec = tv.tv_sec;
343 | 	return id;
344 | }
345 | 
346 | // cstring -> zson
347 | Datum
348 | zson_in(PG_FUNCTION_ARGS)
349 | {
350 | 	Datum string_datum = PG_GETARG_DATUM(0);
351 | 	Datum jsonb_datum = DirectFunctionCall1(jsonb_in, string_datum);
352 | 	Datum zson_datum = DirectFunctionCall1(jsonb_to_zson, jsonb_datum);
353 | 	bytea* zson_bytea = DatumGetByteaP(zson_datum);
354 | 	PG_RETURN_BYTEA_P(zson_bytea);
355 | }
356 | 
357 | // zson -> cstring
358 | Datum
359 | zson_out(PG_FUNCTION_ARGS)
360 | {
361 | 	bytea *zson_bytea = PG_GETARG_BYTEA_P(0);
362 | 	Datum zson_datum = PointerGetDatum(zson_bytea);
363 | 	Datum jsonb_datum = DirectFunctionCall1(zson_to_jsonb, zson_datum);
364 | 	Datum string_datum = DirectFunctionCall1(jsonb_out, jsonb_datum);
365 | 	PG_RETURN_CSTRING(DatumGetCString(string_datum));
366 | }
367 | 
368 | inline static Size
369 | zson_fastcompress_bound(Size size)
370 | {
371 | 	return PGLZ_HINT_SIZE + (size / 2 + 1)*3;
372 | }
373 | 
374 | static bool
375 | zson_fastcompress(const Dict* pdict,
376 | 				const void* src_data_, Size src_size,
377 | 				void* encoded_data_, Size* pencoded_size)
378 | {
379 | 	size_t nbytes;
380 | 	Size inoffset;
381 | 	Size outskipoffset = 0;
382 | 	Size outoffset = 1;
383 | 	uint8 skipbytes = 0;
384 | 	const uint8* src_data = src_data_;
385 | 	uint8* encoded_data = ((uint8*)encoded_data_) + PGLZ_HINT_SIZE;
386 | 
387 | 	memset(encoded_data_, 0, PGLZ_HINT_SIZE);
388 | 
389 | 	for(inoffset = 0; inoffset < src_size; )
390 | 	{
391 | 		uint16 code = dict_find_match(pdict, &(src_data[inoffset]),
392 | 			src_size - inoffset, &nbytes);
393 | 
394 | 		if(code == DICT_INVALID_CODE)
395 | 		{
396 | 			skipbytes++;
397 | 			encoded_data[outoffset] = src_data[inoffset];
398 | 			outoffset++;
399 | 			inoffset++;
400 | 
401 | 			if(skipbytes == 255)
402 | 			{
403 | 				encoded_data[outskipoffset] = skipbytes;
404 | 				encoded_data[outoffset++] = 0; // DICT_INVALID_CODE
405 | 				encoded_data[outoffset++] = 0;
406 | 				outskipoffset = outoffset++;
407 | 				skipbytes = 0;
408 | 			}
409 | 		}
410 | 		else
411 | 		{
412 | 			encoded_data[outskipoffset] = skipbytes;
413 | 			encoded_data[outoffset++] = code >> 8;
414 | 			encoded_data[outoffset++] = code & 0xFF;
415 | 			outskipoffset = outoffset++;
416 | 			skipbytes = 0;
417 | 			inoffset += nbytes;
418 | 		}
419 | 	}
420 | 
421 | 	encoded_data[outskipoffset] = skipbytes;
422 | 	*pencoded_size = outoffset + PGLZ_HINT_SIZE;
423 | 
424 | 	return true;
425 | }
426 | 
427 | static bool
428 | zson_fastdecompress(const Dict* pdict,
429 | 				const void* encoded_data_, Size encoded_size,
430 | 				void* decoded_data_, Size decoded_size)
431 | {
432 | 	Size inoffset = 0;
433 | 	Size outoffset = 0;
434 | 	uint16 code, idx;
435 | 	uint8 skipbytes;
436 | 	const uint8* encoded_data = ((uint8*)encoded_data_) + PGLZ_HINT_SIZE;
437 | 	uint8* decoded_data = decoded_data_;
438 | 	encoded_size -= PGLZ_HINT_SIZE;
439 | 
440 | 	for(inoffset = 0; inoffset < encoded_size; )
441 | 	{
442 | 		skipbytes = encoded_data[inoffset++];
443 | 
444 | 		if(skipbytes > decoded_size - outoffset)
445 | 			return false;
446 | 
447 | 		if(skipbytes > encoded_size - inoffset)
448 | 			return false;
449 | 
450 | 		memcpy(
451 | 			&(decoded_data[outoffset]),
452 | 			&(encoded_data[inoffset]),
453 | 			skipbytes
454 | 		);
455 | 
456 | 		outoffset += skipbytes;
457 | 		inoffset += skipbytes;
458 | 
459 | 		if(encoded_size == inoffset && decoded_size == outoffset)
460 | 			break; /* end of input - its OK */
461 | 
462 | 		if(2 > encoded_size - inoffset)
463 | 			return false;
464 | 
465 | 		code = (uint16)encoded_data[inoffset++];
466 | 		code = (code << 8) | (uint16)encoded_data[inoffset++];
467 | 
468 | 		if(code != DICT_INVALID_CODE)
469 | 		{
470 | 			idx = pdict->code_to_word[code];
471 | 
472 | 			if(pdict->words[idx].nbytes > decoded_size - outoffset)
473 | 				return false;
474 | 
475 | 			memcpy(
476 | 				&(decoded_data[outoffset]),
477 | 				pdict->words[idx].word,
478 | 				pdict->words[idx].nbytes
479 | 			);
480 | 
481 | 			outoffset += pdict->words[idx].nbytes;
482 | 		}
483 | 	}
484 | 
485 | 	return true;
486 | }
487 | 
488 | // jsonb -> zson
489 | Datum
490 | jsonb_to_zson(PG_FUNCTION_ARGS)
491 | {
492 | 	Jsonb	*jsonb = PG_GETARG_JSONB_P(0);
493 | 	uint8* jsonb_data = (uint8*)VARDATA(jsonb);
494 | 	Size jsonb_data_size = VARSIZE(jsonb) - VARHDRSZ;
495 | 	uint8 *encoded_buff, *encoded_header, *encoded_data;
496 | 	Size encoded_size;
497 | 	bool res;
498 | 	Dict* pdict;
499 | 	Size encoded_buff_size = VARHDRSZ + ZSON_HEADER_SIZE +
500 | 		zson_fastcompress_bound(jsonb_data_size);
501 | 	int32 dict_id = get_current_dict_id();
502 | 
503 | 	if(dict_id < 0)
504 | 		ereport(ERROR,
505 | 			(
506 | 			 errcode(ERRCODE_INTERNAL_ERROR),
507 | 			 errmsg("Unable to compress jsonb"),
508 | 			 errdetail("zson_dict is not initialized"),
509 | 			 errhint("You probably forgot to execute zson_learn(). In case you are restoring from a backup made via pg_dump, just move the content of zson_dict table above the current line.")
510 | 			));
511 | 
512 | 	pdict = dict_get(dict_id);
513 | 	if(pdict == NULL)
514 | 		elog(ERROR, "Unable to load dictionary");
515 | 
516 | 	encoded_buff = palloc(encoded_buff_size);
517 | 	encoded_header = (uint8*)VARDATA(encoded_buff);
518 | 	encoded_data = encoded_header + ZSON_HEADER_SIZE;
519 | 
520 | 	ZSON_HEADER_VERSION(encoded_header) = ZSON_CURRENT_VERSION;
521 | 	ZSON_HEADER_DICT_VERSION(encoded_header) = dict_id;
522 | 	ZSON_HEADER_DECODED_SIZE(encoded_header) = jsonb_data_size;
523 | 
524 | 	encoded_size = encoded_buff_size - VARHDRSZ - ZSON_HEADER_SIZE;
525 | 
526 | 	res = zson_fastcompress(pdict, jsonb_data, jsonb_data_size,
527 | 			encoded_data, &encoded_size);
528 | 	if(!res)
529 | 		ereport(ERROR,
530 | 			(
531 | 			 errcode(ERRCODE_INTERNAL_ERROR),
532 | 			 errmsg("Unable to compress jsonb"),
533 | 			 errdetail("Procedure fastcompress() returned %d", res),
534 | 			 errhint("You probably should report this to pgsql-bugs@")
535 | 			));
536 | 
537 | 	encoded_size += VARHDRSZ + ZSON_HEADER_SIZE;
538 | 
539 | 	encoded_buff = repalloc(encoded_buff, encoded_size);
540 | 	SET_VARSIZE(encoded_buff, encoded_size);
541 | 	PG_RETURN_BYTEA_P(encoded_buff);
542 | }
543 | 
544 | // zson -> jsonb
545 | Datum
546 | zson_to_jsonb(PG_FUNCTION_ARGS)
547 | {
548 | 	bytea	*encoded_buff = PG_GETARG_BYTEA_P(0);
549 | 	uint8* encoded_header = (uint8*)VARDATA(encoded_buff);
550 | 	uint8* encoded_data = encoded_header + ZSON_HEADER_SIZE;
551 | 	Size encoded_size = VARSIZE(encoded_buff) - VARHDRSZ - ZSON_HEADER_SIZE;
552 | 	int zson_version = ZSON_HEADER_VERSION(encoded_header);
553 | 	uint32 dict_id; // cannot read until zson version is checked
554 | 	uint32 decoded_size; // cannot read until zson version is checked
555 | 	Jsonb* jsonb;
556 | 	uint8* jsonb_data;
557 | 	Dict* pdict;
558 | 	bool res;
559 | 
560 | 	if(zson_version > ZSON_CURRENT_VERSION)
561 | 		ereport(ERROR,
562 | 			(
563 | 			 errcode(ERRCODE_INTERNAL_ERROR),
564 | 			 errmsg("Unsupported zson version"),
565 | 			 errdetail("Saved zson version is %d, extension version is %d",
566 | 			 	zson_version, ZSON_CURRENT_VERSION),
567 | 			 errhint("You probably should upgrade zson extension "
568 | 			 			"or report a bug to pgsql-bugs@")
569 | 			));
570 | 
571 | 	dict_id = ZSON_HEADER_DICT_VERSION(encoded_header);
572 | 	decoded_size = ZSON_HEADER_DECODED_SIZE(encoded_header);
573 | 	pdict = dict_get(dict_id);
574 | 	if(pdict == NULL)
575 | 		elog(ERROR, "Unable to load dictionary");
576 | 
577 | 	jsonb = palloc(decoded_size + VARHDRSZ);
578 | 	jsonb_data = (uint8*)VARDATA(jsonb);
579 | 
580 | 	res = zson_fastdecompress(pdict,
581 | 			encoded_data, encoded_size,
582 | 			jsonb_data, decoded_size);
583 | 
584 | 	if(!res)
585 | 		ereport(ERROR,
586 | 			(
587 | 			 errcode(ERRCODE_INTERNAL_ERROR),
588 | 			 errmsg("Unable to uncompress zson"),
589 | 			 errdetail("Procedure fastdecompress() returned %d", res),
590 | 			 errhint("You probably should report this to pgsql-bugs@")
591 | 			));
592 | 
593 | 	decoded_size += VARHDRSZ;
594 | 	SET_VARSIZE(jsonb, decoded_size);
595 | 	PG_RETURN_JSONB_P(jsonb);
596 | }
597 | 
598 | // zson -> "size, encoded size, other info"
599 | Datum
600 | zson_info(PG_FUNCTION_ARGS)
601 | {
602 | 	bytea* zson = PG_GETARG_BYTEA_P(0);
603 | 
604 | 	Size zson_size = VARSIZE(zson);
605 | 	uint8* zson_header = (uint8*)VARDATA(zson);
606 | 	uint32 zson_version = ZSON_HEADER_VERSION(zson_header);
607 | 	uint32 dict_version = ZSON_HEADER_DICT_VERSION(zson_header);
608 | 	uint32 decoded_size = ZSON_HEADER_DECODED_SIZE(zson_header) + VARHDRSZ;
609 | 
610 | 	size_t buff_size = 1024;
611 | 	char* string_buff = palloc(buff_size);
612 | 	snprintf(string_buff, buff_size,
613 | 		"zson version = %u, dict version = %u, jsonb size = %u, "
614 | 		"zson size (without pglz compression) = %u (%.2f%%)",
615 | 
616 | 		zson_version, dict_version, decoded_size,
617 | 		(uint32)zson_size, (float)zson_size*100/(float)decoded_size
618 | 	);
619 | 
620 | 	PG_RETURN_CSTRING(string_buff);
621 | }
622 | 
623 | /*
624 | // jsonb -> "\xAA\xBB\xCC..." cstring
625 | Datum
626 | debug_dump_jsonb(PG_FUNCTION_ARGS)
627 | {
628 | 	Jsonb	*jsonb = PG_GETARG_JSONB(0);
629 | 	uint8* jsonb_data = (uint8*)VARDATA(jsonb);
630 | 	Size jsonb_data_size = VARSIZE(jsonb) - VARHDRSZ;
631 | 	Size string_buff_size = jsonb_data_size*4 + 1;
632 | 	char* string_buff = palloc(string_buff_size);
633 | 	int i;
634 | 
635 | 	for(i = 0; i < jsonb_data_size; i++)
636 | 		sprintf(&string_buff[i*4], "\\x%02X", jsonb_data[i]);
637 | 
638 | 	PG_RETURN_CSTRING((Datum)string_buff);
639 | }
640 | */
641 | 


--------------------------------------------------------------------------------
/zson.control:
--------------------------------------------------------------------------------
1 | comment = 'ZSON: an extension for transparent JSONB compression'
2 | default_version = '1.1'
3 | module_pathname = '$libdir/zson'
4 | relocatable = false
5 | 


--------------------------------------------------------------------------------