├── META.json
├── Makefile
├── README.md
├── curlapi.c
├── curlapi.h
├── data
    ├── blk_-729487577044220672
    ├── customer_reviews_1998.1000.json.gz
    ├── data.json
    ├── data_broken.json
    └── invalid_gz_file.json.gz
├── expected
    └── .gitignore
├── gettickcount.c
├── gettickcount.h
├── input
    ├── basic_tests.source
    ├── customer_reviews.source
    ├── hdfs_block.source
    └── invalid_gz_file.source
├── json_fdw--1.0.sql
├── json_fdw.c
├── json_fdw.control
├── json_fdw.h
├── output
    ├── basic_tests.source
    ├── customer_reviews.source
    ├── hdfs_block.source
    └── invalid_gz_file.source
├── rciapi.c
├── rciapi.h
├── regexapi.c
├── regexapi.h
├── regexapi_helper.c
├── regexapi_helper.h
└── sql
    └── .gitignore


/META.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "json_fdw",
 3 |    "abstract": "Foreign Data Wrapper for JSON files",
 4 |    "description": "PostgreSQL extension which implements a Foreign Data Wrapper (FDW) for JSON files.",
 5 |    "version": "1.3.0",
 6 |    "maintainer": "Hadi Moshayedi <hadi@citusdata.com>",
 7 |    "license": "gpl_3",
 8 |    "provides": {
 9 |       "json_fdw": {
10 |          "abstract": "Foreign Data Wrapper for JSON files",
11 |          "file": "json_fdw.c",
12 |          "docfile": "README.md",
13 |          "version": "1.3.0"
14 |       }
15 |    },
16 |    "prereqs": {
17 |       "runtime": {
18 |          "requires": {
19 |             "PostgreSQL": "9.2.0"
20 |          }
21 |       }
22 |    },
23 |    "resources": {
24 |       "bugtracker": {
25 |          "web": "http://github.com/citusdata/json_fdw/issues/"
26 |       },
27 |       "repository": {
28 |         "url":  "git://github.com/citusdata/json_fdw.git",
29 |         "web":  "https://github.com/citusdata/json_fdw/",
30 |         "type": "git"
31 |       }
32 |    },
33 |    "generated_by": "David E. Wheeler",
34 |    "meta-spec": {
35 |       "version": "1.0.0",
36 |       "url": "http://pgxn.org/meta/spec.txt"
37 |    },
38 |    "tags": [
39 |       "json",
40 |       "fdw",
41 |       "foreign data wrapper",
42 |       "json_fdw"
43 |    ]
44 | }
45 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # contrib/json_fdw/Makefile
 2 | 
 3 | MODULE_big = json_fdw
 4 | 
 5 | OBJS = json_fdw.o curlapi.o regexapi.o regexapi_helper.o gettickcount.o rciapi.o
 6 | 
 7 | ifeq ($(shell uname -s), Linux)
 8 |     # Directly link against yajl 2, so it works in Ubuntu 12.04 too.
 9 |     SHLIB_LINK = -lz -l:libyajl.so.2
10 | else
11 |     # Non-linux OS's (in particular, OS X) don't support "-l:" syntax, 
12 |     # so use the -lyajl flag instead.
13 |     SHLIB_LINK = -lz -lyajl
14 | endif
15 | 
16 | EXTENSION = json_fdw
17 | DATA = json_fdw--1.0.sql
18 | 
19 | REGRESS = basic_tests customer_reviews hdfs_block invalid_gz_file
20 | EXTRA_CLEAN = sql/basic_tests.sql expected/basic_tests.out \
21 |               sql/customer_reviews.sql expected/customer_reviews.out \
22 |               sql/hdfs_block.sql expected/hdfs_block.out \
23 |               sql/invalid_gz_file.sql expected/invalid_gz_file.out
24 | 
25 | #
26 | # Users need to specify their Postgres installation path through pg_config. For
27 | # example: /usr/local/pgsql/bin/pg_config or /usr/lib/postgresql/9.2/bin/pg_config
28 | #
29 | 
30 | # find pg_config
31 | OS:=$(shell uname -s)
32 | PG_CONFIG:= $(shell which pg_config)
33 | FIND_ROOTPATH:= "/"
34 | ifeq (${OS},Darwin)
35 | FIND_ROOTPATH:= $(shell if [ -d "/Applications" ]; then echo "/Applications"; else echo "/"; fi)
36 | endif
37 | PG_CONFIG:= $(shell if [ ! -e "pg_config.loc" ]; then find $(FIND_ROOTPATH) -name pg_config > pg_config.loc; fi; cat pg_config.loc)
38 | 
39 | # for localy built uinstalled libraries, do this
40 | YAJLDIR= ../yajl.git/build/yajl-2.1.1
41 | PG_CPPFLAGS+= -I$(YAJLDIR)/include
42 | SHLIB_LINK+= -L$(YAJLDIR)/lib
43 | 
44 | ZLIBDIR= ../zlib-1.2.8
45 | PG_CPPFLAGS+= -I$(ZLIBDIR)
46 | SHLIB_LINK+= -L$(ZLIBDIR)
47 | 
48 | # for localy build uninstalled curl, do this
49 | CURLDIR= ../curl-7.40.0
50 | PG_CPPFLAGS+= -I$(CURLDIR)/include
51 | SHLIB_LINK+= -L$(CURLDIR)/lib/.libs -lcurl -lssl -lcrypto
52 | 
53 | # for system version of curl, do this
54 | #CURL_CONFIG:= $(shell which curl-config)
55 | #PG_CPPFLAGS+= $(shell sh $(CURL_CONFIG) --cflags)
56 | #SHLIB_LINK+= $(shell sh $(CUR_CONFIG) --static-libs)
57 | 
58 | PGXS := $(shell $(PG_CONFIG) --pgxs)
59 | include $(PGXS)
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | json_fdw2
  2 | ========
  3 | 
  4 | **json_fdw2** is a fork of the [citusdata/json_fdw] PostgreSQL Foreign Data Wrapper (FDW) extension project, to query locally stored JSON files, and supports analytic queries against array types, nested fields, and
  5 | heterogeneous documents.
  6 | 
  7 | 
  8 | Project Goal
  9 | ---
 10 | 
 11 | The original project is only capable of **Select** operations. ie. read-only, and only from local JSON files. 
 12 | This fork's goals are:
 13 |  1. Add the ability to operate on remote JSON content via HTTP operations, in a RESTful style/manner.
 14 |  2. Add support for **Update**, **Insert** and **Delete** operations.
 15 | 
 16 | 
 17 | Progress
 18 | ---
 19 | 
 20 |  1. Done
 21 |  2. I have completed the work for **Update** and **Insert**, and believe them to both function correctly.
 22 | 
 23 | 
 24 | Todo
 25 | ---
 26 |  * Implement **Delete** operation support
 27 |  * Only execute remote ETAG re-validation after aging based on Cache-Control and / or Content-Expires headers.
 28 | 
 29 | 
 30 | Limitations
 31 | ---
 32 | 
 33 | * json\_fdw2 currently only works with PostgreSQL 9.4
 34 | 
 35 | * json\_fdw2 only supports files that consist of one JSON document per line. It
 36 |   doesn't support objects that span multiple lines.
 37 | 
 38 | * PostgreSQL limits column names to 63 characters by default. If you need column
 39 |   names that are longer, you can increase the NAMEDATALEN constant in
 40 |   src/include/pg\_config\_manual.h, compile, and reinstall.
 41 | 
 42 | 
 43 | Dependancies
 44 | ---
 45 | 
 46 |  * [nkhorman/yajl] You'll need to use the \`\`json_path'' branch. **Do not** use the yajl from http://github.com/lloyd/yajl, json\_fdw2 won't compile!
 47 |  * [libcurl-7.40.0] Only curl-7.40.0 has been tested.
 48 |  * zlib-1.2.8
 49 | 
 50 | 
 51 | Building
 52 | --------
 53 | 
 54 | The following build instructions are from the original project and are old: 
 55 | 
 56 | 
 57 |     ## Fedora 17+
 58 |     sudo yum install zlib-devel yajl-devel
 59 | 
 60 |     ## Ubuntu 12.10+
 61 |     sudo apt-get update
 62 |     sudo apt-get install zlib1g-dev libyajl-dev
 63 | 
 64 |     ## Other Linux Distributions
 65 |     (First install zlib-devel, cmake, and ruby)
 66 |     wget http://github.com/lloyd/yajl/tarball/2.0.1 -O yajl-2.0.1.tar.gz
 67 |     tar -xzvf yajl-2.0.1.tar.gz
 68 |     cd lloyd-yajl-f4b2b1a
 69 |     ./configure
 70 |     make
 71 |     sudo make install
 72 |     echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/libyajl.conf
 73 |     sudo ldconfig
 74 | 
 75 | Once you have yajl and zlib installed on your machine, you are ready to build
 76 | json\_fdw2. For this, you need to include the pg\_config directory path in your
 77 | make command. This path is typically the same as your PostgreSQL installation's
 78 | bin/ directory path. For example:
 79 | 
 80 |     PATH=/usr/local/pgsql/bin/:$PATH make
 81 |     sudo PATH=/usr/local/pgsql/bin/:$PATH make install
 82 | 
 83 | **Note**: In RedHat 5.X and CentOS 5.X you may need to edit the Makefile and change "-l:libyajl.so.2" to "-lyajl".
 84 | 
 85 | 
 86 | Usage
 87 | -----
 88 | 
 89 | The following parameters can be set on a JSON foreign table object;
 90 | 
 91 |  * \`\`filename'': The absolute path of a json file or a gzipped json file.
 92 |  * \`\`max\_error\_count'': Maximum number of invalid json documents to skip before
 93 |    erroring out. Defaults to 0.
 94 | 
 95 | As an example, we demonstrate querying a compressed JSON file from scratch here. Note
 96 | that the underlying file contains JSON documents separated by newlines.
 97 | Start with downloading the file.
 98 | 
 99 |     wget http://examples.citusdata.com/customer_reviews_nested_1998.json.gz
100 | 
101 | Next, log into Postgres, and run the following commands to create a
102 | foreign table associated with this JSON file.
103 | 
104 |     -- load extension first time after install
105 |     CREATE EXTENSION json_fdw;
106 | 
107 |     -- create server object
108 |     CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw;
109 | 
110 |     -- create foreign table
111 |     CREATE FOREIGN TABLE customer_reviews
112 |     (
113 |         customer_id TEXT,
114 |         "review.date" DATE,
115 |         "review.rating" INTEGER,
116 |         "product.id" CHAR(10),
117 |         "product.group" TEXT,
118 |         "product.title" TEXT,
119 |         "product.similar_ids" CHAR(10)[]
120 |     )
121 |     SERVER json_server
122 |     OPTIONS (filename '/home/citusdata/customer_reviews_nested_1998.json.gz');
123 | 
124 |     -- optionally, collect data distribution statistics
125 |     ANALYZE customer_reviews;
126 | 
127 | Finally, let's run some example SQL queries on your JSON file.
128 | 
129 |     -- find all reviews a particular customer made on the Dune series in 1998
130 | 
131 |     SELECT
132 |         customer_id, "review.rating", "product.id", "product.title"
133 |     FROM
134 |         customer_reviews
135 |     WHERE
136 |         customer_id ='A27T7HVDXA3K2A' AND
137 |         "product.title" LIKE '%Dune%' AND
138 |         "review.date" >= '1998-01-01' AND
139 |         "review.date" <= '1998-12-31';
140 | 
141 |     -- do we have a correlation between a book's title's length and its review ratings?
142 | 
143 |     SELECT
144 |         width_bucket(length("product.title"), 1, 50, 5) title_length_bucket,
145 |         round(avg("review.rating"), 2) AS review_average,
146 |         count(*)
147 |     FROM
148 |         customer_reviews
149 |     WHERE
150 |         "product.group" = 'Book'
151 |     GROUP BY
152 |         title_length_bucket
153 |     ORDER BY
154 |         title_length_bucket;
155 | 
156 | 
157 | Fetching Remote Files
158 | ---------------------
159 | For remote fetch operations, the \`\`filename'' parameter is now overloaded as
160 | any valid HTTP URL, and an additional parameter has been introduced;
161 | 
162 | * \`\`http\_post\_vars'': A list of key value pairs separated by the \`\`&''
163 | symbol that are sent in a post operation.
164 | 
165 | Using key values pairs in the filename URL and in http\_post\_vars option are
166 | not mutally exclusive, however, a given key value pair should only exist in
167 | one or the other.
168 | 
169 | The following example shows how to fetch remote files, that are then cached locally.
170 | Local caching of the remote content is done, and validated using Entity Tags (ETAG header) upon every query of the table content.
171 | 
172 | **Note**: that the existing handling of Gzip files is supported, because, after the
173 | file is fetched, it is handed off to the existing file handling code, as if
174 | it were previously staged on disk.
175 | 
176 | Based on how libcurl is built the following are supported, but untested;
177 | 
178 |  * Both Content Encoding and Transport Encoding
179 |  * Https
180 | 
181 | Fictitious usage example, using a standard Get operation;
182 | 
183 |     -- create foreign table - using optional get parameters
184 |     CREATE FOREIGN TABLE an_example_table
185 |     (
186 |         fieldName1 TEXT,
187 |         fieldName2 INTEGER,
188 |         . .,
189 |         . .,
190 |         . .
191 |     )
192 |     SERVER json_server
193 |     OPTIONS (filename 'http://www.example.com/file/location/url/some.json.gz?optional=paramaters&separated=traditionally');
194 | 
195 | 
196 | Fictitious usage example, using a Post operation;
197 | 
198 |     -- create foreign table - using optional post and get parameters
199 |     CREATE FOREIGN TABLE another_example_table
200 |     (
201 |         fieldName1 TEXT,
202 |         fieldName2 INTEGER,
203 |         . .,
204 |         . .,
205 |         . .
206 |     )
207 |     SERVER json_server
208 |     OPTIONS (filename 'http://www.example.com/file/location/url/someother.json', http_post_vars 'another=parameter_set&separated=traditionally');
209 | 
210 | 
211 | Refining the original table example, the "wget" operation and query
212 | operation are preformed in a single step, create the table as below.
213 | 
214 |     -- create foreign table
215 |     CREATE FOREIGN TABLE customer_reviews
216 |     (
217 |         customer_id TEXT,
218 |         "review.date" DATE,
219 |         "review.rating" INTEGER,
220 |         "product.id" CHAR(10),
221 |         "product.group" TEXT,
222 |         "product.title" TEXT,
223 |         "product.similar_ids" CHAR(10)[]
224 |     )
225 |     SERVER json_server
226 |     OPTIONS (filename 'http://examples.citusdata.com/customer_reviews_nested_1998.json.gz');
227 | 
228 | 
229 | The additional table options \`\`rom_url'' and \`\`rom_path'' are required for operations
230 | other than **Select**. Use of these two options are mutually exlusive to the \`\`filename'' and 
231 | \`\`http_post_vars'' table options.
232 | 
233 | Rather than add additional table options for differing operations, ie Select, Insert, etc.,
234 | which necesitate table destruction and re-creation to change, a more flexible approach was
235 | taken by using a json object to describe the operational characteristics. The locations of
236 | the json object is specified by the \`\`rom_url'' option.
237 | 
238 | The \`\`rom_path'' option is used to specify which operation set to use. ie. the name of
239 | the table be operated on.
240 | 
241 | An example ROM (Remote Operations Mapping) json object follows;
242 | 
243 |     {
244 |     	"romschema": "2",
245 |     	"host": "",
246 |     	"url": "/some/uri/path",
247 |     
248 |     	"rom_path_1":
249 |     	{
250 |     		"url": "/",
251 |     		"select":{
252 |     			"method": "get",
253 |     			"url": "/",
254 |     			"query": [ {"name":"mode", "value":"multi-doc"}, {"name":"t", "value":3} ]
255 |     		},
256 |     		"insert":{
257 |     			"method": "put",
258 |     			"url": "/",
259 |     			"query": [ {"name":"t", "value":4} ]
260 |     			},
261 |     		"update":{
262 |     			"method": "put",
263 |     			"url": "/",
264 |     			"query": [ {"name":"mode", "value":"multi-doc"}, {"name":"t", "value":3} ]
265 |     		}
266 |     	},
267 | 	"rom_path_other":
268 | 	{
269 | 		"select":{
270 | 			"method":"get",
271 | 			"query": [ {"name":"other", "value":"foo"} ]
272 | 		}
273 | 	}
274 |     }
275 | 
276 | The "romschema" value of 2 is fixed, used as the only schema validation of the ROM.
277 | 
278 | The "url" string elements specified inside a given rom_path and or rom_path operation, are
279 | optional, and if specified as "/", will be ignored, however, if present, will be used to
280 | create the effective url. Each of the "query" arrayed object elements are concatenated with
281 | the effective url as request key value pairs. So for example, given the following table options;
282 | 
283 |     (rom_url 'http://www.example.com/object/rom.json', rom_path 'rom_path_1')
284 | 
285 | and an SQL Select operation with the rom_url pointing to the example rom above, the following
286 | url will be used;
287 | 
288 |     http://www.example.com/some/uri/path/?mode=multi-doc&t=3
289 | 
290 | as the fetch url for content to be retreived, as if it has been used in the \`\`filename''
291 | table option.
292 | 
293 | The "host" string element at the root of the ROM is used to prepend the "url" string element.
294 | If specfied as;
295 | 
296 |     http://api.example.com:8080
297 | 
298 | 
299 | Then an SQL Select operation would use the following url;
300 | 
301 |     http://api.example.com:8080/some/uri/path/?mode=multi-doc&t=3
302 | 
303 | **Note:** Only http based operations are supported for ROM actions. Also, presently, "get"
304 | is the only method supported for Select operations, and only "put" is supported for
305 | Insert, and Update operations.
306 | 
307 | 
308 | 
309 | Table Schema Conventions
310 | ------------------------
311 | 
312 | There are three things worth noting about table schemas. First, nested fields
313 | in JSON documents are referenced using dot separators. For example, a field defined
314 | as "review": { "rating" : 5 } in a JSON document is declared as "review.rating"
315 | in the foreign table schema. The quotes around "review.rating" are necessary, as
316 | identifiers that include dots aren't valid in Postgres otherwise.
317 | 
318 | Second, the foreign table schema is defined at read-time. If you have an additional
319 | field that you'd like to query, such as "review.votes", you can simply add the
320 | column name and start querying for data. You can even create multiple table schemas
321 | for the same underlying JSON, and query through them.
322 | 
323 | Third, json\_fdw2 assumes that underlying data can be heterogeneous. If you are
324 | querying for a column, and this field doesn't exist in a document, or the field's
325 | data type doesn't match the declared column type, json\_fdw2 considers that particular
326 | field to be null.
327 | 
328 | 
329 | Querying Multiple Sources
330 | -----------------------
331 | 
332 | json\_fdw2 borrows its semantics from file\_fdw, and associates one foreign table
333 | with one JSON source. If you'd like to query all your JSON sources from one table,
334 | you could use PostgreSQL's basic table partitioning feature, and manually create
335 | one child table per JSON file.
336 | 
337 | 
338 | Copyright
339 | ---------
340 | 
341 | Portions Copyright (c) 2015 Neal Horman
342 | 
343 | Portions Copyright (c) 2013 Citus Data, Inc.
344 | 
345 | This module is free software; you can redistribute it and/or modify it under the
346 | GNU GPL v3.0 License.
347 | 
348 | 
349 | 
350 | [citusdata/json_fdw]: <https://github.com/citusdata/json_fdw>
351 | [nkhorman/yajl]: <https://github.com/nkhorman/yajl>
352 | [libcurl-7.40.0]: <http://curl.haxx.se/libcurl>
353 | 


--------------------------------------------------------------------------------
/curlapi.c:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------*
  2 |  *
  3 |  * Developed by;
  4 |  *	Neal Horman - http://www.wanlink.com
  5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
  6 |  *
  7 |  *	This "source code" is free software: you can redistribute it and/or modify
  8 |  *	it under the terms of the GNU General Public License as published by
  9 |  *	the Free Software Foundation, either version 3 of the License, or
 10 |  *	(at your option) any later version.
 11 |  *
 12 |  *	This "source code" is distributed in the hope that it will be useful,
 13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |  *	GNU General Public License for more details.
 16 |  *
 17 |  *	You should have received a copy of the GNU General Public License
 18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
 19 |  *
 20 |  *	RCSID:  $Id$
 21 |  *
 22 |  *--------------------------------------------------------------------*/
 23 | 
 24 | #include <stdio.h>
 25 | #include <stdlib.h>
 26 | #include <unistd.h>
 27 | #include <string.h>
 28 | #include <fcntl.h>
 29 | #include <stdarg.h>
 30 | 
 31 | #include <sys/types.h> // for struct dirent
 32 | #include <sys/dir.h> // for struct dirent
 33 | #include <sys/stat.h> // for mkdir
 34 | #include <openssl/md5.h> // for MD5_xxx foo
 35 | #include <pthread.h> // for pthread_self()
 36 | 
 37 | #include "curl/curl.h"
 38 | #include "curlapi.h"
 39 | #include "regexapi.h"
 40 | #include "regexapi_helper.h"
 41 | #include "gettickcount.h"
 42 | 
 43 | // Where files are downloaded to
 44 | #define CURL_BASE_DIR "/tmp/json_fdw_cache"
 45 | // Maximum length of on disk tempoarary file names
 46 | #define MAXFILENAME 1024
 47 | 
 48 | #define FREEPTR(a) do { if((a) != NULL) { free((a)); (a) = NULL; }; } while(0)
 49 | 
 50 | #ifdef DEBUG_WLOGIT
 51 | void (*gcurlLogFn)(const char *) = NULL;
 52 | 
 53 | void curlLogItSet(void (*pfn)(const char *))
 54 | {
 55 | 	gcurlLogFn = pfn;
 56 | }
 57 | 
 58 | static void curlLogIt(const char *pFmt, ...)
 59 | {
 60 | 	if(gcurlLogFn != NULL)
 61 | 	{	va_list vl;
 62 | 		char *pStr = NULL;
 63 | 
 64 | 		va_start(vl, pFmt);
 65 | 		vasprintf(&pStr, pFmt, vl);
 66 | 		va_end(vl);
 67 | 
 68 | 		if(pStr != NULL)
 69 | 		{
 70 | 			gcurlLogFn(pStr);
 71 | 			free(pStr);
 72 | 		}
 73 | 	}
 74 | }
 75 | #endif
 76 | 
 77 | static const char *hexDigits = "0123456789ABCDEF";
 78 | 
 79 | // An MD5 object
 80 | typedef struct _cmd5_t
 81 | {
 82 | 	MD5_CTX ctx;
 83 | 	unsigned char digest[MD5_DIGEST_LENGTH];
 84 | 	char ascii[(MD5_DIGEST_LENGTH*2)+1];
 85 | }cmd5_t; // Curl MD5 Type
 86 | 
 87 | // Alloc and Init MD5 object
 88 | static cmd5_t *curlMd5Init(void)
 89 | {	cmd5_t *pMd5 = calloc(1, sizeof(cmd5_t));
 90 | 
 91 | 	if(pMd5 != NULL)
 92 | 		MD5_Init(&pMd5->ctx);
 93 | 
 94 | 	return pMd5;
 95 | }
 96 | 
 97 | // Free an MD5 object
 98 | static void curlMd5Free(cmd5_t *pMd5)
 99 | {
100 | 	FREEPTR(pMd5);
101 | }
102 | 
103 | static void curlMd5Hash(cmd5_t *pMd5, const char *pStr)
104 | {
105 | 	MD5_Update(&pMd5->ctx, (const unsigned char *)pStr, strlen(pStr));
106 | }
107 | 
108 | // Finalize the MD5 object, and build an ASCII string
109 | // of the digest, then strdup it
110 | static char *curlMd5Final(cmd5_t *pMd5)
111 | {	int i;
112 | 
113 | 	MD5_Final(pMd5->digest, &pMd5->ctx);
114 | 
115 | 	// Convert MD5 digest into ASCII string
116 | 	for (i = 0; i < MD5_DIGEST_LENGTH; i++)
117 | 	{
118 | 		pMd5->ascii[i+i] = hexDigits[pMd5->digest[i] >> 4];
119 | 		pMd5->ascii[i+i+1] = hexDigits[pMd5->digest[i] & 0x0f];
120 | 	}
121 | 
122 | 	return strdup(pMd5->ascii);
123 | }
124 | 
125 | // Callback from CURL to write contents to disk
126 | static size_t curlWriteCallback(void *contents, size_t size, size_t nmemb, void *userp)
127 | {	ccf_t *pCcf = (ccf_t *)userp;
128 | 
129 | 	return fwrite(contents, size, nmemb, pCcf->pFile);
130 | }
131 | 
132 | // If pHdr matches the first of pSrc, then duplicate the balance of the header
133 | // The caller must free the result
134 | static char *curlHeaderCallbackMatch(const char *pSrc, size_t srcLen, const char *pHdr)
135 | {	char *pDst = NULL;
136 | 	size_t hdrLen = strlen(pHdr);
137 | 
138 | 	// capture the etag header value
139 | 	if(srcLen > hdrLen && strncasecmp(pSrc, pHdr, hdrLen) == 0)
140 | 	{	const char *pl = pSrc + hdrLen;
141 | 		const char *pr = pSrc + srcLen - 1;
142 | 
143 | 		// left trim
144 | 		while(*pl == ' ' || *pl == '\t')
145 | 			pl++;
146 | 		// right trim
147 | 		while(*pl == ' ' || *pl == '\t' || *pr == '\n' || *pr == '\r')
148 | 			pr--;
149 | 
150 | 		if(pr>pl)
151 | 		{	int l = pr-pl+1;
152 | 
153 | 			asprintf(&pDst, "%*.*s", l, l, pl);
154 | 		}
155 | 	}
156 | 
157 | 	return pDst;
158 | }
159 | 
160 | // Callback from CURL for header examination
161 | // Collect header values that we are interested in
162 | static size_t curlHeaderCallback(void *contents, size_t size, size_t nmemb, void *userp)
163 | {	cfr_t *pCfr = (cfr_t *)userp;
164 | 	size_t len = size * nmemb;
165 | 
166 | 	if(pCfr != NULL)
167 | 	{	char *pHdrVal = NULL;
168 | 		ccf_t *pCcf = &pCfr->ccf;
169 | 		int i;
170 | 		struct hdra_t
171 | 		{
172 | 			const char *str;
173 | 			size_t idx;
174 | 		} pHdrs[] =
175 | 		{
176 | 			// Order in this array doesn't matter,
177 | 			// but the number of elements in this
178 | 			// array must be no more than HDR_COUNT
179 | 			{HDR_STR_ETAG, HDR_IDX_ETAG},
180 | 			{HDR_STR_LASTMODIFIED, HDR_IDX_LASTMODIFIED},
181 | 			{HDR_STR_CACHECONTROL, HDR_IDX_CACHECONTROL}
182 | 		};
183 | 
184 | 		// Search the array of header keys, find the one that matches what
185 | 		// was just passed into us in contents, and, if not already set to
186 | 		// non-null, store the duplicated header value
187 | 		for(i=0; pHdrVal == NULL && i < sizeof(pHdrs)/sizeof(pHdrs[0]); i++)
188 | 		{
189 | 			// pHdrVal is already strdup'd for us
190 | 			pHdrVal = curlHeaderCallbackMatch((const char *)contents, len, pHdrs[i].str);
191 | 
192 | 			if(pHdrVal != NULL)
193 | 			{
194 | 				FREEPTR(pCcf->pHdrs[pHdrs[i].idx]);
195 | 				pCcf->pHdrs[pHdrs[i].idx] = pHdrVal;
196 | 			}
197 | 		}
198 | 	}
199 | 
200 | 	return len;
201 | }
202 | 
203 | // Create a temporary file possibly to write into,
204 | // if we receive content from the fetch operation
205 | // Also, figure out what filename we should use for
206 | // content caching purposes.
207 | static void curlCacheFileOpen(ccf_t *pCcf)
208 | {	int fd = -1;
209 | 	char tmpfnamebuf[MAXFILENAME];
210 | 
211 | 	// make sure we can store our files
212 | 	mkdir(CURL_BASE_DIR, 0755);
213 | 
214 | 	// create a temporary file, for possible use later
215 | 	memset(tmpfnamebuf, 0, sizeof(tmpfnamebuf));
216 | 	sprintf(tmpfnamebuf, "%s/tmpXXXXXXXXXX", CURL_BASE_DIR);
217 | 
218 | 	if((fd = mkstemp(tmpfnamebuf)) != -1)
219 | 	{
220 | 		pCcf->pFileNameTmp = strdup(tmpfnamebuf);
221 | 		pCcf->bNeedUnlink = true;
222 | 
223 | 		// Get a FILE pointer
224 | 		pCcf->pFile = (fd != -1 ? fdopen(fd, "w") : NULL);
225 | 	}
226 | 
227 | 	// Figure out what the on disk filename should be after the retrieval
228 | 	if(pCcf->pUrlBaseName == NULL || !*pCcf->pUrlBaseName)
229 | 	{
230 | 		FREEPTR(pCcf->pUrlBaseName);
231 | 		FREEPTR(pCcf->pFileName);
232 | 
233 | 		// The URL didn't specify a file, use the urlhash as the filename
234 | 		asprintf(&pCcf->pFileName, "%s/%s", CURL_BASE_DIR, pCcf->pUrlHash);
235 | 	}
236 | 	else	// Use the specified basename of the filename from the URL
237 | 		// so that file handling semantics based on filenames work
238 | 		asprintf(&pCcf->pFileName, "%s/%s", CURL_BASE_DIR, pCcf->pUrlBaseName);
239 | }
240 | 
241 | // Test if pUrl is a CURL supported URL
242 | // If so, grab the basename, for use later
243 | static bool curlIsUrl(const char *pUrl, ccf_t *pCcf)
244 | {	bool bIsUrl = false;
245 | 	regexapi_t *pRat = regexapi_url(pUrl);
246 | 
247 | 	// If we found a regex match, then we assume that CURL supports the url
248 | 	if(pRat != NULL)
249 | 	{	int regexNSubs = regexapi_nsubs(pRat, 0);
250 | 		// Assume that the last subcomponent of the regex is the filename portion
251 | 		const char *pRegexSub = (regexNSubs > 1 ? regexapi_sub(pRat, 0, regexNSubs - 1) : NULL);
252 | 		// and get the basename of that
253 | 		char *pBaseName = (pRegexSub != NULL ? strrchr(pRegexSub, '/') : NULL);
254 | 
255 | 		bIsUrl = (pBaseName != NULL && *pBaseName);
256 | 		if(bIsUrl)
257 | 		{	char *pTerm = strchr(pBaseName, '?');
258 | 
259 | 			// The string returned to us is not const, so we'll terminate it
260 | 			// at the URI point, so as to not have silly basenames
261 | 			if(pTerm != NULL)
262 | 				*pTerm = 0;
263 | 
264 | 			// no basename, just a plain url ?
265 | 			if(*pBaseName == '/')
266 | 				pBaseName++;
267 | 
268 | 			if(*pBaseName)
269 | 				pCcf->pUrlBaseName = strdup(pBaseName);
270 | 		}
271 | 
272 | 		// Cleanup the regex
273 | 		regexapi_free(pRat);
274 | 	}
275 | 
276 | 	return bIsUrl;
277 | }
278 | 
279 | // Returns a url character encoded string
280 | // The caller must free() the result
281 | static char *curlEncodeUrlCharacters(const char *src)
282 | {	char *dst = (src != NULL ? calloc(1,strlen(src)*3) : NULL);
283 | 	char *str = dst;
284 | 
285 | 	if(src != NULL)
286 | 	{	int eq = 0; // we assume that the first `=' is the kvp separator (ie val=data), so don't encode it.
287 | 
288 | 		while(*src)
289 | 		{
290 | 			// http://en.wikipedia.org/wiki/Percent-encoding#Character_data plus a few more
291 | 			if((eq != 0 && *src == '=') || strchr("\"%-.<>\\^_`{|}~[],:#@?;\r\n", *src))
292 | 			{	char c = *src;
293 | 
294 | 				eq = (*src == '=');
295 | 
296 | 				*(dst++) = '%';
297 | 				*(dst++) = hexDigits[c >> 4];
298 | 				*(dst++) = hexDigits[c & 0x0f];
299 | 			}
300 | 			else if(*src == ' ')
301 | 				*(dst++) = '+';
302 | 			else
303 | 			{
304 | 				if(*src == '&')
305 | 					eq = 0;
306 | 				*(dst++) = *src;
307 | 			}
308 | 			src++;
309 | 		}
310 | 	}
311 | 
312 | 	return str;
313 | }
314 | 
315 | static void curlCfrClose(cfr_t *pCfr)
316 | {
317 | 	if(pCfr != NULL)
318 | 	{
319 | 		if(pCfr->ccf.pFile != NULL)
320 | 		{
321 | 			fflush(pCfr->ccf.pFile);
322 | 			fclose(pCfr->ccf.pFile);
323 | 			pCfr->ccf.pFile = NULL;
324 | 		}
325 | 	}
326 | }
327 | 
328 | // Free the structure and sub-components
329 | void curlCfrFree(cfr_t *pCfr)
330 | {
331 | 	if(pCfr != NULL)
332 | 	{	int i;
333 | 
334 | 		curlCfrClose(pCfr);
335 | 
336 | 		if(pCfr->ccf.pFileNameTmp != NULL)
337 | 		{
338 | 			if(pCfr->ccf.bNeedUnlink)
339 | 				unlink(pCfr->ccf.pFileNameTmp);
340 | 		}
341 | 
342 | 		FREEPTR(pCfr->ccf.pUrlBaseName);
343 | 		FREEPTR(pCfr->ccf.pFileName);
344 | 		FREEPTR(pCfr->ccf.pUrlHash);
345 | 		FREEPTR(pCfr->ccf.pFileNameTmp);
346 | 
347 | 		for(i=0; i<HDR_COUNT; i++)
348 | 			FREEPTR(pCfr->ccf.pHdrs[i]);
349 | 
350 | 		FREEPTR(pCfr->pContentType);
351 | 
352 | 		free(pCfr);
353 | 	}
354 | }
355 | 
356 | // Build an md5 hash for the URL being requested
357 | // The caller must free the result
358 | static char *curlUrlHash(const char *pUrl, const char *pHttpPostVars)
359 | {	cmd5_t * pMd5 = curlMd5Init();
360 | 	char *pUrlHash = NULL;
361 | 
362 | 	curlMd5Hash(pMd5, pUrl);
363 | 	if(pHttpPostVars != NULL)
364 | 		curlMd5Hash(pMd5, pHttpPostVars);
365 | 	pUrlHash = curlMd5Final(pMd5);
366 | 	curlMd5Free(pMd5);
367 | 
368 | 	return pUrlHash;
369 | }
370 | 
371 | /*
372 | static ccf_t *curlCacheMetaSet(const char *pFileName
373 | 	, const char *pEtag
374 | 	, const char *pLastModified
375 | 	, const char *pCacheControl
376 | 	)
377 | {	ccf_t *pCcf = calloc(1, sizeof(ccf_t));
378 | 
379 | 	if(pCcf != NULL)
380 | 	{
381 | 		pCcf->pFileName = strdup(pFileName);
382 | 		pCcf->pHdrs[HDR_IDX_ETAG] = strdup(pEtag);
383 | 		pCcf->pHdrs[HDR_IDX_LASTMODIFIED] = strdup(pLastModified);
384 | 		pCcf->pHdrs[HDR_IDX_CACHECONTROL] = strdup(pCacheControl);
385 | 	}
386 | 
387 | 	return pCcf;
388 | }
389 | */
390 | 
391 | // Sort of like strtok, but more convienient.
392 | // Scribbles in the source.
393 | // Return a pointer to the begining of the 'delim'ited string,
394 | // white space trimmed on the left.
395 | // Also addvances the source pointer to the delimited point,
396 | // zero terminates it, then white space trimmed on the right.
397 | static char *stradvtok(char **ppSrc, char delim)
398 | {
399 | 	char *dst = *ppSrc;
400 | 	char *src = *ppSrc;
401 | 
402 | 	while(src != NULL && *src && *src != delim)
403 | 	{
404 | 		if(dst == src && *src != delim && (*src == ' ' || *src == '\t' || *src == '\r' || *src == '\n'))
405 | 			dst++;
406 | 		src++;
407 | 	}
408 | 
409 | 	if(*src == delim)
410 | 	{
411 | 		*src = '\0';
412 | 		src++;
413 | 		while(*src == ' ' || *src == '\t' || *src == '\r' || *src == '\n')
414 | 			src++;
415 | 	}
416 | 
417 | 	*ppSrc = src;
418 | 
419 | 	return dst;
420 | }
421 | 
422 | 
423 | static void curlCacheMetaGet(ccf_t *pCcf)
424 | {	char *pFname = NULL;
425 | 
426 | 	asprintf(&pFname, "%s/%s.meta", CURL_BASE_DIR, pCcf->pUrlHash);
427 | 	if(pFname != NULL)
428 | 	{	FILE *fin = fopen(pFname, "r");
429 | 
430 | 		if(fin != NULL)
431 | 		{	char buf[4096];
432 | 			char *p1;
433 | 			char *p2;
434 | 			char *p3;
435 | 			char *p4;
436 | 			char *pbuf;
437 | 
438 | 			memset(buf, 0, sizeof(buf));
439 | 			pbuf = fgets(buf, sizeof(buf)-1, fin);
440 | 
441 | 			if(pbuf != NULL)
442 | 			{
443 | 				p1 = stradvtok(&pbuf, '|');
444 | 				p2 = stradvtok(&pbuf, '|');
445 | 				p3 = stradvtok(&pbuf, '|');
446 | 				p4 = stradvtok(&pbuf, '|');
447 | 
448 | 				pCcf->pFileName = strdup(p1);
449 | 				pCcf->pHdrs[HDR_IDX_ETAG] = strdup(p2);
450 | 				pCcf->pHdrs[HDR_IDX_LASTMODIFIED] = strdup(p3);
451 | 				pCcf->pHdrs[HDR_IDX_CACHECONTROL] = strdup(p4);
452 | 			}
453 | 
454 | 			fclose(fin);
455 | 		}
456 | 
457 | 		free(pFname);
458 | 	}
459 | }
460 | 
461 | #define NOTNULLPTR(a) ((a) != NULL ? (a) : "")
462 | 
463 | static void curlCacheMetaPut(ccf_t *pCcf)
464 | {	char *pFname = NULL;
465 | 
466 | 	asprintf(&pFname, "%s/%s.meta", CURL_BASE_DIR, pCcf->pUrlHash);
467 | 
468 | 	// TODO - lock operation to prevent contention races
469 | 	if(pFname != NULL)
470 | 	{	FILE *fout = fopen(pFname, "w");
471 | 
472 | 		if(fout != NULL)
473 | 		{
474 | 			fprintf(fout,"%s|%s|%s|%s|"
475 | 				, NOTNULLPTR(pCcf->pFileName)
476 | 				, NOTNULLPTR(pCcf->pHdrs[HDR_IDX_ETAG])
477 | 				, NOTNULLPTR(pCcf->pHdrs[HDR_IDX_LASTMODIFIED])
478 | 				, NOTNULLPTR(pCcf->pHdrs[HDR_IDX_CACHECONTROL])
479 | 				);
480 | 			fclose(fout);
481 | 		}
482 | 		free(pFname);
483 | 	}
484 | }
485 | 
486 | // Move the temp file to the cached file ?
487 | static void curlCacheFileFinalize(cfr_t *pCfr)
488 | {
489 | 	if(pCfr != NULL)
490 | 	{
491 | 		// TODO;
492 | 		// 	1. set unlink flag based on cache-control
493 | 		switch(pCfr->httpResponseCode)
494 | 		{
495 | 			case 200: // new content, remove old, use new
496 | 				// TODO - lock operation to prevent contention races
497 | 				unlink(pCfr->ccf.pFileName);
498 | 				rename(pCfr->ccf.pFileNameTmp, pCfr->ccf.pFileName);
499 | 				pCfr->ccf.bNeedUnlink = false;
500 | 				break;
501 | 			case 304:	// no new content, remove temp file
502 | 			default:
503 | 				unlink(pCfr->ccf.pFileNameTmp);
504 | 				break;
505 | 		}
506 | 	}
507 | }
508 | 
509 | static CURL *curlCoreInit(const char *pUrl, void *pHeaderFn, void *pHeaderData)
510 | {	CURL *curl_handle = NULL;
511 | 
512 | 	curl_global_init(CURL_GLOBAL_ALL);
513 | 	curl_handle = curl_easy_init();
514 | 	curl_easy_setopt(curl_handle, CURLOPT_URL, pUrl);
515 | 
516 | 	curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "json_fdw/1.2 (+http://github.com/nkhorman/json_fdw) libcurl-agent/1.0");
517 | 	curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 30); // TODO - table option ?
518 | 
519 | 	curl_easy_setopt(curl_handle, CURLOPT_ACCEPT_ENCODING, ""); // turn on builtin supported default content dencoding
520 | 	//curl_easy_setopt(curl_handle, CURLOPT_TRANSFER_ENCODING, 1L); // turn on transfer decoding
521 | 
522 | 	curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); // turn on redirection following
523 | 	curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS, 5); // for a maximum of 5
524 | 	curl_easy_setopt(curl_handle, CURLOPT_POSTREDIR, CURL_REDIR_POST_ALL); // maintain a post as a post on redirects
525 | 	curl_easy_setopt(curl_handle, CURLOPT_AUTOREFERER, 1L); // turn on Refer when redirecting
526 | 
527 | 	if(pHeaderFn != NULL)
528 | 	{
529 | 		curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, pHeaderFn);
530 | 		curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, pHeaderData);
531 | 	}
532 | 
533 | 	return curl_handle;
534 | }
535 | 
536 | static CURL *curlCoreInitGetOrPost(const char *pUrl, void *pWriteFn, void *pWriteData, void *pHeaderFn, void *pHeaderData, const char *pPostStr)
537 | {	CURL *curl_handle = curlCoreInit(pUrl, pHeaderFn, pHeaderData);
538 | 
539 | 	if(pWriteFn != NULL)
540 | 	{
541 | 		curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, pWriteFn);
542 | 		curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, pWriteData);
543 | 	}
544 | 
545 | 	if(pPostStr != NULL && *pPostStr)
546 | 	{
547 | 		curl_easy_setopt(curl_handle, CURLOPT_POST, 1L);
548 | 		curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDS, pPostStr);
549 | 		curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDSIZE, strlen(pPostStr));
550 | 	}
551 | 
552 | 	return curl_handle;
553 | }
554 | 
555 | typedef struct _cprfc_t
556 | {
557 | 	const char *buffer; // data to send
558 | 	size_t len; // size to send
559 | 	size_t index; // current index into buffer where the next send operations should start from
560 | }cprfc_t; // Curl Put Read Fn Callback Type
561 | 
562 | static size_t curlPutReadFnCallback(char *buffer, size_t size, size_t nmemb, void *instream)
563 | {	cprfc_t *pCprfc = (cprfc_t *)instream;
564 | 	size_t curl_size = nmemb * size;
565 | 	size_t left_to_copy = pCprfc->len - pCprfc->index;
566 | 	size_t to_copy = (left_to_copy < curl_size) ? left_to_copy : curl_size;
567 | 
568 | 	memcpy(buffer, &pCprfc->buffer[pCprfc->index], to_copy);
569 | 	pCprfc->index += to_copy;
570 | 
571 | 	return to_copy;
572 | }
573 | 
574 | static size_t curlPutHeaderFnCallback(void *buffer, size_t size, size_t nmemb, void *userp)
575 | {	int curl_size = nmemb * size;
576 | 
577 | 	//printf("%s:%d header '%*.*s'\n", __func__, __LINE__, curl_size-2, curl_size-2, buffer);
578 | 
579 | 	return curl_size;
580 | }
581 | 
582 | static size_t curlPutWriteFnCallback(void *buffer, size_t size, size_t nmemb, void *userp)
583 | {	int curl_size = nmemb * size;
584 | 
585 | 	//printf("%s:%d '%*.*s'\n", __func__, __LINE__, curl_size-2, curl_size-2, buffer);
586 | 
587 | 	return curl_size;
588 | }
589 | 
590 | static CURL *curlCoreInitPut(const char *pUrl, void *pReadFn, void *pReadData, void *pHeaderFn, void *pHeaderData, size_t size)
591 | {	CURL *curl_handle = curlCoreInit(pUrl, pHeaderFn, pHeaderData);
592 | 
593 | 	if(pReadFn != NULL)
594 | 	{
595 | 		curl_easy_setopt(curl_handle, CURLOPT_READFUNCTION, pReadFn);
596 | 		curl_easy_setopt(curl_handle, CURLOPT_READDATA, pReadData);
597 | 	}
598 | 
599 | 	curl_easy_setopt(curl_handle, CURLOPT_PUT, 1L);
600 | 	curl_easy_setopt(curl_handle, CURLOPT_UPLOAD, 1L);
601 | 	curl_easy_setopt(curl_handle, CURLOPT_INFILESIZE_LARGE, (curl_off_t)size);
602 | 
603 | 	// don't let output go to stdout
604 | 	curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, &curlPutWriteFnCallback);
605 | 	curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, NULL);
606 | 
607 | 	curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, &curlPutHeaderFnCallback);
608 | 	curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, NULL);
609 | 
610 | 	return curl_handle;
611 | }
612 | 
613 | /*
614 | void curlCoreInitAuth(CURL *curl_handle)
615 | {
616 | 	// TODO - auth foo - possibly some or all of these
617 | 	//	CURLOPT_USERPWD or (CURLOPT_USERNAME and CURLOPT_PASSWORD)
618 | 	//	CURLOPT_LOGIN_OPTIONS
619 | 	//	CURLOPT_PROXYUSERNAME and CURLOPT_PROXYPASSWORD
620 | 	//	CURLOPT_HTTPAUTH
621 | 	//	CURLOPT_TLSAUTH_USERNAME and CURLOPT_TLSAUTH_PASSWORD
622 | 	//	CURLOPT_PROXYAUTH
623 | 	//	CURLOPT_SASL_IR
624 | 	//	CURLOPT_XOAUTH2_BEARER
625 | 	//
626 | }
627 | */
628 | 
629 | // Add a header to the header list
630 | static struct curl_slist *curlCoreInitHeader(CURL *curl_handle, struct curl_slist *pChunk, const char *pName, const char *pValue)
631 | {	char *pHdr = NULL;
632 | 
633 | 	asprintf(&pHdr, "%s: %s", pName, pValue);
634 | 	if(pHdr != NULL)
635 | 	{
636 | 		pChunk = curl_slist_append(pChunk, pHdr);
637 | 		curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, pChunk);
638 | 		free(pHdr);
639 | 	}
640 | 
641 | 	return pChunk;
642 | }
643 | 
644 | // Fetch the file from the url
645 | cfr_t *curlFetchFile(const char *pUrl, const char *pHttpPostVars)
646 | {	cfr_t *pCfr = calloc(1,sizeof(cfr_t));
647 | 
648 | 	if(!curlIsUrl(pUrl, &pCfr->ccf))
649 | 		FREEPTR(pCfr);
650 | 
651 | 	if(pCfr != NULL)
652 | 	{	struct curl_slist *chunk = NULL;
653 | 		CURLcode res;
654 | 		char *pPostStr = curlEncodeUrlCharacters(pHttpPostVars);
655 | 		CURL *curl_handle = curlCoreInitGetOrPost(pUrl, curlWriteCallback, (void *)&pCfr->ccf, curlHeaderCallback, (void *)&pCfr->ccf, pPostStr);
656 | 		unsigned long queryStart = 0;
657 | 
658 | 		pCfr->ccf.pUrlHash = curlUrlHash(pUrl, pHttpPostVars);
659 | 		curlCacheMetaGet(&pCfr->ccf);
660 | 		curlCacheFileOpen(&pCfr->ccf);
661 | 
662 | 		// inject etag header request ?
663 | 		// TODO;
664 | 		//	1. don't if the actual file is missing, so that we get a new one
665 | 		// 	2. don't if stale acording to cache-control
666 | 		if(pCfr->ccf.pHdrs[HDR_IDX_ETAG] != NULL)
667 | 			chunk = curlCoreInitHeader(curl_handle, chunk, "If-None-Match", pCfr->ccf.pHdrs[HDR_IDX_ETAG]);
668 | 
669 | 		// the file should already be open, get it
670 | 		queryStart = GetTickCount();
671 | 		res = curl_easy_perform(curl_handle);
672 | 		pCfr->queryDuration = GetTickCount() - queryStart; // how long did the fetch take ?
673 | 
674 | 		// clean up post data
675 | 		FREEPTR(pPostStr);
676 | 
677 | 		// close the open file
678 | 		curlCfrClose(pCfr);
679 | 
680 | 		// this means that we communicated with the server
681 | 		if(res == CURLE_OK)
682 | 		{	char *pContentType = NULL;
683 | 
684 | 			curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &pCfr->httpResponseCode);
685 | 			curl_easy_getinfo(curl_handle, CURLINFO_CONTENT_TYPE, &pContentType);
686 | 
687 | 			if(pContentType != NULL)
688 | 				pCfr->pContentType = strdup(pContentType);
689 | 
690 | 			curlCacheMetaPut(&pCfr->ccf);
691 | 
692 | 			switch(pCfr->httpResponseCode)
693 | 			{
694 | 				case 200:
695 | 					pCfr->bFileFetched = true;
696 | #ifndef JSON_CONTENT_TYPE_NONE
697 | 					// make sure it's the correct content type
698 | 					pCfr->bFileFetched &= (
699 | #ifdef JSON_CONTENT_TYPE_NULL
700 | 						// Highly non-conforming server/application
701 | 						pContentType == NULL ||
702 | #endif
703 | #ifdef JSON_CONTENT_TYPE_LIBERAL
704 | 						// If your using a badly configured/coded/non-conforming server
705 | 						// application, you might get one or more of these mime types
706 | 						(pContentType != NULL && strcasecmp("application/x-javascript", pContentType) == 0) ||
707 | 						(pContentType != NULL && strcasecmp("text/javascript", pContentType) == 0) ||
708 | 						(pContentType != NULL && strcasecmp("text/x-javascript", pContentType) == 0) ||
709 | 						(pContentType != NULL && strcasecmp("text/x-json", pContentType) == 0) ||
710 | 						(pContentType != NULL && strcasecmp("text/html", pContentType) == 0) ||
711 | #endif
712 | 						// The content might be a straight up gzip compressed file
713 | 						(pContentType != NULL && strcasecmp("application/x-gzip", pContentType) == 0) ||
714 | 						// If it is uncompressed, it should look like this
715 | 						(pContentType != NULL && strcasecmp("application/json", pContentType) == 0)
716 | 						);
717 | #endif
718 | 					break;
719 | 				case 304:
720 | 					// we lie here, because we already have the file
721 | 					pCfr->bFileFetched = true;
722 | 					break;
723 | 				default:
724 | 					break;
725 | 			}
726 | 
727 | 			curlCacheFileFinalize(pCfr);
728 | 		}
729 | 
730 | 		// all done, cleanup
731 | 		curl_easy_cleanup(curl_handle);
732 | 		curl_global_cleanup();
733 | 		curl_slist_free_all(chunk);
734 | 	}
735 | 
736 | 	return pCfr;
737 | }
738 | 
739 | // Put
740 | int curlPut(const char *pUrl, const char *pBuffer, size_t bufferSize, const char *pContentType)
741 | {	int ok = 0;
742 | 
743 | 	CURLcode res;
744 | 	cprfc_t cprfc = { pBuffer, bufferSize, 0 };
745 | 	CURL *curl_handle = curlCoreInitPut(pUrl, &curlPutReadFnCallback, &cprfc, NULL, NULL, cprfc.len);
746 | 	struct curl_slist *chunk = curlCoreInitHeader(curl_handle, NULL, "Content-Type", pContentType);
747 | 
748 | 	res = curl_easy_perform(curl_handle);
749 | 
750 | 	// this means that we communicated with the server
751 | 	if(res == CURLE_OK)
752 | 	{	unsigned long httpResponseCode = 0;
753 | 
754 | 		curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &httpResponseCode);
755 | 		ok = (httpResponseCode == 200);
756 | 	}
757 | 
758 | 	// all done, cleanup
759 | 	curl_easy_cleanup(curl_handle);
760 | 	curl_global_cleanup();
761 | 	curl_slist_free_all(chunk);
762 | 
763 | 	return ok;
764 | }
765 | 
766 | #ifdef _CURL_UNIT_TEST
767 | int debug = 0;
768 | 
769 | void logit(const char *p)
770 | {
771 | 	if(debug)
772 | 		printf("%s\n",p);
773 | }
774 | 
775 | void test1(int argc, char **argv)
776 | {
777 | 	cfr_t *pCfr = NULL;
778 | 	const char *pUrl = NULL;
779 | 	const char *pHttpPostVars = NULL;
780 | 	const char *pFileName = NULL;
781 | 	int i = 0;
782 | 
783 | 	pUrl = (argc >= i ? argv[i] : NULL);
784 | 	i++;
785 | 	pHttpPostVars = (argc >= i ? argv[i] : NULL);
786 | 
787 | 	pCfr = curlFetchFile(pUrl, pHttpPostVars);
788 | 	if(pCfr != NULL)
789 | 		pFileName = pCfr->ccf.pFileName;
790 | 
791 | 	printf("'%s' --> '%s' == %s\n", pUrl, pFileName, (pCfr && pCfr->bFileFetched ? "OK" : "FAIL"));
792 | 	if(pCfr && pCfr->bFileFetched)
793 | 	{	char *pCmd = NULL;
794 | 
795 | 		printf("HTTP response code %lu\n", pCfr->httpResponseCode);
796 | 		printf("%s duration %lums\n", (pCfr->httpResponseCode == 200 ? "Fetch" : "Query"), pCfr->queryDuration);
797 | 		if(pCfr->pContentType != NULL && strcasecmp("application/json", pCfr->pContentType) == 0)
798 | 		{
799 | 			if(debug)
800 | 				asprintf(&pCmd, "ls -la %s/; cat %s", CURL_BASE_DIR, pFileName);
801 | 			else
802 | 				asprintf(&pCmd, "cat %s", pFileName);
803 | 			system(pCmd);
804 | 			free(pCmd);
805 | 		}
806 | 
807 | 		asprintf(&pCmd, "ls -la %s/", CURL_BASE_DIR);
808 | 		system(pCmd);
809 | 		free(pCmd);
810 | 	}
811 | 
812 | 	curlCfrFree(pCfr);
813 | }
814 | 
815 | void test2(int argc, char **argv)
816 | {
817 | 	const char *pUrl = NULL;
818 | 	const char *pBuffer = NULL;
819 | 	int i = 0;
820 | 	int ok = 0;
821 | 
822 | 	pUrl = (argc >= i ? argv[i] : NULL);
823 | 	i++;
824 | 	pBuffer = (argc >= i ? argv[i] : NULL);
825 | 
826 | 	ok = curlPut(pUrl, pBuffer, strlen(pBuffer), "application/json");
827 | 
828 | 	printf("'%s' --> '%s' == %s\n", pUrl, pBuffer, (ok ? "OK" : "FAIL"));
829 | }
830 | 
831 | void test3(int argc, char **argv)
832 | {
833 | 	const char *pStrIn = "a=1&b=2&json={\"query\":[[3,0,0]]}&d=4";
834 | 	char *pStrOut = curlEncodeUrlCharacters(pStrIn);
835 | 
836 | 	printf("in '%s' out '%s'\n", pStrIn, pStrOut);
837 | 
838 | 	free(pStrOut);
839 | }
840 | 
841 | int main(int argc, char **argv)
842 | {
843 | 	int i = 1;
844 | 	int c;
845 | 
846 | #ifdef DEBUG_WLOGIT
847 | 	curlLogItSet(&logit);
848 | #endif
849 | 
850 | 	if(argc == 1)
851 | 	{
852 | 		printf("%s: [-d] [-1 [url] [optional post vars]]\n", argv[0]);
853 | 		exit(0);
854 | 	}
855 | 
856 | 	while(i < argc)
857 | 	{
858 | 		if(argv[i][0] == '-')
859 | 		{
860 | 			switch(argv[i][1])
861 | 			{
862 | 				case 'd': debug = 1; i++; break;
863 | 				case '1': test1(argc-i, argv+i); i += 2; break;
864 | 				case '2': test2(argc-i, argv+i); i += 2; break;
865 | 				case '3': test3(argc-i, argv+i); i++; break;
866 | 				default: i++; printf("unknown cli arg '%s'\n", argv[i]); break;
867 | 			}
868 | 		}
869 | 		else
870 | 			i++;
871 | 	}
872 | 
873 | 	return 0;
874 | }
875 | #endif
876 | 


--------------------------------------------------------------------------------
/curlapi.h:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------*
 2 |  *
 3 |  * Developed by;
 4 |  *	Neal Horman - http://www.wanlink.com
 5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
 6 |  *
 7 |  *	This "source code" is free software: you can redistribute it and/or modify
 8 |  *	it under the terms of the GNU General Public License as published by
 9 |  *	the Free Software Foundation, either version 3 of the License, or
10 |  *	(at your option) any later version.
11 |  *
12 |  *	This "source code" is distributed in the hope that it will be useful,
13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  *	GNU General Public License for more details.
16 |  *
17 |  *	You should have received a copy of the GNU General Public License
18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
19 |  *
20 |  *	RCSID:  $Id$
21 |  *
22 |  *--------------------------------------------------------------------*/
23 | 
24 | #ifndef _CURLAPI_H_
25 | #define _CURLAPI_H_
26 | 
27 | #include <stdio.h>
28 | #include <stdlib.h>
29 | #include <unistd.h>
30 | #include <stdbool.h>
31 | 
32 | // HDR_IDX_xx values must be zero relative, and consecutive
33 | #define HDR_STR_ETAG "ETag: "
34 | #define HDR_STR_LASTMODIFIED "Last-Modified: "
35 | #define HDR_STR_CACHECONTROL "Cache-Control: "
36 | 
37 | enum
38 | {
39 | HDR_IDX_ETAG,
40 | HDR_IDX_LASTMODIFIED,
41 | HDR_IDX_CACHECONTROL,
42 | 
43 | HDR_COUNT // must always be last
44 | };
45 | 
46 | typedef struct _ccf_t
47 | {
48 | 	char *pUrlBaseName;
49 | 	char *pFileName;
50 | 	char *pUrlHash;
51 | 	char *pFileNameTmp;
52 | 	FILE* pFile;
53 | 	bool bNeedUnlink;
54 | 	char *pHdrs[HDR_COUNT];
55 | }ccf_t; // CurlCacheFile_Type
56 | 
57 | typedef struct _cfr_t
58 | {
59 | 	ccf_t ccf;
60 | 	bool bFileFetched;
61 | 	unsigned long httpResponseCode;
62 | 	char *pContentType;
63 | 	unsigned long queryDuration;
64 | } cfr_t; // "CurlFetchResult_Type"
65 | 
66 | cfr_t *curlFetchFile(const char *pUrl, const char *pHttpPostVars);
67 | void curlPost(const char *pUrl, const char *pHttpPostVars);
68 | void curlCfrFree(cfr_t *pCfr);
69 | 
70 | int curlPut(const char *pUrl, const char *pBuffer, size_t bufferSize, const char *pContentType);
71 | 
72 | #ifdef DEBUG_WLOGIT
73 | void curlLogItSet(void (*pfn)(const char *));
74 | static void curlLogIt(const char *pFmt, ...);
75 | #endif
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/data/blk_-729487577044220672:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/blk_-729487577044220672


--------------------------------------------------------------------------------
/data/customer_reviews_1998.1000.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/customer_reviews_1998.1000.json.gz


--------------------------------------------------------------------------------
/data/data.json:
--------------------------------------------------------------------------------
1 | {"id": 1, "type": "person", "name": "Beatus Henk", "birthdate": "1973-06-24", "actions": [1]}
2 | {"id": 2, "type": "person", "name": "Lugos Alfons", "birthdate": "1961-08-30"}
3 | {"id": 3, "type": "person", "name": "Temür Essa", "birthdate": "1995-07-28", "actions": [2, 2, 1, 3]}
4 | {"id": 4, "type": "resturaunt", "name": "Mingus Kitchen", "position": {"lat": -4.83798e1, "lon": -65.43274, "address": {"country": "Argentina"}}, "last_update": "2013-01-02 12:05:01"}
5 | {"id": 5, "type": "resturaunt", "name": "Café Utopia Lounge", "position": {"lat": 429.7208e-1, "lon": 143.39097}, "last_update_tz": "2013-01-02 12:05:01 America/New_York"}
6 | {"id": 6, "type": "invalid_record", "birthdate": null, "last_update": "invalid time format", "position": "Canada"}
7 | {"id": 9223372036854775807}
8 | {"id": -9223372036854775808}
9 | 


--------------------------------------------------------------------------------
/data/data_broken.json:
--------------------------------------------------------------------------------
1 | {"a": 1, "b": 2}
2 | {"a": 2, "b": 3}
3 | {"a": 3, "b": 4}
4 | {"a": 3, 
5 | 


--------------------------------------------------------------------------------
/data/invalid_gz_file.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/invalid_gz_file.json.gz


--------------------------------------------------------------------------------
/expected/.gitignore:
--------------------------------------------------------------------------------
1 | # This directory will be populated when testing
2 | # Ignore everything in this directory
3 | *
4 | # Except this file
5 | !.gitignore
6 | 


--------------------------------------------------------------------------------
/gettickcount.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*--------------------------------------------------------------------*
  3 |  *
  4 |  * Developed by;
  5 |  *	Neal Horman - http://www.wanlink.com
  6 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
  7 |  *
  8 |  *	This "source code" is free software: you can redistribute it and/or modify
  9 |  *	it under the terms of the GNU General Public License as published by
 10 |  *	the Free Software Foundation, either version 3 of the License, or
 11 |  *	(at your option) any later version.
 12 |  *
 13 |  *	This "source code" is distributed in the hope that it will be useful,
 14 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 |  *	GNU General Public License for more details.
 17 |  *
 18 |  *	You should have received a copy of the GNU General Public License
 19 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
 20 |  *
 21 |  *	RCSID:  $Id$
 22 |  *
 23 |  *--------------------------------------------------------------------*/
 24 | 
 25 | #ifndef _WIN32
 26 | 
 27 | #include <stdlib.h>
 28 | #include <unistd.h>
 29 | #include <time.h>
 30 | 
 31 | #define NsInAMs 1000000
 32 | #define MsInASec 1000
 33 | 
 34 | #ifdef __APPLE__
 35 | 
 36 | // see https://developer.apple.com/library/mac/qa/qa1398/_index.html
 37 | // and http://stackoverflow.com/questions/3269321/osx-programmatically-get-uptime
 38 | 
 39 | #include <assert.h>
 40 | #include <CoreServices/CoreServices.h>
 41 | #include <mach/mach.h>
 42 | #include <mach/mach_time.h>
 43 | #include <unistd.h>
 44 | 
 45 | #include "gettickcount.h"
 46 | 
 47 | unsigned long GetTickCount(void)
 48 | {
 49 | 	static mach_timebase_info_data_t    sTimebaseInfo;
 50 | 
 51 | 	// If this is the first time we've run, get the timebase.
 52 | 	// We can use denom == 0 to indicate that sTimebaseInfo is 
 53 | 	// uninitialised because it makes no sense to have a zero 
 54 | 	// denominator is a fraction.
 55 | 
 56 | 	if ( sTimebaseInfo.denom == 0 )
 57 | 		(void) mach_timebase_info(&sTimebaseInfo);
 58 | 
 59 | 	return (mach_absolute_time() * (sTimebaseInfo.numer / sTimebaseInfo.denom)) / NsInAMs;
 60 | }
 61 | 
 62 | #else
 63 | 
 64 | unsigned long GetTickCount(void)
 65 | {	struct timespec ts_uptime;
 66 | 	unsigned long uptimeInMs = 0;
 67 | 
 68 | 	if(clock_gettime(CLOCK_UPTIME, &ts_uptime) == 0)
 69 | 	{
 70 | 		uptimeInMs = (ts_uptime.tv_sec * MsInASec);
 71 | 
 72 | 		if(ts_uptime.tv_nsec > 0)
 73 | 			uptimeInMs += (ts_uptime.tv_nsec / NsInAMs);
 74 | 	}
 75 | 
 76 | 	return uptimeInMs;
 77 | }
 78 | #endif // __APPLE__
 79 | #endif // _WINDOWS
 80 | 
 81 | #ifdef UNIT_TEST
 82 | // to compile - gcc -DUNIT_TEST -o uptime gettickcount.c && while [ 1 ]; do clear; ./uptime; sleep 1; done
 83 | #include <stdio.h>
 84 | 
 85 | int main(int argc, char **argv)
 86 | {	unsigned long nTicks = GetTickCount();
 87 | 	unsigned long secs = (nTicks > 0 ? nTicks / 1000 : 0);
 88 | 	unsigned long Secs=0, Mins=0, Hrs=0, Days=0;
 89 | 
 90 | 	if(secs > 0)
 91 | 	{
 92 | 		Secs = secs % 60;
 93 | 		Mins = (secs / 60) % 60;
 94 | 		Hrs = (secs / (60 * 60)) % 24;
 95 | 		Days = (secs / (60 * 60 * 24));
 96 | 	}
 97 | 	printf("uptime in ms %lu = %lu days and %02lu:%02lu:%02lu",nTicks,Days,Hrs,Mins,Secs);
 98 | 
 99 | 	return 0;
100 | }
101 | #endif
102 | 


--------------------------------------------------------------------------------
/gettickcount.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /*--------------------------------------------------------------------*
 3 |  *
 4 |  * Developed by;
 5 |  *	Neal Horman - http://www.wanlink.com
 6 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
 7 |  *
 8 |  *	This "source code" is free software: you can redistribute it and/or modify
 9 |  *	it under the terms of the GNU General Public License as published by
10 |  *	the Free Software Foundation, either version 3 of the License, or
11 |  *	(at your option) any later version.
12 |  *
13 |  *	This "source code" is distributed in the hope that it will be useful,
14 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 |  *	GNU General Public License for more details.
17 |  *
18 |  *	You should have received a copy of the GNU General Public License
19 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
20 |  *
21 |  *	RCSID:  $Id$
22 |  *
23 |  *--------------------------------------------------------------------*/
24 | 
25 | #ifndef _GETTICKCOUNT_H_
26 | #define _GETTICKCOUNT_H_
27 | 
28 | #ifndef _WIN32
29 | 
30 | #ifdef __cplusplus
31 | extern "C" {
32 | #endif
33 | 
34 | 	extern unsigned long GetTickCount(void);
35 | 
36 | #ifdef __cplusplus
37 | }
38 | #endif
39 | 
40 | #endif // _WIN32
41 | 
42 | #endif // _GETTICKCOUNT_H_
43 | 


--------------------------------------------------------------------------------
/input/basic_tests.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test json foreign data wrapper.
 3 | --
 4 | 
 5 | -- Settings to make the result deterministic
 6 | SET datestyle = "ISO, YMD";
 7 | 
 8 | 
 9 | -- Install json_fdw
10 | CREATE EXTENSION json_fdw;
11 | 
12 | CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw;
13 | 
14 | 
15 | -- validator tests
16 | CREATE FOREIGN TABLE test_validator_filename_missing () 
17 | 	SERVER json_server; -- ERROR
18 | 
19 | CREATE FOREIGN TABLE test_validator_invalid_option () 
20 | 	SERVER json_server 
21 | 	OPTIONS(filename 'data.json', bad_option_name '1'); -- ERROR
22 | 
23 | 
24 | -- data conversion tests
25 | CREATE FOREIGN TABLE json_data (id int8, type char(20), name text, 
26 | 	birthdate date, actions int[], "position.lat" float, "position.lon" float, 
27 | 	"position.address.country" varchar(50), last_update timestamp,
28 | 	last_update_tz timestamp with time zone
29 | 	) SERVER json_server OPTIONS(filename '@abs_srcdir@/data/data.json');
30 | 
31 | SELECT id, type, name FROM json_data ORDER BY id;
32 | 
33 | SELECT id, name, birthdate FROM json_data WHERE type = 'person' ORDER BY id;
34 | 
35 | SELECT id, "position.lat" AS lat, "position.lon" AS lon, 
36 | 	"position.address.country" AS country, last_update 
37 | 	FROM json_data WHERE type = 'resturaunt' ORDER BY id;
38 | 
39 | SELECT id, type, birthdate, last_update, "position.lon" as lon 
40 | 	FROM json_data WHERE type = 'invalid_record' ORDER BY id;
41 | 
42 | SELECT last_update_tz AT TIME ZONE 'UTC' FROM json_data 
43 | 	WHERE last_update_tz IS NOT NULL;
44 | 
45 | 
46 | -- max error count test
47 | CREATE FOREIGN TABLE test_skip_broken_on (a integer, b integer) 
48 | 	SERVER json_server 
49 | 	OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '1');
50 | 
51 | SELECT * FROM test_skip_broken_on ORDER BY a;
52 | 
53 | CREATE FOREIGN TABLE test_skip_broken_off (a integer, b integer) 
54 | 	SERVER json_server 
55 | 	OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '0');
56 | 
57 | SELECT * FROM test_skip_broken_off; -- ERROR
58 | 
59 | 
60 | -- error scenarios
61 | CREATE FOREIGN TABLE test_missing_file () SERVER json_server 
62 | 	OPTIONS (filename '@abs_srcdir@/data/missing_file.json');
63 | 
64 | SELECT * FROM test_missing_file; -- ERROR
65 | 
66 | CREATE FOREIGN TABLE test_string_length_check (type CHAR(6)) SERVER json_server
67 | 	OPTIONS (filename '@abs_srcdir@/data/data.json');
68 | 
69 | SELECT * FROM test_string_length_check; -- ERROR
70 | 
71 | CREATE FOREIGN TABLE test_int_range_check (id int4) SERVER json_server
72 | 	OPTIONS (filename '@abs_srcdir@/data/data.json');
73 | 
74 | SELECT * FROM test_int_range_check; -- ERROR
75 | 
76 | CREATE FOREIGN TABLE test_decimal_range_check ("position.lat" decimal(3, 2))
77 | 	SERVER json_server OPTIONS (filename '@abs_srcdir@/data/data.json');
78 | 
79 | SELECT * FROM test_decimal_range_check; -- ERROR
80 | 
81 | 


--------------------------------------------------------------------------------
/input/customer_reviews.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test customer reviews dataset queries.
 3 | --
 4 | 
 5 | CREATE FOREIGN TABLE customer_reviews
 6 | (
 7 |     customer_id TEXT not null,
 8 |     "review.date" DATE not null,
 9 |     "review.rating" INTEGER not null,
10 |     "review.votes" INTEGER,
11 |     "review.helpful_votes" INTEGER,
12 |     "product.id" CHAR(10) not null,
13 |     "product.title" TEXT not null,
14 |     "product.sales_rank" BIGINT,
15 |     "product.group" TEXT,
16 |     "product.category" TEXT,
17 |     "product.subcategory" TEXT,
18 |     similar_product_ids CHAR(10)[]
19 | )
20 | SERVER json_server
21 | OPTIONS(filename '@abs_srcdir@/data/customer_reviews_1998.1000.json.gz');
22 | 
23 | 
24 | -- How people rate your products?
25 | 
26 | SELECT
27 |     extract(month from "review.date") AS review_month,
28 |     round(avg("review.rating"), 2),
29 |     count(*)
30 | FROM
31 |     customer_reviews
32 | GROUP BY
33 |     review_month
34 | ORDER BY
35 |     review_month;
36 | 
37 | -- Do we have a correlation between a book's title's length and its review ratings?
38 | 
39 | SELECT
40 |     width_bucket(length("product.title"), 1, 50, 5) title_length_bucket,
41 |     round(avg("review.rating"), 2) AS review_average,
42 |     count(*)
43 | FROM
44 |    customer_reviews
45 | WHERE
46 |     "product.group" = 'Book'
47 | GROUP BY
48 |     title_length_bucket
49 | ORDER BY
50 |     title_length_bucket;
51 | 
52 | -- Does the average review rating change by product category?
53 | 
54 | SELECT
55 |     "product.category",
56 |     round(avg("review.rating"), 2),
57 |     count(*)
58 | FROM
59 |     customer_reviews
60 | GROUP BY
61 |     "product.category"
62 | ORDER BY
63 |     count(*) DESC, "product.category"
64 | LIMIT 20;


--------------------------------------------------------------------------------
/input/hdfs_block.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test customer reviews dataset which is stored as a HDFS block.
 3 | --
 4 | 
 5 | CREATE FOREIGN TABLE customer_reviews_hdfs_block
 6 | (
 7 |     customer_id TEXT not null,
 8 |     "review.date" DATE not null,
 9 |     "review.rating" INTEGER not null,
10 |     "review.votes" INTEGER,
11 |     "review.helpful_votes" INTEGER,
12 |     "product.id" CHAR(10) not null,
13 |     "product.title" TEXT not null,
14 |     "product.sales_rank" BIGINT,
15 |     "product.group" TEXT,
16 |     "product.category" TEXT,
17 |     "product.subcategory" TEXT,
18 |     similar_product_ids CHAR(10)[]
19 | )
20 | SERVER json_server
21 | OPTIONS(filename '@abs_srcdir@/data/blk_-729487577044220672', 
22 |         max_error_count '2');
23 | 
24 | -- Does the average review rating change by product category?
25 | SELECT
26 |     "product.category",
27 |     round(avg("review.rating"), 2),
28 |     count(*)
29 | FROM
30 |     customer_reviews_hdfs_block
31 | GROUP BY
32 |     "product.category"
33 | ORDER BY
34 |     count(*) DESC, "product.category"
35 | LIMIT 20;


--------------------------------------------------------------------------------
/input/invalid_gz_file.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test that we handle invalid gzip files properly.
 3 | --
 4 | 
 5 | \set VERBOSITY terse
 6 | 
 7 | CREATE FOREIGN TABLE invalid_gz_file_table
 8 | (
 9 |     customer_id TEXT not null,
10 |     "review.date" DATE not null,
11 |     "review.rating" INTEGER not null,
12 |     "review.votes" INTEGER,
13 |     "review.helpful_votes" INTEGER,
14 |     "product.id" CHAR(10) not null,
15 |     "product.title" TEXT not null,
16 |     "product.sales_rank" BIGINT,
17 |     "product.group" TEXT,
18 |     "product.category" TEXT,
19 |     "product.subcategory" TEXT,
20 |     similar_product_ids CHAR(10)[]
21 | )
22 | SERVER json_server
23 | OPTIONS(filename '@abs_srcdir@/data/invalid_gz_file.json.gz');
24 | 
25 | select count(*) from invalid_gz_file_table;
26 | 
27 | \set VERBOSITY default


--------------------------------------------------------------------------------
/json_fdw--1.0.sql:
--------------------------------------------------------------------------------
 1 | /* contrib/json_fdw/json_fdw--1.0.sql */
 2 | 
 3 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 4 | \echo Use "CREATE EXTENSION json_fdw" to load this file. \quit
 5 | 
 6 | CREATE FUNCTION json_fdw_handler()
 7 | RETURNS fdw_handler
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION json_fdw_validator(text[], oid)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FOREIGN DATA WRAPPER json_fdw
17 |   HANDLER json_fdw_handler
18 |   VALIDATOR json_fdw_validator;
19 | 


--------------------------------------------------------------------------------
/json_fdw.c:
--------------------------------------------------------------------------------
   1 | /*-------------------------------------------------------------------------
   2 |  *
   3 |  * json_fdw.c
   4 |  *
   5 |  * Function definitions for JSON foreign data wrapper.
   6 |  *
   7 |  * Copyright (c) 2013, Citus Data, Inc.
   8 |  *
   9 |  * $Id$
  10 |  *
  11 |  *-------------------------------------------------------------------------
  12 |  */
  13 | 
  14 | 
  15 | // http://wiki.postgresql.org/images/6/67/Pg-fdw.pdf
  16 | 
  17 | #include <stdio.h>
  18 | #include <stdbool.h>
  19 | #include <sys/stat.h>
  20 | 
  21 | #include "postgres.h"
  22 | #include "json_fdw.h"
  23 | 
  24 | #include <yajl/yajl_tree.h>
  25 | #include <yajl/yajl_tree_path.h>
  26 | 
  27 | #include <zlib.h>
  28 | 
  29 | #include "access/reloptions.h"
  30 | #include "catalog/pg_foreign_table.h"
  31 | #include "catalog/pg_type.h"
  32 | #include "commands/defrem.h"
  33 | #include "commands/explain.h"
  34 | #include "commands/vacuum.h"
  35 | #include "foreign/fdwapi.h"
  36 | #include "foreign/foreign.h"
  37 | #include "miscadmin.h"
  38 | #include "nodes/makefuncs.h"
  39 | #include "optimizer/cost.h"
  40 | #include "optimizer/plancat.h"
  41 | #include "optimizer/pathnode.h"
  42 | #include "optimizer/planmain.h"
  43 | #include "optimizer/restrictinfo.h"
  44 | #include "optimizer/var.h"
  45 | #include "port.h"
  46 | #include "storage/fd.h"
  47 | #include "utils/array.h"
  48 | #include "utils/builtins.h"
  49 | #include "utils/date.h"
  50 | #include "utils/datetime.h"
  51 | #include "utils/int8.h"
  52 | #include "utils/timestamp.h"
  53 | #include "utils/hsearch.h"
  54 | #include "utils/lsyscache.h"
  55 | #include "utils/memutils.h"
  56 | #include "utils/rel.h"
  57 | #include "parser/parsetree.h"
  58 | #include "nodes/relation.h"
  59 | 
  60 | #if PG_VERSION_NUM >= 90300
  61 | 	#include "access/htup_details.h"
  62 | #endif
  63 | 
  64 | #include "curlapi.h"
  65 | #include "rciapi.h"
  66 | 
  67 | 
  68 | #define ELog(elevel, ...)  \
  69 | do { \
  70 | 	elog_start(__FILE__, __LINE__, PG_FUNCNAME_MACRO); \
  71 | 	elog_finish(elevel, __VA_ARGS__); \
  72 | 	if (__builtin_constant_p(elevel) && (elevel) >= ERROR) \
  73 | 		pg_unreachable(); \
  74 | } while(0)
  75 | 
  76 | 
  77 | // Local functions forward declarations
  78 | static StringInfo OptionNamesString(Oid currentContextId);
  79 | static void JsonGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel,
  80 | 								  Oid foreignTableId);
  81 | static void JsonGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel,
  82 | 								Oid foreignTableId);
  83 | static ForeignScan * JsonGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel,
  84 | 				   						Oid foreignTableId, ForeignPath *bestPath,
  85 | 				   						List *targetList, List *scanClauses);
  86 | static void JsonExplainForeignScan(ForeignScanState *scanState, 
  87 | 								   ExplainState *explainState);
  88 | static void JsonBeginForeignScan(ForeignScanState *scanState, int executorFlags);
  89 | static TupleTableSlot * JsonIterateForeignScan(ForeignScanState *scanState);
  90 | static void JsonReScanForeignScan(ForeignScanState *scanState);
  91 | static void JsonEndForeignScan(ForeignScanState *scanState);
  92 | static JsonFdwOptions * JsonGetOptions(Oid foreignTableId);
  93 | static char * JsonGetOptionValue(Oid foreignTableId, const char *optionName);
  94 | static double TupleCount(RelOptInfo *baserel, const char *filename);
  95 | static BlockNumber PageCount(const char *filename);
  96 | static List * ColumnList(RelOptInfo *baserel);
  97 | static HTAB * ColumnMappingHash(Oid foreignTableId, List *columnList);
  98 | static bool GzipFilename(const char *filename);
  99 | static bool HdfsBlockName(const char *filename);
 100 | static StringInfo ReadLineFromFile(FILE *filePointer);
 101 | static StringInfo ReadLineFromGzipFile(gzFile gzFilePointer);
 102 | static void FillTupleSlot(const yajl_val jsonObject, const char *jsonObjectKey,
 103 | 						  HTAB *columnMappingHash, Datum *columnValues,
 104 | 						  bool *columnNulls);
 105 | static bool ColumnTypesCompatible(yajl_val jsonValue, Oid columnTypeId);
 106 | static bool ValidDateTimeFormat(const char *dateTimeString);
 107 | static Datum ColumnValueArray(yajl_val jsonArray, Oid valueTypeId, Oid valueTypeMod);
 108 | static Datum ColumnValue(yajl_val jsonValue, Oid columnTypeId, int32 columnTypeMod);
 109 | static bool JsonAnalyzeForeignTable(Relation relation,
 110 | 									AcquireSampleRowsFunc *acquireSampleRowsFunc,
 111 | 									BlockNumber *totalPageCount);
 112 | static int JsonAcquireSampleRows(Relation relation, int logLevel,
 113 | 								 HeapTuple *sampleRows, int targetRowCount,
 114 | 								 double *totalRowCount, double *totalDeadRowCount);
 115 | 
 116 | static List *JsonPlanForeignModify(PlannerInfo *root, ModifyTable *plan, Index resultRelation, int subplan_index);
 117 | static void JsonBeginForeignModify( ModifyTableState *mtstate, ResultRelInfo *resultRelInfo, List *fdw_private, int subplan_index, int eflags);
 118 | static TupleTableSlot *JsonExecForeignInsert( EState *estate, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, TupleTableSlot *planSlot);
 119 | static void JsonAddForeignUpdateTargets(Query *parsetree, RangeTblEntry *target_rte, Relation target_relation);
 120 | static TupleTableSlot * JsonExecForeignUpdate( EState *estate, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, TupleTableSlot *planSlot);
 121 | static void JsonEndForeignModify(EState *estate, ResultRelInfo *resultRelInfo);
 122 | 
 123 | 
 124 | // Array of options that are valid for json_fdw
 125 | static const JsonValidOption ValidOptionArray[] =
 126 | {
 127 | 	// foreign table options
 128 | 	{ OPTION_NAME_FILENAME, ForeignTableRelationId },
 129 | 	{ OPTION_NAME_MAX_ERROR_COUNT, ForeignTableRelationId },
 130 | 	{ OPTION_NAME_HTTP_POST_VARS, ForeignTableRelationId },
 131 | 	{ OPTION_NAME_ROM_URL, ForeignTableRelationId },
 132 | 	{ OPTION_NAME_ROM_PATH, ForeignTableRelationId },
 133 | };
 134 | // Never maintain by hand, what the compiler could do for you
 135 | static const uint32 ValidOptionCount = (sizeof(ValidOptionArray)/sizeof(ValidOptionArray[0]));
 136 | 
 137 | 
 138 | // Declarations for dynamic loading
 139 | PG_MODULE_MAGIC;
 140 | 
 141 | PG_FUNCTION_INFO_V1(json_fdw_handler);
 142 | PG_FUNCTION_INFO_V1(json_fdw_validator);
 143 | 
 144 | 
 145 | /*
 146 |  * json_fdw_handler creates and returns a struct with pointers to foreign table
 147 |  * callback functions.
 148 |  */
 149 | Datum
 150 | json_fdw_handler(PG_FUNCTION_ARGS)
 151 | {
 152 | 	FdwRoutine *fdwRoutine = makeNode(FdwRoutine);
 153 | 
 154 | 	fdwRoutine->GetForeignRelSize = JsonGetForeignRelSize;
 155 | 	fdwRoutine->GetForeignPaths = JsonGetForeignPaths;
 156 | 	fdwRoutine->GetForeignPlan = JsonGetForeignPlan;
 157 | 	fdwRoutine->ExplainForeignScan = JsonExplainForeignScan;
 158 | 	fdwRoutine->BeginForeignScan = JsonBeginForeignScan;
 159 | 	fdwRoutine->IterateForeignScan = JsonIterateForeignScan;
 160 | 	fdwRoutine->ReScanForeignScan = JsonReScanForeignScan;
 161 | 	fdwRoutine->EndForeignScan = JsonEndForeignScan;
 162 | 	fdwRoutine->AnalyzeForeignTable = JsonAnalyzeForeignTable;
 163 | 
 164 | 	fdwRoutine->PlanForeignModify = JsonPlanForeignModify;
 165 | 	fdwRoutine->BeginForeignModify = JsonBeginForeignModify;
 166 | 	fdwRoutine->AddForeignUpdateTargets = JsonAddForeignUpdateTargets; // update and delete
 167 | 	fdwRoutine->ExecForeignInsert = JsonExecForeignInsert;
 168 | 	fdwRoutine->ExecForeignUpdate = JsonExecForeignUpdate;
 169 | 	//fdwRoutine->ExecForeignDelete = JsonExecForeignDelete;
 170 | 	fdwRoutine->EndForeignModify = JsonEndForeignModify;
 171 | 
 172 | 	PG_RETURN_POINTER(fdwRoutine);
 173 | }
 174 | 
 175 | 
 176 | /*
 177 |  * json_fdw_validator validates options given to one of the following commands:
 178 |  * foreign data wrapper, server, user mapping, or foreign table. This function
 179 |  * errors out if the given option name or its value is considered invalid. The
 180 |  * filename option is required by the foreign table, so we error out if it is
 181 |  * not provided.
 182 |  */
 183 | Datum
 184 | json_fdw_validator(PG_FUNCTION_ARGS)
 185 | {
 186 | 	Datum optionArray = PG_GETARG_DATUM(0);
 187 | 	Oid optionContextId = PG_GETARG_OID(1);
 188 | 	List *optionList = untransformRelOptions(optionArray);
 189 | 	ListCell *optionCell = NULL;
 190 | 	int filenameFound = 0;
 191 | 	int romUrlFound = 0;
 192 | 	int romPathFound = 0;
 193 | 
 194 | 	foreach(optionCell, optionList)
 195 | 	{
 196 | 		DefElem *optionDef = (DefElem *) lfirst(optionCell);
 197 | 		char *optionName = optionDef->defname;
 198 | 		bool optionValid = false;
 199 | 
 200 | 		int32 optionIndex = 0;
 201 | 		for (optionIndex = 0; optionIndex < ValidOptionCount; optionIndex++)
 202 | 		{
 203 | 			const JsonValidOption *validOption = &(ValidOptionArray[optionIndex]);
 204 | 
 205 | 			if ((optionContextId == validOption->optionContextId) &&
 206 | 				(strncmp(optionName, validOption->optionName, NAMEDATALEN) == 0))
 207 | 			{
 208 | 				optionValid = true;
 209 | 				break;
 210 | 			}
 211 | 		}
 212 | 
 213 | 		// if invalid option, display an informative error message
 214 | 		if (!optionValid)
 215 | 		{
 216 | 			StringInfo optionNamesString = OptionNamesString(optionContextId);
 217 | 
 218 | 			ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_OPTION_NAME),
 219 | 							errmsg("invalid option \"%s\"", optionName),
 220 | 							errhint("Valid options in this context are: %s",
 221 | 									optionNamesString->data)));
 222 | 		}
 223 | 		else // test for particular option existence
 224 | 		{
 225 | 			filenameFound |= (strncmp(optionName, OPTION_NAME_FILENAME, NAMEDATALEN) == 0);
 226 | 			romUrlFound |= (strncmp(optionName, OPTION_NAME_ROM_URL, NAMEDATALEN) == 0);
 227 | 			romPathFound |= (strncmp(optionName, OPTION_NAME_ROM_PATH, NAMEDATALEN) == 0);
 228 | 		}
 229 | 	}
 230 | 
 231 | 	if (optionContextId == ForeignTableRelationId)
 232 | 	{
 233 | 		// make sure either filename or rom_url and rom_path, not both
 234 | 		if( !(filenameFound || (romUrlFound && romPathFound)))
 235 | 		{
 236 | 			ereport(ERROR, (errcode(ERRCODE_FDW_DYNAMIC_PARAMETER_VALUE_NEEDED),
 237 | 				errmsg("Either the ``filename'' or the ``rom_url'' and ``rom_path'' options are required for foreign tables")));
 238 | 		}
 239 | 		else if(filenameFound && (romUrlFound || romPathFound))
 240 | 		{
 241 | 			ereport(ERROR, (errcode(ERRCODE_FDW_DYNAMIC_PARAMETER_VALUE_NEEDED),
 242 | 				errmsg("Do not mix the ``filename'' option with the ``rom_url'' and ``rom_path'' options for foreign tables")));
 243 | 		}
 244 | 	}
 245 | 
 246 | 	PG_RETURN_VOID();
 247 | }
 248 | 
 249 | 
 250 | /*
 251 |  * OptionNamesString finds all options that are valid for the current context,
 252 |  * and concatenates these option names in a comma separated string. The function
 253 |  * is unchanged from mongo_fdw.
 254 |  */
 255 | static StringInfo
 256 | OptionNamesString(Oid currentContextId)
 257 | {
 258 | 	StringInfo optionNamesString = makeStringInfo();
 259 | 	bool firstOptionAppended = false;
 260 | 
 261 | 	int32 optionIndex = 0;
 262 | 	for (optionIndex = 0; optionIndex < ValidOptionCount; optionIndex++)
 263 | 	{
 264 | 		const JsonValidOption *validOption = &(ValidOptionArray[optionIndex]);
 265 | 
 266 | 		// if option belongs to current context, append option name
 267 | 		if (currentContextId == validOption->optionContextId)
 268 | 		{
 269 | 			if (firstOptionAppended)
 270 | 			{
 271 | 				appendStringInfoString(optionNamesString, ", ");
 272 | 			}
 273 | 
 274 | 			appendStringInfoString(optionNamesString, validOption->optionName);
 275 | 			firstOptionAppended = true;
 276 | 		}
 277 | 	}
 278 | 
 279 | 	return optionNamesString;
 280 | }
 281 | 
 282 | 
 283 | /*
 284 |  * JsonGetForeignRelSize obtains relation size estimates for a foreign table and
 285 |  * puts its estimate for row count into baserel->rows.
 286 |  */
 287 | static void
 288 | JsonGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId)
 289 | {
 290 | 	JsonFdwOptions *options = JsonGetOptions(foreignTableId);
 291 | 
 292 | 	double tupleCount = TupleCount(baserel, options->filename);
 293 | 	double rowSelectivity = clauselist_selectivity(root, baserel->baserestrictinfo,
 294 | 					   0, JOIN_INNER, NULL);
 295 | 
 296 | 	double outputRowCount = clamp_row_est(tupleCount * rowSelectivity);
 297 | 	baserel->rows = outputRowCount;
 298 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 299 | }
 300 | 
 301 | 
 302 | /*
 303 |  * JsonGetForeignPaths creates possible access paths for a scan on the foreign
 304 |  * table. Currently we only have one possible access path, which simply returns
 305 |  * all records in the order they appear in the underlying file.
 306 |  */
 307 | static void
 308 | JsonGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId)
 309 | {
 310 | 	Path *foreignScanPath = NULL;
 311 | 	JsonFdwOptions *options = JsonGetOptions(foreignTableId);
 312 | 
 313 | 	BlockNumber pageCount = PageCount(options->filename);
 314 | 	double tupleCount = TupleCount(baserel, options->filename);
 315 | 
 316 | 	/*
 317 | 	 * We estimate costs almost the same way as cost_seqscan(), thus assuming
 318 | 	 * that I/O costs are equivalent to a regular table file of the same size.
 319 | 	 * However, we take per-tuple CPU costs as 10x of a seqscan to account for
 320 | 	 * the cost of parsing records.
 321 | 	 */
 322 | 	double tupleParseCost = cpu_tuple_cost * JSON_TUPLE_COST_MULTIPLIER;
 323 | 	double tupleFilterCost = baserel->baserestrictcost.per_tuple;
 324 | 	double cpuCostPerTuple = tupleParseCost + tupleFilterCost;
 325 | 	double executionCost = (seq_page_cost * pageCount) + (cpuCostPerTuple * tupleCount);
 326 | 
 327 | 	double startupCost = baserel->baserestrictcost.startup;
 328 | 	double totalCost  = startupCost + executionCost;
 329 | 
 330 | 	// create a foreign path node and add it as the only possible path
 331 | 	foreignScanPath = (Path *) create_foreignscan_path(root, baserel, baserel->rows,
 332 | 								   startupCost, totalCost,
 333 | 								   NIL,  // no known ordering
 334 | 								   NULL, // not parameterized
 335 | 								   NIL); // no fdw_private
 336 | 
 337 | 	add_path(baserel, foreignScanPath);
 338 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 339 | }
 340 | 
 341 | 
 342 | /*
 343 |  * JsonGetForeignPlan creates a ForeignScan plan node for scanning the foreign
 344 |  * table. We also add the query column list to scan nodes private list, because
 345 |  * we need it later for mapping columns.
 346 |  */
 347 | static ForeignScan *
 348 | JsonGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId,
 349 | 				   ForeignPath *bestPath, List *targetList, List *scanClauses)
 350 | {
 351 | 	ForeignScan *foreignScan = NULL;
 352 | 	List *columnList = NULL;
 353 | 	List *foreignPrivateList = NIL;
 354 | 
 355 | 	/*
 356 | 	 * We have no native ability to evaluate restriction clauses, so we just
 357 | 	 * put all the scanClauses into the plan node's qual list for the executor
 358 | 	 * to check.
 359 | 	 */
 360 | 	scanClauses = extract_actual_clauses(scanClauses, false);
 361 | 
 362 | 	/*
 363 | 	 * As an optimization, we only add columns that are present in the query to
 364 | 	 * the column mapping hash. To find these columns, we need baserel. We don't
 365 | 	 * have access to baserel in executor's callback functions, so we get the
 366 | 	 * column list here and put it into foreign scan node's private list.
 367 | 	 */
 368 | 	columnList = ColumnList(baserel);
 369 | 	foreignPrivateList = list_make1(columnList);
 370 | 
 371 | 	// create the foreign scan node
 372 | 	foreignScan = make_foreignscan(
 373 | 		targetList, scanClauses, baserel->relid
 374 | 		, NIL // no expressions to evaluate
 375 | 		, foreignPrivateList
 376 | #if PG_VERSION_NUM >= 90500
 377 | 		,NIL // no fdw_scan_tlist
 378 | #endif
 379 | 		);
 380 | 
 381 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 382 | 	return foreignScan;
 383 | }
 384 | 
 385 | 
 386 | // JsonExplainForeignScan produces extra output for the Explain command.
 387 | static void
 388 | JsonExplainForeignScan(ForeignScanState *scanState, ExplainState *explainState)
 389 | {
 390 | 	Oid foreignTableId = RelationGetRelid(scanState->ss.ss_currentRelation);
 391 | 	JsonFdwOptions *options = JsonGetOptions(foreignTableId);
 392 | 
 393 | 	ExplainPropertyText("Json File", options->filename, explainState);
 394 | 	ExplainPropertyText("HTTP Post Vars", options->pHttpPostVars, explainState);
 395 | 	ExplainPropertyText("Rom URL", options->pRomUrl, explainState);
 396 | 	ExplainPropertyText("Rom PATH", options->pRomPath, explainState);
 397 | 
 398 | 	// supress file size if we're not showing cost details
 399 | 	if (explainState->costs)
 400 | 	{
 401 | 		struct stat statBuffer;
 402 | 
 403 | 		int statResult = stat(options->filename, &statBuffer);
 404 | 		if (statResult == 0)
 405 | 		{
 406 | 			ExplainPropertyLong("Json File Size", (long) statBuffer.st_size,
 407 | 								explainState);
 408 | 		}
 409 | 	}
 410 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 411 | }
 412 | 
 413 | static int rciMethod(rci_t *pRci, char const *pMethod, char const *pRomUrl, char const *pRomPath)
 414 | {	int methodOk = 0;
 415 | 
 416 | 	if(pRci != NULL)
 417 | 	{
 418 | 		if(pRci->pMethod != NULL)
 419 | 		{
 420 | 			// special case, (pMethod == NULL) means any method
 421 | 			methodOk = (pMethod != NULL ? (strcasecmp(pRci->pMethod, pMethod) == 0) : 1);
 422 | 			if(!methodOk)
 423 | 			{
 424 | 				ereport(ERROR, (errmsg("Method not supported."),
 425 | 					errhint("URL '%s' path '%s' operation '%s' method '%s'"
 426 | 						, pRomUrl
 427 | 						, pRomPath
 428 | 						, pRci->pAction
 429 | 						, pRci->pMethod
 430 | 						)));
 431 | 			}
 432 | 		}
 433 | 		else
 434 | 		{
 435 | 			ereport(ERROR, (errmsg("Method not specified for ROM path operation."),
 436 | 				errhint("URL '%s' path '%s' operation '%s'", pRomUrl, pRomPath, pRci->pAction)));
 437 | 		}
 438 | 	}
 439 | 
 440 | 	return methodOk;
 441 | }
 442 | 
 443 | static int rciError(rci_t *pRci, char const *pRomUrl, char const *pRomPath)
 444 | {	int error = 1;
 445 | 
 446 | 	if(pRci != NULL)
 447 | 	{
 448 | 		if(pRci->romRoot != NULL)
 449 | 		{
 450 | 			if(pRci->romRootAction != NULL)
 451 | 				error = 0;
 452 | 			else
 453 | 			{
 454 | 				ereport(ERROR, (errmsg("Path does not support operation."),
 455 | 					errhint("URL '%s' path '%s' operation '%s'", pRomUrl, pRomPath, pRci->pAction)));
 456 | 			}
 457 | 		}
 458 | 		else
 459 | 		{
 460 | 			ereport(ERROR, (errmsg("Invalid rom_path."),
 461 | 				errhint("URL '%s' path '%s'", pRomUrl, pRomPath)));
 462 | 		}
 463 | 	}
 464 | 	else
 465 | 	{
 466 | 		ereport(ERROR, (errmsg("Unable to access ROM."),
 467 | 			errhint("URL '%s' path '%s'", pRomUrl, pRomPath)));
 468 | 	}
 469 | 
 470 | 	return error;
 471 | }
 472 | 
 473 | /*
 474 |  * JsonBeginForeignScan opens the underlying json file for reading. The function
 475 |  * also creates a hash table that maps referenced column names to column index
 476 |  * and type information.
 477 |  */
 478 | static void
 479 | JsonBeginForeignScan(ForeignScanState *scanState, int executorFlags)
 480 | {
 481 | 	JsonFdwExecState *execState = NULL;
 482 | 	ForeignScan *foreignScan = NULL;
 483 | 	List *foreignPrivateList = NULL;
 484 | 	Oid foreignTableId = InvalidOid;
 485 | 	JsonFdwOptions *options = NULL;
 486 | 	List *columnList = NULL;
 487 | 	HTAB *columnMappingHash = NULL;
 488 | 	bool gzipFile = false;
 489 | 	bool hdfsBlock = false;
 490 | 	FILE *filePointer = NULL;
 491 | 	gzFile gzFilePointer = NULL;
 492 | 	bool openError = false;
 493 | 	const char *filename = NULL;
 494 | 	const char *postVars = NULL;
 495 | 	cfr_t *pCfr = NULL;
 496 | 
 497 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 498 | 
 499 | 	// if Explain with no Analyze, do nothing
 500 | 	if (executorFlags & EXEC_FLAG_EXPLAIN_ONLY)
 501 | 	{
 502 | 		return;
 503 | 	}
 504 | 
 505 | 	foreignTableId = RelationGetRelid(scanState->ss.ss_currentRelation);
 506 | 	options = JsonGetOptions(foreignTableId);
 507 | 
 508 | 	foreignScan = (ForeignScan *) scanState->ss.ps.plan;
 509 | 	foreignPrivateList = (List *) foreignScan->fdw_private;
 510 | 
 511 | 	columnList = (List *) linitial(foreignPrivateList);
 512 | 	columnMappingHash = ColumnMappingHash(foreignTableId, columnList);
 513 | 
 514 | 	filename = options->filename;
 515 | 	postVars = options->pHttpPostVars;
 516 | 
 517 | 	// if a ROM is specified, get/build an off box url
 518 | 	if(options->pRomUrl != NULL && *options->pRomUrl
 519 | 		&& options->pRomPath != NULL && *options->pRomPath
 520 | 		)
 521 | 	{
 522 | 		rci_t *pRci = rciFetch(options->pRomUrl, options->pRomPath, RCI_ACTION_SELECT);
 523 | 
 524 | 		//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 525 | 		if(!rciError(pRci, options->pRomUrl, options->pRomPath)
 526 | 			&& rciMethod(pRci, "get", options->pRomUrl, options->pRomPath)
 527 | 			)
 528 | 		{
 529 | 			filename = pstrdup(pRci->pUrl); // dupe the url
 530 | 			postVars = NULL;
 531 | 		}
 532 | 		rciFree(pRci);
 533 | 	}
 534 | 
 535 | 	// See if this is an off box url, and try to fetch it
 536 | 	// and then pass it off to one of the native file handlers
 537 | 	if(filename != NULL && *filename)
 538 | 		pCfr = curlFetchFile(filename, postVars);
 539 | 	else
 540 | 		openError = 1;
 541 | 
 542 | 	// if fetched
 543 | 	if(pCfr != NULL)
 544 | 	{
 545 | 		openError = !pCfr->bFileFetched;
 546 | 		if(!openError)
 547 | 			// replace the url with the on box filename of the file that we just
 548 | 			// downloaded so that the existing file handlers can just use a file
 549 | 			filename = pCfr->ccf.pFileName;
 550 | 			/*
 551 | 			ELog(DEBUG1, "%s:%u fetched %u, took %lu ms, http response %lu, content type '%s'"
 552 | 				, __func__, __LINE__
 553 | 				, pCfr->bFileFetched
 554 | 				, pCfr->queryDuration
 555 | 				, pCfr->httpResponseCode
 556 | 				, pCfr->pContentType
 557 | 				);
 558 | 			*/
 559 | 	}
 560 | 
 561 | 	if(!openError && filename != NULL && *filename)
 562 | 	{
 563 | 		gzipFile = GzipFilename(filename);
 564 | 		hdfsBlock = HdfsBlockName(filename);
 565 | 
 566 | 		if (gzipFile || hdfsBlock)
 567 | 		{
 568 | 			gzFilePointer = gzopen(filename, PG_BINARY_R);
 569 | 			openError = (gzFilePointer == NULL);
 570 | 		}
 571 | 		else
 572 | 		{
 573 | 			filePointer = AllocateFile(filename, PG_BINARY_R);
 574 | 			openError = (filePointer == NULL);
 575 | 		}
 576 | 	}
 577 | 
 578 | 	if(openError || filename == NULL || !*filename)
 579 | 	{
 580 | 		ereport(ERROR, (errcode_for_file_access(),
 581 | 						errmsg("could not open file \"%s\" for reading: %m",
 582 | 							   filename)));
 583 | 		curlCfrFree(pCfr);
 584 | 		pCfr = NULL;
 585 | 	}
 586 | 
 587 | 	execState = (JsonFdwExecState *) palloc(sizeof(JsonFdwExecState));
 588 | 	execState->filename = filename;
 589 | 	execState->filePointer = filePointer;
 590 | 	execState->gzFilePointer = gzFilePointer;
 591 | 	execState->columnMappingHash = columnMappingHash;
 592 | 	execState->maxErrorCount = options->maxErrorCount;
 593 | 	execState->errorCount = 0;
 594 | 	execState->currentLineNumber = 0;
 595 | 	// we pass this off to EndForeignScan to manage
 596 | 	execState->pCfr = pCfr;
 597 | 
 598 | 	scanState->fdw_state = (void *) execState;
 599 | }
 600 | 
 601 | 
 602 | /*
 603 |  * JsonIterateForeignScan reads the next record from the data file, converts it 
 604 |  * to PostgreSQL tuple, and stores the converted tuple into the ScanTupleSlot as
 605 |  * a virtual tuple.
 606 |  */
 607 | static TupleTableSlot *
 608 | JsonIterateForeignScan(ForeignScanState *scanState)
 609 | {
 610 | 	JsonFdwExecState *execState = (JsonFdwExecState *) scanState->fdw_state;
 611 | 	TupleTableSlot *tupleSlot = scanState->ss.ss_ScanTupleSlot;
 612 | 	HTAB *columnMappingHash = execState->columnMappingHash;
 613 | 	char errorBuffer[ERROR_BUFFER_SIZE];
 614 | 	yajl_val jsonValue = NULL;
 615 | 	bool endOfFile = false;
 616 | 	bool jsonObjectValid = false;
 617 | 	bool errorCountExceeded = false;
 618 | 
 619 | 	TupleDesc tupleDescriptor = tupleSlot->tts_tupleDescriptor;
 620 | 	Datum *columnValues = tupleSlot->tts_values;
 621 | 	bool *columnNulls = tupleSlot->tts_isnull;
 622 | 	int columnCount = tupleDescriptor->natts;
 623 | 
 624 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 625 | 	// initialize all values for this row to null
 626 | 	memset(columnValues, 0, columnCount * sizeof(Datum));
 627 | 	memset(columnNulls, true, columnCount * sizeof(bool));
 628 | 
 629 | 	ExecClearTuple(tupleSlot);
 630 | 
 631 | 	/*
 632 | 	 * Loop until we reach the end of file, or we read a line that parses to be
 633 | 	 * a valid json object, or we exceed the maximum allowed error count.
 634 | 	 */
 635 | 	while (!(endOfFile || jsonObjectValid || errorCountExceeded))
 636 | 	{
 637 | 		StringInfo lineData = NULL;
 638 | 		if (execState->gzFilePointer != NULL)
 639 | 			lineData = ReadLineFromGzipFile(execState->gzFilePointer);
 640 | 		else
 641 | 			lineData = ReadLineFromFile(execState->filePointer);
 642 | 
 643 | 		if (lineData->len == 0)
 644 | 			endOfFile = true;
 645 | 		else
 646 | 		{
 647 | 			execState->currentLineNumber++;
 648 | 
 649 | 			jsonValue = yajl_tree_parse(lineData->data, errorBuffer, sizeof(errorBuffer));
 650 | 
 651 | 			jsonObjectValid = YAJL_IS_OBJECT(jsonValue);
 652 | 			if (!jsonObjectValid)
 653 | 			{
 654 | 				yajl_tree_free(jsonValue);
 655 | 
 656 | 				execState->errorCount++;
 657 | 			}
 658 | 
 659 | 			if (execState->errorCount > execState->maxErrorCount)
 660 | 				errorCountExceeded = true;
 661 | 		}
 662 | 	}
 663 | 
 664 | 	if (jsonObjectValid)
 665 | 	{
 666 | 		FillTupleSlot(jsonValue, NULL, columnMappingHash, columnValues, columnNulls);
 667 | 		ExecStoreVirtualTuple(tupleSlot);
 668 | 
 669 | 		yajl_tree_free(jsonValue);
 670 | 	}
 671 | 	else if (errorCountExceeded)
 672 | 	{
 673 | 		ereport(ERROR, (errmsg("could not parse %u json objects", execState->errorCount),
 674 | 						errhint("Last error message at line: %u: %s",
 675 | 								execState->currentLineNumber, errorBuffer)));
 676 | 	}
 677 | 
 678 | 	return tupleSlot;
 679 | }
 680 | 
 681 | 
 682 | // JsonReScanForeignScan rescans the foreign table.
 683 | static void
 684 | JsonReScanForeignScan(ForeignScanState *scanState)
 685 | {
 686 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 687 | 	JsonEndForeignScan(scanState);
 688 | 	JsonBeginForeignScan(scanState, 0);
 689 | }
 690 | 
 691 | 
 692 | /*
 693 |  * JsonEndForeignScan finishes scanning the foreign table, and frees the acquired
 694 |  * resources.
 695 |  */
 696 | static void
 697 | JsonEndForeignScan(ForeignScanState *scanState)
 698 | {
 699 | 	JsonFdwExecState *executionState = (JsonFdwExecState *) scanState->fdw_state;
 700 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 701 | 	if (executionState == NULL)
 702 | 	{
 703 | 		return;
 704 | 	}
 705 | 
 706 | 	if (executionState->filePointer != NULL)
 707 | 	{
 708 | 		int closeStatus = FreeFile(executionState->filePointer);
 709 | 		if (closeStatus != 0)
 710 | 		{
 711 | 			ereport(ERROR, (errcode_for_file_access(),
 712 | 					 		errmsg("could not close file \"%s\": %m",
 713 | 								   executionState->filename)));
 714 | 		}
 715 | 	}
 716 | 
 717 | 	if (executionState->gzFilePointer != NULL)
 718 | 	{
 719 | 		int closeStatus = gzclose(executionState->gzFilePointer);
 720 | 		if (closeStatus != Z_OK)
 721 | 		{
 722 | 			ereport(ERROR, (errcode_for_file_access(),
 723 | 					 		errmsg("could not close file \"%s\": %m",
 724 | 								   executionState->filename)));
 725 | 		}
 726 | 	}
 727 | 
 728 | 	if (executionState->columnMappingHash != NULL)
 729 | 	{
 730 | 		hash_destroy(executionState->columnMappingHash);
 731 | 	}
 732 | 
 733 | 	curlCfrFree(executionState->pCfr);
 734 | 
 735 | 	pfree(executionState);
 736 | }
 737 | 
 738 | 
 739 | /*
 740 |  * JsonGetOptions returns the option values to be used when reading and parsing 
 741 |  * the json file. To resolve these values, the function checks options for the
 742 |  * foreign table, and if not present, falls back to default values.
 743 |  */
 744 | static JsonFdwOptions *
 745 | JsonGetOptions(Oid foreignTableId)
 746 | {
 747 | 	JsonFdwOptions *jsonFdwOptions = (JsonFdwOptions *) palloc0(sizeof(JsonFdwOptions));
 748 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 749 | 
 750 | 	if(jsonFdwOptions != NULL)
 751 | 	{	char *maxErrorCountString = JsonGetOptionValue(foreignTableId, OPTION_NAME_MAX_ERROR_COUNT);
 752 | 
 753 | 		jsonFdwOptions->maxErrorCount = (maxErrorCountString != NULL
 754 | 			? pg_atoi(maxErrorCountString, sizeof(int32), 0)
 755 | 			: DEFAULT_MAX_ERROR_COUNT
 756 | 			);
 757 | 		jsonFdwOptions->filename = JsonGetOptionValue(foreignTableId, OPTION_NAME_FILENAME);
 758 | 		jsonFdwOptions->pHttpPostVars = JsonGetOptionValue(foreignTableId, OPTION_NAME_HTTP_POST_VARS);
 759 | 		jsonFdwOptions->pRomUrl = JsonGetOptionValue(foreignTableId, OPTION_NAME_ROM_URL);
 760 | 		jsonFdwOptions->pRomPath = JsonGetOptionValue(foreignTableId, OPTION_NAME_ROM_PATH);
 761 | 	}
 762 | 
 763 | 	return jsonFdwOptions;
 764 | }
 765 | 
 766 | 
 767 | /*
 768 |  * Json GetOptionValue walks over foreign table and foreign server options, and
 769 |  * looks for the option with the given name. If found, the function returns the
 770 |  * option's value. This function is unchanged from mongo_fdw.
 771 |  */
 772 | static char *
 773 | JsonGetOptionValue(Oid foreignTableId, const char *optionName)
 774 | {
 775 | 	ForeignTable *foreignTable = NULL;
 776 | 	ForeignServer *foreignServer = NULL;
 777 | 	List *optionList = NIL;
 778 | 	ListCell *optionCell = NULL;
 779 | 	char *optionValue = NULL;
 780 | 
 781 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
 782 | 	foreignTable = GetForeignTable(foreignTableId);
 783 | 	foreignServer = GetForeignServer(foreignTable->serverid);
 784 | 
 785 | 	optionList = list_concat(optionList, foreignTable->options);
 786 | 	optionList = list_concat(optionList, foreignServer->options);
 787 | 
 788 | 	foreach(optionCell, optionList)
 789 | 	{
 790 | 		DefElem *optionDef = (DefElem *) lfirst(optionCell);
 791 | 		char *optionDefName = optionDef->defname;
 792 | 
 793 | 		if (strncmp(optionDefName, optionName, NAMEDATALEN) == 0)
 794 | 		{
 795 | 			optionValue = defGetString(optionDef);
 796 | 			break;
 797 | 		}
 798 | 	}
 799 | 
 800 | 	return optionValue;
 801 | }
 802 | 
 803 | 
 804 | // TupleCount estimates the number of base relation tuples in the given file.
 805 | static double
 806 | TupleCount(RelOptInfo *baserel, const char *filename)
 807 | {
 808 | 	double tupleCount = 0.0;
 809 | 
 810 | 	BlockNumber pageCountEstimate = baserel->pages;
 811 | 	if (pageCountEstimate > 0)
 812 | 	{
 813 | 		/*
 814 | 		 * We have number of pages and number of tuples from pg_class (from a
 815 | 		 * previous Analyze), so compute a tuples-per-page estimate and scale
 816 | 		 * that by the current file size.
 817 | 		 */
 818 | 		double density = baserel->tuples / (double) pageCountEstimate;
 819 | 		BlockNumber pageCount = PageCount(filename);
 820 | 
 821 | 		tupleCount = clamp_row_est(density * (double) pageCount);
 822 | 	}
 823 | 	else
 824 | 	{
 825 | 		/*
 826 | 		 * Otherwise we have to fake it. We back into this estimate using the
 827 | 		 * planner's idea of relation width, which may be inaccurate. For better
 828 | 		 * estimates, users need to run Analyze.
 829 | 		 */
 830 | 		struct stat statBuffer;
 831 | 		int tupleWidth = 0;
 832 | 
 833 | 		int statResult = stat(filename, &statBuffer);
 834 | 		if (statResult < 0)
 835 | 		{
 836 | 			// file may not be there at plan time, so use a default estimate
 837 | 			statBuffer.st_size = 10 * BLCKSZ;
 838 | 		}
 839 | 
 840 | 		tupleWidth = MAXALIGN(baserel->width) + MAXALIGN(sizeof(HeapTupleHeaderData));
 841 | 		tupleCount = clamp_row_est((double) statBuffer.st_size / (double) tupleWidth);
 842 | 	}
 843 | 
 844 | 	return tupleCount;
 845 | }
 846 | 
 847 | 
 848 | // PageCount calculates and returns the number of pages in a file.
 849 | static BlockNumber
 850 | PageCount(const char *filename)
 851 | {
 852 | 	BlockNumber pageCount = 0;
 853 | 	struct stat statBuffer;
 854 | 
 855 | 	// if file doesn't exist at plan time, use default estimate for its size
 856 | 	int statResult = stat(filename, &statBuffer);
 857 | 	if (statResult < 0)
 858 | 	{
 859 | 		statBuffer.st_size = 10 * BLCKSZ;
 860 | 	}
 861 | 
 862 | 	pageCount = (statBuffer.st_size + (BLCKSZ - 1)) / BLCKSZ;
 863 | 	if (pageCount < 1)
 864 | 	{
 865 | 		pageCount = 1;
 866 | 	}
 867 | 
 868 | 	return pageCount;
 869 | }
 870 | 
 871 | 
 872 | /*
 873 |  * ColumnList takes in the planner's information about this foreign table. The
 874 |  * function then finds all columns needed for query execution, including those
 875 |  * used in projections, joins, and filter clauses, de-duplicates these columns,
 876 |  * and returns them in a new list. This function is unchanged from mongo_fdw. 
 877 |  */
 878 | static List *
 879 | ColumnList(RelOptInfo *baserel)
 880 | {
 881 | 	List *columnList = NIL;
 882 | 	List *neededColumnList = NIL;
 883 | 	AttrNumber columnIndex = 1;
 884 | 	AttrNumber columnCount = baserel->max_attr;
 885 | 	List *targetColumnList = baserel->reltargetlist;
 886 | 	List *restrictInfoList = baserel->baserestrictinfo;
 887 | 	ListCell *restrictInfoCell = NULL;
 888 | 
 889 | 	// first add the columns used in joins and projections
 890 | 	neededColumnList = list_copy(targetColumnList);
 891 | 
 892 | 	// then walk over all restriction clauses, and pull up any used columns
 893 | 	foreach(restrictInfoCell, restrictInfoList)
 894 | 	{
 895 | 		RestrictInfo *restrictInfo = (RestrictInfo *) lfirst(restrictInfoCell);
 896 | 		Node *restrictClause = (Node *) restrictInfo->clause;
 897 | 		List *clauseColumnList = NIL;
 898 | 
 899 | 		// recursively pull up any columns used in the restriction clause
 900 | 		clauseColumnList = pull_var_clause(restrictClause,
 901 | 						   PVC_RECURSE_AGGREGATES,
 902 | 						   PVC_RECURSE_PLACEHOLDERS);
 903 | 
 904 | 		neededColumnList = list_union(neededColumnList, clauseColumnList);
 905 | 	}
 906 | 
 907 | 	// walk over all column definitions, and de-duplicate column list
 908 | 	for (columnIndex = 1; columnIndex <= columnCount; columnIndex++)
 909 | 	{
 910 | 		ListCell *neededColumnCell = NULL;
 911 | 		Var *column = NULL;
 912 | 
 913 | 		// look for this column in the needed column list
 914 | 		foreach(neededColumnCell, neededColumnList)
 915 | 		{
 916 | 			Var *neededColumn = (Var *) lfirst(neededColumnCell);
 917 | 			if (neededColumn->varattno == columnIndex)
 918 | 			{
 919 | 				column = neededColumn;
 920 | 				break;
 921 | 			}
 922 | 		}
 923 | 
 924 | 		if (column != NULL)
 925 | 		{
 926 | 			columnList = lappend(columnList, column);
 927 | 		}
 928 | 	}
 929 | 
 930 | 	return columnList;
 931 | }
 932 | 
 933 | 
 934 | /*
 935 |  * ColumnMappingHash creates a hash table that maps column names to column index
 936 |  * and types. This table helps us quickly translate JSON document key/values to
 937 |  * corresponding PostgreSQL columns. This function is unchanged from mongo_fdw.
 938 |  */
 939 | static HTAB *
 940 | ColumnMappingHash(Oid foreignTableId, List *columnList)
 941 | {
 942 | 	HTAB *columnMappingHash = NULL;
 943 | 	ListCell *columnCell = NULL;
 944 | 	const long hashTableSize = 2048;
 945 | 
 946 | 	// create hash table
 947 | 	HASHCTL hashInfo;
 948 | 	memset(&hashInfo, 0, sizeof(hashInfo));
 949 | 	hashInfo.keysize = NAMEDATALEN;
 950 | 	hashInfo.entrysize = sizeof(ColumnMapping);
 951 | 	hashInfo.hash = string_hash;
 952 | 	hashInfo.hcxt = CurrentMemoryContext;
 953 | 
 954 | 	columnMappingHash = hash_create("Column Mapping Hash", hashTableSize, &hashInfo,
 955 | 									(HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT));
 956 | 	Assert(columnMappingHash != NULL);
 957 | 
 958 | 	foreach(columnCell, columnList)
 959 | 	{
 960 | 		Var *column = (Var *) lfirst(columnCell);
 961 | 		AttrNumber columnId = column->varattno;
 962 | 
 963 | 		ColumnMapping *columnMapping = NULL;
 964 | 		char *columnName = NULL;
 965 | 		bool handleFound = false;
 966 | 		void *hashKey = NULL;
 967 | 
 968 | 		columnName = get_relid_attribute_name(foreignTableId, columnId);
 969 | 		hashKey = (void *) columnName;
 970 | 
 971 | 		columnMapping = (ColumnMapping *) hash_search(columnMappingHash, hashKey,
 972 | 													  HASH_ENTER, &handleFound);
 973 | 		Assert(columnMapping != NULL);
 974 | 
 975 | 		columnMapping->columnIndex = columnId - 1;
 976 | 		columnMapping->columnTypeId = column->vartype;
 977 | 		columnMapping->columnTypeMod = column->vartypmod;
 978 | 		columnMapping->columnArrayTypeId = get_element_type(column->vartype);
 979 | 	}
 980 | 
 981 | 	return columnMappingHash;
 982 | }
 983 | 
 984 | 
 985 | // GzipFilename returns true if the filename ends with a gzip file extension.
 986 | static bool
 987 | GzipFilename(const char *filename)
 988 | {
 989 | 	bool gzipFile = false;
 990 | 	const char *extension = NULL;
 991 | 
 992 | 	extension = strrchr(filename, '.');
 993 | 	if (extension != NULL)
 994 | 	{
 995 | 		if (strncmp(extension, GZIP_FILE_EXTENSION, MAXPGPATH) == 0)
 996 | 		{
 997 | 			gzipFile = true;
 998 | 		}
 999 | 	}
1000 | 
1001 | 	return gzipFile;
1002 | }
1003 | 
1004 | 
1005 | // HdfsBlockName returns true if filename belongs to a hdfs block.
1006 | static bool
1007 | HdfsBlockName(const char *filename)
1008 | {
1009 | 	bool hdfsBlock = false;
1010 | 	const char *basename = NULL;
1011 | 
1012 | 	const char *lastDirSeparator = last_dir_separator(filename);
1013 | 	if (lastDirSeparator == NULL)
1014 | 	{
1015 | 		basename = filename;
1016 | 	}
1017 | 	else
1018 | 	{
1019 | 		basename = lastDirSeparator + 1;
1020 | 	}
1021 | 
1022 | 	if (strncmp(basename, HDFS_BLOCK_PREFIX, HDFS_BLOCK_PREFIX_LENGTH) == 0)
1023 | 	{
1024 | 		hdfsBlock = true;
1025 | 	}
1026 | 
1027 | 	return hdfsBlock;
1028 | }
1029 | 
1030 | 
1031 | /*
1032 |  * ReadLineFromFile reads and returns the next line in the file. If the function
1033 |  * reaches the end of file without reading input, it returns an empty string.
1034 |  */
1035 | static StringInfo
1036 | ReadLineFromFile(FILE *filePointer)
1037 | {
1038 | 	StringInfo lineData = makeStringInfo();
1039 | 	bool endOfFile = false;
1040 | 	bool endOfLine = false;
1041 | 	char buffer[READ_BUFFER_SIZE];
1042 | 
1043 | 	// read from file until either we reach end of file or end of line
1044 | 	while (!endOfFile && !endOfLine)
1045 | 	{
1046 | 		char *fgetsResult;
1047 | 		
1048 | 		memset(buffer, 0, sizeof(buffer));
1049 | 		fgetsResult = fgets(buffer, sizeof(buffer), filePointer);
1050 | 		if (fgetsResult == NULL)
1051 | 		{
1052 | 			int errorResult = ferror(filePointer);
1053 | 			if (errorResult != 0)
1054 | 			{
1055 | 				ereport(ERROR, (errcode_for_file_access(),
1056 | 								errmsg("could not read from json file: %m")));
1057 | 			}
1058 | 
1059 | 			endOfFile = true;
1060 | 		}
1061 | 		else
1062 | 		{
1063 | 			// check if we read a new line
1064 | 			endOfLine = (buffer[strlen(buffer) - 1] == '\n');
1065 | 
1066 | 			appendStringInfoString(lineData, buffer);
1067 | 		}
1068 | 	}
1069 | 
1070 | 	return lineData;
1071 | }
1072 | 
1073 | 
1074 | /*
1075 |  * ReadLineFromFile reads and returns the next line in the file. If the function
1076 |  * reaches the end of file without reading input, it returns an empty string.
1077 |  */
1078 | static StringInfo
1079 | ReadLineFromGzipFile(gzFile gzFilePointer)
1080 | {
1081 | 	StringInfo lineData = makeStringInfo();
1082 | 	bool endOfFile = false;
1083 | 	bool endOfLine = false;
1084 | 	char buffer[READ_BUFFER_SIZE];
1085 | 
1086 | 	// read from file until either we reach end of file or end of line
1087 | 	while (!endOfFile && !endOfLine)
1088 | 	{
1089 | 		char *getsResult = gzgets(gzFilePointer, buffer, sizeof(buffer));
1090 | 		if (getsResult == NULL)
1091 | 		{
1092 | 			int errorResult = 0;
1093 | 			const char *message = gzerror(gzFilePointer, &errorResult);
1094 | 			if (errorResult != Z_OK && errorResult != Z_STREAM_END)
1095 | 			{
1096 | 				ereport(ERROR, (errmsg("could not read from json file"), 
1097 | 								errhint("%s", message)));
1098 | 			}
1099 | 
1100 | 			endOfFile = true;
1101 | 		}
1102 | 		else
1103 | 		{
1104 | 			// check if we read a new line
1105 | 			endOfLine = (buffer[strlen(buffer) - 1] == '\n');
1106 | 
1107 | 			appendStringInfoString(lineData, buffer);
1108 | 		}
1109 | 	}
1110 | 
1111 | 	return lineData;
1112 | }
1113 | 
1114 | 
1115 | /*
1116 |  * FillTupleSlot walks over all key/value pairs in the given document. For each
1117 |  * pair, the function checks if the key appears in the column mapping hash, and
1118 |  * if the value type is compatible with the one specified for the column. If so
1119 |  * the function converts the value and fills the corresponding tuple position.
1120 |  * The jsonObjectKey parameter is used for recursion, and should always be
1121 |  * passed as NULL. This function is based on the function with the same name in
1122 |  * mongo_fdw.
1123 |  */
1124 | static void
1125 | FillTupleSlot(const yajl_val jsonObject, const char *jsonObjectKey,
1126 | 			  HTAB *columnMappingHash, Datum *columnValues, bool *columnNulls)
1127 | {
1128 | 	uint32 jsonKeyCount = jsonObject->u.object.len;
1129 | 	const char **jsonKeyArray = jsonObject->u.object.keys;
1130 | 	yajl_val *jsonValueArray = jsonObject->u.object.values;
1131 | 	uint32 jsonKeyIndex = 0;
1132 | 
1133 | 	// loop over key/value pairs of the json object
1134 | 	for (jsonKeyIndex = 0; jsonKeyIndex < jsonKeyCount; jsonKeyIndex++)
1135 | 	{
1136 | 		const char *jsonKey = jsonKeyArray[jsonKeyIndex];
1137 | 		yajl_val jsonValue = jsonValueArray[jsonKeyIndex];
1138 | 
1139 | 		ColumnMapping *columnMapping = NULL;
1140 | 		Oid columnTypeId = InvalidOid;
1141 | 		Oid columnArrayTypeId = InvalidOid;
1142 | 		Oid columnTypeMod = InvalidOid;
1143 | 		bool compatibleTypes = false;
1144 | 		bool handleFound = false;
1145 | 		const char *jsonFullKey = NULL;
1146 | 		void *hashKey = NULL;
1147 | 
1148 | 		if (jsonObjectKey != NULL)
1149 | 		{
1150 | 			/*
1151 | 			 * For fields in nested json objects, we use fully qualified field
1152 | 			 * name to check the column mapping.
1153 | 			 */
1154 | 			StringInfo jsonFullKeyString = makeStringInfo();
1155 | 			appendStringInfo(jsonFullKeyString, "%s.%s", jsonObjectKey, jsonKey);
1156 | 			jsonFullKey = jsonFullKeyString->data;
1157 | 		}
1158 | 		else
1159 | 		{
1160 | 			jsonFullKey = jsonKey;
1161 | 		}
1162 | 
1163 | 		// recurse into nested objects
1164 | 		if (YAJL_IS_OBJECT(jsonValue))
1165 | 		{
1166 | 			FillTupleSlot(jsonValue, jsonFullKey, columnMappingHash,
1167 | 						  columnValues, columnNulls);
1168 | 			continue;
1169 | 		}
1170 | 
1171 | 		// look up the corresponding column for this json key
1172 | 		hashKey = (void *) jsonFullKey;
1173 | 		columnMapping = (ColumnMapping *) hash_search(columnMappingHash, hashKey,
1174 | 													  HASH_FIND, &handleFound);
1175 | 
1176 | 		// if no corresponding column or null json value, continue
1177 | 		if (columnMapping == NULL || YAJL_IS_NULL(jsonValue))
1178 | 		{
1179 | 			continue;
1180 | 		}
1181 | 
1182 | 		// check if columns have compatible types
1183 | 		columnTypeId = columnMapping->columnTypeId;
1184 | 		columnArrayTypeId = columnMapping->columnArrayTypeId;
1185 | 		columnTypeMod = columnMapping->columnTypeMod;
1186 | 
1187 | 		if (OidIsValid(columnArrayTypeId))
1188 | 		{
1189 | 			compatibleTypes = YAJL_IS_ARRAY(jsonValue);
1190 | 		}
1191 | 		else
1192 | 		{
1193 | 			compatibleTypes = ColumnTypesCompatible(jsonValue, columnTypeId);
1194 | 		}
1195 | 
1196 | 		// if types are incompatible, leave this column null
1197 | 		if (!compatibleTypes)
1198 | 		{
1199 | 			continue;
1200 | 		}
1201 | 
1202 | 		// fill in corresponding column value and null flag
1203 | 		if (OidIsValid(columnArrayTypeId))
1204 | 		{
1205 | 			uint32 columnIndex = columnMapping->columnIndex;
1206 | 			columnValues[columnIndex] = ColumnValueArray(jsonValue, columnArrayTypeId,
1207 | 														 columnTypeMod);
1208 | 			columnNulls[columnIndex] = false;
1209 | 		}
1210 | 		else
1211 | 		{
1212 | 			uint32 columnIndex = columnMapping->columnIndex;
1213 | 			columnValues[columnIndex] = ColumnValue(jsonValue, columnTypeId,
1214 | 													columnTypeMod);
1215 | 			columnNulls[columnIndex] = false;
1216 | 		}
1217 | 	}
1218 | }
1219 | 
1220 | 
1221 | /*
1222 |  * ColumnTypesCompatible checks if the given json value can be converted to the
1223 |  * given PostgreSQL type.
1224 |  */
1225 | static bool
1226 | ColumnTypesCompatible(yajl_val jsonValue, Oid columnTypeId)
1227 | {
1228 | 	bool compatibleTypes = false;
1229 | 
1230 | 	// we consider the PostgreSQL column type as authoritative
1231 | 	switch(columnTypeId)
1232 | 	{
1233 | 		case INT2OID: case INT4OID:
1234 | 		case INT8OID: case FLOAT4OID:
1235 | 		case FLOAT8OID: case NUMERICOID:
1236 | 		{
1237 | 			if (YAJL_IS_NUMBER(jsonValue))
1238 | 			{
1239 | 				compatibleTypes = true;
1240 | 			}
1241 | 			break;
1242 | 		}
1243 | 		case BOOLOID:
1244 | 		{
1245 | 			if (YAJL_IS_TRUE(jsonValue) || YAJL_IS_FALSE(jsonValue))
1246 | 			{
1247 | 				compatibleTypes = true;
1248 | 			}
1249 | 			break;
1250 | 		}
1251 | 		case BPCHAROID:
1252 | 		case VARCHAROID:
1253 | 		case TEXTOID:
1254 | 		{
1255 | 			if (YAJL_IS_STRING(jsonValue))
1256 | 			{
1257 | 				compatibleTypes = true;
1258 | 			}
1259 | 			break;
1260 | 		}
1261 | 		case DATEOID:
1262 | 		case TIMESTAMPOID:
1263 | 		case TIMESTAMPTZOID:
1264 | 		{
1265 | 			if (YAJL_IS_STRING(jsonValue))
1266 | 			{
1267 | 				const char *stringValue = (char *) YAJL_GET_STRING(jsonValue);
1268 | 
1269 | 				bool validDateTimeFormat = ValidDateTimeFormat(stringValue);
1270 | 				if (validDateTimeFormat)
1271 | 				{
1272 | 					compatibleTypes = true;
1273 | 				}
1274 | 			}
1275 | 			break;
1276 | 		}
1277 | 		default:
1278 | 		{
1279 | 			/*
1280 | 			 * We currently error out on other data types. Some types such as
1281 | 			 * byte arrays are easy to add, but they need testing. Other types
1282 | 			 * such as money or inet, do not have equivalents in JSON.
1283 | 			 */
1284 | 			ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE),
1285 | 							errmsg("cannot convert json type to column type"),
1286 | 							errhint("column type: %u", (uint32) columnTypeId)));
1287 | 			break;
1288 | 		}
1289 | 	}
1290 | 
1291 | 	return compatibleTypes;
1292 | }
1293 | 
1294 | 
1295 | /*
1296 |  * ValidDateTimeFormat checks if the given dateTimeString can be parsed and decoded
1297 |  * as a date/timestamp. The algorithm used here is based on date_in, timestamp_in,
1298 |  * and timestamptz_in functions.
1299 |  */
1300 | static bool
1301 | ValidDateTimeFormat(const char *dateTimeString)
1302 | {
1303 | 	bool validDateTimeFormat = false;
1304 | 	char workBuffer[MAXDATELEN + 1];
1305 | 	char *fieldArray[MAXDATEFIELDS];
1306 | 	int fieldTypeArray[MAXDATEFIELDS];
1307 | 	int fieldCount = 0;
1308 | 
1309 | 	int parseError = ParseDateTime(dateTimeString, workBuffer, sizeof(workBuffer),
1310 | 								   fieldArray, fieldTypeArray, MAXDATEFIELDS, 
1311 | 								   &fieldCount);
1312 | 
1313 | 	if (parseError == 0)
1314 | 	{
1315 | 		int dateType = 0;
1316 | 		struct pg_tm dateTime;
1317 | 		fsec_t fractionalSecond = 0;
1318 | 		int timezone = 0;
1319 | 
1320 | 		int decodeError = DecodeDateTime(fieldArray, fieldTypeArray, fieldCount,
1321 | 										 &dateType, &dateTime, &fractionalSecond,
1322 | 										 &timezone);
1323 | 		if (decodeError == 0)
1324 | 		{
1325 | 			/* 
1326 | 			 * We only accept DTK_DATE, DTK_EPOCH, DTK_LATE, and DTK_EARLY date
1327 | 			 * types. For other date types, input functions raise an error.
1328 | 			 */
1329 | 			if (dateType == DTK_DATE || dateType == DTK_EPOCH ||
1330 | 				dateType == DTK_LATE || dateType == DTK_EARLY)
1331 | 			{
1332 | 				validDateTimeFormat = true;
1333 | 			}
1334 | #ifdef DEBUG
1335 | 			else
1336 | 				ereport(DEBUG1, (errmsg("%s:%s:%u invlalid format", __FILE__, __func__, __LINE__)));
1337 | #endif
1338 | 		}
1339 | #ifdef DEBUG
1340 | 		else
1341 | 			ereport(DEBUG1, (errmsg("%s:%s:%u decode error", __FILE__, __func__, __LINE__)));
1342 | #endif
1343 | 	}
1344 | #ifdef DEBUG
1345 | 	else
1346 | 		ereport(DEBUG1, (errmsg("%s:%s:%u parse error", __FILE__, __func__, __LINE__)));
1347 | #endif
1348 | 
1349 | 	return validDateTimeFormat;
1350 | }
1351 | 
1352 | 
1353 | /*
1354 |  * ColumnValueArray uses array element type id to read the current array pointed
1355 |  * to by the jsonArray, and converts each array element with matching type to 
1356 |  * the corresponding PostgreSQL datum. Then, the function constructs an array
1357 |  * datum from element datums, and returns the array datum. This function ignores
1358 |  * values that aren't type compatible with valueTypeId.
1359 |  */
1360 | static Datum
1361 | ColumnValueArray(yajl_val jsonArray, Oid valueTypeId, Oid valueTypeMod)
1362 | {
1363 | 	Datum columnValueDatum = 0;
1364 | 	ArrayType *columnValueObject = NULL;
1365 | 	bool typeByValue = false;
1366 | 	char typeAlignment = 0;
1367 | 	int16 typeLength = 0;
1368 | 
1369 | 	uint32 jsonValueCount = jsonArray->u.array.len;
1370 | 	yajl_val *jsonValueArray = jsonArray->u.array.values;
1371 | 
1372 | 	// allocate enough room for datum array's maximum possible size
1373 | 	Datum *datumArray = palloc0(jsonValueCount * sizeof(Datum));
1374 | 	uint32 datumArraySize = 0;
1375 | 
1376 | 	uint32 jsonValueIndex = 0;
1377 | 	for (jsonValueIndex = 0; jsonValueIndex < jsonValueCount; jsonValueIndex++)
1378 | 	{
1379 | 		yajl_val jsonValue = jsonValueArray[jsonValueIndex];
1380 | 
1381 | 		bool compatibleTypes = ColumnTypesCompatible(jsonValue, valueTypeId);
1382 | 		if (compatibleTypes)
1383 | 		{
1384 | 			datumArray[datumArraySize] = ColumnValue(jsonValue, valueTypeId,
1385 | 													 valueTypeMod);
1386 | 			datumArraySize++;
1387 | 		}
1388 | 	}
1389 | 
1390 | 	get_typlenbyvalalign(valueTypeId, &typeLength, &typeByValue, &typeAlignment);
1391 | 	columnValueObject = construct_array(datumArray, datumArraySize, valueTypeId,
1392 | 										typeLength, typeByValue, typeAlignment);
1393 | 
1394 | 	columnValueDatum = PointerGetDatum(columnValueObject);
1395 | 	return columnValueDatum;
1396 | }
1397 | 
1398 | 
1399 | /*
1400 |  * ColumnValue uses column type information to read the current value pointed to
1401 |  * by jsonValue, and converts this value to the corresponding PostgreSQL datum.
1402 |  * The function then returns this datum.
1403 |  */
1404 | static Datum
1405 | ColumnValue(yajl_val jsonValue, Oid columnTypeId, int32 columnTypeMod)
1406 | {
1407 | 	Datum columnValue = 0;
1408 | 
1409 | 	switch(columnTypeId)
1410 | 	{
1411 | 		case INT2OID:
1412 | 		{
1413 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1414 | 			columnValue = DirectFunctionCall1(int2in, CStringGetDatum(value));
1415 | 			break;
1416 | 		}
1417 | 		case INT4OID:
1418 | 		{
1419 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1420 | 			columnValue = DirectFunctionCall1(int4in, CStringGetDatum(value));
1421 | 			break;
1422 | 		}
1423 | 		case INT8OID:
1424 | 		{
1425 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1426 | 			columnValue = DirectFunctionCall1(int8in, CStringGetDatum(value));
1427 | 			break;
1428 | 		}
1429 | 		case FLOAT4OID:
1430 | 		{
1431 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1432 | 			columnValue = DirectFunctionCall1(float4in, CStringGetDatum(value));
1433 | 			break;
1434 | 		}
1435 | 		case FLOAT8OID:
1436 | 		{
1437 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1438 | 			columnValue = DirectFunctionCall1(float8in, CStringGetDatum(value));
1439 | 			break;
1440 | 		}
1441 | 		case NUMERICOID:
1442 | 		{
1443 | 			const char *value = YAJL_GET_NUMBER(jsonValue);
1444 | 			columnValue = DirectFunctionCall3(numeric_in, CStringGetDatum(value),
1445 | 											  ObjectIdGetDatum(InvalidOid),
1446 | 											  Int32GetDatum(columnTypeMod));
1447 | 			break;
1448 | 		}
1449 | 		case BOOLOID:
1450 | 		{
1451 | 			bool value = YAJL_IS_TRUE(jsonValue);
1452 | 			columnValue = BoolGetDatum(value);
1453 | 			break;
1454 | 		}
1455 | 		case BPCHAROID:
1456 | 		{
1457 | 			const char *value = YAJL_GET_STRING(jsonValue);
1458 | 			columnValue = DirectFunctionCall3(bpcharin, CStringGetDatum(value),
1459 | 											  ObjectIdGetDatum(InvalidOid),
1460 | 											  Int32GetDatum(columnTypeMod));
1461 | 			break;
1462 | 		}
1463 | 		case VARCHAROID:
1464 | 		{
1465 | 			const char *value = YAJL_GET_STRING(jsonValue);
1466 | 			columnValue = DirectFunctionCall3(varcharin, CStringGetDatum(value),
1467 | 											  ObjectIdGetDatum(InvalidOid),
1468 | 											  Int32GetDatum(columnTypeMod));
1469 | 			break;
1470 | 		}
1471 | 		case TEXTOID:
1472 | 		{
1473 | 			const char *value = YAJL_GET_STRING(jsonValue);
1474 | 			columnValue = CStringGetTextDatum(value);
1475 | 			break;
1476 | 		}
1477 | 		case DATEOID:
1478 | 		{
1479 | 			const char *value = YAJL_GET_STRING(jsonValue);
1480 | 			columnValue = DirectFunctionCall1(date_in, CStringGetDatum(value));
1481 | 			break;
1482 | 		}
1483 | 		case TIMESTAMPOID:
1484 | 		{
1485 | 			const char *value = YAJL_GET_STRING(jsonValue);
1486 | 			columnValue = DirectFunctionCall3(timestamp_in, CStringGetDatum(value),
1487 | 											  ObjectIdGetDatum(InvalidOid),
1488 | 											  Int32GetDatum(columnTypeMod));
1489 | 			break;
1490 | 		}
1491 | 		case TIMESTAMPTZOID:
1492 | 		{
1493 | 			const char *value = YAJL_GET_STRING(jsonValue);
1494 | 			columnValue = DirectFunctionCall3(timestamptz_in, CStringGetDatum(value),
1495 | 											  ObjectIdGetDatum(InvalidOid),
1496 | 											  Int32GetDatum(columnTypeMod));
1497 | 			break;
1498 | 		}
1499 | 		default:
1500 | 		{
1501 | 			ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE),
1502 | 							errmsg("cannot convert json type to column type"),
1503 | 							errhint("column type: %u", (uint32) columnTypeId)));
1504 | 			break;
1505 | 		}
1506 | 	}
1507 | 
1508 | 	return columnValue;
1509 | }
1510 | 
1511 | 
1512 | /*
1513 |  * JsonAnalyzeForeignTable sets the total page count and the function pointer
1514 |  * used to acquire a random sample of rows from the foreign file.
1515 |  */
1516 | static bool
1517 | JsonAnalyzeForeignTable(Relation relation,
1518 | 						AcquireSampleRowsFunc *acquireSampleRowsFunc,
1519 | 						BlockNumber *totalPageCount)
1520 | {
1521 | 	Oid foreignTableId = RelationGetRelid(relation);
1522 | 	JsonFdwOptions *options = JsonGetOptions(foreignTableId);
1523 | 	BlockNumber pageCount = 0;
1524 | 	struct stat statBuffer;
1525 | 
1526 | 	int statResult = stat(options->filename, &statBuffer);
1527 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
1528 | 	if (statResult < 0)
1529 | 	{
1530 | 		ereport(ERROR, (errcode_for_file_access(),
1531 | 				 		errmsg("could not stat file \"%s\": %m",
1532 | 							   options->filename)));
1533 | 	}
1534 | 
1535 | 	/*
1536 | 	 * Our estimate should return at least 1 so that we can tell later on that
1537 | 	 * pg_class.relpages is not default.
1538 | 	 */
1539 | 	pageCount = (statBuffer.st_size + (BLCKSZ - 1)) / BLCKSZ;
1540 | 	if (pageCount < 1)
1541 | 	{
1542 | 		pageCount = 1;
1543 | 	}
1544 | 
1545 | 	(*totalPageCount) = pageCount;
1546 | 	(*acquireSampleRowsFunc) = JsonAcquireSampleRows;
1547 | 
1548 | 	return true;
1549 | }
1550 | 
1551 | 
1552 | /*
1553 |  * JsonAcquireSampleRows acquires a random sample of rows from the foreign
1554 |  * table. Selected rows are returned in the caller allocated sampleRows array,
1555 |  * which must have at least target row count entries. The actual number of rows
1556 |  * selected is returned as the function result. We also count the number of rows
1557 |  * in the collection and return it in total row count. We also always set dead
1558 |  * row count to zero.
1559 |  *
1560 |  * Note that the returned list of rows does not always follow their actual order
1561 |  * in the JSON file. Therefore, correlation estimates derived later could be
1562 |  * inaccurate, but that's OK. We currently don't use correlation estimates (the
1563 |  * planner only pays attention to correlation for index scans).
1564 |  */
1565 | static int
1566 | JsonAcquireSampleRows(Relation relation, int logLevel,
1567 | 					  HeapTuple *sampleRows, int targetRowCount,
1568 | 					  double *totalRowCount, double *totalDeadRowCount)
1569 | {
1570 | 	int sampleRowCount = 0;
1571 | 	double rowCount = 0.0;
1572 | 	double rowCountToSkip = -1;	// -1 means not set yet
1573 | 	double selectionState = 0;
1574 | 	MemoryContext oldContext = CurrentMemoryContext;
1575 | 	MemoryContext tupleContext = NULL;
1576 | 	Datum *columnValues = NULL;
1577 | 	bool *columnNulls = NULL;
1578 | 	TupleTableSlot *scanTupleSlot = NULL;
1579 | 	List *columnList = NIL;
1580 | 	List *foreignPrivateList = NULL;
1581 | 	ForeignScanState *scanState = NULL;
1582 | 	ForeignScan *foreignScan = NULL;
1583 | 	char *relationName = NULL;
1584 | 	int executorFlags = 0;
1585 | 
1586 | 	TupleDesc tupleDescriptor = RelationGetDescr(relation);
1587 | 	int columnCount = tupleDescriptor->natts;
1588 | 	Form_pg_attribute *attributes = tupleDescriptor->attrs;
1589 | 
1590 | 	// create list of columns of the relation
1591 | 	int columnIndex = 0;
1592 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
1593 | 	for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
1594 | 	{
1595 | 		Var *column = (Var *) palloc0(sizeof(Var));
1596 | 
1597 | 		// only assign required fields for column mapping hash
1598 | 		column->varattno = columnIndex + 1;
1599 | 		column->vartype = attributes[columnIndex]->atttypid;
1600 | 		column->vartypmod = attributes[columnIndex]->atttypmod;
1601 | 
1602 | 		columnList = lappend(columnList, column);
1603 | 	}
1604 | 
1605 | 	// setup foreign scan plan node
1606 | 	foreignPrivateList = list_make1(columnList);
1607 | 	foreignScan = makeNode(ForeignScan);
1608 | 	foreignScan->fdw_private = foreignPrivateList;
1609 | 
1610 | 	// setup tuple slot
1611 | 	columnValues = (Datum *) palloc0(columnCount * sizeof(Datum));
1612 | 	columnNulls = (bool *) palloc0(columnCount * sizeof(bool));	
1613 | 	scanTupleSlot = MakeTupleTableSlot();
1614 | 	scanTupleSlot->tts_tupleDescriptor = tupleDescriptor;
1615 | 	scanTupleSlot->tts_values = columnValues;
1616 | 	scanTupleSlot->tts_isnull = columnNulls;
1617 | 
1618 | 	// setup scan state
1619 | 	scanState = makeNode(ForeignScanState);
1620 | 	scanState->ss.ss_currentRelation = relation;
1621 | 	scanState->ss.ps.plan = (Plan *) foreignScan;
1622 | 	scanState->ss.ss_ScanTupleSlot = scanTupleSlot;
1623 | 
1624 | 	JsonBeginForeignScan(scanState, executorFlags);
1625 | 
1626 | 	/*
1627 | 	 * Use per-tuple memory context to prevent leak of memory used to read and
1628 | 	 * parse rows from the file using ReadLineFromFile and FillTupleSlot.
1629 | 	 */
1630 | 	tupleContext = AllocSetContextCreate(CurrentMemoryContext,
1631 | 					 "json_fdw temporary context",
1632 | 					 ALLOCSET_DEFAULT_MINSIZE,
1633 | 					 ALLOCSET_DEFAULT_INITSIZE,
1634 | 					 ALLOCSET_DEFAULT_MAXSIZE);
1635 | 
1636 | 	// prepare for sampling rows
1637 | 	selectionState = anl_init_selection_state(targetRowCount);
1638 | 
1639 | 	for (;;)
1640 | 	{
1641 | 		// check for user-requested abort or sleep
1642 | 		vacuum_delay_point();
1643 | 
1644 | 		memset(columnValues, 0, columnCount * sizeof(Datum));
1645 | 		memset(columnNulls, true, columnCount * sizeof(bool));
1646 | 
1647 | 		MemoryContextReset(tupleContext);
1648 | 		MemoryContextSwitchTo(tupleContext);
1649 | 
1650 | 		// read the next record
1651 | 		JsonIterateForeignScan(scanState);
1652 | 
1653 | 		MemoryContextSwitchTo(oldContext);
1654 | 
1655 | 		// if there are no more records to read, break
1656 | 		if (scanTupleSlot->tts_isempty)
1657 | 		{
1658 | 			break;
1659 | 		}
1660 | 
1661 | 		/*
1662 | 		 * The first targetRowCount sample rows are simply copied into the
1663 | 		 * reservoir. Then we start replacing tuples in the sample until we
1664 | 		 * reach the end of the relation. This algorithm is from Jeff Vitter's
1665 | 		 * paper (see more info in commands/analyze.c).
1666 | 		 */
1667 | 		if (sampleRowCount < targetRowCount)
1668 | 		{
1669 | 			sampleRows[sampleRowCount++] = heap_form_tuple(tupleDescriptor, 
1670 | 								   columnValues,
1671 | 								   columnNulls);
1672 | 		}
1673 | 		else
1674 | 		{
1675 | 			/*
1676 | 			 * t in Vitter's paper is the number of records already processed.
1677 | 			 * If we need to compute a new S value, we must use the "not yet
1678 | 			 * incremented" value of rowCount as t.
1679 | 			 */
1680 | 			if (rowCountToSkip < 0)
1681 | 			{
1682 | 				rowCountToSkip = anl_get_next_S(rowCount, targetRowCount, &selectionState);
1683 | 			}
1684 | 
1685 | 			if (rowCountToSkip <= 0)
1686 | 			{
1687 | 				/*
1688 | 				 * Found a suitable tuple, so save it, replacing one old tuple
1689 | 				 * at random.
1690 | 				 */
1691 | 				int rowIndex = (int) (targetRowCount * anl_random_fract());
1692 | 				Assert(rowIndex >= 0);
1693 | 				Assert(rowIndex < targetRowCount);
1694 | 
1695 | 				heap_freetuple(sampleRows[rowIndex]);
1696 | 				sampleRows[rowIndex] = heap_form_tuple(tupleDescriptor, columnValues, columnNulls);
1697 | 			}
1698 | 
1699 | 			rowCountToSkip -= 1;
1700 | 		}
1701 | 
1702 | 		rowCount += 1;
1703 | 	}
1704 | 
1705 | 	// clean up
1706 | 	MemoryContextDelete(tupleContext);
1707 | 	pfree(columnValues);
1708 | 	pfree(columnNulls);
1709 | 
1710 | 	JsonEndForeignScan(scanState);
1711 | 
1712 | 	// emit some interesting relation info
1713 | 	relationName = RelationGetRelationName(relation);
1714 | 	ereport(logLevel, (errmsg("\"%s\": file contains %.0f rows; %d rows in sample",
1715 | 				  relationName, rowCount, sampleRowCount)));
1716 | 
1717 | 	(*totalRowCount) = rowCount;
1718 | 	(*totalDeadRowCount) = 0;
1719 | 
1720 | 	return sampleRowCount;
1721 | }
1722 | 
1723 | // *** All the stuff below here, was broken by Neal Horman ;)
1724 | static char *JsonAttributeNameGet(int varno, int varattno, PlannerInfo *root)
1725 | {
1726 | 	RangeTblEntry *rte = planner_rt_fetch(varno, root);
1727 | 	List *options = GetForeignColumnOptions(rte->relid, varattno);
1728 | 	char *colname = NULL;
1729 | 	ListCell *lc;
1730 | 
1731 | 	foreach(lc, options)
1732 | 	{
1733 | 		DefElem *def = (DefElem *) lfirst(lc);
1734 | 
1735 | 		if (strcmp(def->defname, "column_name") == 0)
1736 | 		{
1737 | 			colname = defGetString(def);
1738 | 			break;
1739 | 		}
1740 | 	}
1741 | 
1742 | 	if(colname == NULL)
1743 | 		colname = get_relid_attribute_name(rte->relid, varattno);
1744 | 
1745 | 	return colname;
1746 | }
1747 | 
1748 | /*
1749 |  * An insert operation consists of
1750 |  *	PlanForeignModify
1751 |  *	BeginForeignModify
1752 |  *	ExecForeignInsert
1753 |  *	EndForeignModify
1754 |  */
1755 | 
1756 | static List *JsonPlanForeignModify(PlannerInfo *root, ModifyTable *plan, Index resultRelation, int subplan_index)
1757 | {
1758 | 	CmdType		operation = plan->operation;
1759 | 	RangeTblEntry	*rte = planner_rt_fetch(resultRelation, root);
1760 | 	Relation	rel = heap_open(rte->relid, NoLock);
1761 | 	ForeignTable	*table = GetForeignTable(RelationGetRelid(rel));
1762 | 	char		*tableName = RelationGetRelationName(rel);
1763 | 	List		*targetAttrs = NULL;
1764 | 	List		*targetNames = NULL;
1765 | 	ListCell	*lc;
1766 | 	char const	*pRomUrl = NULL;
1767 | 	char const	*pRomPath = NULL;
1768 | 	rci_t		*pRci = NULL;
1769 | 	StringInfoData	strUrl;
1770 | 
1771 | 	initStringInfo(&strUrl);
1772 | 
1773 | 	// find the ROM url and path options
1774 | 	foreach(lc, table->options)
1775 | 	{
1776 | 		DefElem *def = (DefElem *) lfirst(lc);
1777 | 		const char *str = defGetString(def);
1778 | 
1779 | 		//ELog(DEBUG1, "%s:%d '%s' --> '%s'", __func__, __LINE__, def->defname, str);
1780 | 		if(strcasecmp(def->defname, OPTION_NAME_ROM_URL) == 0)
1781 | 			pRomUrl = str;
1782 | 		else if(strcasecmp(def->defname, OPTION_NAME_ROM_PATH) == 0)
1783 | 			pRomPath = str;
1784 | 	}
1785 | 
1786 | 	//ELog(DEBUG1, "%s:%d table name '%s'", __func__, __LINE__, tableName);
1787 | 
1788 | 	// fetch the ROM
1789 | 	pRci = rciFetch(pRomUrl, pRomPath, 
1790 | 			(
1791 | 			operation == CMD_INSERT ? RCI_ACTION_INSERT :
1792 | 			operation == CMD_UPDATE ? RCI_ACTION_UPDATE :
1793 | 			//operation == CMD_DELETE ? RCI_ACTION_DELETE :
1794 | 			RCI_ACTION_NONE
1795 | 			)
1796 | 		);
1797 | 
1798 | 	if(!rciError(pRci, pRomUrl, pRomPath)
1799 | 		&& rciMethod(pRci, "put", pRomUrl, pRomPath)
1800 | 		)
1801 | 	{
1802 | 		appendStringInfoString(&strUrl, pRci->pUrl);
1803 | 		//ELog(DEBUG1, "%s:%d url '%s'", __func__, __LINE__, strUrl.data);
1804 | 	}
1805 | 	rciFree(pRci);
1806 | 
1807 | 	switch (operation)
1808 | 	{
1809 | 		case CMD_INSERT:
1810 | 		case CMD_UPDATE:
1811 | 			{
1812 | 				TupleDesc tupdesc = RelationGetDescr(rel);
1813 | 				int attnum;
1814 | 
1815 | 				// collect relation information
1816 | 				for (attnum = 1; attnum <= tupdesc->natts; attnum++)
1817 | 				{
1818 | 					Form_pg_attribute attr = tupdesc->attrs[attnum - 1];
1819 | 
1820 | 					if (!attr->attisdropped)
1821 | 					{
1822 | 						// collect the name of the attribute
1823 | 						char *colname = JsonAttributeNameGet(resultRelation, attnum, root);
1824 | 						targetNames = lappend(targetNames, colname);
1825 | 
1826 | 						// collect the index of the attribute
1827 | 						targetAttrs = lappend_int(targetAttrs, attnum);
1828 | 
1829 | 						//ELog(DEBUG1, "%s:%d %s", __func__, __LINE__, colname);
1830 | 					}
1831 | 				}
1832 | 			}
1833 | 			break;
1834 | 		default:
1835 | 			break;
1836 | 	}
1837 | 
1838 | 	heap_close(rel, NoLock);
1839 | 	return list_make3(targetNames, targetAttrs, strUrl.data);
1840 | }
1841 | 
1842 | static void JsonBeginForeignModify(
1843 | 	ModifyTableState *mtstate,
1844 | 	ResultRelInfo *resultRelInfo,
1845 | 	List *fdw_private,
1846 | 	int subplan_index,
1847 | 	int eflags
1848 | 	)
1849 | {
1850 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
1851 | 
1852 | 	if(!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
1853 | 	{
1854 | 		AttrNumber n_params = 0;
1855 | 		Oid typefnoid = InvalidOid;
1856 | 		bool isvarlena = false;
1857 | 		ListCell *lc = NULL;
1858 | 		EState *estate = mtstate->ps.state;
1859 | 		Relation rel = resultRelInfo->ri_RelationDesc;
1860 | 		Oid foreignTableId = RelationGetRelid(rel);
1861 | 		ForeignTable *table = GetForeignTable(foreignTableId);
1862 | 		jfmes_t *pJfmes = (jfmes_t *) palloc0(sizeof(jfmes_t));
1863 | 
1864 | 		//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
1865 | 		if(pJfmes != NULL)
1866 | 		{
1867 | 			pJfmes->rel = rel;
1868 | 
1869 | 			pJfmes->retrieved_names = (List *) list_nth(fdw_private, 0);
1870 | 			pJfmes->retrieved_attrs = (List *) list_nth(fdw_private, 1);
1871 | 			pJfmes->pUrl = (char const *) list_nth(fdw_private, 2);
1872 | 			pJfmes->table_options = table->options;
1873 | 
1874 | 			n_params = list_length(pJfmes->retrieved_attrs) + 1;
1875 | 			pJfmes->p_flinfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * n_params);
1876 | 			pJfmes->p_nums = 0;
1877 | 
1878 | 			pJfmes->temp_cxt = AllocSetContextCreate(
1879 | 				estate->es_query_cxt,
1880 | 				"json_fdw temporary data",
1881 | 				ALLOCSET_SMALL_MINSIZE,
1882 | 				ALLOCSET_SMALL_INITSIZE,
1883 | 				ALLOCSET_SMALL_MAXSIZE
1884 | 				);
1885 | 
1886 | 			//ELog(DEBUG1, "%s:%d put url '%s'", __func__, __LINE__, pJfmes->pUrl);
1887 | 			// collect accessor functions for each attribute
1888 | 			foreach(lc, pJfmes->retrieved_attrs)
1889 | 			{
1890 | 				int attnum = lfirst_int(lc);
1891 | 				Form_pg_attribute attr = RelationGetDescr(rel)->attrs[attnum - 1];
1892 | 
1893 | 				Assert(!attr->attisdropped);
1894 | 
1895 | 				getTypeOutputInfo(attr->atttypid, &typefnoid, &isvarlena);
1896 | 				fmgr_info(typefnoid, &pJfmes->p_flinfo[pJfmes->p_nums]);
1897 | 				pJfmes->p_nums++;
1898 | 				//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
1899 | 			}
1900 | 			Assert(pJfmes->p_nums <= n_params);
1901 | 		}
1902 | 
1903 | 		resultRelInfo->ri_FdwState = pJfmes;
1904 | 	}
1905 | }
1906 | 
1907 | static int JsonPg2Json(StringInfo Str, Oid type, Datum value, const char *name, bool *isnull);
1908 | 
1909 | static TupleTableSlot *JsonExecForeignInsert(
1910 | 	EState *estate,
1911 | 	ResultRelInfo *resultRelInfo,
1912 | 	TupleTableSlot *slot,
1913 | 	TupleTableSlot *planSlot
1914 | 	)
1915 | {
1916 | 	jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState;
1917 | 	MemoryContext oldContext = MemoryContextSwitchTo(pJfmes->temp_cxt);
1918 | 	int nParams = list_length(pJfmes->retrieved_attrs);
1919 | 
1920 | 	if(nParams == list_length(pJfmes->retrieved_names))
1921 | 	{
1922 | 		bool *isnull = (bool*) palloc0(sizeof(bool) * nParams);
1923 | 		ListCell *lcAttrs = NULL;
1924 | 		ListCell *lcNames = list_head(pJfmes->retrieved_names);
1925 | 		StringInfoData str;
1926 | 		int paramNum = 0;
1927 | 		int paramCount = 0;
1928 | 		int ok = 0;
1929 | 
1930 | 		// count the number of non-null attributes
1931 | 		foreach(lcAttrs, pJfmes->retrieved_attrs)
1932 | 		{
1933 | 			bool bIsNull = true;
1934 | 			slot_getattr(slot, lfirst_int(lcAttrs), &bIsNull);
1935 | 			paramCount += (!bIsNull);
1936 | 		}
1937 | 
1938 | 		// build json object document string
1939 | 		initStringInfo(&str);
1940 | 		appendStringInfoString(&str, "{ ");
1941 | 		foreach(lcAttrs, pJfmes->retrieved_attrs)
1942 | 		{
1943 | 			int attnum = lfirst_int(lcAttrs) - 1;
1944 | 			Datum value = slot_getattr(slot, attnum + 1, &isnull[attnum]);
1945 | 			Oid type = slot->tts_tupleDescriptor->attrs[attnum]->atttypid;
1946 | 
1947 | 			//ELog(DEBUG1, "%s:%d %u/%u %s %u", __func__, __LINE__, attnum, nParams, lfirst(lcNames), isnull[attnum]);
1948 | 			if(JsonPg2Json(&str, type, value, lfirst(lcNames), &isnull[attnum])
1949 | 				// if not last attribute
1950 | 				&& paramNum < paramCount - 1
1951 | 				&& !isnull[attnum]
1952 | 			)
1953 | 			{
1954 | 				appendStringInfoString(&str, ", ");
1955 | 				paramNum++;
1956 | 			}
1957 | 
1958 | 			lcNames = lnext(lcNames);
1959 | 		}
1960 | 		appendStringInfoString(&str, " }");
1961 | 
1962 | 		// send the json object to the remote server
1963 | 		ok = curlPut(pJfmes->pUrl, str.data, strlen(str.data), "application/json");
1964 | 
1965 | 		//ELog(DEBUG1, "%s:%d '%s' --> %s %s", __func__, __LINE__, str.data, pJfmes->pUrl, ok ? "OK" : "FAIL");
1966 | 	}
1967 | 
1968 | 	MemoryContextSwitchTo(oldContext);
1969 | 	MemoryContextReset(pJfmes->temp_cxt);
1970 | 
1971 | 	return slot;
1972 | }
1973 | 
1974 | 
1975 | /*
1976 |  *
1977 |  * An update operation consists of
1978 |  *	AddForeignUpdateTargets
1979 |  *
1980 |  *	GetForeignRelSize
1981 |  *	GetForeignPaths
1982 |  *	GetForeignPlan
1983 |  *
1984 |  *	PlanForeignModify
1985 |  *	BeginForeignScan
1986 |  *	BeginForeignModify
1987 |  *	EndForeignModify
1988 |  *
1989 |  *	EndForeignScan
1990 |  */
1991 | 
1992 | static void JsonAddForeignUpdateTargets(Query *parsetree, RangeTblEntry *target_rte, Relation target_relation)
1993 | {
1994 | 	// What we need is the rowid which is the first column
1995 | 	Form_pg_attribute attr = RelationGetDescr(target_relation)->attrs[0];
1996 | 	// Make a Var representing the desired value
1997 | 	Var *var = makeVar(parsetree->resultRelation, 1, attr->atttypid, attr->atttypmod, InvalidOid, 0);
1998 | 	// Wrap it in a TLE with the right name ...
1999 | 	const char *attrname = NameStr(attr->attname);
2000 | 
2001 | 	TargetEntry *tle = makeTargetEntry((Expr *) var,
2002 | 		list_length(parsetree->targetList) + 1,
2003 | 		pstrdup(attrname),
2004 | 		true
2005 | 		);
2006 | 
2007 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
2008 | 	// ... and add it to the query's targetlist
2009 | 	parsetree->targetList = lappend(parsetree->targetList, tle);
2010 | }
2011 | 
2012 | static TupleTableSlot * JsonExecForeignUpdate(
2013 | 	EState *estate,
2014 | 	ResultRelInfo *resultRelInfo,
2015 | 	TupleTableSlot *slot,
2016 | 	TupleTableSlot *planSlot
2017 | 	)
2018 | {
2019 | 	jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState;
2020 | 	int nParams = list_length(pJfmes->retrieved_attrs);
2021 | 
2022 | 	if(nParams == list_length(pJfmes->retrieved_names))
2023 | 	{
2024 | 		bool *isnull = palloc0(sizeof(bool) * nParams);
2025 | 		ListCell *lcAttrs = NULL;
2026 | 		ListCell *lcNames = list_head(pJfmes->retrieved_names);
2027 | 		StringInfoData str;
2028 | 		int paramNum = 0;
2029 | 		int ok = 0;
2030 | 
2031 | 		//ELog(DEBUG1, "%s:%d nparams %u", __func__, __LINE__, nParams);
2032 | 
2033 | 		// build json object document string
2034 | 		initStringInfo(&str);
2035 | 		appendStringInfoString(&str, "{ ");
2036 | 		foreach(lcAttrs, pJfmes->retrieved_attrs)
2037 | 		{
2038 | 			Datum value = 0;
2039 | 			int attnum = lfirst_int(lcAttrs) - 1;
2040 | 			Oid type;
2041 | 
2042 | 			type = slot->tts_tupleDescriptor->attrs[attnum]->atttypid;
2043 | 			value = slot_getattr(slot, attnum + 1, (bool*)(&isnull[attnum]));
2044 | 
2045 | 			if(JsonPg2Json(&str, type, value, lfirst(lcNames), &isnull[attnum])
2046 | 				// if not last attribute
2047 | 				&& paramNum < nParams -1
2048 | 				&& !isnull[attnum]
2049 | 			)
2050 | 			{
2051 | 				appendStringInfoString(&str, ", ");
2052 | 			}
2053 | 			lcNames = lnext(lcNames);
2054 | 			paramNum ++;
2055 | 		}
2056 | 		appendStringInfoString(&str, " }");
2057 | 
2058 | 		// send the json object to the remote server
2059 | 		ok = curlPut(pJfmes->pUrl, str.data, strlen(str.data), "application/json");
2060 | 		//ELog(DEBUG1, "%s:%d '%s' --> %s %s", __func__, __LINE__, str.data, pJfmes->pUrl, ok ? "OK" : "FAIL");
2061 | 	}
2062 | 
2063 | 	return slot;
2064 | }
2065 | 
2066 | static void JsonEndForeignModify(EState *estate, ResultRelInfo *resultRelInfo)
2067 | {
2068 | 	jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState;
2069 | 
2070 | 	//ELog(DEBUG1, "%s:%d", __func__, __LINE__);
2071 | }
2072 | 
2073 | // Transmute a postgres text array into a json text array
2074 | enum
2075 | {
2076 | 	SM_UNQUOTED,
2077 | 	SM_QUOTED,
2078 | 	SM_NEEDQUOTE,
2079 | };
2080 | 
2081 | // Append to the "outStr" string, an array of data that is Text,
2082 | // while dealing with quoting conversion from sql to json
2083 | static void JsonPgTextArray2Json(StringInfo outStr, const char *inStr, int len)
2084 | {	int state = SM_UNQUOTED;
2085 | 	int i;
2086 | 
2087 | 	if(len)
2088 | 		appendStringInfoCharMacro(outStr, '[');
2089 | 
2090 | 	for(i=0; i<len; i++)
2091 | 	{
2092 | 		switch(state)
2093 | 		{
2094 | 			case SM_UNQUOTED:
2095 | 				if(inStr[i] == '\\' && i+1 < len)
2096 | 					appendStringInfoCharMacro(outStr, inStr[i++]);
2097 | 				else if(inStr[i] == '"')
2098 | 					state = SM_QUOTED;
2099 | 				else if(inStr[i] != ',')
2100 | 				{
2101 | 					appendStringInfoCharMacro(outStr, '"');
2102 | 					state = SM_NEEDQUOTE;
2103 | 				}
2104 | 				appendStringInfoCharMacro(outStr, inStr[i]);
2105 | 				break;
2106 | 
2107 | 			case SM_NEEDQUOTE:
2108 | 				if(inStr[i] == '\\' && i+1 < len)
2109 | 					appendStringInfoCharMacro(outStr, inStr[i++]);
2110 | 				else if(inStr[i] == ',')
2111 | 				{
2112 | 					appendStringInfoCharMacro(outStr, '"');
2113 | 					state = SM_UNQUOTED;
2114 | 				}
2115 | 				appendStringInfoCharMacro(outStr, inStr[i]);
2116 | 				break;
2117 | 
2118 | 			case SM_QUOTED:
2119 | 				if(inStr[i] == '\\' && i+1 < len)
2120 | 					appendStringInfoCharMacro(outStr, inStr[i++]);
2121 | 				else if(inStr[i] == '"')
2122 | 					state = SM_UNQUOTED;
2123 | 				appendStringInfoCharMacro(outStr, inStr[i]);
2124 | 				break;
2125 | 		}
2126 | 	}
2127 | 
2128 | 	if(state == SM_NEEDQUOTE)
2129 | 		appendStringInfoCharMacro(outStr, '"');
2130 | 	if(i)
2131 | 		appendStringInfoCharMacro(outStr, ']');
2132 | }
2133 | 
2134 | // Convert a postgres attribute value into a json name and value pair
2135 | // and append it to the json object string "str"
2136 | // On return, tell the consumer if we did that for this attribute.
2137 | static int JsonPg2Json(StringInfo str, Oid type, Datum value, const char *name, bool *isnull)
2138 | {	int oldLen = str->len;
2139 | 
2140 | 	if(!*isnull)
2141 | 	{
2142 | 		switch(type)
2143 | 		{
2144 | 			case INT2OID: appendStringInfo(str, "\"%s\": %u", name, (int16)DatumGetInt16(value)); break;
2145 | 			case INT4OID: appendStringInfo(str, "\"%s\": %u", name, (int32)DatumGetInt32(value)); break;
2146 | 			case INT8OID: appendStringInfo(str, "\"%s\": %lu", name, (int64)DatumGetInt64(value)); break;
2147 | 			case FLOAT4OID: appendStringInfo(str, "\"%s\": %f", name, (float4)DatumGetFloat4(value)); break;
2148 | 			case FLOAT8OID: appendStringInfo(str, "\"%s\": %f", name, (float8)DatumGetFloat8(value)); break;
2149 | 
2150 | 			case NUMERICOID:
2151 | 				{	Datum valueDatum = DirectFunctionCall1(numeric_float8, value);
2152 | 
2153 | 					appendStringInfo(str, "\"%s\": %f", name, (float8)DatumGetFloat8(valueDatum));
2154 | 				}
2155 | 				break;
2156 | 
2157 | 			//case BOOLOID: appendStringInfo(str, "\"%s\": %s", name, (((int32)DatumGetInt32(value))) ? "true", "false"); break;
2158 | 			case BOOLOID: appendStringInfo(str, "\"%s\": %u", name, (int32)DatumGetInt32(value)); break;
2159 | 
2160 | 			case BPCHAROID:
2161 | 			case VARCHAROID:
2162 | 			case TEXTOID:
2163 | 			case NAMEOID:
2164 | 				{	char *outputString = NULL;
2165 | 					Oid outputFunctionId = InvalidOid;
2166 | 					bool typeVarLength = false;
2167 | 
2168 | 					getTypeOutputInfo(type, &outputFunctionId, &typeVarLength);
2169 | 					outputString = OidOutputFunctionCall(outputFunctionId, value);
2170 | 
2171 | 					appendStringInfo(str, "\"%s\": \"%s\"", name, outputString);
2172 | 				}
2173 | 				break;
2174 | 
2175 | 			case DATEOID:
2176 | 			case TIMEOID:
2177 | 			case TIMESTAMPOID:
2178 | 			case TIMESTAMPTZOID:
2179 | 				{
2180 | 					int pgtz;
2181 | 					struct pg_tm pgtm;
2182 | 					fsec_t fsec;
2183 | 					const char *pgtzn;
2184 | 					struct tm tm;
2185 | 					char buffer [128];
2186 | 					Timestamp valueTimestamp;
2187 | 
2188 | 					// get pg time
2189 | 					if(type == DATEOID)
2190 | 					{	Datum valueDatum = DirectFunctionCall1(date_timestamp, value);
2191 | 
2192 | 						valueTimestamp = DatumGetTimestamp(valueDatum);
2193 | 					}
2194 | 					else
2195 | 						valueTimestamp = DatumGetTimestamp(value);
2196 | 
2197 | 					// extract pg time
2198 | 					timestamp2tm(valueTimestamp, &pgtz, &pgtm, &fsec, &pgtzn, pg_tzset("UTC"));
2199 | 
2200 | 					// map to unix time
2201 | 					tm.tm_sec = pgtm.tm_sec;
2202 | 					tm.tm_min = pgtm.tm_min;
2203 | 					tm.tm_hour = pgtm.tm_hour;
2204 | 					tm.tm_mday = pgtm.tm_mday;
2205 | 					tm.tm_mon = pgtm.tm_mon - 1;
2206 | 					tm.tm_year = pgtm.tm_year - 1900;
2207 | 					tm.tm_wday = pgtm.tm_wday;
2208 | 					tm.tm_yday = pgtm.tm_yday;
2209 | 					tm.tm_isdst = pgtm.tm_isdst;
2210 | 					tm.tm_gmtoff = pgtm.tm_gmtoff;
2211 | 					tm.tm_zone = (char *)pgtm.tm_zone;
2212 | 
2213 | 					memset(buffer, 0, sizeof(buffer));
2214 | 					// convert to string in ISO format
2215 | 					strftime(buffer, sizeof(buffer)-1, "%Y-%m-%d %H:%M:%S %Z", &tm);
2216 | 
2217 | 					appendStringInfo(str, "\"%s\": \"%s\"", name, buffer);
2218 | 				}
2219 | 				break;
2220 | 		/*
2221 | 			case BITOID:
2222 | 			{
2223 | 				int32 dat;
2224 | 				int32 *bufptr = palloc0(sizeof(int32));
2225 | 				char *outputString = NULL;
2226 | 				Oid outputFunctionId = InvalidOid;
2227 | 				bool typeVarLength = false;
2228 | 				getTypeOutputInfo(type, &outputFunctionId, &typeVarLength);
2229 | 				outputString = OidOutputFunctionCall(outputFunctionId, value);
2230 | 
2231 | 				dat = bin_dec(atoi(outputString));
2232 | 				memcpy(bufptr, (char*)&dat, sizeof(int32));
2233 | 				binds[attnum].buffer = bufptr;
2234 | 				break;
2235 | 			}
2236 | 		*/
2237 | 			case INT4ARRAYOID:
2238 | 			case INT2ARRAYOID:
2239 | 			case FLOAT4ARRAYOID:
2240 | 			case TEXTARRAYOID:
2241 | 				{	Oid outputFunctionId;
2242 | 					bool typeVarLength = false;
2243 | 					char *outputString = NULL;
2244 | 					int l;
2245 | 
2246 | 					getTypeOutputInfo(type, &outputFunctionId, &typeVarLength);
2247 | 					outputString = OidOutputFunctionCall(outputFunctionId, value);
2248 | 
2249 | 					// trim left and right curly braces
2250 | 					outputString++;
2251 | 					l = strlen(outputString) - 1;
2252 | 
2253 | 					if(type != TEXTARRAYOID)
2254 | 						appendStringInfo(str, "\"%s\": [%*.*s]", name, l, l, outputString);
2255 | 					else
2256 | 					{
2257 | 						appendStringInfo(str, "\"%s\": ", name);
2258 | 						JsonPgTextArray2Json(str, outputString, l);
2259 | 					}
2260 | 				}
2261 | 				break;
2262 | 
2263 | 			//case OIDARRAYOID:
2264 | 			default:
2265 | 			{
2266 | 				ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE),
2267 | 								errmsg("cannot convert constant value to JSON value"),
2268 | 								errhint("Constant value data type: %u", type)));
2269 | 				break;
2270 | 			}
2271 | 		}
2272 | 	}
2273 | 
2274 | 	return (str->len > oldLen); // we appended new characters
2275 | }
2276 | 


--------------------------------------------------------------------------------
/json_fdw.control:
--------------------------------------------------------------------------------
1 | # json_fdw extension
2 | comment = 'foreign-data wrapper for json file access'
3 | default_version = '1.0'
4 | module_pathname = '$libdir/json_fdw'
5 | relocatable = true
6 | 


--------------------------------------------------------------------------------
/json_fdw.h:
--------------------------------------------------------------------------------
  1 | /*-------------------------------------------------------------------------
  2 |  *
  3 |  * json_fdw.h
  4 |  *
  5 |  * Type and function declarations for JSON foreign data wrapper.
  6 |  *
  7 |  * Copyright (c) 2013, Citus Data, Inc.
  8 |  *
  9 |  * $Id$
 10 |  *
 11 |  *-------------------------------------------------------------------------
 12 |  */
 13 | 
 14 | #ifndef JSON_FDW_H
 15 | #define JSON_FDW_H
 16 | 
 17 | #include "fmgr.h"
 18 | #include "catalog/pg_foreign_server.h"
 19 | #include "catalog/pg_foreign_table.h"
 20 | #include "utils/hsearch.h"
 21 | #include "nodes/pg_list.h"
 22 | #include "utils/rel.h"
 23 | 
 24 | #include "curlapi.h"
 25 | 
 26 | 
 27 | /* Defines for valid option names and default values */
 28 | #define OPTION_NAME_FILENAME "filename"
 29 | #define OPTION_NAME_MAX_ERROR_COUNT "max_error_count"
 30 | #define DEFAULT_MAX_ERROR_COUNT 0
 31 | 
 32 | #define OPTION_NAME_HTTP_POST_VARS "http_post_vars"
 33 | #define OPTION_NAME_ROM_URL "rom_url"
 34 | #define OPTION_NAME_ROM_PATH "rom_path"
 35 | 
 36 | #define JSON_TUPLE_COST_MULTIPLIER 10
 37 | #define ERROR_BUFFER_SIZE 1024
 38 | #define READ_BUFFER_SIZE 4096
 39 | #define GZIP_FILE_EXTENSION ".gz"
 40 | #define HDFS_BLOCK_PREFIX "blk_"
 41 | #define HDFS_BLOCK_PREFIX_LENGTH 4
 42 | 
 43 | 
 44 | /*
 45 |  * JsonValidOption keeps an option name and a context. When an option is passed
 46 |  * into json_fdw objects (server and foreign table), we compare this option's
 47 |  * name and context against those of valid options.
 48 |  */
 49 | typedef struct JsonValidOption
 50 | {
 51 | 	const char *optionName;
 52 | 	Oid optionContextId;
 53 | 
 54 | } JsonValidOption;
 55 | 
 56 | 
 57 | /*
 58 |  * JsonFdwOptions holds the option values to be used when reading and parsing
 59 |  * the json file. To resolve these values, we first check foreign table's 
 60 |  * options, and if not present, we then fall back to the default values 
 61 |  * specified above.
 62 |  */
 63 | typedef struct JsonFdwOptions
 64 | {
 65 | 	char const *filename;
 66 | 	int32 maxErrorCount;
 67 | 	char const *pHttpPostVars;
 68 | 	char const *pRomUrl;
 69 | 	char const *pRomPath;
 70 | } JsonFdwOptions;
 71 | 
 72 | 
 73 | /*
 74 |  * JsonFdwExecState keeps foreign data wrapper specific execution state that we
 75 |  * create and hold onto when executing the query.
 76 |  */
 77 | typedef struct JsonFdwExecState
 78 | {
 79 | 	char const *filename;		// on disk file name of json content
 80 | 	FILE *filePointer;		// file pointer to on disk content
 81 | 	void *gzFilePointer;		// gz file pointe to on disk content
 82 | 
 83 | 	uint32 maxErrorCount;
 84 | 	uint32 errorCount;
 85 | 	uint32 currentLineNumber;
 86 | 	HTAB *columnMappingHash;
 87 | 
 88 | 	cfr_t *pCfr;			// curl fetch result
 89 | } JsonFdwExecState;
 90 | 
 91 | typedef struct _jfmes_t
 92 | {
 93 | 	Relation rel;			// relcache entry for the foriegn table
 94 | 	int p_nums;			// number of parameters to transmit
 95 | 	FmgrInfo *p_flinfo;		// output conversion functions for them
 96 | 
 97 | 	List *retrieved_attrs;		// list of target attribute members
 98 | 	List *retrieved_names;		// list of target attribute names
 99 | 	List *table_options;
100 | 	char const *pUrl;		// put url
101 | 
102 | 	MemoryContext temp_cxt;		// context for per-tuple temp data
103 | 
104 | } jfmes_t; // Json Fdw Modify Exec State Type
105 | 
106 | 
107 | /*
108 |  * ColumnMapping reprents a hash table entry that maps a column name to column
109 |  * related information. We construct these hash table entries to speed up the
110 |  * conversion from JSON documents to PostgreSQL tuples; and each hash entry maps
111 |  * the column name to the column's tuple index and its type-related information.
112 |  */
113 | typedef struct ColumnMapping
114 | {
115 | 	char columnName[NAMEDATALEN];
116 | 	uint32 columnIndex;
117 | 	Oid columnTypeId;
118 | 	int32 columnTypeMod;
119 | 	Oid columnArrayTypeId;
120 | 
121 | } ColumnMapping;
122 | 
123 | 
124 | /* Function declarations for foreign data wrapper */
125 | extern Datum json_fdw_handler(PG_FUNCTION_ARGS);
126 | extern Datum json_fdw_validator(PG_FUNCTION_ARGS);
127 | 
128 | 
129 | #endif   /* JSON_FDW_H */
130 | 


--------------------------------------------------------------------------------
/output/basic_tests.source:
--------------------------------------------------------------------------------
  1 | --
  2 | -- Test json foreign data wrapper.
  3 | --
  4 | -- Settings to make the result deterministic
  5 | SET datestyle = "ISO, YMD";
  6 | -- Install json_fdw
  7 | CREATE EXTENSION json_fdw;
  8 | CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw;
  9 | -- validator tests
 10 | CREATE FOREIGN TABLE test_validator_filename_missing () 
 11 | 	SERVER json_server; -- ERROR
 12 | ERROR:  filename is required for json_fdw foreign tables
 13 | CREATE FOREIGN TABLE test_validator_invalid_option () 
 14 | 	SERVER json_server 
 15 | 	OPTIONS(filename 'data.json', bad_option_name '1'); -- ERROR
 16 | ERROR:  invalid option "bad_option_name"
 17 | HINT:  Valid options in this context are: filename, max_error_count, hdfs_directory_path
 18 | -- data conversion tests
 19 | CREATE FOREIGN TABLE json_data (id int8, type char(20), name text, 
 20 | 	birthdate date, actions int[], "position.lat" float, "position.lon" float, 
 21 | 	"position.address.country" varchar(50), last_update timestamp,
 22 | 	last_update_tz timestamp with time zone
 23 | 	) SERVER json_server OPTIONS(filename '@abs_srcdir@/data/data.json');
 24 | SELECT id, type, name FROM json_data ORDER BY id;
 25 |           id          |         type         |        name        
 26 | ----------------------+----------------------+--------------------
 27 |  -9223372036854775808 |                      | 
 28 |                     1 | person               | Beatus Henk
 29 |                     2 | person               | Lugos Alfons
 30 |                     3 | person               | Temür Essa
 31 |                     4 | resturaunt           | Mingus Kitchen
 32 |                     5 | resturaunt           | Café Utopia Lounge
 33 |                     6 | invalid_record       | 
 34 |   9223372036854775807 |                      | 
 35 | (8 rows)
 36 | 
 37 | SELECT id, name, birthdate FROM json_data WHERE type = 'person' ORDER BY id;
 38 |  id |     name     | birthdate  
 39 | ----+--------------+------------
 40 |   1 | Beatus Henk  | 1973-06-24
 41 |   2 | Lugos Alfons | 1961-08-30
 42 |   3 | Temür Essa   | 1995-07-28
 43 | (3 rows)
 44 | 
 45 | SELECT id, "position.lat" AS lat, "position.lon" AS lon, 
 46 | 	"position.address.country" AS country, last_update 
 47 | 	FROM json_data WHERE type = 'resturaunt' ORDER BY id;
 48 |  id |   lat    |    lon    |  country  |     last_update     
 49 | ----+----------+-----------+-----------+---------------------
 50 |   4 | -48.3798 | -65.43274 | Argentina | 2013-01-02 12:05:01
 51 |   5 | 42.97208 | 143.39097 |           | 
 52 | (2 rows)
 53 | 
 54 | SELECT id, type, birthdate, last_update, "position.lon" as lon 
 55 | 	FROM json_data WHERE type = 'invalid_record' ORDER BY id;
 56 |  id |         type         | birthdate | last_update | lon 
 57 | ----+----------------------+-----------+-------------+-----
 58 |   6 | invalid_record       |           |             |    
 59 | (1 row)
 60 | 
 61 | SELECT last_update_tz AT TIME ZONE 'UTC' FROM json_data 
 62 | 	WHERE last_update_tz IS NOT NULL;
 63 |       timezone       
 64 | ---------------------
 65 |  2013-01-02 17:05:01
 66 | (1 row)
 67 | 
 68 | -- max error count test
 69 | CREATE FOREIGN TABLE test_skip_broken_on (a integer, b integer) 
 70 | 	SERVER json_server 
 71 | 	OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '1');
 72 | SELECT * FROM test_skip_broken_on ORDER BY a;
 73 |  a | b 
 74 | ---+---
 75 |  1 | 2
 76 |  2 | 3
 77 |  3 | 4
 78 | (3 rows)
 79 | 
 80 | CREATE FOREIGN TABLE test_skip_broken_off (a integer, b integer) 
 81 | 	SERVER json_server 
 82 | 	OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '0');
 83 | SELECT * FROM test_skip_broken_off; -- ERROR
 84 | ERROR:  could not parse 1 json objects
 85 | HINT:  Last error message at line: 4: parse error: premature EOF
 86 |                                        {"a": 3,  
 87 |                      (right here) ------^
 88 | 
 89 | -- error scenarios
 90 | CREATE FOREIGN TABLE test_missing_file () SERVER json_server 
 91 | 	OPTIONS (filename '@abs_srcdir@/data/missing_file.json');
 92 | SELECT * FROM test_missing_file; -- ERROR
 93 | ERROR:  could not open file "@abs_srcdir@/data/missing_file.json" for reading: No such file or directory
 94 | CREATE FOREIGN TABLE test_string_length_check (type CHAR(6)) SERVER json_server
 95 | 	OPTIONS (filename '@abs_srcdir@/data/data.json');
 96 | SELECT * FROM test_string_length_check; -- ERROR
 97 | ERROR:  value too long for type character(6)
 98 | CREATE FOREIGN TABLE test_int_range_check (id int4) SERVER json_server
 99 | 	OPTIONS (filename '@abs_srcdir@/data/data.json');
100 | SELECT * FROM test_int_range_check; -- ERROR
101 | ERROR:  value "9223372036854775807" is out of range for type integer
102 | CREATE FOREIGN TABLE test_decimal_range_check ("position.lat" decimal(3, 2))
103 | 	SERVER json_server OPTIONS (filename '@abs_srcdir@/data/data.json');
104 | SELECT * FROM test_decimal_range_check; -- ERROR
105 | ERROR:  numeric field overflow
106 | DETAIL:  A field with precision 3, scale 2 must round to an absolute value less than 10^1.
107 | 


--------------------------------------------------------------------------------
/output/customer_reviews.source:
--------------------------------------------------------------------------------
  1 | --
  2 | -- Test customer reviews dataset queries.
  3 | --
  4 | CREATE FOREIGN TABLE customer_reviews
  5 | (
  6 |     customer_id TEXT not null,
  7 |     "review.date" DATE not null,
  8 |     "review.rating" INTEGER not null,
  9 |     "review.votes" INTEGER,
 10 |     "review.helpful_votes" INTEGER,
 11 |     "product.id" CHAR(10) not null,
 12 |     "product.title" TEXT not null,
 13 |     "product.sales_rank" BIGINT,
 14 |     "product.group" TEXT,
 15 |     "product.category" TEXT,
 16 |     "product.subcategory" TEXT,
 17 |     similar_product_ids CHAR(10)[]
 18 | )
 19 | SERVER json_server
 20 | OPTIONS(filename '@abs_srcdir@/data/customer_reviews_1998.1000.json.gz');
 21 | -- How people rate your products?
 22 | SELECT
 23 |     extract(month from "review.date") AS review_month,
 24 |     round(avg("review.rating"), 2),
 25 |     count(*)
 26 | FROM
 27 |     customer_reviews
 28 | GROUP BY
 29 |     review_month
 30 | ORDER BY
 31 |     review_month;
 32 |  review_month | round | count 
 33 | --------------+-------+-------
 34 |             1 |  4.48 |   224
 35 |             2 |  4.42 |   149
 36 |             6 |  4.50 |     2
 37 |             7 |  4.63 |    71
 38 |             8 |  4.61 |    75
 39 |             9 |  4.57 |   101
 40 |            10 |  4.42 |   130
 41 |            11 |  4.59 |   143
 42 |            12 |  4.54 |   105
 43 | (9 rows)
 44 | 
 45 | -- Do we have a correlation between a book's title's length and its review ratings?
 46 | SELECT
 47 |     width_bucket(length("product.title"), 1, 50, 5) title_length_bucket,
 48 |     round(avg("review.rating"), 2) AS review_average,
 49 |     count(*)
 50 | FROM
 51 |    customer_reviews
 52 | WHERE
 53 |     "product.group" = 'Book'
 54 | GROUP BY
 55 |     title_length_bucket
 56 | ORDER BY
 57 |     title_length_bucket;
 58 |  title_length_bucket | review_average | count 
 59 | ---------------------+----------------+-------
 60 |                    1 |           4.50 |   135
 61 |                    2 |           4.48 |   364
 62 |                    3 |           4.53 |   190
 63 |                    4 |           4.52 |   151
 64 |                    5 |           4.60 |    99
 65 |                    6 |           4.62 |    55
 66 | (6 rows)
 67 | 
 68 | -- Does the average review rating change by product category?
 69 | SELECT
 70 |     "product.category",
 71 |     round(avg("review.rating"), 2),
 72 |     count(*)
 73 | FROM
 74 |     customer_reviews
 75 | GROUP BY
 76 |     "product.category"
 77 | ORDER BY
 78 |     count(*) DESC, "product.category"
 79 | LIMIT 20;
 80 |      product.category      | round | count 
 81 | ---------------------------+-------+-------
 82 |  Science Fiction & Fantasy |  4.44 |   189
 83 |  Literature & Fiction      |  4.62 |   149
 84 |  Mystery & Thrillers       |  3.79 |    71
 85 |  Books on Tape             |  4.75 |    65
 86 |  Children's Books          |  4.49 |    65
 87 |  Nonfiction                |  4.56 |    57
 88 |  Religion & Spirituality   |  4.67 |    52
 89 |  Science                   |  4.47 |    36
 90 |  Health, Mind & Body       |  4.97 |    30
 91 |  Computers & Internet      |  4.50 |    26
 92 |  Horror                    |  4.32 |    25
 93 |  Business & Investing      |  4.57 |    21
 94 |  Biographies & Memoirs     |  4.55 |    20
 95 |  History                   |  4.45 |    20
 96 |  Teens                     |  4.26 |    19
 97 |  Entertainment             |  4.89 |    18
 98 |  Home & Garden             |  4.24 |    17
 99 |  Reference                 |  4.71 |    14
100 |  Romance                   |  4.77 |    13
101 |  Sports                    |  4.50 |    10
102 | (20 rows)
103 | 
104 | 


--------------------------------------------------------------------------------
/output/hdfs_block.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test customer reviews dataset which is stored as a HDFS block.
 3 | --
 4 | CREATE FOREIGN TABLE customer_reviews_hdfs_block
 5 | (
 6 |     customer_id TEXT not null,
 7 |     "review.date" DATE not null,
 8 |     "review.rating" INTEGER not null,
 9 |     "review.votes" INTEGER,
10 |     "review.helpful_votes" INTEGER,
11 |     "product.id" CHAR(10) not null,
12 |     "product.title" TEXT not null,
13 |     "product.sales_rank" BIGINT,
14 |     "product.group" TEXT,
15 |     "product.category" TEXT,
16 |     "product.subcategory" TEXT,
17 |     similar_product_ids CHAR(10)[]
18 | )
19 | SERVER json_server
20 | OPTIONS(filename '@abs_srcdir@/data/blk_-729487577044220672', 
21 |         max_error_count '2');
22 | -- Does the average review rating change by product category?
23 | SELECT
24 |     "product.category",
25 |     round(avg("review.rating"), 2),
26 |     count(*)
27 | FROM
28 |     customer_reviews_hdfs_block
29 | GROUP BY
30 |     "product.category"
31 | ORDER BY
32 |     count(*) DESC, "product.category"
33 | LIMIT 20;
34 |      product.category      | round | count 
35 | ---------------------------+-------+-------
36 |  Science Fiction & Fantasy |  4.44 |   189
37 |  Literature & Fiction      |  4.62 |   149
38 |  Mystery & Thrillers       |  3.79 |    71
39 |  Books on Tape             |  4.75 |    65
40 |  Children's Books          |  4.49 |    65
41 |  Nonfiction                |  4.56 |    57
42 |  Religion & Spirituality   |  4.67 |    52
43 |  Science                   |  4.47 |    36
44 |  Health, Mind & Body       |  4.97 |    30
45 |  Computers & Internet      |  4.50 |    26
46 |  Horror                    |  4.32 |    25
47 |  Business & Investing      |  4.57 |    21
48 |  Biographies & Memoirs     |  4.55 |    20
49 |  History                   |  4.45 |    20
50 |  Teens                     |  4.26 |    19
51 |  Entertainment             |  4.89 |    18
52 |  Home & Garden             |  4.24 |    17
53 |  Reference                 |  4.71 |    14
54 |  Romance                   |  4.77 |    13
55 |  Sports                    |  4.50 |    10
56 | (20 rows)
57 | 
58 | 


--------------------------------------------------------------------------------
/output/invalid_gz_file.source:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Test that we handle invalid gzip files properly.
 3 | --
 4 | \set VERBOSITY terse
 5 | CREATE FOREIGN TABLE invalid_gz_file_table
 6 | (
 7 |     customer_id TEXT not null,
 8 |     "review.date" DATE not null,
 9 |     "review.rating" INTEGER not null,
10 |     "review.votes" INTEGER,
11 |     "review.helpful_votes" INTEGER,
12 |     "product.id" CHAR(10) not null,
13 |     "product.title" TEXT not null,
14 |     "product.sales_rank" BIGINT,
15 |     "product.group" TEXT,
16 |     "product.category" TEXT,
17 |     "product.subcategory" TEXT,
18 |     similar_product_ids CHAR(10)[]
19 | )
20 | SERVER json_server
21 | OPTIONS(filename '@abs_srcdir@/data/invalid_gz_file.json.gz');
22 | select count(*) from invalid_gz_file_table;
23 | ERROR:  could not read from json file
24 | \set VERBOSITY default
25 | 


--------------------------------------------------------------------------------
/rciapi.c:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------*
  2 |  *
  3 |  * Developed by;
  4 |  *	Neal Horman - http://www.wanlink.com
  5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
  6 |  *
  7 |  *	This "source code" is free software: you can redistribute it and/or modify
  8 |  *	it under the terms of the GNU General Public License as published by
  9 |  *	the Free Software Foundation, either version 3 of the License, or
 10 |  *	(at your option) any later version.
 11 |  *
 12 |  *	This "source code" is distributed in the hope that it will be useful,
 13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |  *	GNU General Public License for more details.
 16 |  *
 17 |  *	You should have received a copy of the GNU General Public License
 18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
 19 |  *
 20 |  *	RCSID:  $Id$
 21 |  *
 22 |  *--------------------------------------------------------------------*/
 23 | 
 24 | #include <stdio.h>
 25 | #include <stdlib.h>
 26 | #include <unistd.h>
 27 | #include <string.h>
 28 | 
 29 | #include <yajl/yajl_tree.h>
 30 | #include <yajl/yajl_tree_path.h>
 31 | 
 32 | // same as in json_fdw.h
 33 | #define ERROR_BUFFER_SIZE 1024
 34 | #define READ_BUFFER_SIZE 4096
 35 | 
 36 | #include "curlapi.h"
 37 | #include "regexapi.h"
 38 | #include "regexapi_helper.h"
 39 | #include "rciapi.h"
 40 | 
 41 | static yajl_val romRootFetch(char const *pRomUrl, char const *pRomPath)
 42 | {	yajl_val root = NULL;
 43 | 
 44 | 	if(pRomUrl != NULL && pRomPath != NULL && *pRomUrl && *pRomPath)
 45 | 	{	cfr_t *pCfr = curlFetchFile(pRomUrl, NULL);
 46 | 
 47 | 		if(pCfr != NULL && pCfr->bFileFetched)
 48 | 		{	FILE *fin = fopen(pCfr->ccf.pFileName, "r");
 49 | 
 50 | 			if(fin != NULL)
 51 | 			{	char errorBuffer[ERROR_BUFFER_SIZE];
 52 | 
 53 | 				root = yajl_tree_parse_file(fin, READ_BUFFER_SIZE, NULL, errorBuffer, sizeof(errorBuffer));
 54 | 				fclose(fin);
 55 | 			}
 56 | 
 57 | 			// must be an object with schema 2,
 58 | 			// else, not the rom we are looking for
 59 | 			if(
 60 | 				!YAJL_IS_OBJECT(root)
 61 | 				 || atoi( ytp_get(root, "romschema", NULL)) != 2
 62 | 				)
 63 | 			{
 64 | 				// free and null, if failure
 65 | 				yajl_tree_free(root);
 66 | 				root = NULL;
 67 | 			}
 68 | 		}
 69 | 		curlCfrFree(pCfr);
 70 | 	}
 71 | 
 72 | 	return root;
 73 | }
 74 | 
 75 | void rciFree(rci_t *pRci)
 76 | {
 77 | 	if(pRci != NULL)
 78 | 	{
 79 | 		if(pRci->pUrl != NULL)
 80 | 			free(pRci->pUrl);
 81 | 		if(pRci->pQuery != NULL)
 82 | 			free(pRci->pQuery);
 83 | 		if(pRci->pAction != NULL)
 84 | 			free(pRci->pAction);
 85 | 		if(pRci->romRoot != NULL)
 86 | 			yajl_tree_free(pRci->romRoot);
 87 | 		free(pRci);
 88 | 	}
 89 | }
 90 | 
 91 | // strcat but dst is realloc'd to add src
 92 | static char *strcatr(char *dst, char const *src)
 93 | {
 94 | 	if(src != NULL && *src)
 95 | 		dst = strcat(realloc(dst, (dst != NULL ? strlen(dst) : 0) + strlen(src) + 1), src);
 96 | 
 97 | 	return  dst;
 98 | }
 99 | 
100 | // special case strcatr, don't concat "/blah/" and "/" into "/blah//"
101 | static char *strcatrurl(char *dst, char const *src)
102 | {
103 | 	// won't be "/blah//" ?
104 | 	if( !(dst != NULL && dst[strlen(dst)-1] == '/' && src != NULL && *src == '/' && src[1] == 0))
105 | 		dst = strcatr(dst, src); // so concat the two
106 | 
107 | 	return dst;
108 | }
109 | 
110 | rci_t *rciFetch(char const *pRomUrl, char const *pRomPath, int action)
111 | {	rci_t *pRci = calloc(1, sizeof(rci_t));
112 | 
113 | 	if(pRci != NULL)
114 | 	{
115 | 		pRci->romRoot = romRootFetch(pRomUrl, pRomPath);
116 | 		// the schema has already been validated
117 | 		if(pRci->romRoot != NULL && action != RCI_ACTION_NONE)
118 | 		{
119 | 			yajl_val rootTable = ytp_get(pRci->romRoot, pRomPath, NULL);
120 | 			yajl_val rootQuery;
121 | 
122 | 			pRci->pAction = strdup(action == RCI_ACTION_INSERT ? "insert"
123 | 				: action == RCI_ACTION_UPDATE ? "update"
124 | 				: action == RCI_ACTION_DELETE ? "delete"
125 | 				: "select"
126 | 				);
127 | 
128 | 			pRci->romRootAction = ytp_get(rootTable, pRci->pAction, NULL);
129 | 			pRci->pMethod = ytp_get(pRci->romRootAction, "method", NULL);
130 | 			pRci->pUrl = strcatr(NULL, ytp_GetPath(pRci->romRoot, "$.host"));
131 | 
132 | 			rootQuery = ytp_get(pRci->romRootAction, "query");
133 | 
134 | 			// If no host specified in ROM, use the
135 | 			// host specification of the ROM url
136 | 			if(pRci->pUrl == NULL || !*pRci->pUrl)
137 | 			{	// split the ROM url into pieces
138 | 				regexapi_t *pRat = regexapi_url(pRomUrl);
139 | 
140 | 				// use the pieces ?
141 | 				if(pRat != NULL)
142 | 				{	int regexNSubs = regexapi_nsubs(pRat, 0);
143 | 
144 | 					if(regexNSubs >= 2)
145 | 					{
146 | 						if(pRci->pUrl != NULL)
147 | 							free(pRci->pUrl);
148 | 						asprintf(&pRci->pUrl, "%s://%s"
149 | 							, regexapi_sub(pRat, 0, 0) // protocol specification
150 | 							, regexapi_sub(pRat, 0, 1) // host specification
151 | 							);
152 | 					}
153 | 					regexapi_free(pRat);
154 | 				}
155 | 			}
156 | 
157 | 			// concat / build the url based on the path selected
158 | 			pRci->pUrl = strcatrurl(pRci->pUrl, ytp_GetPath(pRci->romRoot, "$.url"));
159 | 			pRci->pUrl = strcatrurl(pRci->pUrl, ytp_get(rootTable, "url", NULL));
160 | 			pRci->pUrl = strcatrurl(pRci->pUrl, ytp_GetPath(pRci->romRootAction, "$.url"));
161 | 
162 | 			// use the query array objects to build a set
163 | 			// of url named parameters with values ?
164 | 			if(YAJL_IS_ARRAY(rootQuery))
165 | 			{	int i,q,first=1;
166 | 				char const *pStrName;
167 | 				char const *pStrValue;
168 | 
169 | 				// each query object
170 | 				for(i=0,q=rootQuery->u.array.len; i<q; i++)
171 | 				{
172 | 					pStrName = ytp_get(rootQuery, i+1, "name", NULL);
173 | 					pStrValue = ytp_get(rootQuery, i+1, "value", NULL);
174 | 					//printf("%s:%d i %u name '%s' value '%s'\n", __func__, __LINE__, i, pStrName, pStrValue);
175 | 					if(
176 | 						pStrName != NULL && *pStrName
177 | 						&& pStrValue != NULL && *pStrValue
178 | 						)
179 | 					{
180 | 
181 | 						if(first)
182 | 						{
183 | 							// This supposes that the url as built above this
184 | 							// code section, doesn't already have paramenters.
185 | 							// TODO - figure out if this has already been done.
186 | 							pRci->pUrl = strcatr(pRci->pUrl, "?");
187 | 							first = 0;
188 | 						}
189 | 						else
190 | 							pRci->pUrl = strcatr(pRci->pUrl, "&");
191 | 
192 | 						pRci->pUrl = strcatr(pRci->pUrl, pStrName);
193 | 						pRci->pUrl = strcatr(pRci->pUrl, "=");
194 | 						pRci->pUrl = strcatr(pRci->pUrl, pStrValue);
195 | 					}
196 | 				}
197 | 			}
198 | 		}
199 | 		else if(pRci->romRoot == NULL)
200 | 		{
201 | 			rciFree(pRci);
202 | 			pRci = NULL;
203 | 		}
204 | 	}
205 | 
206 | 	return pRci;
207 | }
208 | 
209 | 
210 | #ifdef _UNIT_TEST_RCI
211 | 
212 | void test1(int argc, char **argv)
213 | {
214 | 	char const	*pRomUrl = "http://127.0.0.1:9734/files/rom.json";
215 | 	char const	*pRomPath = "devicestate";
216 | 	rci_t		*pRci = NULL;
217 | 
218 | 	pRci = rciFetch(pRomUrl, pRomPath, RCI_ACTION_SELECT);
219 | 
220 | 	if(pRci != NULL)
221 | 		printf("url '%s' method '%s'\n", pRci->pUrl, pRci->pMethod);
222 | 	else
223 | 		printf("rciFetch failed\n");
224 | 
225 | 	rciFree(pRci);
226 | }
227 | 
228 | int main(int argc, char **argv)
229 | {
230 | 	test1(argc, argv);
231 | 
232 | 	return 0;
233 | }
234 | #endif
235 | 


--------------------------------------------------------------------------------
/rciapi.h:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------*
 2 |  *
 3 |  * Developed by;
 4 |  *	Neal Horman - http://www.wanlink.com
 5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
 6 |  *
 7 |  *	This "source code" is free software: you can redistribute it and/or modify
 8 |  *	it under the terms of the GNU General Public License as published by
 9 |  *	the Free Software Foundation, either version 3 of the License, or
10 |  *	(at your option) any later version.
11 |  *
12 |  *	This "source code" is distributed in the hope that it will be useful,
13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  *	GNU General Public License for more details.
16 |  *
17 |  *	You should have received a copy of the GNU General Public License
18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
19 |  *
20 |  *	RCSID:  $Id$
21 |  *
22 |  *--------------------------------------------------------------------*/
23 | 
24 | #ifndef _RCIAPI_H_
25 | #define _RCIAPI_H_
26 | 
27 | /*
28 |  * Remote Operations Map - is inspired by, but not Swagger
29 |  *
30 |  * It specifies how to do sql type operations on remote
31 |  * json objects via a server based api.
32 |  *
33 |  * The ROM is it's self a json object.
34 |  *
35 |  * This is a codified helper interface to that ROM
36 |  *
37 |  * It is expected that the ROM resides remotelty, and may be
38 |  * cached locally in json form, but is read in, and retained
39 |  * in memory for later use.
40 |  *
41 |  */
42 | 
43 | /* An example ROM, supporting select and insert operations
44 |  * of a local table schema of at least;
45 |  *	create foreign table sometable
46 |  *		(t integer, st integer, id integer, data integer[])
47 |  *	server json_server
48 |  *	options
49 |  *		(rom_url 'http://server.example.com/rom.json', rom_path 'devicestate')
50 |  * where the remote data could be at least '{ "t":3, "st":2, "id":4, "data":[ 1, 2, 3] }'
51 | 
52 | {
53 | 	"romschema": "2",
54 | 	"host": "",
55 | 	"url": "/omsgsql",
56 | 
57 | 	"devicestate":
58 | 	{
59 | 		"url": "/devices",
60 | 		"select":{
61 | 			"method": "get",
62 | 			"url": "/",
63 | 			"query": [ { "name":"st", "type":"integer"}, { "name":"id", "type":"integer"} ]
64 | 		},
65 | 		"insert":{
66 | 			"method": "put",
67 | 			"url": "/",
68 | 			"query": [ { "name":"st", "type":"integer"}, { "name":"id", "type":"integer"}, {"name":"data", "type":"integer[]"} ]
69 | 			},
70 | 		"delete":{ "method": "", "url": "", "schema": [ ] },
71 | 		"update":{ "method": "", "url": "", "schema": [ ] }
72 | 	}
73 | }
74 | 
75 | */
76 | 
77 | #include <yajl/yajl_tree.h>
78 | 
79 | typedef struct _rci_t
80 | {
81 | 	char *pUrl; // must be free()'d
82 | 	char *pQuery; // must be free()'d
83 | 	char const *pMethod;
84 | 	char const *pAction; // must be freed()'d
85 | 	yajl_val romRoot; // must be yajl_free()'d
86 | 	yajl_val romRootAction; // do not yajl_free(), is subnode of romRoot
87 | } rci_t; // Rom Context Info Type;
88 | 
89 | enum { RCI_ACTION_NONE, RCI_ACTION_SELECT, RCI_ACTION_INSERT, RCI_ACTION_UPDATE, RCI_ACTION_DELETE };
90 | 
91 | void rciFree(rci_t *pRci);
92 | rci_t *rciFetch(char const *pRomUrl, char const *pRomPath, int action);
93 | 
94 | #endif
95 | 


--------------------------------------------------------------------------------
/regexapi.c:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------*
  2 |  *
  3 |  *	This "source code" is part of Spamilter - http://www.spamilter.org
  4 |  *	Additionally, this "source code" is herby, also incorporated as part json_fdw
  5 |  *
  6 |  * Developed by;
  7 |  *	Neal Horman - http://www.wanlink.com
  8 |  *	Copyright (c) 2010-2015 Neal Horman. All Rights Reserved
  9 |  *
 10 |  *	Redistribution and use in source and binary forms, with or without
 11 |  *	modification, are permitted provided that the following conditions
 12 |  *	are met;
 13 |  *
 14 |  *		1. Redistributions of source code must retain the above copyright
 15 |  *		   notice, this list of conditions and the following disclaimer.
 16 |  *		2. Redistributions in binary form must reproduce the above copyright
 17 |  *		   notice, this list of conditions and the following disclaimer in the
 18 |  *		   documentation and/or other materials provided with the distribution.
 19 |  *		3. All advertising materials mentioning features or use of this software
 20 |  *		   must display the following acknowledgement:
 21 |  *		This product includes software developed by Neal Horman.
 22 |  *		4. Neither the name Neal Horman nor the names of any contributors
 23 |  *		   may be used to endorse or promote products derived from this software
 24 |  *		   without specific prior written permission.
 25 |  *		
 26 |  *		THIS SOFTWARE IS PROVIDED BY NEAL HORMAN AND ANY CONTRIBUTORS ``AS IS'' AND
 27 |  *		ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  *		IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  *		ARE DISCLAIMED.  IN NO EVENT SHALL NEAL HORMAN OR ANY CONTRIBUTORS BE LIABLE
 30 |  *		FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 31 |  *		DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 32 |  *		OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 33 |  *		HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 34 |  *		LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 35 |  *		OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 36 |  *		SUCH DAMAGE.
 37 |  *
 38 |  *		Alternately;
 39 |  *
 40 |  *		This "source code" is free software: you can redistribute it and/or modify
 41 |  *		it under the terms of the GNU General Public License as published by
 42 |  *		the Free Software Foundation, either version 3 of the License, or
 43 |  *		(at your option) any later version.
 44 |  *
 45 |  *		This "source code" is distributed in the hope that it will be useful,
 46 |  *		but WITHOUT ANY WARRANTY; without even the implied warranty of
 47 |  *		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 48 |  *		GNU General Public License for more details.
 49 |  *
 50 |  *		You should have received a copy of the GNU General Public License
 51 |  *		along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
 52 |  *
 53 |  *	RCSID:  $Id$
 54 |  *
 55 |  *--------------------------------------------------------------------*/
 56 | 
 57 | static char const rcsid[] = "@(#)$Id$";
 58 | 
 59 | #include <stdlib.h>
 60 | #include <stdio.h>
 61 | #include <string.h>
 62 | #include <regex.h>
 63 | 
 64 | #define _IS_REGEXAPI_
 65 | #include "regexapi.h"
 66 | 
 67 | void regexapi_free(regexapi_t *prat)
 68 | {
 69 | 	if(prat != NULL)
 70 | 	{	unsigned int i,j;
 71 | 
 72 | 		for(i=0; i<prat->matches; i++)
 73 | 		{	regexapimatch_t *pmatch = prat->pmatches+i;
 74 | 		
 75 | 			if(pmatch->nsubs && pmatch->ppsubs != NULL)
 76 | 			{
 77 | 				for(j=0; j<pmatch->nsubs; j++)
 78 | 					free(*(pmatch->ppsubs+j));
 79 | 				free(pmatch->ppsubs);
 80 | 			}
 81 | 		}
 82 | 		if(prat->matches && prat->pmatches != NULL)
 83 | 			free(prat->pmatches);
 84 | 		regfree(&prat->re);
 85 | 		if(prat->preerr != NULL)
 86 | 			free(prat->preerr);
 87 | 		free(prat);
 88 | 	}
 89 | }
 90 | 
 91 | const char *regexapi_sub(regexapi_t *prat, size_t match, size_t nsub)
 92 | {
 93 | 	return (prat != NULL && match < prat->matches && nsub <= (prat->pmatches+match)->nsubs ? *((prat->pmatches+match)->ppsubs+nsub) : NULL);
 94 | }
 95 | 
 96 | int regexapi_nsubs(regexapi_t *prat, size_t match)
 97 | {
 98 | 	return (prat != NULL && match < prat->matches ? (prat->pmatches+match)->nsubs : 0);
 99 | }
100 | 
101 | int regexapi_matches(regexapi_t *prat)
102 | {
103 | 	return (prat != NULL ? prat->matches : 0);
104 | }
105 | 
106 | int regexapi_err(regexapi_t *prat)
107 | {
108 | 	return (prat != NULL ? prat->rerc : 0);
109 | }
110 | 
111 | const char *regexapi_errStr(regexapi_t *prat)
112 | {
113 | 	return (prat != NULL && prat->preerr != NULL ? prat->preerr : "");
114 | }
115 | 
116 | static void regexapi_buildErrStr(regexapi_t *prat)
117 | {
118 | 	if(prat != NULL)
119 | 	{	char errbuf[1024];
120 | 
121 | 		memset(&errbuf,0,sizeof(errbuf));
122 | 		regerror(prat->rerc,&prat->re,errbuf,sizeof(errbuf));
123 | 		prat->preerr = strdup(errbuf);
124 | 	}
125 | }
126 | 
127 | regexapi_t *regexapi_exec(const char *pstr, const char *pregex, unsigned int cflags, unsigned int findCount)
128 | {	regexapi_t *prat = calloc(sizeof(regexapi_t),1);
129 | 
130 | #ifdef _REGEX_DEBUG
131 | 	printf("%s:%d - pstr '%s' pregex '%s' cflags 0x%04X findCount %u\n" , __func__, __LINE__ , pstr, pregex, cflags, findCount);
132 | #endif
133 | 	if(prat != NULL)
134 | 	{
135 | 		prat->rerc = regcomp(&prat->re,pregex,cflags);
136 | 
137 | #ifdef _REGEX_DEBUG
138 | 		if(prat->rerc == 0)
139 | 			printf("%s:%d - regcomp() = %d,  nsub = %d\n", __func__, __LINE__, prat->rerc, prat->re.re_nsub);
140 | #endif
141 | 		if(prat->rerc == 0)
142 | 		{	size_t i;
143 | 			char *pdst = NULL;
144 | 			regmatch_t *presubs = (regmatch_t *)calloc(sizeof(regmatch_t),prat->re.re_nsub+1);
145 | 			regexapimatch_t *pmatch = NULL;
146 | 			size_t last = 0;
147 | 
148 | 			// don't allow iteration for more subs than actually exist
149 | 			if(prat->re.re_nsub < findCount)
150 | 				findCount = prat->re.re_nsub;
151 | 
152 | 			while((prat->rerc = regexec(&prat->re,pstr+last,prat->re.re_nsub+1,presubs,0)) == 0 && findCount != 0)
153 | 			{
154 | 				findCount --;
155 | 				prat->matches ++;
156 | 				prat->pmatches = realloc(prat->pmatches,sizeof(regexapimatch_t)*prat->matches);
157 | 				pmatch = prat->pmatches+(prat->matches-1);
158 | #ifdef _REGEX_DEBUG
159 | 				printf("%s:%d - regexec() = %d\n", __func__, __LINE__, prat->rerc);
160 | #endif
161 | 				pmatch->nsubs = 0;
162 | 				pmatch->ppsubs = (char **)calloc(1,sizeof(char *)*prat->re.re_nsub);
163 | 
164 | 				if(pmatch->ppsubs != NULL)
165 | 				{
166 | 					for(i=1; i<prat->re.re_nsub+1; i++)
167 | 					{	size_t so = (presubs+i)->rm_so + last;
168 | 						size_t eo = (presubs+i)->rm_eo + last;
169 | 						size_t qo = (eo - so);
170 | 
171 | 						pmatch->nsubs++;
172 | 						pdst = *(pmatch->ppsubs+(i-1)) = (char *)calloc(qo+1,1);
173 | 						strncpy(pdst,(pstr+so),qo);
174 | 						*(pdst+qo) = 0;
175 | 
176 | 						if(i == prat->re.re_nsub)
177 | 							last = eo;
178 | #ifdef _REGEX_DEBUG
179 | 						printf("%s:%d - sub %d: so %d eo %d qo %d - '%*.*s'\n", __func__, __LINE__, i, so, eo, qo, qo, qo, pdst);
180 | #endif
181 | 					}
182 | 				}
183 | 			}
184 | 
185 | 			if(presubs != NULL)
186 | 				free(presubs);
187 | 
188 | 			if(prat->matches > 0 && prat->rerc == 1)
189 | 				prat->rerc = 0;
190 | 		}
191 | 	}
192 | 
193 | 	if(prat != NULL && prat->rerc)
194 | 	{
195 | 		regexapi_buildErrStr(prat);
196 | #ifdef _REGEX_DEBUG
197 | 		printf("regex error: %d/'%s'\n",prat->rerc,regexapi_errStr(prat));
198 | #endif
199 | 	}
200 | 
201 | 	return prat;
202 | }
203 | 
204 | int regexapi(const char *pstr, const char *pregex, int cflags)
205 | {	regexapi_t *prat = regexapi_exec(pstr,pregex,cflags,1);
206 | 	int rc = regexapi_matches(prat) != 0;
207 | 
208 | 	if(prat != NULL)
209 | 		regexapi_free(prat);
210 | 
211 | 	return rc;
212 | }
213 | 
214 | 
215 | #ifdef _REGEX_UNIT_TEST
216 | int main(int argc, char **argv)
217 | {
218 | 	if(argc == 3)
219 | 	{	int i = 1;
220 | 		regexapi_t *prat = regexapi_exec(argv[i],argv[i+1],REGEX_DEFAULT_CFLAGS,REGEX_FIND_ALL);
221 | 
222 | 		printf("%s: '%s' %c= '%s'\n", argv[0], argv[i], (regexapi_matches(prat) ? '=' : '!'), argv[i+1]);
223 | 		if(regexapi_matches(prat))
224 | 		{	int q;
225 | 
226 | 			for(i=0,q=regexapi_nsubs(prat,0); i<q; i++)
227 | 				printf("sub %d: '%s'\n",i+1,regexapi_sub(prat,0,i));
228 | 		}
229 | 
230 | 		regexapi_free(prat);
231 | 	}
232 | 	else
233 | 		printf("%s [string to test] [regex]\n", argv[0]);
234 | 
235 | 	return 0;
236 | }
237 | #endif
238 | 


--------------------------------------------------------------------------------
/regexapi.h:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------*
  2 |  *
  3 |  *	This "source code" is part of Spamilter - http://www.spamilter.org
  4 |  *	Additionally, this "source code" is herby, also incorporated as part json_fdw
  5 |  *
  6 |  * Developed by;
  7 |  *	Neal Horman - http://www.wanlink.com
  8 |  *	Copyright (c) 2010-2015 Neal Horman. All Rights Reserved
  9 |  *
 10 |  *	Redistribution and use in source and binary forms, with or without
 11 |  *	modification, are permitted provided that the following conditions
 12 |  *	are met;
 13 |  *
 14 |  *		1. Redistributions of source code must retain the above copyright
 15 |  *		   notice, this list of conditions and the following disclaimer.
 16 |  *		2. Redistributions in binary form must reproduce the above copyright
 17 |  *		   notice, this list of conditions and the following disclaimer in the
 18 |  *		   documentation and/or other materials provided with the distribution.
 19 |  *		3. All advertising materials mentioning features or use of this software
 20 |  *		   must display the following acknowledgement:
 21 |  *		This product includes software developed by Neal Horman.
 22 |  *		4. Neither the name Neal Horman nor the names of any contributors
 23 |  *		   may be used to endorse or promote products derived from this software
 24 |  *		   without specific prior written permission.
 25 |  *		
 26 |  *		THIS SOFTWARE IS PROVIDED BY NEAL HORMAN AND ANY CONTRIBUTORS ``AS IS'' AND
 27 |  *		ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  *		IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  *		ARE DISCLAIMED.  IN NO EVENT SHALL NEAL HORMAN OR ANY CONTRIBUTORS BE LIABLE
 30 |  *		FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 31 |  *		DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 32 |  *		OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 33 |  *		HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 34 |  *		LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 35 |  *		OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 36 |  *		SUCH DAMAGE.
 37 |  *
 38 |  *		Alternately;
 39 |  *
 40 |  *		This "source code" is free software: you can redistribute it and/or modify
 41 |  *		it under the terms of the GNU General Public License as published by
 42 |  *		the Free Software Foundation, either version 3 of the License, or
 43 |  *		(at your option) any later version.
 44 |  *
 45 |  *		This "source code" is distributed in the hope that it will be useful,
 46 |  *		but WITHOUT ANY WARRANTY; without even the implied warranty of
 47 |  *		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 48 |  *		GNU General Public License for more details.
 49 |  *
 50 |  *		You should have received a copy of the GNU General Public License
 51 |  *		along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
 52 |  *
 53 |  *	RCSID:  $Id$
 54 |  *
 55 |  *--------------------------------------------------------------------*/
 56 | 
 57 | #ifndef _REGEXAPI_H_
 58 | #define _REGEXAPI_H_
 59 | 
 60 | #ifdef __cplusplus
 61 | extern "C" {
 62 | #endif
 63 | 
 64 | 	#include <regex.h>
 65 | 
 66 | 	#define REGEX_DEFAULT_CFLAGS ( REG_EXTENDED | REG_ICASE )
 67 | 	#define REGEX_FIND_ALL ~0 
 68 | 
 69 | #ifdef _IS_REGEXAPI_
 70 | 	typedef struct _regexapimatch_t
 71 | 	{
 72 | 		size_t nsubs;
 73 | 		char **ppsubs;
 74 | 	}regexapimatch_t;
 75 | 
 76 | 	typedef struct _regexapi_t
 77 | 	{
 78 | 		regex_t re;
 79 | 		int rerc;
 80 | 		char *preerr;
 81 | 
 82 | 		unsigned int matches;
 83 | 		regexapimatch_t *pmatches;
 84 | 	}regexapi_t;
 85 | #else
 86 | 	typedef struct _regexapi_t regexapi_t;
 87 | #endif
 88 | 
 89 | 	void regexapi_free(regexapi_t *prat);
 90 | 	const char *regexapi_sub(regexapi_t *prat, size_t match, size_t nsub);
 91 | 	int regexapi_nsubs(regexapi_t *prat, size_t match);
 92 | 	int regexapi_matches(regexapi_t *prat);
 93 | 	int regexapi_err(regexapi_t *prat);
 94 | 	const char *regexapi_errStr(regexapi_t *prat);
 95 | 	regexapi_t *regexapi_exec(const char *pstr, const char *pregex, unsigned int cflags, unsigned int findCount);
 96 | 
 97 | 	// for simplicitly
 98 | 	int regexapi(const char *pstr, const char *pregex, int cflags);
 99 | 
100 | #ifdef __cplusplus
101 | }
102 | #endif
103 | 
104 | #endif
105 | 
106 | 


--------------------------------------------------------------------------------
/regexapi_helper.c:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------*
 2 |  *
 3 |  * Developed by;
 4 |  *	Neal Horman - http://www.wanlink.com
 5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
 6 |  *
 7 |  *	This "source code" is free software: you can redistribute it and/or modify
 8 |  *	it under the terms of the GNU General Public License as published by
 9 |  *	the Free Software Foundation, either version 3 of the License, or
10 |  *	(at your option) any later version.
11 |  *
12 |  *	This "source code" is distributed in the hope that it will be useful,
13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  *	GNU General Public License for more details.
16 |  *
17 |  *	You should have received a copy of the GNU General Public License
18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
19 |  *
20 |  *	RCSID:  $Id$
21 |  *
22 |  *--------------------------------------------------------------------*/
23 | 
24 | #include <stdio.h>
25 | #include <stdlib.h>
26 | #include <unistd.h>
27 | 
28 | #include "regexapi.h"
29 | #include "regexapi_helper.h"
30 | 
31 | // URL validation support
32 | typedef struct _regexapilist_t
33 | {
34 | 	const char *pattern;
35 | 	int flags;
36 | 	int findCount;
37 | }regexapilist_t;
38 | 
39 | #define URLHOSTNAME "([a-z0-9][a-z0-9._-]*[.][a-z]{2,})"
40 | #define URLHOSTIPV4 "([0-9]{1,3}[.][0-9]{1,3}[.][0.9]{1,3}[.][0-9]{1,3})"
41 | #define URLHOSTLOCAL "(localhost)"
42 | #define URLHOST "(" URLHOSTNAME "|" URLHOSTLOCAL "|" URLHOSTIPV4 ")"
43 | #define URLPORT "(:[0-9]+)*"
44 | #define URLSPEC URLHOST URLPORT
45 | #define URISPEC "/.*"
46 | 
47 | // http[s]?://([a-z0-9][a-z0-9._-]*[.][a-z]{2,}(:[0-9]+)*)(.*)
48 | // http[s]?://((([a-z0-9][a-z0-9._-]*[.][a-z]{2,})|(localhost)|([0-9]{1,3}[.][0-9]{1,3}[.][0.9]{1,3}[.][0-9]{1,3}))(:[0-9]+)*)(/.{0,})
49 | 
50 | // List of valid URL regexes that CURL supports
51 | static regexapilist_t const regexUrls[] =
52 | {
53 | 	{ "(http[s]?)://(" URLSPEC ")(" URISPEC ")", ( REG_EXTENDED | REG_ICASE ), 2 },
54 | 	{ NULL, 0, 0 },
55 | };
56 | 
57 | // Supported URL regex validation iterator
58 | static regexapi_t *regexapi_exec_list(const char *subject, regexapilist_t const *pRegexList)
59 | {	regexapi_t *pRat = NULL;
60 | 
61 | 	while(pRat == NULL && pRegexList->pattern != NULL)
62 | 		pRat = regexapi_exec(subject, pRegexList->pattern, pRegexList->flags, pRegexList->findCount);
63 | 
64 | 	return pRat;
65 | }
66 | 
67 | regexapi_t *regexapi_url(char const *subject)
68 | {
69 | 	return regexapi_exec_list(subject, regexUrls);
70 | }
71 | 


--------------------------------------------------------------------------------
/regexapi_helper.h:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------*
 2 |  *
 3 |  * Developed by;
 4 |  *	Neal Horman - http://www.wanlink.com
 5 |  *	Copyright (c) 2015 Neal Horman. All Rights Reserved
 6 |  *
 7 |  *	This "source code" is free software: you can redistribute it and/or modify
 8 |  *	it under the terms of the GNU General Public License as published by
 9 |  *	the Free Software Foundation, either version 3 of the License, or
10 |  *	(at your option) any later version.
11 |  *
12 |  *	This "source code" is distributed in the hope that it will be useful,
13 |  *	but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  *	GNU General Public License for more details.
16 |  *
17 |  *	You should have received a copy of the GNU General Public License
18 |  *	along with this "source code".  If not, see <http://www.gnu.org/licenses/>.
19 |  *
20 |  *	RCSID:  $Id$
21 |  *
22 |  *--------------------------------------------------------------------*/
23 | 
24 | #ifndef _REGEXAPI_HELPER_H_
25 | #define _REGEXAPI_HELPER_H_
26 | 
27 | regexapi_t *regexapi_url(char const *subject);
28 | #endif
29 | 


--------------------------------------------------------------------------------
/sql/.gitignore:
--------------------------------------------------------------------------------
1 | # This directory will be populated when testing from input directory
2 | # Ignore everything in this directory
3 | *
4 | # Except this file
5 | !.gitignore
6 | 


--------------------------------------------------------------------------------