├── META.json ├── Makefile ├── README.md ├── curlapi.c ├── curlapi.h ├── data ├── blk_-729487577044220672 ├── customer_reviews_1998.1000.json.gz ├── data.json ├── data_broken.json └── invalid_gz_file.json.gz ├── expected └── .gitignore ├── gettickcount.c ├── gettickcount.h ├── input ├── basic_tests.source ├── customer_reviews.source ├── hdfs_block.source └── invalid_gz_file.source ├── json_fdw--1.0.sql ├── json_fdw.c ├── json_fdw.control ├── json_fdw.h ├── output ├── basic_tests.source ├── customer_reviews.source ├── hdfs_block.source └── invalid_gz_file.source ├── rciapi.c ├── rciapi.h ├── regexapi.c ├── regexapi.h ├── regexapi_helper.c ├── regexapi_helper.h └── sql └── .gitignore /META.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "json_fdw", 3 | "abstract": "Foreign Data Wrapper for JSON files", 4 | "description": "PostgreSQL extension which implements a Foreign Data Wrapper (FDW) for JSON files.", 5 | "version": "1.3.0", 6 | "maintainer": "Hadi Moshayedi ", 7 | "license": "gpl_3", 8 | "provides": { 9 | "json_fdw": { 10 | "abstract": "Foreign Data Wrapper for JSON files", 11 | "file": "json_fdw.c", 12 | "docfile": "README.md", 13 | "version": "1.3.0" 14 | } 15 | }, 16 | "prereqs": { 17 | "runtime": { 18 | "requires": { 19 | "PostgreSQL": "9.2.0" 20 | } 21 | } 22 | }, 23 | "resources": { 24 | "bugtracker": { 25 | "web": "http://github.com/citusdata/json_fdw/issues/" 26 | }, 27 | "repository": { 28 | "url": "git://github.com/citusdata/json_fdw.git", 29 | "web": "https://github.com/citusdata/json_fdw/", 30 | "type": "git" 31 | } 32 | }, 33 | "generated_by": "David E. Wheeler", 34 | "meta-spec": { 35 | "version": "1.0.0", 36 | "url": "http://pgxn.org/meta/spec.txt" 37 | }, 38 | "tags": [ 39 | "json", 40 | "fdw", 41 | "foreign data wrapper", 42 | "json_fdw" 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # contrib/json_fdw/Makefile 2 | 3 | MODULE_big = json_fdw 4 | 5 | OBJS = json_fdw.o curlapi.o regexapi.o regexapi_helper.o gettickcount.o rciapi.o 6 | 7 | ifeq ($(shell uname -s), Linux) 8 | # Directly link against yajl 2, so it works in Ubuntu 12.04 too. 9 | SHLIB_LINK = -lz -l:libyajl.so.2 10 | else 11 | # Non-linux OS's (in particular, OS X) don't support "-l:" syntax, 12 | # so use the -lyajl flag instead. 13 | SHLIB_LINK = -lz -lyajl 14 | endif 15 | 16 | EXTENSION = json_fdw 17 | DATA = json_fdw--1.0.sql 18 | 19 | REGRESS = basic_tests customer_reviews hdfs_block invalid_gz_file 20 | EXTRA_CLEAN = sql/basic_tests.sql expected/basic_tests.out \ 21 | sql/customer_reviews.sql expected/customer_reviews.out \ 22 | sql/hdfs_block.sql expected/hdfs_block.out \ 23 | sql/invalid_gz_file.sql expected/invalid_gz_file.out 24 | 25 | # 26 | # Users need to specify their Postgres installation path through pg_config. For 27 | # example: /usr/local/pgsql/bin/pg_config or /usr/lib/postgresql/9.2/bin/pg_config 28 | # 29 | 30 | # find pg_config 31 | OS:=$(shell uname -s) 32 | PG_CONFIG:= $(shell which pg_config) 33 | FIND_ROOTPATH:= "/" 34 | ifeq (${OS},Darwin) 35 | FIND_ROOTPATH:= $(shell if [ -d "/Applications" ]; then echo "/Applications"; else echo "/"; fi) 36 | endif 37 | PG_CONFIG:= $(shell if [ ! -e "pg_config.loc" ]; then find $(FIND_ROOTPATH) -name pg_config > pg_config.loc; fi; cat pg_config.loc) 38 | 39 | # for localy built uinstalled libraries, do this 40 | YAJLDIR= ../yajl.git/build/yajl-2.1.1 41 | PG_CPPFLAGS+= -I$(YAJLDIR)/include 42 | SHLIB_LINK+= -L$(YAJLDIR)/lib 43 | 44 | ZLIBDIR= ../zlib-1.2.8 45 | PG_CPPFLAGS+= -I$(ZLIBDIR) 46 | SHLIB_LINK+= -L$(ZLIBDIR) 47 | 48 | # for localy build uninstalled curl, do this 49 | CURLDIR= ../curl-7.40.0 50 | PG_CPPFLAGS+= -I$(CURLDIR)/include 51 | SHLIB_LINK+= -L$(CURLDIR)/lib/.libs -lcurl -lssl -lcrypto 52 | 53 | # for system version of curl, do this 54 | #CURL_CONFIG:= $(shell which curl-config) 55 | #PG_CPPFLAGS+= $(shell sh $(CURL_CONFIG) --cflags) 56 | #SHLIB_LINK+= $(shell sh $(CUR_CONFIG) --static-libs) 57 | 58 | PGXS := $(shell $(PG_CONFIG) --pgxs) 59 | include $(PGXS) 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | json_fdw2 2 | ======== 3 | 4 | **json_fdw2** is a fork of the [citusdata/json_fdw] PostgreSQL Foreign Data Wrapper (FDW) extension project, to query locally stored JSON files, and supports analytic queries against array types, nested fields, and 5 | heterogeneous documents. 6 | 7 | 8 | Project Goal 9 | --- 10 | 11 | The original project is only capable of **Select** operations. ie. read-only, and only from local JSON files. 12 | This fork's goals are: 13 | 1. Add the ability to operate on remote JSON content via HTTP operations, in a RESTful style/manner. 14 | 2. Add support for **Update**, **Insert** and **Delete** operations. 15 | 16 | 17 | Progress 18 | --- 19 | 20 | 1. Done 21 | 2. I have completed the work for **Update** and **Insert**, and believe them to both function correctly. 22 | 23 | 24 | Todo 25 | --- 26 | * Implement **Delete** operation support 27 | * Only execute remote ETAG re-validation after aging based on Cache-Control and / or Content-Expires headers. 28 | 29 | 30 | Limitations 31 | --- 32 | 33 | * json\_fdw2 currently only works with PostgreSQL 9.4 34 | 35 | * json\_fdw2 only supports files that consist of one JSON document per line. It 36 | doesn't support objects that span multiple lines. 37 | 38 | * PostgreSQL limits column names to 63 characters by default. If you need column 39 | names that are longer, you can increase the NAMEDATALEN constant in 40 | src/include/pg\_config\_manual.h, compile, and reinstall. 41 | 42 | 43 | Dependancies 44 | --- 45 | 46 | * [nkhorman/yajl] You'll need to use the \`\`json_path'' branch. **Do not** use the yajl from http://github.com/lloyd/yajl, json\_fdw2 won't compile! 47 | * [libcurl-7.40.0] Only curl-7.40.0 has been tested. 48 | * zlib-1.2.8 49 | 50 | 51 | Building 52 | -------- 53 | 54 | The following build instructions are from the original project and are old: 55 | 56 | 57 | ## Fedora 17+ 58 | sudo yum install zlib-devel yajl-devel 59 | 60 | ## Ubuntu 12.10+ 61 | sudo apt-get update 62 | sudo apt-get install zlib1g-dev libyajl-dev 63 | 64 | ## Other Linux Distributions 65 | (First install zlib-devel, cmake, and ruby) 66 | wget http://github.com/lloyd/yajl/tarball/2.0.1 -O yajl-2.0.1.tar.gz 67 | tar -xzvf yajl-2.0.1.tar.gz 68 | cd lloyd-yajl-f4b2b1a 69 | ./configure 70 | make 71 | sudo make install 72 | echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/libyajl.conf 73 | sudo ldconfig 74 | 75 | Once you have yajl and zlib installed on your machine, you are ready to build 76 | json\_fdw2. For this, you need to include the pg\_config directory path in your 77 | make command. This path is typically the same as your PostgreSQL installation's 78 | bin/ directory path. For example: 79 | 80 | PATH=/usr/local/pgsql/bin/:$PATH make 81 | sudo PATH=/usr/local/pgsql/bin/:$PATH make install 82 | 83 | **Note**: In RedHat 5.X and CentOS 5.X you may need to edit the Makefile and change "-l:libyajl.so.2" to "-lyajl". 84 | 85 | 86 | Usage 87 | ----- 88 | 89 | The following parameters can be set on a JSON foreign table object; 90 | 91 | * \`\`filename'': The absolute path of a json file or a gzipped json file. 92 | * \`\`max\_error\_count'': Maximum number of invalid json documents to skip before 93 | erroring out. Defaults to 0. 94 | 95 | As an example, we demonstrate querying a compressed JSON file from scratch here. Note 96 | that the underlying file contains JSON documents separated by newlines. 97 | Start with downloading the file. 98 | 99 | wget http://examples.citusdata.com/customer_reviews_nested_1998.json.gz 100 | 101 | Next, log into Postgres, and run the following commands to create a 102 | foreign table associated with this JSON file. 103 | 104 | -- load extension first time after install 105 | CREATE EXTENSION json_fdw; 106 | 107 | -- create server object 108 | CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw; 109 | 110 | -- create foreign table 111 | CREATE FOREIGN TABLE customer_reviews 112 | ( 113 | customer_id TEXT, 114 | "review.date" DATE, 115 | "review.rating" INTEGER, 116 | "product.id" CHAR(10), 117 | "product.group" TEXT, 118 | "product.title" TEXT, 119 | "product.similar_ids" CHAR(10)[] 120 | ) 121 | SERVER json_server 122 | OPTIONS (filename '/home/citusdata/customer_reviews_nested_1998.json.gz'); 123 | 124 | -- optionally, collect data distribution statistics 125 | ANALYZE customer_reviews; 126 | 127 | Finally, let's run some example SQL queries on your JSON file. 128 | 129 | -- find all reviews a particular customer made on the Dune series in 1998 130 | 131 | SELECT 132 | customer_id, "review.rating", "product.id", "product.title" 133 | FROM 134 | customer_reviews 135 | WHERE 136 | customer_id ='A27T7HVDXA3K2A' AND 137 | "product.title" LIKE '%Dune%' AND 138 | "review.date" >= '1998-01-01' AND 139 | "review.date" <= '1998-12-31'; 140 | 141 | -- do we have a correlation between a book's title's length and its review ratings? 142 | 143 | SELECT 144 | width_bucket(length("product.title"), 1, 50, 5) title_length_bucket, 145 | round(avg("review.rating"), 2) AS review_average, 146 | count(*) 147 | FROM 148 | customer_reviews 149 | WHERE 150 | "product.group" = 'Book' 151 | GROUP BY 152 | title_length_bucket 153 | ORDER BY 154 | title_length_bucket; 155 | 156 | 157 | Fetching Remote Files 158 | --------------------- 159 | For remote fetch operations, the \`\`filename'' parameter is now overloaded as 160 | any valid HTTP URL, and an additional parameter has been introduced; 161 | 162 | * \`\`http\_post\_vars'': A list of key value pairs separated by the \`\`&'' 163 | symbol that are sent in a post operation. 164 | 165 | Using key values pairs in the filename URL and in http\_post\_vars option are 166 | not mutally exclusive, however, a given key value pair should only exist in 167 | one or the other. 168 | 169 | The following example shows how to fetch remote files, that are then cached locally. 170 | Local caching of the remote content is done, and validated using Entity Tags (ETAG header) upon every query of the table content. 171 | 172 | **Note**: that the existing handling of Gzip files is supported, because, after the 173 | file is fetched, it is handed off to the existing file handling code, as if 174 | it were previously staged on disk. 175 | 176 | Based on how libcurl is built the following are supported, but untested; 177 | 178 | * Both Content Encoding and Transport Encoding 179 | * Https 180 | 181 | Fictitious usage example, using a standard Get operation; 182 | 183 | -- create foreign table - using optional get parameters 184 | CREATE FOREIGN TABLE an_example_table 185 | ( 186 | fieldName1 TEXT, 187 | fieldName2 INTEGER, 188 | . ., 189 | . ., 190 | . . 191 | ) 192 | SERVER json_server 193 | OPTIONS (filename 'http://www.example.com/file/location/url/some.json.gz?optional=paramaters&separated=traditionally'); 194 | 195 | 196 | Fictitious usage example, using a Post operation; 197 | 198 | -- create foreign table - using optional post and get parameters 199 | CREATE FOREIGN TABLE another_example_table 200 | ( 201 | fieldName1 TEXT, 202 | fieldName2 INTEGER, 203 | . ., 204 | . ., 205 | . . 206 | ) 207 | SERVER json_server 208 | OPTIONS (filename 'http://www.example.com/file/location/url/someother.json', http_post_vars 'another=parameter_set&separated=traditionally'); 209 | 210 | 211 | Refining the original table example, the "wget" operation and query 212 | operation are preformed in a single step, create the table as below. 213 | 214 | -- create foreign table 215 | CREATE FOREIGN TABLE customer_reviews 216 | ( 217 | customer_id TEXT, 218 | "review.date" DATE, 219 | "review.rating" INTEGER, 220 | "product.id" CHAR(10), 221 | "product.group" TEXT, 222 | "product.title" TEXT, 223 | "product.similar_ids" CHAR(10)[] 224 | ) 225 | SERVER json_server 226 | OPTIONS (filename 'http://examples.citusdata.com/customer_reviews_nested_1998.json.gz'); 227 | 228 | 229 | The additional table options \`\`rom_url'' and \`\`rom_path'' are required for operations 230 | other than **Select**. Use of these two options are mutually exlusive to the \`\`filename'' and 231 | \`\`http_post_vars'' table options. 232 | 233 | Rather than add additional table options for differing operations, ie Select, Insert, etc., 234 | which necesitate table destruction and re-creation to change, a more flexible approach was 235 | taken by using a json object to describe the operational characteristics. The locations of 236 | the json object is specified by the \`\`rom_url'' option. 237 | 238 | The \`\`rom_path'' option is used to specify which operation set to use. ie. the name of 239 | the table be operated on. 240 | 241 | An example ROM (Remote Operations Mapping) json object follows; 242 | 243 | { 244 | "romschema": "2", 245 | "host": "", 246 | "url": "/some/uri/path", 247 | 248 | "rom_path_1": 249 | { 250 | "url": "/", 251 | "select":{ 252 | "method": "get", 253 | "url": "/", 254 | "query": [ {"name":"mode", "value":"multi-doc"}, {"name":"t", "value":3} ] 255 | }, 256 | "insert":{ 257 | "method": "put", 258 | "url": "/", 259 | "query": [ {"name":"t", "value":4} ] 260 | }, 261 | "update":{ 262 | "method": "put", 263 | "url": "/", 264 | "query": [ {"name":"mode", "value":"multi-doc"}, {"name":"t", "value":3} ] 265 | } 266 | }, 267 | "rom_path_other": 268 | { 269 | "select":{ 270 | "method":"get", 271 | "query": [ {"name":"other", "value":"foo"} ] 272 | } 273 | } 274 | } 275 | 276 | The "romschema" value of 2 is fixed, used as the only schema validation of the ROM. 277 | 278 | The "url" string elements specified inside a given rom_path and or rom_path operation, are 279 | optional, and if specified as "/", will be ignored, however, if present, will be used to 280 | create the effective url. Each of the "query" arrayed object elements are concatenated with 281 | the effective url as request key value pairs. So for example, given the following table options; 282 | 283 | (rom_url 'http://www.example.com/object/rom.json', rom_path 'rom_path_1') 284 | 285 | and an SQL Select operation with the rom_url pointing to the example rom above, the following 286 | url will be used; 287 | 288 | http://www.example.com/some/uri/path/?mode=multi-doc&t=3 289 | 290 | as the fetch url for content to be retreived, as if it has been used in the \`\`filename'' 291 | table option. 292 | 293 | The "host" string element at the root of the ROM is used to prepend the "url" string element. 294 | If specfied as; 295 | 296 | http://api.example.com:8080 297 | 298 | 299 | Then an SQL Select operation would use the following url; 300 | 301 | http://api.example.com:8080/some/uri/path/?mode=multi-doc&t=3 302 | 303 | **Note:** Only http based operations are supported for ROM actions. Also, presently, "get" 304 | is the only method supported for Select operations, and only "put" is supported for 305 | Insert, and Update operations. 306 | 307 | 308 | 309 | Table Schema Conventions 310 | ------------------------ 311 | 312 | There are three things worth noting about table schemas. First, nested fields 313 | in JSON documents are referenced using dot separators. For example, a field defined 314 | as "review": { "rating" : 5 } in a JSON document is declared as "review.rating" 315 | in the foreign table schema. The quotes around "review.rating" are necessary, as 316 | identifiers that include dots aren't valid in Postgres otherwise. 317 | 318 | Second, the foreign table schema is defined at read-time. If you have an additional 319 | field that you'd like to query, such as "review.votes", you can simply add the 320 | column name and start querying for data. You can even create multiple table schemas 321 | for the same underlying JSON, and query through them. 322 | 323 | Third, json\_fdw2 assumes that underlying data can be heterogeneous. If you are 324 | querying for a column, and this field doesn't exist in a document, or the field's 325 | data type doesn't match the declared column type, json\_fdw2 considers that particular 326 | field to be null. 327 | 328 | 329 | Querying Multiple Sources 330 | ----------------------- 331 | 332 | json\_fdw2 borrows its semantics from file\_fdw, and associates one foreign table 333 | with one JSON source. If you'd like to query all your JSON sources from one table, 334 | you could use PostgreSQL's basic table partitioning feature, and manually create 335 | one child table per JSON file. 336 | 337 | 338 | Copyright 339 | --------- 340 | 341 | Portions Copyright (c) 2015 Neal Horman 342 | 343 | Portions Copyright (c) 2013 Citus Data, Inc. 344 | 345 | This module is free software; you can redistribute it and/or modify it under the 346 | GNU GPL v3.0 License. 347 | 348 | 349 | 350 | [citusdata/json_fdw]: 351 | [nkhorman/yajl]: 352 | [libcurl-7.40.0]: 353 | -------------------------------------------------------------------------------- /curlapi.c: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include // for struct dirent 32 | #include // for struct dirent 33 | #include // for mkdir 34 | #include // for MD5_xxx foo 35 | #include // for pthread_self() 36 | 37 | #include "curl/curl.h" 38 | #include "curlapi.h" 39 | #include "regexapi.h" 40 | #include "regexapi_helper.h" 41 | #include "gettickcount.h" 42 | 43 | // Where files are downloaded to 44 | #define CURL_BASE_DIR "/tmp/json_fdw_cache" 45 | // Maximum length of on disk tempoarary file names 46 | #define MAXFILENAME 1024 47 | 48 | #define FREEPTR(a) do { if((a) != NULL) { free((a)); (a) = NULL; }; } while(0) 49 | 50 | #ifdef DEBUG_WLOGIT 51 | void (*gcurlLogFn)(const char *) = NULL; 52 | 53 | void curlLogItSet(void (*pfn)(const char *)) 54 | { 55 | gcurlLogFn = pfn; 56 | } 57 | 58 | static void curlLogIt(const char *pFmt, ...) 59 | { 60 | if(gcurlLogFn != NULL) 61 | { va_list vl; 62 | char *pStr = NULL; 63 | 64 | va_start(vl, pFmt); 65 | vasprintf(&pStr, pFmt, vl); 66 | va_end(vl); 67 | 68 | if(pStr != NULL) 69 | { 70 | gcurlLogFn(pStr); 71 | free(pStr); 72 | } 73 | } 74 | } 75 | #endif 76 | 77 | static const char *hexDigits = "0123456789ABCDEF"; 78 | 79 | // An MD5 object 80 | typedef struct _cmd5_t 81 | { 82 | MD5_CTX ctx; 83 | unsigned char digest[MD5_DIGEST_LENGTH]; 84 | char ascii[(MD5_DIGEST_LENGTH*2)+1]; 85 | }cmd5_t; // Curl MD5 Type 86 | 87 | // Alloc and Init MD5 object 88 | static cmd5_t *curlMd5Init(void) 89 | { cmd5_t *pMd5 = calloc(1, sizeof(cmd5_t)); 90 | 91 | if(pMd5 != NULL) 92 | MD5_Init(&pMd5->ctx); 93 | 94 | return pMd5; 95 | } 96 | 97 | // Free an MD5 object 98 | static void curlMd5Free(cmd5_t *pMd5) 99 | { 100 | FREEPTR(pMd5); 101 | } 102 | 103 | static void curlMd5Hash(cmd5_t *pMd5, const char *pStr) 104 | { 105 | MD5_Update(&pMd5->ctx, (const unsigned char *)pStr, strlen(pStr)); 106 | } 107 | 108 | // Finalize the MD5 object, and build an ASCII string 109 | // of the digest, then strdup it 110 | static char *curlMd5Final(cmd5_t *pMd5) 111 | { int i; 112 | 113 | MD5_Final(pMd5->digest, &pMd5->ctx); 114 | 115 | // Convert MD5 digest into ASCII string 116 | for (i = 0; i < MD5_DIGEST_LENGTH; i++) 117 | { 118 | pMd5->ascii[i+i] = hexDigits[pMd5->digest[i] >> 4]; 119 | pMd5->ascii[i+i+1] = hexDigits[pMd5->digest[i] & 0x0f]; 120 | } 121 | 122 | return strdup(pMd5->ascii); 123 | } 124 | 125 | // Callback from CURL to write contents to disk 126 | static size_t curlWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) 127 | { ccf_t *pCcf = (ccf_t *)userp; 128 | 129 | return fwrite(contents, size, nmemb, pCcf->pFile); 130 | } 131 | 132 | // If pHdr matches the first of pSrc, then duplicate the balance of the header 133 | // The caller must free the result 134 | static char *curlHeaderCallbackMatch(const char *pSrc, size_t srcLen, const char *pHdr) 135 | { char *pDst = NULL; 136 | size_t hdrLen = strlen(pHdr); 137 | 138 | // capture the etag header value 139 | if(srcLen > hdrLen && strncasecmp(pSrc, pHdr, hdrLen) == 0) 140 | { const char *pl = pSrc + hdrLen; 141 | const char *pr = pSrc + srcLen - 1; 142 | 143 | // left trim 144 | while(*pl == ' ' || *pl == '\t') 145 | pl++; 146 | // right trim 147 | while(*pl == ' ' || *pl == '\t' || *pr == '\n' || *pr == '\r') 148 | pr--; 149 | 150 | if(pr>pl) 151 | { int l = pr-pl+1; 152 | 153 | asprintf(&pDst, "%*.*s", l, l, pl); 154 | } 155 | } 156 | 157 | return pDst; 158 | } 159 | 160 | // Callback from CURL for header examination 161 | // Collect header values that we are interested in 162 | static size_t curlHeaderCallback(void *contents, size_t size, size_t nmemb, void *userp) 163 | { cfr_t *pCfr = (cfr_t *)userp; 164 | size_t len = size * nmemb; 165 | 166 | if(pCfr != NULL) 167 | { char *pHdrVal = NULL; 168 | ccf_t *pCcf = &pCfr->ccf; 169 | int i; 170 | struct hdra_t 171 | { 172 | const char *str; 173 | size_t idx; 174 | } pHdrs[] = 175 | { 176 | // Order in this array doesn't matter, 177 | // but the number of elements in this 178 | // array must be no more than HDR_COUNT 179 | {HDR_STR_ETAG, HDR_IDX_ETAG}, 180 | {HDR_STR_LASTMODIFIED, HDR_IDX_LASTMODIFIED}, 181 | {HDR_STR_CACHECONTROL, HDR_IDX_CACHECONTROL} 182 | }; 183 | 184 | // Search the array of header keys, find the one that matches what 185 | // was just passed into us in contents, and, if not already set to 186 | // non-null, store the duplicated header value 187 | for(i=0; pHdrVal == NULL && i < sizeof(pHdrs)/sizeof(pHdrs[0]); i++) 188 | { 189 | // pHdrVal is already strdup'd for us 190 | pHdrVal = curlHeaderCallbackMatch((const char *)contents, len, pHdrs[i].str); 191 | 192 | if(pHdrVal != NULL) 193 | { 194 | FREEPTR(pCcf->pHdrs[pHdrs[i].idx]); 195 | pCcf->pHdrs[pHdrs[i].idx] = pHdrVal; 196 | } 197 | } 198 | } 199 | 200 | return len; 201 | } 202 | 203 | // Create a temporary file possibly to write into, 204 | // if we receive content from the fetch operation 205 | // Also, figure out what filename we should use for 206 | // content caching purposes. 207 | static void curlCacheFileOpen(ccf_t *pCcf) 208 | { int fd = -1; 209 | char tmpfnamebuf[MAXFILENAME]; 210 | 211 | // make sure we can store our files 212 | mkdir(CURL_BASE_DIR, 0755); 213 | 214 | // create a temporary file, for possible use later 215 | memset(tmpfnamebuf, 0, sizeof(tmpfnamebuf)); 216 | sprintf(tmpfnamebuf, "%s/tmpXXXXXXXXXX", CURL_BASE_DIR); 217 | 218 | if((fd = mkstemp(tmpfnamebuf)) != -1) 219 | { 220 | pCcf->pFileNameTmp = strdup(tmpfnamebuf); 221 | pCcf->bNeedUnlink = true; 222 | 223 | // Get a FILE pointer 224 | pCcf->pFile = (fd != -1 ? fdopen(fd, "w") : NULL); 225 | } 226 | 227 | // Figure out what the on disk filename should be after the retrieval 228 | if(pCcf->pUrlBaseName == NULL || !*pCcf->pUrlBaseName) 229 | { 230 | FREEPTR(pCcf->pUrlBaseName); 231 | FREEPTR(pCcf->pFileName); 232 | 233 | // The URL didn't specify a file, use the urlhash as the filename 234 | asprintf(&pCcf->pFileName, "%s/%s", CURL_BASE_DIR, pCcf->pUrlHash); 235 | } 236 | else // Use the specified basename of the filename from the URL 237 | // so that file handling semantics based on filenames work 238 | asprintf(&pCcf->pFileName, "%s/%s", CURL_BASE_DIR, pCcf->pUrlBaseName); 239 | } 240 | 241 | // Test if pUrl is a CURL supported URL 242 | // If so, grab the basename, for use later 243 | static bool curlIsUrl(const char *pUrl, ccf_t *pCcf) 244 | { bool bIsUrl = false; 245 | regexapi_t *pRat = regexapi_url(pUrl); 246 | 247 | // If we found a regex match, then we assume that CURL supports the url 248 | if(pRat != NULL) 249 | { int regexNSubs = regexapi_nsubs(pRat, 0); 250 | // Assume that the last subcomponent of the regex is the filename portion 251 | const char *pRegexSub = (regexNSubs > 1 ? regexapi_sub(pRat, 0, regexNSubs - 1) : NULL); 252 | // and get the basename of that 253 | char *pBaseName = (pRegexSub != NULL ? strrchr(pRegexSub, '/') : NULL); 254 | 255 | bIsUrl = (pBaseName != NULL && *pBaseName); 256 | if(bIsUrl) 257 | { char *pTerm = strchr(pBaseName, '?'); 258 | 259 | // The string returned to us is not const, so we'll terminate it 260 | // at the URI point, so as to not have silly basenames 261 | if(pTerm != NULL) 262 | *pTerm = 0; 263 | 264 | // no basename, just a plain url ? 265 | if(*pBaseName == '/') 266 | pBaseName++; 267 | 268 | if(*pBaseName) 269 | pCcf->pUrlBaseName = strdup(pBaseName); 270 | } 271 | 272 | // Cleanup the regex 273 | regexapi_free(pRat); 274 | } 275 | 276 | return bIsUrl; 277 | } 278 | 279 | // Returns a url character encoded string 280 | // The caller must free() the result 281 | static char *curlEncodeUrlCharacters(const char *src) 282 | { char *dst = (src != NULL ? calloc(1,strlen(src)*3) : NULL); 283 | char *str = dst; 284 | 285 | if(src != NULL) 286 | { int eq = 0; // we assume that the first `=' is the kvp separator (ie val=data), so don't encode it. 287 | 288 | while(*src) 289 | { 290 | // http://en.wikipedia.org/wiki/Percent-encoding#Character_data plus a few more 291 | if((eq != 0 && *src == '=') || strchr("\"%-.<>\\^_`{|}~[],:#@?;\r\n", *src)) 292 | { char c = *src; 293 | 294 | eq = (*src == '='); 295 | 296 | *(dst++) = '%'; 297 | *(dst++) = hexDigits[c >> 4]; 298 | *(dst++) = hexDigits[c & 0x0f]; 299 | } 300 | else if(*src == ' ') 301 | *(dst++) = '+'; 302 | else 303 | { 304 | if(*src == '&') 305 | eq = 0; 306 | *(dst++) = *src; 307 | } 308 | src++; 309 | } 310 | } 311 | 312 | return str; 313 | } 314 | 315 | static void curlCfrClose(cfr_t *pCfr) 316 | { 317 | if(pCfr != NULL) 318 | { 319 | if(pCfr->ccf.pFile != NULL) 320 | { 321 | fflush(pCfr->ccf.pFile); 322 | fclose(pCfr->ccf.pFile); 323 | pCfr->ccf.pFile = NULL; 324 | } 325 | } 326 | } 327 | 328 | // Free the structure and sub-components 329 | void curlCfrFree(cfr_t *pCfr) 330 | { 331 | if(pCfr != NULL) 332 | { int i; 333 | 334 | curlCfrClose(pCfr); 335 | 336 | if(pCfr->ccf.pFileNameTmp != NULL) 337 | { 338 | if(pCfr->ccf.bNeedUnlink) 339 | unlink(pCfr->ccf.pFileNameTmp); 340 | } 341 | 342 | FREEPTR(pCfr->ccf.pUrlBaseName); 343 | FREEPTR(pCfr->ccf.pFileName); 344 | FREEPTR(pCfr->ccf.pUrlHash); 345 | FREEPTR(pCfr->ccf.pFileNameTmp); 346 | 347 | for(i=0; iccf.pHdrs[i]); 349 | 350 | FREEPTR(pCfr->pContentType); 351 | 352 | free(pCfr); 353 | } 354 | } 355 | 356 | // Build an md5 hash for the URL being requested 357 | // The caller must free the result 358 | static char *curlUrlHash(const char *pUrl, const char *pHttpPostVars) 359 | { cmd5_t * pMd5 = curlMd5Init(); 360 | char *pUrlHash = NULL; 361 | 362 | curlMd5Hash(pMd5, pUrl); 363 | if(pHttpPostVars != NULL) 364 | curlMd5Hash(pMd5, pHttpPostVars); 365 | pUrlHash = curlMd5Final(pMd5); 366 | curlMd5Free(pMd5); 367 | 368 | return pUrlHash; 369 | } 370 | 371 | /* 372 | static ccf_t *curlCacheMetaSet(const char *pFileName 373 | , const char *pEtag 374 | , const char *pLastModified 375 | , const char *pCacheControl 376 | ) 377 | { ccf_t *pCcf = calloc(1, sizeof(ccf_t)); 378 | 379 | if(pCcf != NULL) 380 | { 381 | pCcf->pFileName = strdup(pFileName); 382 | pCcf->pHdrs[HDR_IDX_ETAG] = strdup(pEtag); 383 | pCcf->pHdrs[HDR_IDX_LASTMODIFIED] = strdup(pLastModified); 384 | pCcf->pHdrs[HDR_IDX_CACHECONTROL] = strdup(pCacheControl); 385 | } 386 | 387 | return pCcf; 388 | } 389 | */ 390 | 391 | // Sort of like strtok, but more convienient. 392 | // Scribbles in the source. 393 | // Return a pointer to the begining of the 'delim'ited string, 394 | // white space trimmed on the left. 395 | // Also addvances the source pointer to the delimited point, 396 | // zero terminates it, then white space trimmed on the right. 397 | static char *stradvtok(char **ppSrc, char delim) 398 | { 399 | char *dst = *ppSrc; 400 | char *src = *ppSrc; 401 | 402 | while(src != NULL && *src && *src != delim) 403 | { 404 | if(dst == src && *src != delim && (*src == ' ' || *src == '\t' || *src == '\r' || *src == '\n')) 405 | dst++; 406 | src++; 407 | } 408 | 409 | if(*src == delim) 410 | { 411 | *src = '\0'; 412 | src++; 413 | while(*src == ' ' || *src == '\t' || *src == '\r' || *src == '\n') 414 | src++; 415 | } 416 | 417 | *ppSrc = src; 418 | 419 | return dst; 420 | } 421 | 422 | 423 | static void curlCacheMetaGet(ccf_t *pCcf) 424 | { char *pFname = NULL; 425 | 426 | asprintf(&pFname, "%s/%s.meta", CURL_BASE_DIR, pCcf->pUrlHash); 427 | if(pFname != NULL) 428 | { FILE *fin = fopen(pFname, "r"); 429 | 430 | if(fin != NULL) 431 | { char buf[4096]; 432 | char *p1; 433 | char *p2; 434 | char *p3; 435 | char *p4; 436 | char *pbuf; 437 | 438 | memset(buf, 0, sizeof(buf)); 439 | pbuf = fgets(buf, sizeof(buf)-1, fin); 440 | 441 | if(pbuf != NULL) 442 | { 443 | p1 = stradvtok(&pbuf, '|'); 444 | p2 = stradvtok(&pbuf, '|'); 445 | p3 = stradvtok(&pbuf, '|'); 446 | p4 = stradvtok(&pbuf, '|'); 447 | 448 | pCcf->pFileName = strdup(p1); 449 | pCcf->pHdrs[HDR_IDX_ETAG] = strdup(p2); 450 | pCcf->pHdrs[HDR_IDX_LASTMODIFIED] = strdup(p3); 451 | pCcf->pHdrs[HDR_IDX_CACHECONTROL] = strdup(p4); 452 | } 453 | 454 | fclose(fin); 455 | } 456 | 457 | free(pFname); 458 | } 459 | } 460 | 461 | #define NOTNULLPTR(a) ((a) != NULL ? (a) : "") 462 | 463 | static void curlCacheMetaPut(ccf_t *pCcf) 464 | { char *pFname = NULL; 465 | 466 | asprintf(&pFname, "%s/%s.meta", CURL_BASE_DIR, pCcf->pUrlHash); 467 | 468 | // TODO - lock operation to prevent contention races 469 | if(pFname != NULL) 470 | { FILE *fout = fopen(pFname, "w"); 471 | 472 | if(fout != NULL) 473 | { 474 | fprintf(fout,"%s|%s|%s|%s|" 475 | , NOTNULLPTR(pCcf->pFileName) 476 | , NOTNULLPTR(pCcf->pHdrs[HDR_IDX_ETAG]) 477 | , NOTNULLPTR(pCcf->pHdrs[HDR_IDX_LASTMODIFIED]) 478 | , NOTNULLPTR(pCcf->pHdrs[HDR_IDX_CACHECONTROL]) 479 | ); 480 | fclose(fout); 481 | } 482 | free(pFname); 483 | } 484 | } 485 | 486 | // Move the temp file to the cached file ? 487 | static void curlCacheFileFinalize(cfr_t *pCfr) 488 | { 489 | if(pCfr != NULL) 490 | { 491 | // TODO; 492 | // 1. set unlink flag based on cache-control 493 | switch(pCfr->httpResponseCode) 494 | { 495 | case 200: // new content, remove old, use new 496 | // TODO - lock operation to prevent contention races 497 | unlink(pCfr->ccf.pFileName); 498 | rename(pCfr->ccf.pFileNameTmp, pCfr->ccf.pFileName); 499 | pCfr->ccf.bNeedUnlink = false; 500 | break; 501 | case 304: // no new content, remove temp file 502 | default: 503 | unlink(pCfr->ccf.pFileNameTmp); 504 | break; 505 | } 506 | } 507 | } 508 | 509 | static CURL *curlCoreInit(const char *pUrl, void *pHeaderFn, void *pHeaderData) 510 | { CURL *curl_handle = NULL; 511 | 512 | curl_global_init(CURL_GLOBAL_ALL); 513 | curl_handle = curl_easy_init(); 514 | curl_easy_setopt(curl_handle, CURLOPT_URL, pUrl); 515 | 516 | curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "json_fdw/1.2 (+http://github.com/nkhorman/json_fdw) libcurl-agent/1.0"); 517 | curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 30); // TODO - table option ? 518 | 519 | curl_easy_setopt(curl_handle, CURLOPT_ACCEPT_ENCODING, ""); // turn on builtin supported default content dencoding 520 | //curl_easy_setopt(curl_handle, CURLOPT_TRANSFER_ENCODING, 1L); // turn on transfer decoding 521 | 522 | curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); // turn on redirection following 523 | curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS, 5); // for a maximum of 5 524 | curl_easy_setopt(curl_handle, CURLOPT_POSTREDIR, CURL_REDIR_POST_ALL); // maintain a post as a post on redirects 525 | curl_easy_setopt(curl_handle, CURLOPT_AUTOREFERER, 1L); // turn on Refer when redirecting 526 | 527 | if(pHeaderFn != NULL) 528 | { 529 | curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, pHeaderFn); 530 | curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, pHeaderData); 531 | } 532 | 533 | return curl_handle; 534 | } 535 | 536 | static CURL *curlCoreInitGetOrPost(const char *pUrl, void *pWriteFn, void *pWriteData, void *pHeaderFn, void *pHeaderData, const char *pPostStr) 537 | { CURL *curl_handle = curlCoreInit(pUrl, pHeaderFn, pHeaderData); 538 | 539 | if(pWriteFn != NULL) 540 | { 541 | curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, pWriteFn); 542 | curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, pWriteData); 543 | } 544 | 545 | if(pPostStr != NULL && *pPostStr) 546 | { 547 | curl_easy_setopt(curl_handle, CURLOPT_POST, 1L); 548 | curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDS, pPostStr); 549 | curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDSIZE, strlen(pPostStr)); 550 | } 551 | 552 | return curl_handle; 553 | } 554 | 555 | typedef struct _cprfc_t 556 | { 557 | const char *buffer; // data to send 558 | size_t len; // size to send 559 | size_t index; // current index into buffer where the next send operations should start from 560 | }cprfc_t; // Curl Put Read Fn Callback Type 561 | 562 | static size_t curlPutReadFnCallback(char *buffer, size_t size, size_t nmemb, void *instream) 563 | { cprfc_t *pCprfc = (cprfc_t *)instream; 564 | size_t curl_size = nmemb * size; 565 | size_t left_to_copy = pCprfc->len - pCprfc->index; 566 | size_t to_copy = (left_to_copy < curl_size) ? left_to_copy : curl_size; 567 | 568 | memcpy(buffer, &pCprfc->buffer[pCprfc->index], to_copy); 569 | pCprfc->index += to_copy; 570 | 571 | return to_copy; 572 | } 573 | 574 | static size_t curlPutHeaderFnCallback(void *buffer, size_t size, size_t nmemb, void *userp) 575 | { int curl_size = nmemb * size; 576 | 577 | //printf("%s:%d header '%*.*s'\n", __func__, __LINE__, curl_size-2, curl_size-2, buffer); 578 | 579 | return curl_size; 580 | } 581 | 582 | static size_t curlPutWriteFnCallback(void *buffer, size_t size, size_t nmemb, void *userp) 583 | { int curl_size = nmemb * size; 584 | 585 | //printf("%s:%d '%*.*s'\n", __func__, __LINE__, curl_size-2, curl_size-2, buffer); 586 | 587 | return curl_size; 588 | } 589 | 590 | static CURL *curlCoreInitPut(const char *pUrl, void *pReadFn, void *pReadData, void *pHeaderFn, void *pHeaderData, size_t size) 591 | { CURL *curl_handle = curlCoreInit(pUrl, pHeaderFn, pHeaderData); 592 | 593 | if(pReadFn != NULL) 594 | { 595 | curl_easy_setopt(curl_handle, CURLOPT_READFUNCTION, pReadFn); 596 | curl_easy_setopt(curl_handle, CURLOPT_READDATA, pReadData); 597 | } 598 | 599 | curl_easy_setopt(curl_handle, CURLOPT_PUT, 1L); 600 | curl_easy_setopt(curl_handle, CURLOPT_UPLOAD, 1L); 601 | curl_easy_setopt(curl_handle, CURLOPT_INFILESIZE_LARGE, (curl_off_t)size); 602 | 603 | // don't let output go to stdout 604 | curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, &curlPutWriteFnCallback); 605 | curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, NULL); 606 | 607 | curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, &curlPutHeaderFnCallback); 608 | curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, NULL); 609 | 610 | return curl_handle; 611 | } 612 | 613 | /* 614 | void curlCoreInitAuth(CURL *curl_handle) 615 | { 616 | // TODO - auth foo - possibly some or all of these 617 | // CURLOPT_USERPWD or (CURLOPT_USERNAME and CURLOPT_PASSWORD) 618 | // CURLOPT_LOGIN_OPTIONS 619 | // CURLOPT_PROXYUSERNAME and CURLOPT_PROXYPASSWORD 620 | // CURLOPT_HTTPAUTH 621 | // CURLOPT_TLSAUTH_USERNAME and CURLOPT_TLSAUTH_PASSWORD 622 | // CURLOPT_PROXYAUTH 623 | // CURLOPT_SASL_IR 624 | // CURLOPT_XOAUTH2_BEARER 625 | // 626 | } 627 | */ 628 | 629 | // Add a header to the header list 630 | static struct curl_slist *curlCoreInitHeader(CURL *curl_handle, struct curl_slist *pChunk, const char *pName, const char *pValue) 631 | { char *pHdr = NULL; 632 | 633 | asprintf(&pHdr, "%s: %s", pName, pValue); 634 | if(pHdr != NULL) 635 | { 636 | pChunk = curl_slist_append(pChunk, pHdr); 637 | curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, pChunk); 638 | free(pHdr); 639 | } 640 | 641 | return pChunk; 642 | } 643 | 644 | // Fetch the file from the url 645 | cfr_t *curlFetchFile(const char *pUrl, const char *pHttpPostVars) 646 | { cfr_t *pCfr = calloc(1,sizeof(cfr_t)); 647 | 648 | if(!curlIsUrl(pUrl, &pCfr->ccf)) 649 | FREEPTR(pCfr); 650 | 651 | if(pCfr != NULL) 652 | { struct curl_slist *chunk = NULL; 653 | CURLcode res; 654 | char *pPostStr = curlEncodeUrlCharacters(pHttpPostVars); 655 | CURL *curl_handle = curlCoreInitGetOrPost(pUrl, curlWriteCallback, (void *)&pCfr->ccf, curlHeaderCallback, (void *)&pCfr->ccf, pPostStr); 656 | unsigned long queryStart = 0; 657 | 658 | pCfr->ccf.pUrlHash = curlUrlHash(pUrl, pHttpPostVars); 659 | curlCacheMetaGet(&pCfr->ccf); 660 | curlCacheFileOpen(&pCfr->ccf); 661 | 662 | // inject etag header request ? 663 | // TODO; 664 | // 1. don't if the actual file is missing, so that we get a new one 665 | // 2. don't if stale acording to cache-control 666 | if(pCfr->ccf.pHdrs[HDR_IDX_ETAG] != NULL) 667 | chunk = curlCoreInitHeader(curl_handle, chunk, "If-None-Match", pCfr->ccf.pHdrs[HDR_IDX_ETAG]); 668 | 669 | // the file should already be open, get it 670 | queryStart = GetTickCount(); 671 | res = curl_easy_perform(curl_handle); 672 | pCfr->queryDuration = GetTickCount() - queryStart; // how long did the fetch take ? 673 | 674 | // clean up post data 675 | FREEPTR(pPostStr); 676 | 677 | // close the open file 678 | curlCfrClose(pCfr); 679 | 680 | // this means that we communicated with the server 681 | if(res == CURLE_OK) 682 | { char *pContentType = NULL; 683 | 684 | curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &pCfr->httpResponseCode); 685 | curl_easy_getinfo(curl_handle, CURLINFO_CONTENT_TYPE, &pContentType); 686 | 687 | if(pContentType != NULL) 688 | pCfr->pContentType = strdup(pContentType); 689 | 690 | curlCacheMetaPut(&pCfr->ccf); 691 | 692 | switch(pCfr->httpResponseCode) 693 | { 694 | case 200: 695 | pCfr->bFileFetched = true; 696 | #ifndef JSON_CONTENT_TYPE_NONE 697 | // make sure it's the correct content type 698 | pCfr->bFileFetched &= ( 699 | #ifdef JSON_CONTENT_TYPE_NULL 700 | // Highly non-conforming server/application 701 | pContentType == NULL || 702 | #endif 703 | #ifdef JSON_CONTENT_TYPE_LIBERAL 704 | // If your using a badly configured/coded/non-conforming server 705 | // application, you might get one or more of these mime types 706 | (pContentType != NULL && strcasecmp("application/x-javascript", pContentType) == 0) || 707 | (pContentType != NULL && strcasecmp("text/javascript", pContentType) == 0) || 708 | (pContentType != NULL && strcasecmp("text/x-javascript", pContentType) == 0) || 709 | (pContentType != NULL && strcasecmp("text/x-json", pContentType) == 0) || 710 | (pContentType != NULL && strcasecmp("text/html", pContentType) == 0) || 711 | #endif 712 | // The content might be a straight up gzip compressed file 713 | (pContentType != NULL && strcasecmp("application/x-gzip", pContentType) == 0) || 714 | // If it is uncompressed, it should look like this 715 | (pContentType != NULL && strcasecmp("application/json", pContentType) == 0) 716 | ); 717 | #endif 718 | break; 719 | case 304: 720 | // we lie here, because we already have the file 721 | pCfr->bFileFetched = true; 722 | break; 723 | default: 724 | break; 725 | } 726 | 727 | curlCacheFileFinalize(pCfr); 728 | } 729 | 730 | // all done, cleanup 731 | curl_easy_cleanup(curl_handle); 732 | curl_global_cleanup(); 733 | curl_slist_free_all(chunk); 734 | } 735 | 736 | return pCfr; 737 | } 738 | 739 | // Put 740 | int curlPut(const char *pUrl, const char *pBuffer, size_t bufferSize, const char *pContentType) 741 | { int ok = 0; 742 | 743 | CURLcode res; 744 | cprfc_t cprfc = { pBuffer, bufferSize, 0 }; 745 | CURL *curl_handle = curlCoreInitPut(pUrl, &curlPutReadFnCallback, &cprfc, NULL, NULL, cprfc.len); 746 | struct curl_slist *chunk = curlCoreInitHeader(curl_handle, NULL, "Content-Type", pContentType); 747 | 748 | res = curl_easy_perform(curl_handle); 749 | 750 | // this means that we communicated with the server 751 | if(res == CURLE_OK) 752 | { unsigned long httpResponseCode = 0; 753 | 754 | curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &httpResponseCode); 755 | ok = (httpResponseCode == 200); 756 | } 757 | 758 | // all done, cleanup 759 | curl_easy_cleanup(curl_handle); 760 | curl_global_cleanup(); 761 | curl_slist_free_all(chunk); 762 | 763 | return ok; 764 | } 765 | 766 | #ifdef _CURL_UNIT_TEST 767 | int debug = 0; 768 | 769 | void logit(const char *p) 770 | { 771 | if(debug) 772 | printf("%s\n",p); 773 | } 774 | 775 | void test1(int argc, char **argv) 776 | { 777 | cfr_t *pCfr = NULL; 778 | const char *pUrl = NULL; 779 | const char *pHttpPostVars = NULL; 780 | const char *pFileName = NULL; 781 | int i = 0; 782 | 783 | pUrl = (argc >= i ? argv[i] : NULL); 784 | i++; 785 | pHttpPostVars = (argc >= i ? argv[i] : NULL); 786 | 787 | pCfr = curlFetchFile(pUrl, pHttpPostVars); 788 | if(pCfr != NULL) 789 | pFileName = pCfr->ccf.pFileName; 790 | 791 | printf("'%s' --> '%s' == %s\n", pUrl, pFileName, (pCfr && pCfr->bFileFetched ? "OK" : "FAIL")); 792 | if(pCfr && pCfr->bFileFetched) 793 | { char *pCmd = NULL; 794 | 795 | printf("HTTP response code %lu\n", pCfr->httpResponseCode); 796 | printf("%s duration %lums\n", (pCfr->httpResponseCode == 200 ? "Fetch" : "Query"), pCfr->queryDuration); 797 | if(pCfr->pContentType != NULL && strcasecmp("application/json", pCfr->pContentType) == 0) 798 | { 799 | if(debug) 800 | asprintf(&pCmd, "ls -la %s/; cat %s", CURL_BASE_DIR, pFileName); 801 | else 802 | asprintf(&pCmd, "cat %s", pFileName); 803 | system(pCmd); 804 | free(pCmd); 805 | } 806 | 807 | asprintf(&pCmd, "ls -la %s/", CURL_BASE_DIR); 808 | system(pCmd); 809 | free(pCmd); 810 | } 811 | 812 | curlCfrFree(pCfr); 813 | } 814 | 815 | void test2(int argc, char **argv) 816 | { 817 | const char *pUrl = NULL; 818 | const char *pBuffer = NULL; 819 | int i = 0; 820 | int ok = 0; 821 | 822 | pUrl = (argc >= i ? argv[i] : NULL); 823 | i++; 824 | pBuffer = (argc >= i ? argv[i] : NULL); 825 | 826 | ok = curlPut(pUrl, pBuffer, strlen(pBuffer), "application/json"); 827 | 828 | printf("'%s' --> '%s' == %s\n", pUrl, pBuffer, (ok ? "OK" : "FAIL")); 829 | } 830 | 831 | void test3(int argc, char **argv) 832 | { 833 | const char *pStrIn = "a=1&b=2&json={\"query\":[[3,0,0]]}&d=4"; 834 | char *pStrOut = curlEncodeUrlCharacters(pStrIn); 835 | 836 | printf("in '%s' out '%s'\n", pStrIn, pStrOut); 837 | 838 | free(pStrOut); 839 | } 840 | 841 | int main(int argc, char **argv) 842 | { 843 | int i = 1; 844 | int c; 845 | 846 | #ifdef DEBUG_WLOGIT 847 | curlLogItSet(&logit); 848 | #endif 849 | 850 | if(argc == 1) 851 | { 852 | printf("%s: [-d] [-1 [url] [optional post vars]]\n", argv[0]); 853 | exit(0); 854 | } 855 | 856 | while(i < argc) 857 | { 858 | if(argv[i][0] == '-') 859 | { 860 | switch(argv[i][1]) 861 | { 862 | case 'd': debug = 1; i++; break; 863 | case '1': test1(argc-i, argv+i); i += 2; break; 864 | case '2': test2(argc-i, argv+i); i += 2; break; 865 | case '3': test3(argc-i, argv+i); i++; break; 866 | default: i++; printf("unknown cli arg '%s'\n", argv[i]); break; 867 | } 868 | } 869 | else 870 | i++; 871 | } 872 | 873 | return 0; 874 | } 875 | #endif 876 | -------------------------------------------------------------------------------- /curlapi.h: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #ifndef _CURLAPI_H_ 25 | #define _CURLAPI_H_ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | // HDR_IDX_xx values must be zero relative, and consecutive 33 | #define HDR_STR_ETAG "ETag: " 34 | #define HDR_STR_LASTMODIFIED "Last-Modified: " 35 | #define HDR_STR_CACHECONTROL "Cache-Control: " 36 | 37 | enum 38 | { 39 | HDR_IDX_ETAG, 40 | HDR_IDX_LASTMODIFIED, 41 | HDR_IDX_CACHECONTROL, 42 | 43 | HDR_COUNT // must always be last 44 | }; 45 | 46 | typedef struct _ccf_t 47 | { 48 | char *pUrlBaseName; 49 | char *pFileName; 50 | char *pUrlHash; 51 | char *pFileNameTmp; 52 | FILE* pFile; 53 | bool bNeedUnlink; 54 | char *pHdrs[HDR_COUNT]; 55 | }ccf_t; // CurlCacheFile_Type 56 | 57 | typedef struct _cfr_t 58 | { 59 | ccf_t ccf; 60 | bool bFileFetched; 61 | unsigned long httpResponseCode; 62 | char *pContentType; 63 | unsigned long queryDuration; 64 | } cfr_t; // "CurlFetchResult_Type" 65 | 66 | cfr_t *curlFetchFile(const char *pUrl, const char *pHttpPostVars); 67 | void curlPost(const char *pUrl, const char *pHttpPostVars); 68 | void curlCfrFree(cfr_t *pCfr); 69 | 70 | int curlPut(const char *pUrl, const char *pBuffer, size_t bufferSize, const char *pContentType); 71 | 72 | #ifdef DEBUG_WLOGIT 73 | void curlLogItSet(void (*pfn)(const char *)); 74 | static void curlLogIt(const char *pFmt, ...); 75 | #endif 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /data/blk_-729487577044220672: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/blk_-729487577044220672 -------------------------------------------------------------------------------- /data/customer_reviews_1998.1000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/customer_reviews_1998.1000.json.gz -------------------------------------------------------------------------------- /data/data.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "type": "person", "name": "Beatus Henk", "birthdate": "1973-06-24", "actions": [1]} 2 | {"id": 2, "type": "person", "name": "Lugos Alfons", "birthdate": "1961-08-30"} 3 | {"id": 3, "type": "person", "name": "Temür Essa", "birthdate": "1995-07-28", "actions": [2, 2, 1, 3]} 4 | {"id": 4, "type": "resturaunt", "name": "Mingus Kitchen", "position": {"lat": -4.83798e1, "lon": -65.43274, "address": {"country": "Argentina"}}, "last_update": "2013-01-02 12:05:01"} 5 | {"id": 5, "type": "resturaunt", "name": "Café Utopia Lounge", "position": {"lat": 429.7208e-1, "lon": 143.39097}, "last_update_tz": "2013-01-02 12:05:01 America/New_York"} 6 | {"id": 6, "type": "invalid_record", "birthdate": null, "last_update": "invalid time format", "position": "Canada"} 7 | {"id": 9223372036854775807} 8 | {"id": -9223372036854775808} 9 | -------------------------------------------------------------------------------- /data/data_broken.json: -------------------------------------------------------------------------------- 1 | {"a": 1, "b": 2} 2 | {"a": 2, "b": 3} 3 | {"a": 3, "b": 4} 4 | {"a": 3, 5 | -------------------------------------------------------------------------------- /data/invalid_gz_file.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nkhorman/json_fdw/65b8e7d4dc39bb2844879bb3c0199588e9cfa8a2/data/invalid_gz_file.json.gz -------------------------------------------------------------------------------- /expected/.gitignore: -------------------------------------------------------------------------------- 1 | # This directory will be populated when testing 2 | # Ignore everything in this directory 3 | * 4 | # Except this file 5 | !.gitignore 6 | -------------------------------------------------------------------------------- /gettickcount.c: -------------------------------------------------------------------------------- 1 | 2 | /*--------------------------------------------------------------------* 3 | * 4 | * Developed by; 5 | * Neal Horman - http://www.wanlink.com 6 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 7 | * 8 | * This "source code" is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This "source code" is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this "source code". If not, see . 20 | * 21 | * RCSID: $Id$ 22 | * 23 | *--------------------------------------------------------------------*/ 24 | 25 | #ifndef _WIN32 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #define NsInAMs 1000000 32 | #define MsInASec 1000 33 | 34 | #ifdef __APPLE__ 35 | 36 | // see https://developer.apple.com/library/mac/qa/qa1398/_index.html 37 | // and http://stackoverflow.com/questions/3269321/osx-programmatically-get-uptime 38 | 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include "gettickcount.h" 46 | 47 | unsigned long GetTickCount(void) 48 | { 49 | static mach_timebase_info_data_t sTimebaseInfo; 50 | 51 | // If this is the first time we've run, get the timebase. 52 | // We can use denom == 0 to indicate that sTimebaseInfo is 53 | // uninitialised because it makes no sense to have a zero 54 | // denominator is a fraction. 55 | 56 | if ( sTimebaseInfo.denom == 0 ) 57 | (void) mach_timebase_info(&sTimebaseInfo); 58 | 59 | return (mach_absolute_time() * (sTimebaseInfo.numer / sTimebaseInfo.denom)) / NsInAMs; 60 | } 61 | 62 | #else 63 | 64 | unsigned long GetTickCount(void) 65 | { struct timespec ts_uptime; 66 | unsigned long uptimeInMs = 0; 67 | 68 | if(clock_gettime(CLOCK_UPTIME, &ts_uptime) == 0) 69 | { 70 | uptimeInMs = (ts_uptime.tv_sec * MsInASec); 71 | 72 | if(ts_uptime.tv_nsec > 0) 73 | uptimeInMs += (ts_uptime.tv_nsec / NsInAMs); 74 | } 75 | 76 | return uptimeInMs; 77 | } 78 | #endif // __APPLE__ 79 | #endif // _WINDOWS 80 | 81 | #ifdef UNIT_TEST 82 | // to compile - gcc -DUNIT_TEST -o uptime gettickcount.c && while [ 1 ]; do clear; ./uptime; sleep 1; done 83 | #include 84 | 85 | int main(int argc, char **argv) 86 | { unsigned long nTicks = GetTickCount(); 87 | unsigned long secs = (nTicks > 0 ? nTicks / 1000 : 0); 88 | unsigned long Secs=0, Mins=0, Hrs=0, Days=0; 89 | 90 | if(secs > 0) 91 | { 92 | Secs = secs % 60; 93 | Mins = (secs / 60) % 60; 94 | Hrs = (secs / (60 * 60)) % 24; 95 | Days = (secs / (60 * 60 * 24)); 96 | } 97 | printf("uptime in ms %lu = %lu days and %02lu:%02lu:%02lu",nTicks,Days,Hrs,Mins,Secs); 98 | 99 | return 0; 100 | } 101 | #endif 102 | -------------------------------------------------------------------------------- /gettickcount.h: -------------------------------------------------------------------------------- 1 | 2 | /*--------------------------------------------------------------------* 3 | * 4 | * Developed by; 5 | * Neal Horman - http://www.wanlink.com 6 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 7 | * 8 | * This "source code" is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This "source code" is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this "source code". If not, see . 20 | * 21 | * RCSID: $Id$ 22 | * 23 | *--------------------------------------------------------------------*/ 24 | 25 | #ifndef _GETTICKCOUNT_H_ 26 | #define _GETTICKCOUNT_H_ 27 | 28 | #ifndef _WIN32 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif 33 | 34 | extern unsigned long GetTickCount(void); 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif // _WIN32 41 | 42 | #endif // _GETTICKCOUNT_H_ 43 | -------------------------------------------------------------------------------- /input/basic_tests.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test json foreign data wrapper. 3 | -- 4 | 5 | -- Settings to make the result deterministic 6 | SET datestyle = "ISO, YMD"; 7 | 8 | 9 | -- Install json_fdw 10 | CREATE EXTENSION json_fdw; 11 | 12 | CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw; 13 | 14 | 15 | -- validator tests 16 | CREATE FOREIGN TABLE test_validator_filename_missing () 17 | SERVER json_server; -- ERROR 18 | 19 | CREATE FOREIGN TABLE test_validator_invalid_option () 20 | SERVER json_server 21 | OPTIONS(filename 'data.json', bad_option_name '1'); -- ERROR 22 | 23 | 24 | -- data conversion tests 25 | CREATE FOREIGN TABLE json_data (id int8, type char(20), name text, 26 | birthdate date, actions int[], "position.lat" float, "position.lon" float, 27 | "position.address.country" varchar(50), last_update timestamp, 28 | last_update_tz timestamp with time zone 29 | ) SERVER json_server OPTIONS(filename '@abs_srcdir@/data/data.json'); 30 | 31 | SELECT id, type, name FROM json_data ORDER BY id; 32 | 33 | SELECT id, name, birthdate FROM json_data WHERE type = 'person' ORDER BY id; 34 | 35 | SELECT id, "position.lat" AS lat, "position.lon" AS lon, 36 | "position.address.country" AS country, last_update 37 | FROM json_data WHERE type = 'resturaunt' ORDER BY id; 38 | 39 | SELECT id, type, birthdate, last_update, "position.lon" as lon 40 | FROM json_data WHERE type = 'invalid_record' ORDER BY id; 41 | 42 | SELECT last_update_tz AT TIME ZONE 'UTC' FROM json_data 43 | WHERE last_update_tz IS NOT NULL; 44 | 45 | 46 | -- max error count test 47 | CREATE FOREIGN TABLE test_skip_broken_on (a integer, b integer) 48 | SERVER json_server 49 | OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '1'); 50 | 51 | SELECT * FROM test_skip_broken_on ORDER BY a; 52 | 53 | CREATE FOREIGN TABLE test_skip_broken_off (a integer, b integer) 54 | SERVER json_server 55 | OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '0'); 56 | 57 | SELECT * FROM test_skip_broken_off; -- ERROR 58 | 59 | 60 | -- error scenarios 61 | CREATE FOREIGN TABLE test_missing_file () SERVER json_server 62 | OPTIONS (filename '@abs_srcdir@/data/missing_file.json'); 63 | 64 | SELECT * FROM test_missing_file; -- ERROR 65 | 66 | CREATE FOREIGN TABLE test_string_length_check (type CHAR(6)) SERVER json_server 67 | OPTIONS (filename '@abs_srcdir@/data/data.json'); 68 | 69 | SELECT * FROM test_string_length_check; -- ERROR 70 | 71 | CREATE FOREIGN TABLE test_int_range_check (id int4) SERVER json_server 72 | OPTIONS (filename '@abs_srcdir@/data/data.json'); 73 | 74 | SELECT * FROM test_int_range_check; -- ERROR 75 | 76 | CREATE FOREIGN TABLE test_decimal_range_check ("position.lat" decimal(3, 2)) 77 | SERVER json_server OPTIONS (filename '@abs_srcdir@/data/data.json'); 78 | 79 | SELECT * FROM test_decimal_range_check; -- ERROR 80 | 81 | -------------------------------------------------------------------------------- /input/customer_reviews.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test customer reviews dataset queries. 3 | -- 4 | 5 | CREATE FOREIGN TABLE customer_reviews 6 | ( 7 | customer_id TEXT not null, 8 | "review.date" DATE not null, 9 | "review.rating" INTEGER not null, 10 | "review.votes" INTEGER, 11 | "review.helpful_votes" INTEGER, 12 | "product.id" CHAR(10) not null, 13 | "product.title" TEXT not null, 14 | "product.sales_rank" BIGINT, 15 | "product.group" TEXT, 16 | "product.category" TEXT, 17 | "product.subcategory" TEXT, 18 | similar_product_ids CHAR(10)[] 19 | ) 20 | SERVER json_server 21 | OPTIONS(filename '@abs_srcdir@/data/customer_reviews_1998.1000.json.gz'); 22 | 23 | 24 | -- How people rate your products? 25 | 26 | SELECT 27 | extract(month from "review.date") AS review_month, 28 | round(avg("review.rating"), 2), 29 | count(*) 30 | FROM 31 | customer_reviews 32 | GROUP BY 33 | review_month 34 | ORDER BY 35 | review_month; 36 | 37 | -- Do we have a correlation between a book's title's length and its review ratings? 38 | 39 | SELECT 40 | width_bucket(length("product.title"), 1, 50, 5) title_length_bucket, 41 | round(avg("review.rating"), 2) AS review_average, 42 | count(*) 43 | FROM 44 | customer_reviews 45 | WHERE 46 | "product.group" = 'Book' 47 | GROUP BY 48 | title_length_bucket 49 | ORDER BY 50 | title_length_bucket; 51 | 52 | -- Does the average review rating change by product category? 53 | 54 | SELECT 55 | "product.category", 56 | round(avg("review.rating"), 2), 57 | count(*) 58 | FROM 59 | customer_reviews 60 | GROUP BY 61 | "product.category" 62 | ORDER BY 63 | count(*) DESC, "product.category" 64 | LIMIT 20; -------------------------------------------------------------------------------- /input/hdfs_block.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test customer reviews dataset which is stored as a HDFS block. 3 | -- 4 | 5 | CREATE FOREIGN TABLE customer_reviews_hdfs_block 6 | ( 7 | customer_id TEXT not null, 8 | "review.date" DATE not null, 9 | "review.rating" INTEGER not null, 10 | "review.votes" INTEGER, 11 | "review.helpful_votes" INTEGER, 12 | "product.id" CHAR(10) not null, 13 | "product.title" TEXT not null, 14 | "product.sales_rank" BIGINT, 15 | "product.group" TEXT, 16 | "product.category" TEXT, 17 | "product.subcategory" TEXT, 18 | similar_product_ids CHAR(10)[] 19 | ) 20 | SERVER json_server 21 | OPTIONS(filename '@abs_srcdir@/data/blk_-729487577044220672', 22 | max_error_count '2'); 23 | 24 | -- Does the average review rating change by product category? 25 | SELECT 26 | "product.category", 27 | round(avg("review.rating"), 2), 28 | count(*) 29 | FROM 30 | customer_reviews_hdfs_block 31 | GROUP BY 32 | "product.category" 33 | ORDER BY 34 | count(*) DESC, "product.category" 35 | LIMIT 20; -------------------------------------------------------------------------------- /input/invalid_gz_file.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test that we handle invalid gzip files properly. 3 | -- 4 | 5 | \set VERBOSITY terse 6 | 7 | CREATE FOREIGN TABLE invalid_gz_file_table 8 | ( 9 | customer_id TEXT not null, 10 | "review.date" DATE not null, 11 | "review.rating" INTEGER not null, 12 | "review.votes" INTEGER, 13 | "review.helpful_votes" INTEGER, 14 | "product.id" CHAR(10) not null, 15 | "product.title" TEXT not null, 16 | "product.sales_rank" BIGINT, 17 | "product.group" TEXT, 18 | "product.category" TEXT, 19 | "product.subcategory" TEXT, 20 | similar_product_ids CHAR(10)[] 21 | ) 22 | SERVER json_server 23 | OPTIONS(filename '@abs_srcdir@/data/invalid_gz_file.json.gz'); 24 | 25 | select count(*) from invalid_gz_file_table; 26 | 27 | \set VERBOSITY default -------------------------------------------------------------------------------- /json_fdw--1.0.sql: -------------------------------------------------------------------------------- 1 | /* contrib/json_fdw/json_fdw--1.0.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 4 | \echo Use "CREATE EXTENSION json_fdw" to load this file. \quit 5 | 6 | CREATE FUNCTION json_fdw_handler() 7 | RETURNS fdw_handler 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION json_fdw_validator(text[], oid) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FOREIGN DATA WRAPPER json_fdw 17 | HANDLER json_fdw_handler 18 | VALIDATOR json_fdw_validator; 19 | -------------------------------------------------------------------------------- /json_fdw.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * json_fdw.c 4 | * 5 | * Function definitions for JSON foreign data wrapper. 6 | * 7 | * Copyright (c) 2013, Citus Data, Inc. 8 | * 9 | * $Id$ 10 | * 11 | *------------------------------------------------------------------------- 12 | */ 13 | 14 | 15 | // http://wiki.postgresql.org/images/6/67/Pg-fdw.pdf 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #include "postgres.h" 22 | #include "json_fdw.h" 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "access/reloptions.h" 30 | #include "catalog/pg_foreign_table.h" 31 | #include "catalog/pg_type.h" 32 | #include "commands/defrem.h" 33 | #include "commands/explain.h" 34 | #include "commands/vacuum.h" 35 | #include "foreign/fdwapi.h" 36 | #include "foreign/foreign.h" 37 | #include "miscadmin.h" 38 | #include "nodes/makefuncs.h" 39 | #include "optimizer/cost.h" 40 | #include "optimizer/plancat.h" 41 | #include "optimizer/pathnode.h" 42 | #include "optimizer/planmain.h" 43 | #include "optimizer/restrictinfo.h" 44 | #include "optimizer/var.h" 45 | #include "port.h" 46 | #include "storage/fd.h" 47 | #include "utils/array.h" 48 | #include "utils/builtins.h" 49 | #include "utils/date.h" 50 | #include "utils/datetime.h" 51 | #include "utils/int8.h" 52 | #include "utils/timestamp.h" 53 | #include "utils/hsearch.h" 54 | #include "utils/lsyscache.h" 55 | #include "utils/memutils.h" 56 | #include "utils/rel.h" 57 | #include "parser/parsetree.h" 58 | #include "nodes/relation.h" 59 | 60 | #if PG_VERSION_NUM >= 90300 61 | #include "access/htup_details.h" 62 | #endif 63 | 64 | #include "curlapi.h" 65 | #include "rciapi.h" 66 | 67 | 68 | #define ELog(elevel, ...) \ 69 | do { \ 70 | elog_start(__FILE__, __LINE__, PG_FUNCNAME_MACRO); \ 71 | elog_finish(elevel, __VA_ARGS__); \ 72 | if (__builtin_constant_p(elevel) && (elevel) >= ERROR) \ 73 | pg_unreachable(); \ 74 | } while(0) 75 | 76 | 77 | // Local functions forward declarations 78 | static StringInfo OptionNamesString(Oid currentContextId); 79 | static void JsonGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, 80 | Oid foreignTableId); 81 | static void JsonGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel, 82 | Oid foreignTableId); 83 | static ForeignScan * JsonGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, 84 | Oid foreignTableId, ForeignPath *bestPath, 85 | List *targetList, List *scanClauses); 86 | static void JsonExplainForeignScan(ForeignScanState *scanState, 87 | ExplainState *explainState); 88 | static void JsonBeginForeignScan(ForeignScanState *scanState, int executorFlags); 89 | static TupleTableSlot * JsonIterateForeignScan(ForeignScanState *scanState); 90 | static void JsonReScanForeignScan(ForeignScanState *scanState); 91 | static void JsonEndForeignScan(ForeignScanState *scanState); 92 | static JsonFdwOptions * JsonGetOptions(Oid foreignTableId); 93 | static char * JsonGetOptionValue(Oid foreignTableId, const char *optionName); 94 | static double TupleCount(RelOptInfo *baserel, const char *filename); 95 | static BlockNumber PageCount(const char *filename); 96 | static List * ColumnList(RelOptInfo *baserel); 97 | static HTAB * ColumnMappingHash(Oid foreignTableId, List *columnList); 98 | static bool GzipFilename(const char *filename); 99 | static bool HdfsBlockName(const char *filename); 100 | static StringInfo ReadLineFromFile(FILE *filePointer); 101 | static StringInfo ReadLineFromGzipFile(gzFile gzFilePointer); 102 | static void FillTupleSlot(const yajl_val jsonObject, const char *jsonObjectKey, 103 | HTAB *columnMappingHash, Datum *columnValues, 104 | bool *columnNulls); 105 | static bool ColumnTypesCompatible(yajl_val jsonValue, Oid columnTypeId); 106 | static bool ValidDateTimeFormat(const char *dateTimeString); 107 | static Datum ColumnValueArray(yajl_val jsonArray, Oid valueTypeId, Oid valueTypeMod); 108 | static Datum ColumnValue(yajl_val jsonValue, Oid columnTypeId, int32 columnTypeMod); 109 | static bool JsonAnalyzeForeignTable(Relation relation, 110 | AcquireSampleRowsFunc *acquireSampleRowsFunc, 111 | BlockNumber *totalPageCount); 112 | static int JsonAcquireSampleRows(Relation relation, int logLevel, 113 | HeapTuple *sampleRows, int targetRowCount, 114 | double *totalRowCount, double *totalDeadRowCount); 115 | 116 | static List *JsonPlanForeignModify(PlannerInfo *root, ModifyTable *plan, Index resultRelation, int subplan_index); 117 | static void JsonBeginForeignModify( ModifyTableState *mtstate, ResultRelInfo *resultRelInfo, List *fdw_private, int subplan_index, int eflags); 118 | static TupleTableSlot *JsonExecForeignInsert( EState *estate, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, TupleTableSlot *planSlot); 119 | static void JsonAddForeignUpdateTargets(Query *parsetree, RangeTblEntry *target_rte, Relation target_relation); 120 | static TupleTableSlot * JsonExecForeignUpdate( EState *estate, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, TupleTableSlot *planSlot); 121 | static void JsonEndForeignModify(EState *estate, ResultRelInfo *resultRelInfo); 122 | 123 | 124 | // Array of options that are valid for json_fdw 125 | static const JsonValidOption ValidOptionArray[] = 126 | { 127 | // foreign table options 128 | { OPTION_NAME_FILENAME, ForeignTableRelationId }, 129 | { OPTION_NAME_MAX_ERROR_COUNT, ForeignTableRelationId }, 130 | { OPTION_NAME_HTTP_POST_VARS, ForeignTableRelationId }, 131 | { OPTION_NAME_ROM_URL, ForeignTableRelationId }, 132 | { OPTION_NAME_ROM_PATH, ForeignTableRelationId }, 133 | }; 134 | // Never maintain by hand, what the compiler could do for you 135 | static const uint32 ValidOptionCount = (sizeof(ValidOptionArray)/sizeof(ValidOptionArray[0])); 136 | 137 | 138 | // Declarations for dynamic loading 139 | PG_MODULE_MAGIC; 140 | 141 | PG_FUNCTION_INFO_V1(json_fdw_handler); 142 | PG_FUNCTION_INFO_V1(json_fdw_validator); 143 | 144 | 145 | /* 146 | * json_fdw_handler creates and returns a struct with pointers to foreign table 147 | * callback functions. 148 | */ 149 | Datum 150 | json_fdw_handler(PG_FUNCTION_ARGS) 151 | { 152 | FdwRoutine *fdwRoutine = makeNode(FdwRoutine); 153 | 154 | fdwRoutine->GetForeignRelSize = JsonGetForeignRelSize; 155 | fdwRoutine->GetForeignPaths = JsonGetForeignPaths; 156 | fdwRoutine->GetForeignPlan = JsonGetForeignPlan; 157 | fdwRoutine->ExplainForeignScan = JsonExplainForeignScan; 158 | fdwRoutine->BeginForeignScan = JsonBeginForeignScan; 159 | fdwRoutine->IterateForeignScan = JsonIterateForeignScan; 160 | fdwRoutine->ReScanForeignScan = JsonReScanForeignScan; 161 | fdwRoutine->EndForeignScan = JsonEndForeignScan; 162 | fdwRoutine->AnalyzeForeignTable = JsonAnalyzeForeignTable; 163 | 164 | fdwRoutine->PlanForeignModify = JsonPlanForeignModify; 165 | fdwRoutine->BeginForeignModify = JsonBeginForeignModify; 166 | fdwRoutine->AddForeignUpdateTargets = JsonAddForeignUpdateTargets; // update and delete 167 | fdwRoutine->ExecForeignInsert = JsonExecForeignInsert; 168 | fdwRoutine->ExecForeignUpdate = JsonExecForeignUpdate; 169 | //fdwRoutine->ExecForeignDelete = JsonExecForeignDelete; 170 | fdwRoutine->EndForeignModify = JsonEndForeignModify; 171 | 172 | PG_RETURN_POINTER(fdwRoutine); 173 | } 174 | 175 | 176 | /* 177 | * json_fdw_validator validates options given to one of the following commands: 178 | * foreign data wrapper, server, user mapping, or foreign table. This function 179 | * errors out if the given option name or its value is considered invalid. The 180 | * filename option is required by the foreign table, so we error out if it is 181 | * not provided. 182 | */ 183 | Datum 184 | json_fdw_validator(PG_FUNCTION_ARGS) 185 | { 186 | Datum optionArray = PG_GETARG_DATUM(0); 187 | Oid optionContextId = PG_GETARG_OID(1); 188 | List *optionList = untransformRelOptions(optionArray); 189 | ListCell *optionCell = NULL; 190 | int filenameFound = 0; 191 | int romUrlFound = 0; 192 | int romPathFound = 0; 193 | 194 | foreach(optionCell, optionList) 195 | { 196 | DefElem *optionDef = (DefElem *) lfirst(optionCell); 197 | char *optionName = optionDef->defname; 198 | bool optionValid = false; 199 | 200 | int32 optionIndex = 0; 201 | for (optionIndex = 0; optionIndex < ValidOptionCount; optionIndex++) 202 | { 203 | const JsonValidOption *validOption = &(ValidOptionArray[optionIndex]); 204 | 205 | if ((optionContextId == validOption->optionContextId) && 206 | (strncmp(optionName, validOption->optionName, NAMEDATALEN) == 0)) 207 | { 208 | optionValid = true; 209 | break; 210 | } 211 | } 212 | 213 | // if invalid option, display an informative error message 214 | if (!optionValid) 215 | { 216 | StringInfo optionNamesString = OptionNamesString(optionContextId); 217 | 218 | ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_OPTION_NAME), 219 | errmsg("invalid option \"%s\"", optionName), 220 | errhint("Valid options in this context are: %s", 221 | optionNamesString->data))); 222 | } 223 | else // test for particular option existence 224 | { 225 | filenameFound |= (strncmp(optionName, OPTION_NAME_FILENAME, NAMEDATALEN) == 0); 226 | romUrlFound |= (strncmp(optionName, OPTION_NAME_ROM_URL, NAMEDATALEN) == 0); 227 | romPathFound |= (strncmp(optionName, OPTION_NAME_ROM_PATH, NAMEDATALEN) == 0); 228 | } 229 | } 230 | 231 | if (optionContextId == ForeignTableRelationId) 232 | { 233 | // make sure either filename or rom_url and rom_path, not both 234 | if( !(filenameFound || (romUrlFound && romPathFound))) 235 | { 236 | ereport(ERROR, (errcode(ERRCODE_FDW_DYNAMIC_PARAMETER_VALUE_NEEDED), 237 | errmsg("Either the ``filename'' or the ``rom_url'' and ``rom_path'' options are required for foreign tables"))); 238 | } 239 | else if(filenameFound && (romUrlFound || romPathFound)) 240 | { 241 | ereport(ERROR, (errcode(ERRCODE_FDW_DYNAMIC_PARAMETER_VALUE_NEEDED), 242 | errmsg("Do not mix the ``filename'' option with the ``rom_url'' and ``rom_path'' options for foreign tables"))); 243 | } 244 | } 245 | 246 | PG_RETURN_VOID(); 247 | } 248 | 249 | 250 | /* 251 | * OptionNamesString finds all options that are valid for the current context, 252 | * and concatenates these option names in a comma separated string. The function 253 | * is unchanged from mongo_fdw. 254 | */ 255 | static StringInfo 256 | OptionNamesString(Oid currentContextId) 257 | { 258 | StringInfo optionNamesString = makeStringInfo(); 259 | bool firstOptionAppended = false; 260 | 261 | int32 optionIndex = 0; 262 | for (optionIndex = 0; optionIndex < ValidOptionCount; optionIndex++) 263 | { 264 | const JsonValidOption *validOption = &(ValidOptionArray[optionIndex]); 265 | 266 | // if option belongs to current context, append option name 267 | if (currentContextId == validOption->optionContextId) 268 | { 269 | if (firstOptionAppended) 270 | { 271 | appendStringInfoString(optionNamesString, ", "); 272 | } 273 | 274 | appendStringInfoString(optionNamesString, validOption->optionName); 275 | firstOptionAppended = true; 276 | } 277 | } 278 | 279 | return optionNamesString; 280 | } 281 | 282 | 283 | /* 284 | * JsonGetForeignRelSize obtains relation size estimates for a foreign table and 285 | * puts its estimate for row count into baserel->rows. 286 | */ 287 | static void 288 | JsonGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId) 289 | { 290 | JsonFdwOptions *options = JsonGetOptions(foreignTableId); 291 | 292 | double tupleCount = TupleCount(baserel, options->filename); 293 | double rowSelectivity = clauselist_selectivity(root, baserel->baserestrictinfo, 294 | 0, JOIN_INNER, NULL); 295 | 296 | double outputRowCount = clamp_row_est(tupleCount * rowSelectivity); 297 | baserel->rows = outputRowCount; 298 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 299 | } 300 | 301 | 302 | /* 303 | * JsonGetForeignPaths creates possible access paths for a scan on the foreign 304 | * table. Currently we only have one possible access path, which simply returns 305 | * all records in the order they appear in the underlying file. 306 | */ 307 | static void 308 | JsonGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId) 309 | { 310 | Path *foreignScanPath = NULL; 311 | JsonFdwOptions *options = JsonGetOptions(foreignTableId); 312 | 313 | BlockNumber pageCount = PageCount(options->filename); 314 | double tupleCount = TupleCount(baserel, options->filename); 315 | 316 | /* 317 | * We estimate costs almost the same way as cost_seqscan(), thus assuming 318 | * that I/O costs are equivalent to a regular table file of the same size. 319 | * However, we take per-tuple CPU costs as 10x of a seqscan to account for 320 | * the cost of parsing records. 321 | */ 322 | double tupleParseCost = cpu_tuple_cost * JSON_TUPLE_COST_MULTIPLIER; 323 | double tupleFilterCost = baserel->baserestrictcost.per_tuple; 324 | double cpuCostPerTuple = tupleParseCost + tupleFilterCost; 325 | double executionCost = (seq_page_cost * pageCount) + (cpuCostPerTuple * tupleCount); 326 | 327 | double startupCost = baserel->baserestrictcost.startup; 328 | double totalCost = startupCost + executionCost; 329 | 330 | // create a foreign path node and add it as the only possible path 331 | foreignScanPath = (Path *) create_foreignscan_path(root, baserel, baserel->rows, 332 | startupCost, totalCost, 333 | NIL, // no known ordering 334 | NULL, // not parameterized 335 | NIL); // no fdw_private 336 | 337 | add_path(baserel, foreignScanPath); 338 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 339 | } 340 | 341 | 342 | /* 343 | * JsonGetForeignPlan creates a ForeignScan plan node for scanning the foreign 344 | * table. We also add the query column list to scan nodes private list, because 345 | * we need it later for mapping columns. 346 | */ 347 | static ForeignScan * 348 | JsonGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreignTableId, 349 | ForeignPath *bestPath, List *targetList, List *scanClauses) 350 | { 351 | ForeignScan *foreignScan = NULL; 352 | List *columnList = NULL; 353 | List *foreignPrivateList = NIL; 354 | 355 | /* 356 | * We have no native ability to evaluate restriction clauses, so we just 357 | * put all the scanClauses into the plan node's qual list for the executor 358 | * to check. 359 | */ 360 | scanClauses = extract_actual_clauses(scanClauses, false); 361 | 362 | /* 363 | * As an optimization, we only add columns that are present in the query to 364 | * the column mapping hash. To find these columns, we need baserel. We don't 365 | * have access to baserel in executor's callback functions, so we get the 366 | * column list here and put it into foreign scan node's private list. 367 | */ 368 | columnList = ColumnList(baserel); 369 | foreignPrivateList = list_make1(columnList); 370 | 371 | // create the foreign scan node 372 | foreignScan = make_foreignscan( 373 | targetList, scanClauses, baserel->relid 374 | , NIL // no expressions to evaluate 375 | , foreignPrivateList 376 | #if PG_VERSION_NUM >= 90500 377 | ,NIL // no fdw_scan_tlist 378 | #endif 379 | ); 380 | 381 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 382 | return foreignScan; 383 | } 384 | 385 | 386 | // JsonExplainForeignScan produces extra output for the Explain command. 387 | static void 388 | JsonExplainForeignScan(ForeignScanState *scanState, ExplainState *explainState) 389 | { 390 | Oid foreignTableId = RelationGetRelid(scanState->ss.ss_currentRelation); 391 | JsonFdwOptions *options = JsonGetOptions(foreignTableId); 392 | 393 | ExplainPropertyText("Json File", options->filename, explainState); 394 | ExplainPropertyText("HTTP Post Vars", options->pHttpPostVars, explainState); 395 | ExplainPropertyText("Rom URL", options->pRomUrl, explainState); 396 | ExplainPropertyText("Rom PATH", options->pRomPath, explainState); 397 | 398 | // supress file size if we're not showing cost details 399 | if (explainState->costs) 400 | { 401 | struct stat statBuffer; 402 | 403 | int statResult = stat(options->filename, &statBuffer); 404 | if (statResult == 0) 405 | { 406 | ExplainPropertyLong("Json File Size", (long) statBuffer.st_size, 407 | explainState); 408 | } 409 | } 410 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 411 | } 412 | 413 | static int rciMethod(rci_t *pRci, char const *pMethod, char const *pRomUrl, char const *pRomPath) 414 | { int methodOk = 0; 415 | 416 | if(pRci != NULL) 417 | { 418 | if(pRci->pMethod != NULL) 419 | { 420 | // special case, (pMethod == NULL) means any method 421 | methodOk = (pMethod != NULL ? (strcasecmp(pRci->pMethod, pMethod) == 0) : 1); 422 | if(!methodOk) 423 | { 424 | ereport(ERROR, (errmsg("Method not supported."), 425 | errhint("URL '%s' path '%s' operation '%s' method '%s'" 426 | , pRomUrl 427 | , pRomPath 428 | , pRci->pAction 429 | , pRci->pMethod 430 | ))); 431 | } 432 | } 433 | else 434 | { 435 | ereport(ERROR, (errmsg("Method not specified for ROM path operation."), 436 | errhint("URL '%s' path '%s' operation '%s'", pRomUrl, pRomPath, pRci->pAction))); 437 | } 438 | } 439 | 440 | return methodOk; 441 | } 442 | 443 | static int rciError(rci_t *pRci, char const *pRomUrl, char const *pRomPath) 444 | { int error = 1; 445 | 446 | if(pRci != NULL) 447 | { 448 | if(pRci->romRoot != NULL) 449 | { 450 | if(pRci->romRootAction != NULL) 451 | error = 0; 452 | else 453 | { 454 | ereport(ERROR, (errmsg("Path does not support operation."), 455 | errhint("URL '%s' path '%s' operation '%s'", pRomUrl, pRomPath, pRci->pAction))); 456 | } 457 | } 458 | else 459 | { 460 | ereport(ERROR, (errmsg("Invalid rom_path."), 461 | errhint("URL '%s' path '%s'", pRomUrl, pRomPath))); 462 | } 463 | } 464 | else 465 | { 466 | ereport(ERROR, (errmsg("Unable to access ROM."), 467 | errhint("URL '%s' path '%s'", pRomUrl, pRomPath))); 468 | } 469 | 470 | return error; 471 | } 472 | 473 | /* 474 | * JsonBeginForeignScan opens the underlying json file for reading. The function 475 | * also creates a hash table that maps referenced column names to column index 476 | * and type information. 477 | */ 478 | static void 479 | JsonBeginForeignScan(ForeignScanState *scanState, int executorFlags) 480 | { 481 | JsonFdwExecState *execState = NULL; 482 | ForeignScan *foreignScan = NULL; 483 | List *foreignPrivateList = NULL; 484 | Oid foreignTableId = InvalidOid; 485 | JsonFdwOptions *options = NULL; 486 | List *columnList = NULL; 487 | HTAB *columnMappingHash = NULL; 488 | bool gzipFile = false; 489 | bool hdfsBlock = false; 490 | FILE *filePointer = NULL; 491 | gzFile gzFilePointer = NULL; 492 | bool openError = false; 493 | const char *filename = NULL; 494 | const char *postVars = NULL; 495 | cfr_t *pCfr = NULL; 496 | 497 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 498 | 499 | // if Explain with no Analyze, do nothing 500 | if (executorFlags & EXEC_FLAG_EXPLAIN_ONLY) 501 | { 502 | return; 503 | } 504 | 505 | foreignTableId = RelationGetRelid(scanState->ss.ss_currentRelation); 506 | options = JsonGetOptions(foreignTableId); 507 | 508 | foreignScan = (ForeignScan *) scanState->ss.ps.plan; 509 | foreignPrivateList = (List *) foreignScan->fdw_private; 510 | 511 | columnList = (List *) linitial(foreignPrivateList); 512 | columnMappingHash = ColumnMappingHash(foreignTableId, columnList); 513 | 514 | filename = options->filename; 515 | postVars = options->pHttpPostVars; 516 | 517 | // if a ROM is specified, get/build an off box url 518 | if(options->pRomUrl != NULL && *options->pRomUrl 519 | && options->pRomPath != NULL && *options->pRomPath 520 | ) 521 | { 522 | rci_t *pRci = rciFetch(options->pRomUrl, options->pRomPath, RCI_ACTION_SELECT); 523 | 524 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 525 | if(!rciError(pRci, options->pRomUrl, options->pRomPath) 526 | && rciMethod(pRci, "get", options->pRomUrl, options->pRomPath) 527 | ) 528 | { 529 | filename = pstrdup(pRci->pUrl); // dupe the url 530 | postVars = NULL; 531 | } 532 | rciFree(pRci); 533 | } 534 | 535 | // See if this is an off box url, and try to fetch it 536 | // and then pass it off to one of the native file handlers 537 | if(filename != NULL && *filename) 538 | pCfr = curlFetchFile(filename, postVars); 539 | else 540 | openError = 1; 541 | 542 | // if fetched 543 | if(pCfr != NULL) 544 | { 545 | openError = !pCfr->bFileFetched; 546 | if(!openError) 547 | // replace the url with the on box filename of the file that we just 548 | // downloaded so that the existing file handlers can just use a file 549 | filename = pCfr->ccf.pFileName; 550 | /* 551 | ELog(DEBUG1, "%s:%u fetched %u, took %lu ms, http response %lu, content type '%s'" 552 | , __func__, __LINE__ 553 | , pCfr->bFileFetched 554 | , pCfr->queryDuration 555 | , pCfr->httpResponseCode 556 | , pCfr->pContentType 557 | ); 558 | */ 559 | } 560 | 561 | if(!openError && filename != NULL && *filename) 562 | { 563 | gzipFile = GzipFilename(filename); 564 | hdfsBlock = HdfsBlockName(filename); 565 | 566 | if (gzipFile || hdfsBlock) 567 | { 568 | gzFilePointer = gzopen(filename, PG_BINARY_R); 569 | openError = (gzFilePointer == NULL); 570 | } 571 | else 572 | { 573 | filePointer = AllocateFile(filename, PG_BINARY_R); 574 | openError = (filePointer == NULL); 575 | } 576 | } 577 | 578 | if(openError || filename == NULL || !*filename) 579 | { 580 | ereport(ERROR, (errcode_for_file_access(), 581 | errmsg("could not open file \"%s\" for reading: %m", 582 | filename))); 583 | curlCfrFree(pCfr); 584 | pCfr = NULL; 585 | } 586 | 587 | execState = (JsonFdwExecState *) palloc(sizeof(JsonFdwExecState)); 588 | execState->filename = filename; 589 | execState->filePointer = filePointer; 590 | execState->gzFilePointer = gzFilePointer; 591 | execState->columnMappingHash = columnMappingHash; 592 | execState->maxErrorCount = options->maxErrorCount; 593 | execState->errorCount = 0; 594 | execState->currentLineNumber = 0; 595 | // we pass this off to EndForeignScan to manage 596 | execState->pCfr = pCfr; 597 | 598 | scanState->fdw_state = (void *) execState; 599 | } 600 | 601 | 602 | /* 603 | * JsonIterateForeignScan reads the next record from the data file, converts it 604 | * to PostgreSQL tuple, and stores the converted tuple into the ScanTupleSlot as 605 | * a virtual tuple. 606 | */ 607 | static TupleTableSlot * 608 | JsonIterateForeignScan(ForeignScanState *scanState) 609 | { 610 | JsonFdwExecState *execState = (JsonFdwExecState *) scanState->fdw_state; 611 | TupleTableSlot *tupleSlot = scanState->ss.ss_ScanTupleSlot; 612 | HTAB *columnMappingHash = execState->columnMappingHash; 613 | char errorBuffer[ERROR_BUFFER_SIZE]; 614 | yajl_val jsonValue = NULL; 615 | bool endOfFile = false; 616 | bool jsonObjectValid = false; 617 | bool errorCountExceeded = false; 618 | 619 | TupleDesc tupleDescriptor = tupleSlot->tts_tupleDescriptor; 620 | Datum *columnValues = tupleSlot->tts_values; 621 | bool *columnNulls = tupleSlot->tts_isnull; 622 | int columnCount = tupleDescriptor->natts; 623 | 624 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 625 | // initialize all values for this row to null 626 | memset(columnValues, 0, columnCount * sizeof(Datum)); 627 | memset(columnNulls, true, columnCount * sizeof(bool)); 628 | 629 | ExecClearTuple(tupleSlot); 630 | 631 | /* 632 | * Loop until we reach the end of file, or we read a line that parses to be 633 | * a valid json object, or we exceed the maximum allowed error count. 634 | */ 635 | while (!(endOfFile || jsonObjectValid || errorCountExceeded)) 636 | { 637 | StringInfo lineData = NULL; 638 | if (execState->gzFilePointer != NULL) 639 | lineData = ReadLineFromGzipFile(execState->gzFilePointer); 640 | else 641 | lineData = ReadLineFromFile(execState->filePointer); 642 | 643 | if (lineData->len == 0) 644 | endOfFile = true; 645 | else 646 | { 647 | execState->currentLineNumber++; 648 | 649 | jsonValue = yajl_tree_parse(lineData->data, errorBuffer, sizeof(errorBuffer)); 650 | 651 | jsonObjectValid = YAJL_IS_OBJECT(jsonValue); 652 | if (!jsonObjectValid) 653 | { 654 | yajl_tree_free(jsonValue); 655 | 656 | execState->errorCount++; 657 | } 658 | 659 | if (execState->errorCount > execState->maxErrorCount) 660 | errorCountExceeded = true; 661 | } 662 | } 663 | 664 | if (jsonObjectValid) 665 | { 666 | FillTupleSlot(jsonValue, NULL, columnMappingHash, columnValues, columnNulls); 667 | ExecStoreVirtualTuple(tupleSlot); 668 | 669 | yajl_tree_free(jsonValue); 670 | } 671 | else if (errorCountExceeded) 672 | { 673 | ereport(ERROR, (errmsg("could not parse %u json objects", execState->errorCount), 674 | errhint("Last error message at line: %u: %s", 675 | execState->currentLineNumber, errorBuffer))); 676 | } 677 | 678 | return tupleSlot; 679 | } 680 | 681 | 682 | // JsonReScanForeignScan rescans the foreign table. 683 | static void 684 | JsonReScanForeignScan(ForeignScanState *scanState) 685 | { 686 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 687 | JsonEndForeignScan(scanState); 688 | JsonBeginForeignScan(scanState, 0); 689 | } 690 | 691 | 692 | /* 693 | * JsonEndForeignScan finishes scanning the foreign table, and frees the acquired 694 | * resources. 695 | */ 696 | static void 697 | JsonEndForeignScan(ForeignScanState *scanState) 698 | { 699 | JsonFdwExecState *executionState = (JsonFdwExecState *) scanState->fdw_state; 700 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 701 | if (executionState == NULL) 702 | { 703 | return; 704 | } 705 | 706 | if (executionState->filePointer != NULL) 707 | { 708 | int closeStatus = FreeFile(executionState->filePointer); 709 | if (closeStatus != 0) 710 | { 711 | ereport(ERROR, (errcode_for_file_access(), 712 | errmsg("could not close file \"%s\": %m", 713 | executionState->filename))); 714 | } 715 | } 716 | 717 | if (executionState->gzFilePointer != NULL) 718 | { 719 | int closeStatus = gzclose(executionState->gzFilePointer); 720 | if (closeStatus != Z_OK) 721 | { 722 | ereport(ERROR, (errcode_for_file_access(), 723 | errmsg("could not close file \"%s\": %m", 724 | executionState->filename))); 725 | } 726 | } 727 | 728 | if (executionState->columnMappingHash != NULL) 729 | { 730 | hash_destroy(executionState->columnMappingHash); 731 | } 732 | 733 | curlCfrFree(executionState->pCfr); 734 | 735 | pfree(executionState); 736 | } 737 | 738 | 739 | /* 740 | * JsonGetOptions returns the option values to be used when reading and parsing 741 | * the json file. To resolve these values, the function checks options for the 742 | * foreign table, and if not present, falls back to default values. 743 | */ 744 | static JsonFdwOptions * 745 | JsonGetOptions(Oid foreignTableId) 746 | { 747 | JsonFdwOptions *jsonFdwOptions = (JsonFdwOptions *) palloc0(sizeof(JsonFdwOptions)); 748 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 749 | 750 | if(jsonFdwOptions != NULL) 751 | { char *maxErrorCountString = JsonGetOptionValue(foreignTableId, OPTION_NAME_MAX_ERROR_COUNT); 752 | 753 | jsonFdwOptions->maxErrorCount = (maxErrorCountString != NULL 754 | ? pg_atoi(maxErrorCountString, sizeof(int32), 0) 755 | : DEFAULT_MAX_ERROR_COUNT 756 | ); 757 | jsonFdwOptions->filename = JsonGetOptionValue(foreignTableId, OPTION_NAME_FILENAME); 758 | jsonFdwOptions->pHttpPostVars = JsonGetOptionValue(foreignTableId, OPTION_NAME_HTTP_POST_VARS); 759 | jsonFdwOptions->pRomUrl = JsonGetOptionValue(foreignTableId, OPTION_NAME_ROM_URL); 760 | jsonFdwOptions->pRomPath = JsonGetOptionValue(foreignTableId, OPTION_NAME_ROM_PATH); 761 | } 762 | 763 | return jsonFdwOptions; 764 | } 765 | 766 | 767 | /* 768 | * Json GetOptionValue walks over foreign table and foreign server options, and 769 | * looks for the option with the given name. If found, the function returns the 770 | * option's value. This function is unchanged from mongo_fdw. 771 | */ 772 | static char * 773 | JsonGetOptionValue(Oid foreignTableId, const char *optionName) 774 | { 775 | ForeignTable *foreignTable = NULL; 776 | ForeignServer *foreignServer = NULL; 777 | List *optionList = NIL; 778 | ListCell *optionCell = NULL; 779 | char *optionValue = NULL; 780 | 781 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 782 | foreignTable = GetForeignTable(foreignTableId); 783 | foreignServer = GetForeignServer(foreignTable->serverid); 784 | 785 | optionList = list_concat(optionList, foreignTable->options); 786 | optionList = list_concat(optionList, foreignServer->options); 787 | 788 | foreach(optionCell, optionList) 789 | { 790 | DefElem *optionDef = (DefElem *) lfirst(optionCell); 791 | char *optionDefName = optionDef->defname; 792 | 793 | if (strncmp(optionDefName, optionName, NAMEDATALEN) == 0) 794 | { 795 | optionValue = defGetString(optionDef); 796 | break; 797 | } 798 | } 799 | 800 | return optionValue; 801 | } 802 | 803 | 804 | // TupleCount estimates the number of base relation tuples in the given file. 805 | static double 806 | TupleCount(RelOptInfo *baserel, const char *filename) 807 | { 808 | double tupleCount = 0.0; 809 | 810 | BlockNumber pageCountEstimate = baserel->pages; 811 | if (pageCountEstimate > 0) 812 | { 813 | /* 814 | * We have number of pages and number of tuples from pg_class (from a 815 | * previous Analyze), so compute a tuples-per-page estimate and scale 816 | * that by the current file size. 817 | */ 818 | double density = baserel->tuples / (double) pageCountEstimate; 819 | BlockNumber pageCount = PageCount(filename); 820 | 821 | tupleCount = clamp_row_est(density * (double) pageCount); 822 | } 823 | else 824 | { 825 | /* 826 | * Otherwise we have to fake it. We back into this estimate using the 827 | * planner's idea of relation width, which may be inaccurate. For better 828 | * estimates, users need to run Analyze. 829 | */ 830 | struct stat statBuffer; 831 | int tupleWidth = 0; 832 | 833 | int statResult = stat(filename, &statBuffer); 834 | if (statResult < 0) 835 | { 836 | // file may not be there at plan time, so use a default estimate 837 | statBuffer.st_size = 10 * BLCKSZ; 838 | } 839 | 840 | tupleWidth = MAXALIGN(baserel->width) + MAXALIGN(sizeof(HeapTupleHeaderData)); 841 | tupleCount = clamp_row_est((double) statBuffer.st_size / (double) tupleWidth); 842 | } 843 | 844 | return tupleCount; 845 | } 846 | 847 | 848 | // PageCount calculates and returns the number of pages in a file. 849 | static BlockNumber 850 | PageCount(const char *filename) 851 | { 852 | BlockNumber pageCount = 0; 853 | struct stat statBuffer; 854 | 855 | // if file doesn't exist at plan time, use default estimate for its size 856 | int statResult = stat(filename, &statBuffer); 857 | if (statResult < 0) 858 | { 859 | statBuffer.st_size = 10 * BLCKSZ; 860 | } 861 | 862 | pageCount = (statBuffer.st_size + (BLCKSZ - 1)) / BLCKSZ; 863 | if (pageCount < 1) 864 | { 865 | pageCount = 1; 866 | } 867 | 868 | return pageCount; 869 | } 870 | 871 | 872 | /* 873 | * ColumnList takes in the planner's information about this foreign table. The 874 | * function then finds all columns needed for query execution, including those 875 | * used in projections, joins, and filter clauses, de-duplicates these columns, 876 | * and returns them in a new list. This function is unchanged from mongo_fdw. 877 | */ 878 | static List * 879 | ColumnList(RelOptInfo *baserel) 880 | { 881 | List *columnList = NIL; 882 | List *neededColumnList = NIL; 883 | AttrNumber columnIndex = 1; 884 | AttrNumber columnCount = baserel->max_attr; 885 | List *targetColumnList = baserel->reltargetlist; 886 | List *restrictInfoList = baserel->baserestrictinfo; 887 | ListCell *restrictInfoCell = NULL; 888 | 889 | // first add the columns used in joins and projections 890 | neededColumnList = list_copy(targetColumnList); 891 | 892 | // then walk over all restriction clauses, and pull up any used columns 893 | foreach(restrictInfoCell, restrictInfoList) 894 | { 895 | RestrictInfo *restrictInfo = (RestrictInfo *) lfirst(restrictInfoCell); 896 | Node *restrictClause = (Node *) restrictInfo->clause; 897 | List *clauseColumnList = NIL; 898 | 899 | // recursively pull up any columns used in the restriction clause 900 | clauseColumnList = pull_var_clause(restrictClause, 901 | PVC_RECURSE_AGGREGATES, 902 | PVC_RECURSE_PLACEHOLDERS); 903 | 904 | neededColumnList = list_union(neededColumnList, clauseColumnList); 905 | } 906 | 907 | // walk over all column definitions, and de-duplicate column list 908 | for (columnIndex = 1; columnIndex <= columnCount; columnIndex++) 909 | { 910 | ListCell *neededColumnCell = NULL; 911 | Var *column = NULL; 912 | 913 | // look for this column in the needed column list 914 | foreach(neededColumnCell, neededColumnList) 915 | { 916 | Var *neededColumn = (Var *) lfirst(neededColumnCell); 917 | if (neededColumn->varattno == columnIndex) 918 | { 919 | column = neededColumn; 920 | break; 921 | } 922 | } 923 | 924 | if (column != NULL) 925 | { 926 | columnList = lappend(columnList, column); 927 | } 928 | } 929 | 930 | return columnList; 931 | } 932 | 933 | 934 | /* 935 | * ColumnMappingHash creates a hash table that maps column names to column index 936 | * and types. This table helps us quickly translate JSON document key/values to 937 | * corresponding PostgreSQL columns. This function is unchanged from mongo_fdw. 938 | */ 939 | static HTAB * 940 | ColumnMappingHash(Oid foreignTableId, List *columnList) 941 | { 942 | HTAB *columnMappingHash = NULL; 943 | ListCell *columnCell = NULL; 944 | const long hashTableSize = 2048; 945 | 946 | // create hash table 947 | HASHCTL hashInfo; 948 | memset(&hashInfo, 0, sizeof(hashInfo)); 949 | hashInfo.keysize = NAMEDATALEN; 950 | hashInfo.entrysize = sizeof(ColumnMapping); 951 | hashInfo.hash = string_hash; 952 | hashInfo.hcxt = CurrentMemoryContext; 953 | 954 | columnMappingHash = hash_create("Column Mapping Hash", hashTableSize, &hashInfo, 955 | (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT)); 956 | Assert(columnMappingHash != NULL); 957 | 958 | foreach(columnCell, columnList) 959 | { 960 | Var *column = (Var *) lfirst(columnCell); 961 | AttrNumber columnId = column->varattno; 962 | 963 | ColumnMapping *columnMapping = NULL; 964 | char *columnName = NULL; 965 | bool handleFound = false; 966 | void *hashKey = NULL; 967 | 968 | columnName = get_relid_attribute_name(foreignTableId, columnId); 969 | hashKey = (void *) columnName; 970 | 971 | columnMapping = (ColumnMapping *) hash_search(columnMappingHash, hashKey, 972 | HASH_ENTER, &handleFound); 973 | Assert(columnMapping != NULL); 974 | 975 | columnMapping->columnIndex = columnId - 1; 976 | columnMapping->columnTypeId = column->vartype; 977 | columnMapping->columnTypeMod = column->vartypmod; 978 | columnMapping->columnArrayTypeId = get_element_type(column->vartype); 979 | } 980 | 981 | return columnMappingHash; 982 | } 983 | 984 | 985 | // GzipFilename returns true if the filename ends with a gzip file extension. 986 | static bool 987 | GzipFilename(const char *filename) 988 | { 989 | bool gzipFile = false; 990 | const char *extension = NULL; 991 | 992 | extension = strrchr(filename, '.'); 993 | if (extension != NULL) 994 | { 995 | if (strncmp(extension, GZIP_FILE_EXTENSION, MAXPGPATH) == 0) 996 | { 997 | gzipFile = true; 998 | } 999 | } 1000 | 1001 | return gzipFile; 1002 | } 1003 | 1004 | 1005 | // HdfsBlockName returns true if filename belongs to a hdfs block. 1006 | static bool 1007 | HdfsBlockName(const char *filename) 1008 | { 1009 | bool hdfsBlock = false; 1010 | const char *basename = NULL; 1011 | 1012 | const char *lastDirSeparator = last_dir_separator(filename); 1013 | if (lastDirSeparator == NULL) 1014 | { 1015 | basename = filename; 1016 | } 1017 | else 1018 | { 1019 | basename = lastDirSeparator + 1; 1020 | } 1021 | 1022 | if (strncmp(basename, HDFS_BLOCK_PREFIX, HDFS_BLOCK_PREFIX_LENGTH) == 0) 1023 | { 1024 | hdfsBlock = true; 1025 | } 1026 | 1027 | return hdfsBlock; 1028 | } 1029 | 1030 | 1031 | /* 1032 | * ReadLineFromFile reads and returns the next line in the file. If the function 1033 | * reaches the end of file without reading input, it returns an empty string. 1034 | */ 1035 | static StringInfo 1036 | ReadLineFromFile(FILE *filePointer) 1037 | { 1038 | StringInfo lineData = makeStringInfo(); 1039 | bool endOfFile = false; 1040 | bool endOfLine = false; 1041 | char buffer[READ_BUFFER_SIZE]; 1042 | 1043 | // read from file until either we reach end of file or end of line 1044 | while (!endOfFile && !endOfLine) 1045 | { 1046 | char *fgetsResult; 1047 | 1048 | memset(buffer, 0, sizeof(buffer)); 1049 | fgetsResult = fgets(buffer, sizeof(buffer), filePointer); 1050 | if (fgetsResult == NULL) 1051 | { 1052 | int errorResult = ferror(filePointer); 1053 | if (errorResult != 0) 1054 | { 1055 | ereport(ERROR, (errcode_for_file_access(), 1056 | errmsg("could not read from json file: %m"))); 1057 | } 1058 | 1059 | endOfFile = true; 1060 | } 1061 | else 1062 | { 1063 | // check if we read a new line 1064 | endOfLine = (buffer[strlen(buffer) - 1] == '\n'); 1065 | 1066 | appendStringInfoString(lineData, buffer); 1067 | } 1068 | } 1069 | 1070 | return lineData; 1071 | } 1072 | 1073 | 1074 | /* 1075 | * ReadLineFromFile reads and returns the next line in the file. If the function 1076 | * reaches the end of file without reading input, it returns an empty string. 1077 | */ 1078 | static StringInfo 1079 | ReadLineFromGzipFile(gzFile gzFilePointer) 1080 | { 1081 | StringInfo lineData = makeStringInfo(); 1082 | bool endOfFile = false; 1083 | bool endOfLine = false; 1084 | char buffer[READ_BUFFER_SIZE]; 1085 | 1086 | // read from file until either we reach end of file or end of line 1087 | while (!endOfFile && !endOfLine) 1088 | { 1089 | char *getsResult = gzgets(gzFilePointer, buffer, sizeof(buffer)); 1090 | if (getsResult == NULL) 1091 | { 1092 | int errorResult = 0; 1093 | const char *message = gzerror(gzFilePointer, &errorResult); 1094 | if (errorResult != Z_OK && errorResult != Z_STREAM_END) 1095 | { 1096 | ereport(ERROR, (errmsg("could not read from json file"), 1097 | errhint("%s", message))); 1098 | } 1099 | 1100 | endOfFile = true; 1101 | } 1102 | else 1103 | { 1104 | // check if we read a new line 1105 | endOfLine = (buffer[strlen(buffer) - 1] == '\n'); 1106 | 1107 | appendStringInfoString(lineData, buffer); 1108 | } 1109 | } 1110 | 1111 | return lineData; 1112 | } 1113 | 1114 | 1115 | /* 1116 | * FillTupleSlot walks over all key/value pairs in the given document. For each 1117 | * pair, the function checks if the key appears in the column mapping hash, and 1118 | * if the value type is compatible with the one specified for the column. If so 1119 | * the function converts the value and fills the corresponding tuple position. 1120 | * The jsonObjectKey parameter is used for recursion, and should always be 1121 | * passed as NULL. This function is based on the function with the same name in 1122 | * mongo_fdw. 1123 | */ 1124 | static void 1125 | FillTupleSlot(const yajl_val jsonObject, const char *jsonObjectKey, 1126 | HTAB *columnMappingHash, Datum *columnValues, bool *columnNulls) 1127 | { 1128 | uint32 jsonKeyCount = jsonObject->u.object.len; 1129 | const char **jsonKeyArray = jsonObject->u.object.keys; 1130 | yajl_val *jsonValueArray = jsonObject->u.object.values; 1131 | uint32 jsonKeyIndex = 0; 1132 | 1133 | // loop over key/value pairs of the json object 1134 | for (jsonKeyIndex = 0; jsonKeyIndex < jsonKeyCount; jsonKeyIndex++) 1135 | { 1136 | const char *jsonKey = jsonKeyArray[jsonKeyIndex]; 1137 | yajl_val jsonValue = jsonValueArray[jsonKeyIndex]; 1138 | 1139 | ColumnMapping *columnMapping = NULL; 1140 | Oid columnTypeId = InvalidOid; 1141 | Oid columnArrayTypeId = InvalidOid; 1142 | Oid columnTypeMod = InvalidOid; 1143 | bool compatibleTypes = false; 1144 | bool handleFound = false; 1145 | const char *jsonFullKey = NULL; 1146 | void *hashKey = NULL; 1147 | 1148 | if (jsonObjectKey != NULL) 1149 | { 1150 | /* 1151 | * For fields in nested json objects, we use fully qualified field 1152 | * name to check the column mapping. 1153 | */ 1154 | StringInfo jsonFullKeyString = makeStringInfo(); 1155 | appendStringInfo(jsonFullKeyString, "%s.%s", jsonObjectKey, jsonKey); 1156 | jsonFullKey = jsonFullKeyString->data; 1157 | } 1158 | else 1159 | { 1160 | jsonFullKey = jsonKey; 1161 | } 1162 | 1163 | // recurse into nested objects 1164 | if (YAJL_IS_OBJECT(jsonValue)) 1165 | { 1166 | FillTupleSlot(jsonValue, jsonFullKey, columnMappingHash, 1167 | columnValues, columnNulls); 1168 | continue; 1169 | } 1170 | 1171 | // look up the corresponding column for this json key 1172 | hashKey = (void *) jsonFullKey; 1173 | columnMapping = (ColumnMapping *) hash_search(columnMappingHash, hashKey, 1174 | HASH_FIND, &handleFound); 1175 | 1176 | // if no corresponding column or null json value, continue 1177 | if (columnMapping == NULL || YAJL_IS_NULL(jsonValue)) 1178 | { 1179 | continue; 1180 | } 1181 | 1182 | // check if columns have compatible types 1183 | columnTypeId = columnMapping->columnTypeId; 1184 | columnArrayTypeId = columnMapping->columnArrayTypeId; 1185 | columnTypeMod = columnMapping->columnTypeMod; 1186 | 1187 | if (OidIsValid(columnArrayTypeId)) 1188 | { 1189 | compatibleTypes = YAJL_IS_ARRAY(jsonValue); 1190 | } 1191 | else 1192 | { 1193 | compatibleTypes = ColumnTypesCompatible(jsonValue, columnTypeId); 1194 | } 1195 | 1196 | // if types are incompatible, leave this column null 1197 | if (!compatibleTypes) 1198 | { 1199 | continue; 1200 | } 1201 | 1202 | // fill in corresponding column value and null flag 1203 | if (OidIsValid(columnArrayTypeId)) 1204 | { 1205 | uint32 columnIndex = columnMapping->columnIndex; 1206 | columnValues[columnIndex] = ColumnValueArray(jsonValue, columnArrayTypeId, 1207 | columnTypeMod); 1208 | columnNulls[columnIndex] = false; 1209 | } 1210 | else 1211 | { 1212 | uint32 columnIndex = columnMapping->columnIndex; 1213 | columnValues[columnIndex] = ColumnValue(jsonValue, columnTypeId, 1214 | columnTypeMod); 1215 | columnNulls[columnIndex] = false; 1216 | } 1217 | } 1218 | } 1219 | 1220 | 1221 | /* 1222 | * ColumnTypesCompatible checks if the given json value can be converted to the 1223 | * given PostgreSQL type. 1224 | */ 1225 | static bool 1226 | ColumnTypesCompatible(yajl_val jsonValue, Oid columnTypeId) 1227 | { 1228 | bool compatibleTypes = false; 1229 | 1230 | // we consider the PostgreSQL column type as authoritative 1231 | switch(columnTypeId) 1232 | { 1233 | case INT2OID: case INT4OID: 1234 | case INT8OID: case FLOAT4OID: 1235 | case FLOAT8OID: case NUMERICOID: 1236 | { 1237 | if (YAJL_IS_NUMBER(jsonValue)) 1238 | { 1239 | compatibleTypes = true; 1240 | } 1241 | break; 1242 | } 1243 | case BOOLOID: 1244 | { 1245 | if (YAJL_IS_TRUE(jsonValue) || YAJL_IS_FALSE(jsonValue)) 1246 | { 1247 | compatibleTypes = true; 1248 | } 1249 | break; 1250 | } 1251 | case BPCHAROID: 1252 | case VARCHAROID: 1253 | case TEXTOID: 1254 | { 1255 | if (YAJL_IS_STRING(jsonValue)) 1256 | { 1257 | compatibleTypes = true; 1258 | } 1259 | break; 1260 | } 1261 | case DATEOID: 1262 | case TIMESTAMPOID: 1263 | case TIMESTAMPTZOID: 1264 | { 1265 | if (YAJL_IS_STRING(jsonValue)) 1266 | { 1267 | const char *stringValue = (char *) YAJL_GET_STRING(jsonValue); 1268 | 1269 | bool validDateTimeFormat = ValidDateTimeFormat(stringValue); 1270 | if (validDateTimeFormat) 1271 | { 1272 | compatibleTypes = true; 1273 | } 1274 | } 1275 | break; 1276 | } 1277 | default: 1278 | { 1279 | /* 1280 | * We currently error out on other data types. Some types such as 1281 | * byte arrays are easy to add, but they need testing. Other types 1282 | * such as money or inet, do not have equivalents in JSON. 1283 | */ 1284 | ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE), 1285 | errmsg("cannot convert json type to column type"), 1286 | errhint("column type: %u", (uint32) columnTypeId))); 1287 | break; 1288 | } 1289 | } 1290 | 1291 | return compatibleTypes; 1292 | } 1293 | 1294 | 1295 | /* 1296 | * ValidDateTimeFormat checks if the given dateTimeString can be parsed and decoded 1297 | * as a date/timestamp. The algorithm used here is based on date_in, timestamp_in, 1298 | * and timestamptz_in functions. 1299 | */ 1300 | static bool 1301 | ValidDateTimeFormat(const char *dateTimeString) 1302 | { 1303 | bool validDateTimeFormat = false; 1304 | char workBuffer[MAXDATELEN + 1]; 1305 | char *fieldArray[MAXDATEFIELDS]; 1306 | int fieldTypeArray[MAXDATEFIELDS]; 1307 | int fieldCount = 0; 1308 | 1309 | int parseError = ParseDateTime(dateTimeString, workBuffer, sizeof(workBuffer), 1310 | fieldArray, fieldTypeArray, MAXDATEFIELDS, 1311 | &fieldCount); 1312 | 1313 | if (parseError == 0) 1314 | { 1315 | int dateType = 0; 1316 | struct pg_tm dateTime; 1317 | fsec_t fractionalSecond = 0; 1318 | int timezone = 0; 1319 | 1320 | int decodeError = DecodeDateTime(fieldArray, fieldTypeArray, fieldCount, 1321 | &dateType, &dateTime, &fractionalSecond, 1322 | &timezone); 1323 | if (decodeError == 0) 1324 | { 1325 | /* 1326 | * We only accept DTK_DATE, DTK_EPOCH, DTK_LATE, and DTK_EARLY date 1327 | * types. For other date types, input functions raise an error. 1328 | */ 1329 | if (dateType == DTK_DATE || dateType == DTK_EPOCH || 1330 | dateType == DTK_LATE || dateType == DTK_EARLY) 1331 | { 1332 | validDateTimeFormat = true; 1333 | } 1334 | #ifdef DEBUG 1335 | else 1336 | ereport(DEBUG1, (errmsg("%s:%s:%u invlalid format", __FILE__, __func__, __LINE__))); 1337 | #endif 1338 | } 1339 | #ifdef DEBUG 1340 | else 1341 | ereport(DEBUG1, (errmsg("%s:%s:%u decode error", __FILE__, __func__, __LINE__))); 1342 | #endif 1343 | } 1344 | #ifdef DEBUG 1345 | else 1346 | ereport(DEBUG1, (errmsg("%s:%s:%u parse error", __FILE__, __func__, __LINE__))); 1347 | #endif 1348 | 1349 | return validDateTimeFormat; 1350 | } 1351 | 1352 | 1353 | /* 1354 | * ColumnValueArray uses array element type id to read the current array pointed 1355 | * to by the jsonArray, and converts each array element with matching type to 1356 | * the corresponding PostgreSQL datum. Then, the function constructs an array 1357 | * datum from element datums, and returns the array datum. This function ignores 1358 | * values that aren't type compatible with valueTypeId. 1359 | */ 1360 | static Datum 1361 | ColumnValueArray(yajl_val jsonArray, Oid valueTypeId, Oid valueTypeMod) 1362 | { 1363 | Datum columnValueDatum = 0; 1364 | ArrayType *columnValueObject = NULL; 1365 | bool typeByValue = false; 1366 | char typeAlignment = 0; 1367 | int16 typeLength = 0; 1368 | 1369 | uint32 jsonValueCount = jsonArray->u.array.len; 1370 | yajl_val *jsonValueArray = jsonArray->u.array.values; 1371 | 1372 | // allocate enough room for datum array's maximum possible size 1373 | Datum *datumArray = palloc0(jsonValueCount * sizeof(Datum)); 1374 | uint32 datumArraySize = 0; 1375 | 1376 | uint32 jsonValueIndex = 0; 1377 | for (jsonValueIndex = 0; jsonValueIndex < jsonValueCount; jsonValueIndex++) 1378 | { 1379 | yajl_val jsonValue = jsonValueArray[jsonValueIndex]; 1380 | 1381 | bool compatibleTypes = ColumnTypesCompatible(jsonValue, valueTypeId); 1382 | if (compatibleTypes) 1383 | { 1384 | datumArray[datumArraySize] = ColumnValue(jsonValue, valueTypeId, 1385 | valueTypeMod); 1386 | datumArraySize++; 1387 | } 1388 | } 1389 | 1390 | get_typlenbyvalalign(valueTypeId, &typeLength, &typeByValue, &typeAlignment); 1391 | columnValueObject = construct_array(datumArray, datumArraySize, valueTypeId, 1392 | typeLength, typeByValue, typeAlignment); 1393 | 1394 | columnValueDatum = PointerGetDatum(columnValueObject); 1395 | return columnValueDatum; 1396 | } 1397 | 1398 | 1399 | /* 1400 | * ColumnValue uses column type information to read the current value pointed to 1401 | * by jsonValue, and converts this value to the corresponding PostgreSQL datum. 1402 | * The function then returns this datum. 1403 | */ 1404 | static Datum 1405 | ColumnValue(yajl_val jsonValue, Oid columnTypeId, int32 columnTypeMod) 1406 | { 1407 | Datum columnValue = 0; 1408 | 1409 | switch(columnTypeId) 1410 | { 1411 | case INT2OID: 1412 | { 1413 | const char *value = YAJL_GET_NUMBER(jsonValue); 1414 | columnValue = DirectFunctionCall1(int2in, CStringGetDatum(value)); 1415 | break; 1416 | } 1417 | case INT4OID: 1418 | { 1419 | const char *value = YAJL_GET_NUMBER(jsonValue); 1420 | columnValue = DirectFunctionCall1(int4in, CStringGetDatum(value)); 1421 | break; 1422 | } 1423 | case INT8OID: 1424 | { 1425 | const char *value = YAJL_GET_NUMBER(jsonValue); 1426 | columnValue = DirectFunctionCall1(int8in, CStringGetDatum(value)); 1427 | break; 1428 | } 1429 | case FLOAT4OID: 1430 | { 1431 | const char *value = YAJL_GET_NUMBER(jsonValue); 1432 | columnValue = DirectFunctionCall1(float4in, CStringGetDatum(value)); 1433 | break; 1434 | } 1435 | case FLOAT8OID: 1436 | { 1437 | const char *value = YAJL_GET_NUMBER(jsonValue); 1438 | columnValue = DirectFunctionCall1(float8in, CStringGetDatum(value)); 1439 | break; 1440 | } 1441 | case NUMERICOID: 1442 | { 1443 | const char *value = YAJL_GET_NUMBER(jsonValue); 1444 | columnValue = DirectFunctionCall3(numeric_in, CStringGetDatum(value), 1445 | ObjectIdGetDatum(InvalidOid), 1446 | Int32GetDatum(columnTypeMod)); 1447 | break; 1448 | } 1449 | case BOOLOID: 1450 | { 1451 | bool value = YAJL_IS_TRUE(jsonValue); 1452 | columnValue = BoolGetDatum(value); 1453 | break; 1454 | } 1455 | case BPCHAROID: 1456 | { 1457 | const char *value = YAJL_GET_STRING(jsonValue); 1458 | columnValue = DirectFunctionCall3(bpcharin, CStringGetDatum(value), 1459 | ObjectIdGetDatum(InvalidOid), 1460 | Int32GetDatum(columnTypeMod)); 1461 | break; 1462 | } 1463 | case VARCHAROID: 1464 | { 1465 | const char *value = YAJL_GET_STRING(jsonValue); 1466 | columnValue = DirectFunctionCall3(varcharin, CStringGetDatum(value), 1467 | ObjectIdGetDatum(InvalidOid), 1468 | Int32GetDatum(columnTypeMod)); 1469 | break; 1470 | } 1471 | case TEXTOID: 1472 | { 1473 | const char *value = YAJL_GET_STRING(jsonValue); 1474 | columnValue = CStringGetTextDatum(value); 1475 | break; 1476 | } 1477 | case DATEOID: 1478 | { 1479 | const char *value = YAJL_GET_STRING(jsonValue); 1480 | columnValue = DirectFunctionCall1(date_in, CStringGetDatum(value)); 1481 | break; 1482 | } 1483 | case TIMESTAMPOID: 1484 | { 1485 | const char *value = YAJL_GET_STRING(jsonValue); 1486 | columnValue = DirectFunctionCall3(timestamp_in, CStringGetDatum(value), 1487 | ObjectIdGetDatum(InvalidOid), 1488 | Int32GetDatum(columnTypeMod)); 1489 | break; 1490 | } 1491 | case TIMESTAMPTZOID: 1492 | { 1493 | const char *value = YAJL_GET_STRING(jsonValue); 1494 | columnValue = DirectFunctionCall3(timestamptz_in, CStringGetDatum(value), 1495 | ObjectIdGetDatum(InvalidOid), 1496 | Int32GetDatum(columnTypeMod)); 1497 | break; 1498 | } 1499 | default: 1500 | { 1501 | ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE), 1502 | errmsg("cannot convert json type to column type"), 1503 | errhint("column type: %u", (uint32) columnTypeId))); 1504 | break; 1505 | } 1506 | } 1507 | 1508 | return columnValue; 1509 | } 1510 | 1511 | 1512 | /* 1513 | * JsonAnalyzeForeignTable sets the total page count and the function pointer 1514 | * used to acquire a random sample of rows from the foreign file. 1515 | */ 1516 | static bool 1517 | JsonAnalyzeForeignTable(Relation relation, 1518 | AcquireSampleRowsFunc *acquireSampleRowsFunc, 1519 | BlockNumber *totalPageCount) 1520 | { 1521 | Oid foreignTableId = RelationGetRelid(relation); 1522 | JsonFdwOptions *options = JsonGetOptions(foreignTableId); 1523 | BlockNumber pageCount = 0; 1524 | struct stat statBuffer; 1525 | 1526 | int statResult = stat(options->filename, &statBuffer); 1527 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 1528 | if (statResult < 0) 1529 | { 1530 | ereport(ERROR, (errcode_for_file_access(), 1531 | errmsg("could not stat file \"%s\": %m", 1532 | options->filename))); 1533 | } 1534 | 1535 | /* 1536 | * Our estimate should return at least 1 so that we can tell later on that 1537 | * pg_class.relpages is not default. 1538 | */ 1539 | pageCount = (statBuffer.st_size + (BLCKSZ - 1)) / BLCKSZ; 1540 | if (pageCount < 1) 1541 | { 1542 | pageCount = 1; 1543 | } 1544 | 1545 | (*totalPageCount) = pageCount; 1546 | (*acquireSampleRowsFunc) = JsonAcquireSampleRows; 1547 | 1548 | return true; 1549 | } 1550 | 1551 | 1552 | /* 1553 | * JsonAcquireSampleRows acquires a random sample of rows from the foreign 1554 | * table. Selected rows are returned in the caller allocated sampleRows array, 1555 | * which must have at least target row count entries. The actual number of rows 1556 | * selected is returned as the function result. We also count the number of rows 1557 | * in the collection and return it in total row count. We also always set dead 1558 | * row count to zero. 1559 | * 1560 | * Note that the returned list of rows does not always follow their actual order 1561 | * in the JSON file. Therefore, correlation estimates derived later could be 1562 | * inaccurate, but that's OK. We currently don't use correlation estimates (the 1563 | * planner only pays attention to correlation for index scans). 1564 | */ 1565 | static int 1566 | JsonAcquireSampleRows(Relation relation, int logLevel, 1567 | HeapTuple *sampleRows, int targetRowCount, 1568 | double *totalRowCount, double *totalDeadRowCount) 1569 | { 1570 | int sampleRowCount = 0; 1571 | double rowCount = 0.0; 1572 | double rowCountToSkip = -1; // -1 means not set yet 1573 | double selectionState = 0; 1574 | MemoryContext oldContext = CurrentMemoryContext; 1575 | MemoryContext tupleContext = NULL; 1576 | Datum *columnValues = NULL; 1577 | bool *columnNulls = NULL; 1578 | TupleTableSlot *scanTupleSlot = NULL; 1579 | List *columnList = NIL; 1580 | List *foreignPrivateList = NULL; 1581 | ForeignScanState *scanState = NULL; 1582 | ForeignScan *foreignScan = NULL; 1583 | char *relationName = NULL; 1584 | int executorFlags = 0; 1585 | 1586 | TupleDesc tupleDescriptor = RelationGetDescr(relation); 1587 | int columnCount = tupleDescriptor->natts; 1588 | Form_pg_attribute *attributes = tupleDescriptor->attrs; 1589 | 1590 | // create list of columns of the relation 1591 | int columnIndex = 0; 1592 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 1593 | for (columnIndex = 0; columnIndex < columnCount; columnIndex++) 1594 | { 1595 | Var *column = (Var *) palloc0(sizeof(Var)); 1596 | 1597 | // only assign required fields for column mapping hash 1598 | column->varattno = columnIndex + 1; 1599 | column->vartype = attributes[columnIndex]->atttypid; 1600 | column->vartypmod = attributes[columnIndex]->atttypmod; 1601 | 1602 | columnList = lappend(columnList, column); 1603 | } 1604 | 1605 | // setup foreign scan plan node 1606 | foreignPrivateList = list_make1(columnList); 1607 | foreignScan = makeNode(ForeignScan); 1608 | foreignScan->fdw_private = foreignPrivateList; 1609 | 1610 | // setup tuple slot 1611 | columnValues = (Datum *) palloc0(columnCount * sizeof(Datum)); 1612 | columnNulls = (bool *) palloc0(columnCount * sizeof(bool)); 1613 | scanTupleSlot = MakeTupleTableSlot(); 1614 | scanTupleSlot->tts_tupleDescriptor = tupleDescriptor; 1615 | scanTupleSlot->tts_values = columnValues; 1616 | scanTupleSlot->tts_isnull = columnNulls; 1617 | 1618 | // setup scan state 1619 | scanState = makeNode(ForeignScanState); 1620 | scanState->ss.ss_currentRelation = relation; 1621 | scanState->ss.ps.plan = (Plan *) foreignScan; 1622 | scanState->ss.ss_ScanTupleSlot = scanTupleSlot; 1623 | 1624 | JsonBeginForeignScan(scanState, executorFlags); 1625 | 1626 | /* 1627 | * Use per-tuple memory context to prevent leak of memory used to read and 1628 | * parse rows from the file using ReadLineFromFile and FillTupleSlot. 1629 | */ 1630 | tupleContext = AllocSetContextCreate(CurrentMemoryContext, 1631 | "json_fdw temporary context", 1632 | ALLOCSET_DEFAULT_MINSIZE, 1633 | ALLOCSET_DEFAULT_INITSIZE, 1634 | ALLOCSET_DEFAULT_MAXSIZE); 1635 | 1636 | // prepare for sampling rows 1637 | selectionState = anl_init_selection_state(targetRowCount); 1638 | 1639 | for (;;) 1640 | { 1641 | // check for user-requested abort or sleep 1642 | vacuum_delay_point(); 1643 | 1644 | memset(columnValues, 0, columnCount * sizeof(Datum)); 1645 | memset(columnNulls, true, columnCount * sizeof(bool)); 1646 | 1647 | MemoryContextReset(tupleContext); 1648 | MemoryContextSwitchTo(tupleContext); 1649 | 1650 | // read the next record 1651 | JsonIterateForeignScan(scanState); 1652 | 1653 | MemoryContextSwitchTo(oldContext); 1654 | 1655 | // if there are no more records to read, break 1656 | if (scanTupleSlot->tts_isempty) 1657 | { 1658 | break; 1659 | } 1660 | 1661 | /* 1662 | * The first targetRowCount sample rows are simply copied into the 1663 | * reservoir. Then we start replacing tuples in the sample until we 1664 | * reach the end of the relation. This algorithm is from Jeff Vitter's 1665 | * paper (see more info in commands/analyze.c). 1666 | */ 1667 | if (sampleRowCount < targetRowCount) 1668 | { 1669 | sampleRows[sampleRowCount++] = heap_form_tuple(tupleDescriptor, 1670 | columnValues, 1671 | columnNulls); 1672 | } 1673 | else 1674 | { 1675 | /* 1676 | * t in Vitter's paper is the number of records already processed. 1677 | * If we need to compute a new S value, we must use the "not yet 1678 | * incremented" value of rowCount as t. 1679 | */ 1680 | if (rowCountToSkip < 0) 1681 | { 1682 | rowCountToSkip = anl_get_next_S(rowCount, targetRowCount, &selectionState); 1683 | } 1684 | 1685 | if (rowCountToSkip <= 0) 1686 | { 1687 | /* 1688 | * Found a suitable tuple, so save it, replacing one old tuple 1689 | * at random. 1690 | */ 1691 | int rowIndex = (int) (targetRowCount * anl_random_fract()); 1692 | Assert(rowIndex >= 0); 1693 | Assert(rowIndex < targetRowCount); 1694 | 1695 | heap_freetuple(sampleRows[rowIndex]); 1696 | sampleRows[rowIndex] = heap_form_tuple(tupleDescriptor, columnValues, columnNulls); 1697 | } 1698 | 1699 | rowCountToSkip -= 1; 1700 | } 1701 | 1702 | rowCount += 1; 1703 | } 1704 | 1705 | // clean up 1706 | MemoryContextDelete(tupleContext); 1707 | pfree(columnValues); 1708 | pfree(columnNulls); 1709 | 1710 | JsonEndForeignScan(scanState); 1711 | 1712 | // emit some interesting relation info 1713 | relationName = RelationGetRelationName(relation); 1714 | ereport(logLevel, (errmsg("\"%s\": file contains %.0f rows; %d rows in sample", 1715 | relationName, rowCount, sampleRowCount))); 1716 | 1717 | (*totalRowCount) = rowCount; 1718 | (*totalDeadRowCount) = 0; 1719 | 1720 | return sampleRowCount; 1721 | } 1722 | 1723 | // *** All the stuff below here, was broken by Neal Horman ;) 1724 | static char *JsonAttributeNameGet(int varno, int varattno, PlannerInfo *root) 1725 | { 1726 | RangeTblEntry *rte = planner_rt_fetch(varno, root); 1727 | List *options = GetForeignColumnOptions(rte->relid, varattno); 1728 | char *colname = NULL; 1729 | ListCell *lc; 1730 | 1731 | foreach(lc, options) 1732 | { 1733 | DefElem *def = (DefElem *) lfirst(lc); 1734 | 1735 | if (strcmp(def->defname, "column_name") == 0) 1736 | { 1737 | colname = defGetString(def); 1738 | break; 1739 | } 1740 | } 1741 | 1742 | if(colname == NULL) 1743 | colname = get_relid_attribute_name(rte->relid, varattno); 1744 | 1745 | return colname; 1746 | } 1747 | 1748 | /* 1749 | * An insert operation consists of 1750 | * PlanForeignModify 1751 | * BeginForeignModify 1752 | * ExecForeignInsert 1753 | * EndForeignModify 1754 | */ 1755 | 1756 | static List *JsonPlanForeignModify(PlannerInfo *root, ModifyTable *plan, Index resultRelation, int subplan_index) 1757 | { 1758 | CmdType operation = plan->operation; 1759 | RangeTblEntry *rte = planner_rt_fetch(resultRelation, root); 1760 | Relation rel = heap_open(rte->relid, NoLock); 1761 | ForeignTable *table = GetForeignTable(RelationGetRelid(rel)); 1762 | char *tableName = RelationGetRelationName(rel); 1763 | List *targetAttrs = NULL; 1764 | List *targetNames = NULL; 1765 | ListCell *lc; 1766 | char const *pRomUrl = NULL; 1767 | char const *pRomPath = NULL; 1768 | rci_t *pRci = NULL; 1769 | StringInfoData strUrl; 1770 | 1771 | initStringInfo(&strUrl); 1772 | 1773 | // find the ROM url and path options 1774 | foreach(lc, table->options) 1775 | { 1776 | DefElem *def = (DefElem *) lfirst(lc); 1777 | const char *str = defGetString(def); 1778 | 1779 | //ELog(DEBUG1, "%s:%d '%s' --> '%s'", __func__, __LINE__, def->defname, str); 1780 | if(strcasecmp(def->defname, OPTION_NAME_ROM_URL) == 0) 1781 | pRomUrl = str; 1782 | else if(strcasecmp(def->defname, OPTION_NAME_ROM_PATH) == 0) 1783 | pRomPath = str; 1784 | } 1785 | 1786 | //ELog(DEBUG1, "%s:%d table name '%s'", __func__, __LINE__, tableName); 1787 | 1788 | // fetch the ROM 1789 | pRci = rciFetch(pRomUrl, pRomPath, 1790 | ( 1791 | operation == CMD_INSERT ? RCI_ACTION_INSERT : 1792 | operation == CMD_UPDATE ? RCI_ACTION_UPDATE : 1793 | //operation == CMD_DELETE ? RCI_ACTION_DELETE : 1794 | RCI_ACTION_NONE 1795 | ) 1796 | ); 1797 | 1798 | if(!rciError(pRci, pRomUrl, pRomPath) 1799 | && rciMethod(pRci, "put", pRomUrl, pRomPath) 1800 | ) 1801 | { 1802 | appendStringInfoString(&strUrl, pRci->pUrl); 1803 | //ELog(DEBUG1, "%s:%d url '%s'", __func__, __LINE__, strUrl.data); 1804 | } 1805 | rciFree(pRci); 1806 | 1807 | switch (operation) 1808 | { 1809 | case CMD_INSERT: 1810 | case CMD_UPDATE: 1811 | { 1812 | TupleDesc tupdesc = RelationGetDescr(rel); 1813 | int attnum; 1814 | 1815 | // collect relation information 1816 | for (attnum = 1; attnum <= tupdesc->natts; attnum++) 1817 | { 1818 | Form_pg_attribute attr = tupdesc->attrs[attnum - 1]; 1819 | 1820 | if (!attr->attisdropped) 1821 | { 1822 | // collect the name of the attribute 1823 | char *colname = JsonAttributeNameGet(resultRelation, attnum, root); 1824 | targetNames = lappend(targetNames, colname); 1825 | 1826 | // collect the index of the attribute 1827 | targetAttrs = lappend_int(targetAttrs, attnum); 1828 | 1829 | //ELog(DEBUG1, "%s:%d %s", __func__, __LINE__, colname); 1830 | } 1831 | } 1832 | } 1833 | break; 1834 | default: 1835 | break; 1836 | } 1837 | 1838 | heap_close(rel, NoLock); 1839 | return list_make3(targetNames, targetAttrs, strUrl.data); 1840 | } 1841 | 1842 | static void JsonBeginForeignModify( 1843 | ModifyTableState *mtstate, 1844 | ResultRelInfo *resultRelInfo, 1845 | List *fdw_private, 1846 | int subplan_index, 1847 | int eflags 1848 | ) 1849 | { 1850 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 1851 | 1852 | if(!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) 1853 | { 1854 | AttrNumber n_params = 0; 1855 | Oid typefnoid = InvalidOid; 1856 | bool isvarlena = false; 1857 | ListCell *lc = NULL; 1858 | EState *estate = mtstate->ps.state; 1859 | Relation rel = resultRelInfo->ri_RelationDesc; 1860 | Oid foreignTableId = RelationGetRelid(rel); 1861 | ForeignTable *table = GetForeignTable(foreignTableId); 1862 | jfmes_t *pJfmes = (jfmes_t *) palloc0(sizeof(jfmes_t)); 1863 | 1864 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 1865 | if(pJfmes != NULL) 1866 | { 1867 | pJfmes->rel = rel; 1868 | 1869 | pJfmes->retrieved_names = (List *) list_nth(fdw_private, 0); 1870 | pJfmes->retrieved_attrs = (List *) list_nth(fdw_private, 1); 1871 | pJfmes->pUrl = (char const *) list_nth(fdw_private, 2); 1872 | pJfmes->table_options = table->options; 1873 | 1874 | n_params = list_length(pJfmes->retrieved_attrs) + 1; 1875 | pJfmes->p_flinfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * n_params); 1876 | pJfmes->p_nums = 0; 1877 | 1878 | pJfmes->temp_cxt = AllocSetContextCreate( 1879 | estate->es_query_cxt, 1880 | "json_fdw temporary data", 1881 | ALLOCSET_SMALL_MINSIZE, 1882 | ALLOCSET_SMALL_INITSIZE, 1883 | ALLOCSET_SMALL_MAXSIZE 1884 | ); 1885 | 1886 | //ELog(DEBUG1, "%s:%d put url '%s'", __func__, __LINE__, pJfmes->pUrl); 1887 | // collect accessor functions for each attribute 1888 | foreach(lc, pJfmes->retrieved_attrs) 1889 | { 1890 | int attnum = lfirst_int(lc); 1891 | Form_pg_attribute attr = RelationGetDescr(rel)->attrs[attnum - 1]; 1892 | 1893 | Assert(!attr->attisdropped); 1894 | 1895 | getTypeOutputInfo(attr->atttypid, &typefnoid, &isvarlena); 1896 | fmgr_info(typefnoid, &pJfmes->p_flinfo[pJfmes->p_nums]); 1897 | pJfmes->p_nums++; 1898 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 1899 | } 1900 | Assert(pJfmes->p_nums <= n_params); 1901 | } 1902 | 1903 | resultRelInfo->ri_FdwState = pJfmes; 1904 | } 1905 | } 1906 | 1907 | static int JsonPg2Json(StringInfo Str, Oid type, Datum value, const char *name, bool *isnull); 1908 | 1909 | static TupleTableSlot *JsonExecForeignInsert( 1910 | EState *estate, 1911 | ResultRelInfo *resultRelInfo, 1912 | TupleTableSlot *slot, 1913 | TupleTableSlot *planSlot 1914 | ) 1915 | { 1916 | jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState; 1917 | MemoryContext oldContext = MemoryContextSwitchTo(pJfmes->temp_cxt); 1918 | int nParams = list_length(pJfmes->retrieved_attrs); 1919 | 1920 | if(nParams == list_length(pJfmes->retrieved_names)) 1921 | { 1922 | bool *isnull = (bool*) palloc0(sizeof(bool) * nParams); 1923 | ListCell *lcAttrs = NULL; 1924 | ListCell *lcNames = list_head(pJfmes->retrieved_names); 1925 | StringInfoData str; 1926 | int paramNum = 0; 1927 | int paramCount = 0; 1928 | int ok = 0; 1929 | 1930 | // count the number of non-null attributes 1931 | foreach(lcAttrs, pJfmes->retrieved_attrs) 1932 | { 1933 | bool bIsNull = true; 1934 | slot_getattr(slot, lfirst_int(lcAttrs), &bIsNull); 1935 | paramCount += (!bIsNull); 1936 | } 1937 | 1938 | // build json object document string 1939 | initStringInfo(&str); 1940 | appendStringInfoString(&str, "{ "); 1941 | foreach(lcAttrs, pJfmes->retrieved_attrs) 1942 | { 1943 | int attnum = lfirst_int(lcAttrs) - 1; 1944 | Datum value = slot_getattr(slot, attnum + 1, &isnull[attnum]); 1945 | Oid type = slot->tts_tupleDescriptor->attrs[attnum]->atttypid; 1946 | 1947 | //ELog(DEBUG1, "%s:%d %u/%u %s %u", __func__, __LINE__, attnum, nParams, lfirst(lcNames), isnull[attnum]); 1948 | if(JsonPg2Json(&str, type, value, lfirst(lcNames), &isnull[attnum]) 1949 | // if not last attribute 1950 | && paramNum < paramCount - 1 1951 | && !isnull[attnum] 1952 | ) 1953 | { 1954 | appendStringInfoString(&str, ", "); 1955 | paramNum++; 1956 | } 1957 | 1958 | lcNames = lnext(lcNames); 1959 | } 1960 | appendStringInfoString(&str, " }"); 1961 | 1962 | // send the json object to the remote server 1963 | ok = curlPut(pJfmes->pUrl, str.data, strlen(str.data), "application/json"); 1964 | 1965 | //ELog(DEBUG1, "%s:%d '%s' --> %s %s", __func__, __LINE__, str.data, pJfmes->pUrl, ok ? "OK" : "FAIL"); 1966 | } 1967 | 1968 | MemoryContextSwitchTo(oldContext); 1969 | MemoryContextReset(pJfmes->temp_cxt); 1970 | 1971 | return slot; 1972 | } 1973 | 1974 | 1975 | /* 1976 | * 1977 | * An update operation consists of 1978 | * AddForeignUpdateTargets 1979 | * 1980 | * GetForeignRelSize 1981 | * GetForeignPaths 1982 | * GetForeignPlan 1983 | * 1984 | * PlanForeignModify 1985 | * BeginForeignScan 1986 | * BeginForeignModify 1987 | * EndForeignModify 1988 | * 1989 | * EndForeignScan 1990 | */ 1991 | 1992 | static void JsonAddForeignUpdateTargets(Query *parsetree, RangeTblEntry *target_rte, Relation target_relation) 1993 | { 1994 | // What we need is the rowid which is the first column 1995 | Form_pg_attribute attr = RelationGetDescr(target_relation)->attrs[0]; 1996 | // Make a Var representing the desired value 1997 | Var *var = makeVar(parsetree->resultRelation, 1, attr->atttypid, attr->atttypmod, InvalidOid, 0); 1998 | // Wrap it in a TLE with the right name ... 1999 | const char *attrname = NameStr(attr->attname); 2000 | 2001 | TargetEntry *tle = makeTargetEntry((Expr *) var, 2002 | list_length(parsetree->targetList) + 1, 2003 | pstrdup(attrname), 2004 | true 2005 | ); 2006 | 2007 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 2008 | // ... and add it to the query's targetlist 2009 | parsetree->targetList = lappend(parsetree->targetList, tle); 2010 | } 2011 | 2012 | static TupleTableSlot * JsonExecForeignUpdate( 2013 | EState *estate, 2014 | ResultRelInfo *resultRelInfo, 2015 | TupleTableSlot *slot, 2016 | TupleTableSlot *planSlot 2017 | ) 2018 | { 2019 | jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState; 2020 | int nParams = list_length(pJfmes->retrieved_attrs); 2021 | 2022 | if(nParams == list_length(pJfmes->retrieved_names)) 2023 | { 2024 | bool *isnull = palloc0(sizeof(bool) * nParams); 2025 | ListCell *lcAttrs = NULL; 2026 | ListCell *lcNames = list_head(pJfmes->retrieved_names); 2027 | StringInfoData str; 2028 | int paramNum = 0; 2029 | int ok = 0; 2030 | 2031 | //ELog(DEBUG1, "%s:%d nparams %u", __func__, __LINE__, nParams); 2032 | 2033 | // build json object document string 2034 | initStringInfo(&str); 2035 | appendStringInfoString(&str, "{ "); 2036 | foreach(lcAttrs, pJfmes->retrieved_attrs) 2037 | { 2038 | Datum value = 0; 2039 | int attnum = lfirst_int(lcAttrs) - 1; 2040 | Oid type; 2041 | 2042 | type = slot->tts_tupleDescriptor->attrs[attnum]->atttypid; 2043 | value = slot_getattr(slot, attnum + 1, (bool*)(&isnull[attnum])); 2044 | 2045 | if(JsonPg2Json(&str, type, value, lfirst(lcNames), &isnull[attnum]) 2046 | // if not last attribute 2047 | && paramNum < nParams -1 2048 | && !isnull[attnum] 2049 | ) 2050 | { 2051 | appendStringInfoString(&str, ", "); 2052 | } 2053 | lcNames = lnext(lcNames); 2054 | paramNum ++; 2055 | } 2056 | appendStringInfoString(&str, " }"); 2057 | 2058 | // send the json object to the remote server 2059 | ok = curlPut(pJfmes->pUrl, str.data, strlen(str.data), "application/json"); 2060 | //ELog(DEBUG1, "%s:%d '%s' --> %s %s", __func__, __LINE__, str.data, pJfmes->pUrl, ok ? "OK" : "FAIL"); 2061 | } 2062 | 2063 | return slot; 2064 | } 2065 | 2066 | static void JsonEndForeignModify(EState *estate, ResultRelInfo *resultRelInfo) 2067 | { 2068 | jfmes_t *pJfmes = (jfmes_t *) resultRelInfo->ri_FdwState; 2069 | 2070 | //ELog(DEBUG1, "%s:%d", __func__, __LINE__); 2071 | } 2072 | 2073 | // Transmute a postgres text array into a json text array 2074 | enum 2075 | { 2076 | SM_UNQUOTED, 2077 | SM_QUOTED, 2078 | SM_NEEDQUOTE, 2079 | }; 2080 | 2081 | // Append to the "outStr" string, an array of data that is Text, 2082 | // while dealing with quoting conversion from sql to json 2083 | static void JsonPgTextArray2Json(StringInfo outStr, const char *inStr, int len) 2084 | { int state = SM_UNQUOTED; 2085 | int i; 2086 | 2087 | if(len) 2088 | appendStringInfoCharMacro(outStr, '['); 2089 | 2090 | for(i=0; ilen; 2139 | 2140 | if(!*isnull) 2141 | { 2142 | switch(type) 2143 | { 2144 | case INT2OID: appendStringInfo(str, "\"%s\": %u", name, (int16)DatumGetInt16(value)); break; 2145 | case INT4OID: appendStringInfo(str, "\"%s\": %u", name, (int32)DatumGetInt32(value)); break; 2146 | case INT8OID: appendStringInfo(str, "\"%s\": %lu", name, (int64)DatumGetInt64(value)); break; 2147 | case FLOAT4OID: appendStringInfo(str, "\"%s\": %f", name, (float4)DatumGetFloat4(value)); break; 2148 | case FLOAT8OID: appendStringInfo(str, "\"%s\": %f", name, (float8)DatumGetFloat8(value)); break; 2149 | 2150 | case NUMERICOID: 2151 | { Datum valueDatum = DirectFunctionCall1(numeric_float8, value); 2152 | 2153 | appendStringInfo(str, "\"%s\": %f", name, (float8)DatumGetFloat8(valueDatum)); 2154 | } 2155 | break; 2156 | 2157 | //case BOOLOID: appendStringInfo(str, "\"%s\": %s", name, (((int32)DatumGetInt32(value))) ? "true", "false"); break; 2158 | case BOOLOID: appendStringInfo(str, "\"%s\": %u", name, (int32)DatumGetInt32(value)); break; 2159 | 2160 | case BPCHAROID: 2161 | case VARCHAROID: 2162 | case TEXTOID: 2163 | case NAMEOID: 2164 | { char *outputString = NULL; 2165 | Oid outputFunctionId = InvalidOid; 2166 | bool typeVarLength = false; 2167 | 2168 | getTypeOutputInfo(type, &outputFunctionId, &typeVarLength); 2169 | outputString = OidOutputFunctionCall(outputFunctionId, value); 2170 | 2171 | appendStringInfo(str, "\"%s\": \"%s\"", name, outputString); 2172 | } 2173 | break; 2174 | 2175 | case DATEOID: 2176 | case TIMEOID: 2177 | case TIMESTAMPOID: 2178 | case TIMESTAMPTZOID: 2179 | { 2180 | int pgtz; 2181 | struct pg_tm pgtm; 2182 | fsec_t fsec; 2183 | const char *pgtzn; 2184 | struct tm tm; 2185 | char buffer [128]; 2186 | Timestamp valueTimestamp; 2187 | 2188 | // get pg time 2189 | if(type == DATEOID) 2190 | { Datum valueDatum = DirectFunctionCall1(date_timestamp, value); 2191 | 2192 | valueTimestamp = DatumGetTimestamp(valueDatum); 2193 | } 2194 | else 2195 | valueTimestamp = DatumGetTimestamp(value); 2196 | 2197 | // extract pg time 2198 | timestamp2tm(valueTimestamp, &pgtz, &pgtm, &fsec, &pgtzn, pg_tzset("UTC")); 2199 | 2200 | // map to unix time 2201 | tm.tm_sec = pgtm.tm_sec; 2202 | tm.tm_min = pgtm.tm_min; 2203 | tm.tm_hour = pgtm.tm_hour; 2204 | tm.tm_mday = pgtm.tm_mday; 2205 | tm.tm_mon = pgtm.tm_mon - 1; 2206 | tm.tm_year = pgtm.tm_year - 1900; 2207 | tm.tm_wday = pgtm.tm_wday; 2208 | tm.tm_yday = pgtm.tm_yday; 2209 | tm.tm_isdst = pgtm.tm_isdst; 2210 | tm.tm_gmtoff = pgtm.tm_gmtoff; 2211 | tm.tm_zone = (char *)pgtm.tm_zone; 2212 | 2213 | memset(buffer, 0, sizeof(buffer)); 2214 | // convert to string in ISO format 2215 | strftime(buffer, sizeof(buffer)-1, "%Y-%m-%d %H:%M:%S %Z", &tm); 2216 | 2217 | appendStringInfo(str, "\"%s\": \"%s\"", name, buffer); 2218 | } 2219 | break; 2220 | /* 2221 | case BITOID: 2222 | { 2223 | int32 dat; 2224 | int32 *bufptr = palloc0(sizeof(int32)); 2225 | char *outputString = NULL; 2226 | Oid outputFunctionId = InvalidOid; 2227 | bool typeVarLength = false; 2228 | getTypeOutputInfo(type, &outputFunctionId, &typeVarLength); 2229 | outputString = OidOutputFunctionCall(outputFunctionId, value); 2230 | 2231 | dat = bin_dec(atoi(outputString)); 2232 | memcpy(bufptr, (char*)&dat, sizeof(int32)); 2233 | binds[attnum].buffer = bufptr; 2234 | break; 2235 | } 2236 | */ 2237 | case INT4ARRAYOID: 2238 | case INT2ARRAYOID: 2239 | case FLOAT4ARRAYOID: 2240 | case TEXTARRAYOID: 2241 | { Oid outputFunctionId; 2242 | bool typeVarLength = false; 2243 | char *outputString = NULL; 2244 | int l; 2245 | 2246 | getTypeOutputInfo(type, &outputFunctionId, &typeVarLength); 2247 | outputString = OidOutputFunctionCall(outputFunctionId, value); 2248 | 2249 | // trim left and right curly braces 2250 | outputString++; 2251 | l = strlen(outputString) - 1; 2252 | 2253 | if(type != TEXTARRAYOID) 2254 | appendStringInfo(str, "\"%s\": [%*.*s]", name, l, l, outputString); 2255 | else 2256 | { 2257 | appendStringInfo(str, "\"%s\": ", name); 2258 | JsonPgTextArray2Json(str, outputString, l); 2259 | } 2260 | } 2261 | break; 2262 | 2263 | //case OIDARRAYOID: 2264 | default: 2265 | { 2266 | ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_DATA_TYPE), 2267 | errmsg("cannot convert constant value to JSON value"), 2268 | errhint("Constant value data type: %u", type))); 2269 | break; 2270 | } 2271 | } 2272 | } 2273 | 2274 | return (str->len > oldLen); // we appended new characters 2275 | } 2276 | -------------------------------------------------------------------------------- /json_fdw.control: -------------------------------------------------------------------------------- 1 | # json_fdw extension 2 | comment = 'foreign-data wrapper for json file access' 3 | default_version = '1.0' 4 | module_pathname = '$libdir/json_fdw' 5 | relocatable = true 6 | -------------------------------------------------------------------------------- /json_fdw.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * json_fdw.h 4 | * 5 | * Type and function declarations for JSON foreign data wrapper. 6 | * 7 | * Copyright (c) 2013, Citus Data, Inc. 8 | * 9 | * $Id$ 10 | * 11 | *------------------------------------------------------------------------- 12 | */ 13 | 14 | #ifndef JSON_FDW_H 15 | #define JSON_FDW_H 16 | 17 | #include "fmgr.h" 18 | #include "catalog/pg_foreign_server.h" 19 | #include "catalog/pg_foreign_table.h" 20 | #include "utils/hsearch.h" 21 | #include "nodes/pg_list.h" 22 | #include "utils/rel.h" 23 | 24 | #include "curlapi.h" 25 | 26 | 27 | /* Defines for valid option names and default values */ 28 | #define OPTION_NAME_FILENAME "filename" 29 | #define OPTION_NAME_MAX_ERROR_COUNT "max_error_count" 30 | #define DEFAULT_MAX_ERROR_COUNT 0 31 | 32 | #define OPTION_NAME_HTTP_POST_VARS "http_post_vars" 33 | #define OPTION_NAME_ROM_URL "rom_url" 34 | #define OPTION_NAME_ROM_PATH "rom_path" 35 | 36 | #define JSON_TUPLE_COST_MULTIPLIER 10 37 | #define ERROR_BUFFER_SIZE 1024 38 | #define READ_BUFFER_SIZE 4096 39 | #define GZIP_FILE_EXTENSION ".gz" 40 | #define HDFS_BLOCK_PREFIX "blk_" 41 | #define HDFS_BLOCK_PREFIX_LENGTH 4 42 | 43 | 44 | /* 45 | * JsonValidOption keeps an option name and a context. When an option is passed 46 | * into json_fdw objects (server and foreign table), we compare this option's 47 | * name and context against those of valid options. 48 | */ 49 | typedef struct JsonValidOption 50 | { 51 | const char *optionName; 52 | Oid optionContextId; 53 | 54 | } JsonValidOption; 55 | 56 | 57 | /* 58 | * JsonFdwOptions holds the option values to be used when reading and parsing 59 | * the json file. To resolve these values, we first check foreign table's 60 | * options, and if not present, we then fall back to the default values 61 | * specified above. 62 | */ 63 | typedef struct JsonFdwOptions 64 | { 65 | char const *filename; 66 | int32 maxErrorCount; 67 | char const *pHttpPostVars; 68 | char const *pRomUrl; 69 | char const *pRomPath; 70 | } JsonFdwOptions; 71 | 72 | 73 | /* 74 | * JsonFdwExecState keeps foreign data wrapper specific execution state that we 75 | * create and hold onto when executing the query. 76 | */ 77 | typedef struct JsonFdwExecState 78 | { 79 | char const *filename; // on disk file name of json content 80 | FILE *filePointer; // file pointer to on disk content 81 | void *gzFilePointer; // gz file pointe to on disk content 82 | 83 | uint32 maxErrorCount; 84 | uint32 errorCount; 85 | uint32 currentLineNumber; 86 | HTAB *columnMappingHash; 87 | 88 | cfr_t *pCfr; // curl fetch result 89 | } JsonFdwExecState; 90 | 91 | typedef struct _jfmes_t 92 | { 93 | Relation rel; // relcache entry for the foriegn table 94 | int p_nums; // number of parameters to transmit 95 | FmgrInfo *p_flinfo; // output conversion functions for them 96 | 97 | List *retrieved_attrs; // list of target attribute members 98 | List *retrieved_names; // list of target attribute names 99 | List *table_options; 100 | char const *pUrl; // put url 101 | 102 | MemoryContext temp_cxt; // context for per-tuple temp data 103 | 104 | } jfmes_t; // Json Fdw Modify Exec State Type 105 | 106 | 107 | /* 108 | * ColumnMapping reprents a hash table entry that maps a column name to column 109 | * related information. We construct these hash table entries to speed up the 110 | * conversion from JSON documents to PostgreSQL tuples; and each hash entry maps 111 | * the column name to the column's tuple index and its type-related information. 112 | */ 113 | typedef struct ColumnMapping 114 | { 115 | char columnName[NAMEDATALEN]; 116 | uint32 columnIndex; 117 | Oid columnTypeId; 118 | int32 columnTypeMod; 119 | Oid columnArrayTypeId; 120 | 121 | } ColumnMapping; 122 | 123 | 124 | /* Function declarations for foreign data wrapper */ 125 | extern Datum json_fdw_handler(PG_FUNCTION_ARGS); 126 | extern Datum json_fdw_validator(PG_FUNCTION_ARGS); 127 | 128 | 129 | #endif /* JSON_FDW_H */ 130 | -------------------------------------------------------------------------------- /output/basic_tests.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test json foreign data wrapper. 3 | -- 4 | -- Settings to make the result deterministic 5 | SET datestyle = "ISO, YMD"; 6 | -- Install json_fdw 7 | CREATE EXTENSION json_fdw; 8 | CREATE SERVER json_server FOREIGN DATA WRAPPER json_fdw; 9 | -- validator tests 10 | CREATE FOREIGN TABLE test_validator_filename_missing () 11 | SERVER json_server; -- ERROR 12 | ERROR: filename is required for json_fdw foreign tables 13 | CREATE FOREIGN TABLE test_validator_invalid_option () 14 | SERVER json_server 15 | OPTIONS(filename 'data.json', bad_option_name '1'); -- ERROR 16 | ERROR: invalid option "bad_option_name" 17 | HINT: Valid options in this context are: filename, max_error_count, hdfs_directory_path 18 | -- data conversion tests 19 | CREATE FOREIGN TABLE json_data (id int8, type char(20), name text, 20 | birthdate date, actions int[], "position.lat" float, "position.lon" float, 21 | "position.address.country" varchar(50), last_update timestamp, 22 | last_update_tz timestamp with time zone 23 | ) SERVER json_server OPTIONS(filename '@abs_srcdir@/data/data.json'); 24 | SELECT id, type, name FROM json_data ORDER BY id; 25 | id | type | name 26 | ----------------------+----------------------+-------------------- 27 | -9223372036854775808 | | 28 | 1 | person | Beatus Henk 29 | 2 | person | Lugos Alfons 30 | 3 | person | Temür Essa 31 | 4 | resturaunt | Mingus Kitchen 32 | 5 | resturaunt | Café Utopia Lounge 33 | 6 | invalid_record | 34 | 9223372036854775807 | | 35 | (8 rows) 36 | 37 | SELECT id, name, birthdate FROM json_data WHERE type = 'person' ORDER BY id; 38 | id | name | birthdate 39 | ----+--------------+------------ 40 | 1 | Beatus Henk | 1973-06-24 41 | 2 | Lugos Alfons | 1961-08-30 42 | 3 | Temür Essa | 1995-07-28 43 | (3 rows) 44 | 45 | SELECT id, "position.lat" AS lat, "position.lon" AS lon, 46 | "position.address.country" AS country, last_update 47 | FROM json_data WHERE type = 'resturaunt' ORDER BY id; 48 | id | lat | lon | country | last_update 49 | ----+----------+-----------+-----------+--------------------- 50 | 4 | -48.3798 | -65.43274 | Argentina | 2013-01-02 12:05:01 51 | 5 | 42.97208 | 143.39097 | | 52 | (2 rows) 53 | 54 | SELECT id, type, birthdate, last_update, "position.lon" as lon 55 | FROM json_data WHERE type = 'invalid_record' ORDER BY id; 56 | id | type | birthdate | last_update | lon 57 | ----+----------------------+-----------+-------------+----- 58 | 6 | invalid_record | | | 59 | (1 row) 60 | 61 | SELECT last_update_tz AT TIME ZONE 'UTC' FROM json_data 62 | WHERE last_update_tz IS NOT NULL; 63 | timezone 64 | --------------------- 65 | 2013-01-02 17:05:01 66 | (1 row) 67 | 68 | -- max error count test 69 | CREATE FOREIGN TABLE test_skip_broken_on (a integer, b integer) 70 | SERVER json_server 71 | OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '1'); 72 | SELECT * FROM test_skip_broken_on ORDER BY a; 73 | a | b 74 | ---+--- 75 | 1 | 2 76 | 2 | 3 77 | 3 | 4 78 | (3 rows) 79 | 80 | CREATE FOREIGN TABLE test_skip_broken_off (a integer, b integer) 81 | SERVER json_server 82 | OPTIONS (filename '@abs_srcdir@/data/data_broken.json', max_error_count '0'); 83 | SELECT * FROM test_skip_broken_off; -- ERROR 84 | ERROR: could not parse 1 json objects 85 | HINT: Last error message at line: 4: parse error: premature EOF 86 | {"a": 3, 87 | (right here) ------^ 88 | 89 | -- error scenarios 90 | CREATE FOREIGN TABLE test_missing_file () SERVER json_server 91 | OPTIONS (filename '@abs_srcdir@/data/missing_file.json'); 92 | SELECT * FROM test_missing_file; -- ERROR 93 | ERROR: could not open file "@abs_srcdir@/data/missing_file.json" for reading: No such file or directory 94 | CREATE FOREIGN TABLE test_string_length_check (type CHAR(6)) SERVER json_server 95 | OPTIONS (filename '@abs_srcdir@/data/data.json'); 96 | SELECT * FROM test_string_length_check; -- ERROR 97 | ERROR: value too long for type character(6) 98 | CREATE FOREIGN TABLE test_int_range_check (id int4) SERVER json_server 99 | OPTIONS (filename '@abs_srcdir@/data/data.json'); 100 | SELECT * FROM test_int_range_check; -- ERROR 101 | ERROR: value "9223372036854775807" is out of range for type integer 102 | CREATE FOREIGN TABLE test_decimal_range_check ("position.lat" decimal(3, 2)) 103 | SERVER json_server OPTIONS (filename '@abs_srcdir@/data/data.json'); 104 | SELECT * FROM test_decimal_range_check; -- ERROR 105 | ERROR: numeric field overflow 106 | DETAIL: A field with precision 3, scale 2 must round to an absolute value less than 10^1. 107 | -------------------------------------------------------------------------------- /output/customer_reviews.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test customer reviews dataset queries. 3 | -- 4 | CREATE FOREIGN TABLE customer_reviews 5 | ( 6 | customer_id TEXT not null, 7 | "review.date" DATE not null, 8 | "review.rating" INTEGER not null, 9 | "review.votes" INTEGER, 10 | "review.helpful_votes" INTEGER, 11 | "product.id" CHAR(10) not null, 12 | "product.title" TEXT not null, 13 | "product.sales_rank" BIGINT, 14 | "product.group" TEXT, 15 | "product.category" TEXT, 16 | "product.subcategory" TEXT, 17 | similar_product_ids CHAR(10)[] 18 | ) 19 | SERVER json_server 20 | OPTIONS(filename '@abs_srcdir@/data/customer_reviews_1998.1000.json.gz'); 21 | -- How people rate your products? 22 | SELECT 23 | extract(month from "review.date") AS review_month, 24 | round(avg("review.rating"), 2), 25 | count(*) 26 | FROM 27 | customer_reviews 28 | GROUP BY 29 | review_month 30 | ORDER BY 31 | review_month; 32 | review_month | round | count 33 | --------------+-------+------- 34 | 1 | 4.48 | 224 35 | 2 | 4.42 | 149 36 | 6 | 4.50 | 2 37 | 7 | 4.63 | 71 38 | 8 | 4.61 | 75 39 | 9 | 4.57 | 101 40 | 10 | 4.42 | 130 41 | 11 | 4.59 | 143 42 | 12 | 4.54 | 105 43 | (9 rows) 44 | 45 | -- Do we have a correlation between a book's title's length and its review ratings? 46 | SELECT 47 | width_bucket(length("product.title"), 1, 50, 5) title_length_bucket, 48 | round(avg("review.rating"), 2) AS review_average, 49 | count(*) 50 | FROM 51 | customer_reviews 52 | WHERE 53 | "product.group" = 'Book' 54 | GROUP BY 55 | title_length_bucket 56 | ORDER BY 57 | title_length_bucket; 58 | title_length_bucket | review_average | count 59 | ---------------------+----------------+------- 60 | 1 | 4.50 | 135 61 | 2 | 4.48 | 364 62 | 3 | 4.53 | 190 63 | 4 | 4.52 | 151 64 | 5 | 4.60 | 99 65 | 6 | 4.62 | 55 66 | (6 rows) 67 | 68 | -- Does the average review rating change by product category? 69 | SELECT 70 | "product.category", 71 | round(avg("review.rating"), 2), 72 | count(*) 73 | FROM 74 | customer_reviews 75 | GROUP BY 76 | "product.category" 77 | ORDER BY 78 | count(*) DESC, "product.category" 79 | LIMIT 20; 80 | product.category | round | count 81 | ---------------------------+-------+------- 82 | Science Fiction & Fantasy | 4.44 | 189 83 | Literature & Fiction | 4.62 | 149 84 | Mystery & Thrillers | 3.79 | 71 85 | Books on Tape | 4.75 | 65 86 | Children's Books | 4.49 | 65 87 | Nonfiction | 4.56 | 57 88 | Religion & Spirituality | 4.67 | 52 89 | Science | 4.47 | 36 90 | Health, Mind & Body | 4.97 | 30 91 | Computers & Internet | 4.50 | 26 92 | Horror | 4.32 | 25 93 | Business & Investing | 4.57 | 21 94 | Biographies & Memoirs | 4.55 | 20 95 | History | 4.45 | 20 96 | Teens | 4.26 | 19 97 | Entertainment | 4.89 | 18 98 | Home & Garden | 4.24 | 17 99 | Reference | 4.71 | 14 100 | Romance | 4.77 | 13 101 | Sports | 4.50 | 10 102 | (20 rows) 103 | 104 | -------------------------------------------------------------------------------- /output/hdfs_block.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test customer reviews dataset which is stored as a HDFS block. 3 | -- 4 | CREATE FOREIGN TABLE customer_reviews_hdfs_block 5 | ( 6 | customer_id TEXT not null, 7 | "review.date" DATE not null, 8 | "review.rating" INTEGER not null, 9 | "review.votes" INTEGER, 10 | "review.helpful_votes" INTEGER, 11 | "product.id" CHAR(10) not null, 12 | "product.title" TEXT not null, 13 | "product.sales_rank" BIGINT, 14 | "product.group" TEXT, 15 | "product.category" TEXT, 16 | "product.subcategory" TEXT, 17 | similar_product_ids CHAR(10)[] 18 | ) 19 | SERVER json_server 20 | OPTIONS(filename '@abs_srcdir@/data/blk_-729487577044220672', 21 | max_error_count '2'); 22 | -- Does the average review rating change by product category? 23 | SELECT 24 | "product.category", 25 | round(avg("review.rating"), 2), 26 | count(*) 27 | FROM 28 | customer_reviews_hdfs_block 29 | GROUP BY 30 | "product.category" 31 | ORDER BY 32 | count(*) DESC, "product.category" 33 | LIMIT 20; 34 | product.category | round | count 35 | ---------------------------+-------+------- 36 | Science Fiction & Fantasy | 4.44 | 189 37 | Literature & Fiction | 4.62 | 149 38 | Mystery & Thrillers | 3.79 | 71 39 | Books on Tape | 4.75 | 65 40 | Children's Books | 4.49 | 65 41 | Nonfiction | 4.56 | 57 42 | Religion & Spirituality | 4.67 | 52 43 | Science | 4.47 | 36 44 | Health, Mind & Body | 4.97 | 30 45 | Computers & Internet | 4.50 | 26 46 | Horror | 4.32 | 25 47 | Business & Investing | 4.57 | 21 48 | Biographies & Memoirs | 4.55 | 20 49 | History | 4.45 | 20 50 | Teens | 4.26 | 19 51 | Entertainment | 4.89 | 18 52 | Home & Garden | 4.24 | 17 53 | Reference | 4.71 | 14 54 | Romance | 4.77 | 13 55 | Sports | 4.50 | 10 56 | (20 rows) 57 | 58 | -------------------------------------------------------------------------------- /output/invalid_gz_file.source: -------------------------------------------------------------------------------- 1 | -- 2 | -- Test that we handle invalid gzip files properly. 3 | -- 4 | \set VERBOSITY terse 5 | CREATE FOREIGN TABLE invalid_gz_file_table 6 | ( 7 | customer_id TEXT not null, 8 | "review.date" DATE not null, 9 | "review.rating" INTEGER not null, 10 | "review.votes" INTEGER, 11 | "review.helpful_votes" INTEGER, 12 | "product.id" CHAR(10) not null, 13 | "product.title" TEXT not null, 14 | "product.sales_rank" BIGINT, 15 | "product.group" TEXT, 16 | "product.category" TEXT, 17 | "product.subcategory" TEXT, 18 | similar_product_ids CHAR(10)[] 19 | ) 20 | SERVER json_server 21 | OPTIONS(filename '@abs_srcdir@/data/invalid_gz_file.json.gz'); 22 | select count(*) from invalid_gz_file_table; 23 | ERROR: could not read from json file 24 | \set VERBOSITY default 25 | -------------------------------------------------------------------------------- /rciapi.c: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | // same as in json_fdw.h 33 | #define ERROR_BUFFER_SIZE 1024 34 | #define READ_BUFFER_SIZE 4096 35 | 36 | #include "curlapi.h" 37 | #include "regexapi.h" 38 | #include "regexapi_helper.h" 39 | #include "rciapi.h" 40 | 41 | static yajl_val romRootFetch(char const *pRomUrl, char const *pRomPath) 42 | { yajl_val root = NULL; 43 | 44 | if(pRomUrl != NULL && pRomPath != NULL && *pRomUrl && *pRomPath) 45 | { cfr_t *pCfr = curlFetchFile(pRomUrl, NULL); 46 | 47 | if(pCfr != NULL && pCfr->bFileFetched) 48 | { FILE *fin = fopen(pCfr->ccf.pFileName, "r"); 49 | 50 | if(fin != NULL) 51 | { char errorBuffer[ERROR_BUFFER_SIZE]; 52 | 53 | root = yajl_tree_parse_file(fin, READ_BUFFER_SIZE, NULL, errorBuffer, sizeof(errorBuffer)); 54 | fclose(fin); 55 | } 56 | 57 | // must be an object with schema 2, 58 | // else, not the rom we are looking for 59 | if( 60 | !YAJL_IS_OBJECT(root) 61 | || atoi( ytp_get(root, "romschema", NULL)) != 2 62 | ) 63 | { 64 | // free and null, if failure 65 | yajl_tree_free(root); 66 | root = NULL; 67 | } 68 | } 69 | curlCfrFree(pCfr); 70 | } 71 | 72 | return root; 73 | } 74 | 75 | void rciFree(rci_t *pRci) 76 | { 77 | if(pRci != NULL) 78 | { 79 | if(pRci->pUrl != NULL) 80 | free(pRci->pUrl); 81 | if(pRci->pQuery != NULL) 82 | free(pRci->pQuery); 83 | if(pRci->pAction != NULL) 84 | free(pRci->pAction); 85 | if(pRci->romRoot != NULL) 86 | yajl_tree_free(pRci->romRoot); 87 | free(pRci); 88 | } 89 | } 90 | 91 | // strcat but dst is realloc'd to add src 92 | static char *strcatr(char *dst, char const *src) 93 | { 94 | if(src != NULL && *src) 95 | dst = strcat(realloc(dst, (dst != NULL ? strlen(dst) : 0) + strlen(src) + 1), src); 96 | 97 | return dst; 98 | } 99 | 100 | // special case strcatr, don't concat "/blah/" and "/" into "/blah//" 101 | static char *strcatrurl(char *dst, char const *src) 102 | { 103 | // won't be "/blah//" ? 104 | if( !(dst != NULL && dst[strlen(dst)-1] == '/' && src != NULL && *src == '/' && src[1] == 0)) 105 | dst = strcatr(dst, src); // so concat the two 106 | 107 | return dst; 108 | } 109 | 110 | rci_t *rciFetch(char const *pRomUrl, char const *pRomPath, int action) 111 | { rci_t *pRci = calloc(1, sizeof(rci_t)); 112 | 113 | if(pRci != NULL) 114 | { 115 | pRci->romRoot = romRootFetch(pRomUrl, pRomPath); 116 | // the schema has already been validated 117 | if(pRci->romRoot != NULL && action != RCI_ACTION_NONE) 118 | { 119 | yajl_val rootTable = ytp_get(pRci->romRoot, pRomPath, NULL); 120 | yajl_val rootQuery; 121 | 122 | pRci->pAction = strdup(action == RCI_ACTION_INSERT ? "insert" 123 | : action == RCI_ACTION_UPDATE ? "update" 124 | : action == RCI_ACTION_DELETE ? "delete" 125 | : "select" 126 | ); 127 | 128 | pRci->romRootAction = ytp_get(rootTable, pRci->pAction, NULL); 129 | pRci->pMethod = ytp_get(pRci->romRootAction, "method", NULL); 130 | pRci->pUrl = strcatr(NULL, ytp_GetPath(pRci->romRoot, "$.host")); 131 | 132 | rootQuery = ytp_get(pRci->romRootAction, "query"); 133 | 134 | // If no host specified in ROM, use the 135 | // host specification of the ROM url 136 | if(pRci->pUrl == NULL || !*pRci->pUrl) 137 | { // split the ROM url into pieces 138 | regexapi_t *pRat = regexapi_url(pRomUrl); 139 | 140 | // use the pieces ? 141 | if(pRat != NULL) 142 | { int regexNSubs = regexapi_nsubs(pRat, 0); 143 | 144 | if(regexNSubs >= 2) 145 | { 146 | if(pRci->pUrl != NULL) 147 | free(pRci->pUrl); 148 | asprintf(&pRci->pUrl, "%s://%s" 149 | , regexapi_sub(pRat, 0, 0) // protocol specification 150 | , regexapi_sub(pRat, 0, 1) // host specification 151 | ); 152 | } 153 | regexapi_free(pRat); 154 | } 155 | } 156 | 157 | // concat / build the url based on the path selected 158 | pRci->pUrl = strcatrurl(pRci->pUrl, ytp_GetPath(pRci->romRoot, "$.url")); 159 | pRci->pUrl = strcatrurl(pRci->pUrl, ytp_get(rootTable, "url", NULL)); 160 | pRci->pUrl = strcatrurl(pRci->pUrl, ytp_GetPath(pRci->romRootAction, "$.url")); 161 | 162 | // use the query array objects to build a set 163 | // of url named parameters with values ? 164 | if(YAJL_IS_ARRAY(rootQuery)) 165 | { int i,q,first=1; 166 | char const *pStrName; 167 | char const *pStrValue; 168 | 169 | // each query object 170 | for(i=0,q=rootQuery->u.array.len; ipUrl = strcatr(pRci->pUrl, "?"); 187 | first = 0; 188 | } 189 | else 190 | pRci->pUrl = strcatr(pRci->pUrl, "&"); 191 | 192 | pRci->pUrl = strcatr(pRci->pUrl, pStrName); 193 | pRci->pUrl = strcatr(pRci->pUrl, "="); 194 | pRci->pUrl = strcatr(pRci->pUrl, pStrValue); 195 | } 196 | } 197 | } 198 | } 199 | else if(pRci->romRoot == NULL) 200 | { 201 | rciFree(pRci); 202 | pRci = NULL; 203 | } 204 | } 205 | 206 | return pRci; 207 | } 208 | 209 | 210 | #ifdef _UNIT_TEST_RCI 211 | 212 | void test1(int argc, char **argv) 213 | { 214 | char const *pRomUrl = "http://127.0.0.1:9734/files/rom.json"; 215 | char const *pRomPath = "devicestate"; 216 | rci_t *pRci = NULL; 217 | 218 | pRci = rciFetch(pRomUrl, pRomPath, RCI_ACTION_SELECT); 219 | 220 | if(pRci != NULL) 221 | printf("url '%s' method '%s'\n", pRci->pUrl, pRci->pMethod); 222 | else 223 | printf("rciFetch failed\n"); 224 | 225 | rciFree(pRci); 226 | } 227 | 228 | int main(int argc, char **argv) 229 | { 230 | test1(argc, argv); 231 | 232 | return 0; 233 | } 234 | #endif 235 | -------------------------------------------------------------------------------- /rciapi.h: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #ifndef _RCIAPI_H_ 25 | #define _RCIAPI_H_ 26 | 27 | /* 28 | * Remote Operations Map - is inspired by, but not Swagger 29 | * 30 | * It specifies how to do sql type operations on remote 31 | * json objects via a server based api. 32 | * 33 | * The ROM is it's self a json object. 34 | * 35 | * This is a codified helper interface to that ROM 36 | * 37 | * It is expected that the ROM resides remotelty, and may be 38 | * cached locally in json form, but is read in, and retained 39 | * in memory for later use. 40 | * 41 | */ 42 | 43 | /* An example ROM, supporting select and insert operations 44 | * of a local table schema of at least; 45 | * create foreign table sometable 46 | * (t integer, st integer, id integer, data integer[]) 47 | * server json_server 48 | * options 49 | * (rom_url 'http://server.example.com/rom.json', rom_path 'devicestate') 50 | * where the remote data could be at least '{ "t":3, "st":2, "id":4, "data":[ 1, 2, 3] }' 51 | 52 | { 53 | "romschema": "2", 54 | "host": "", 55 | "url": "/omsgsql", 56 | 57 | "devicestate": 58 | { 59 | "url": "/devices", 60 | "select":{ 61 | "method": "get", 62 | "url": "/", 63 | "query": [ { "name":"st", "type":"integer"}, { "name":"id", "type":"integer"} ] 64 | }, 65 | "insert":{ 66 | "method": "put", 67 | "url": "/", 68 | "query": [ { "name":"st", "type":"integer"}, { "name":"id", "type":"integer"}, {"name":"data", "type":"integer[]"} ] 69 | }, 70 | "delete":{ "method": "", "url": "", "schema": [ ] }, 71 | "update":{ "method": "", "url": "", "schema": [ ] } 72 | } 73 | } 74 | 75 | */ 76 | 77 | #include 78 | 79 | typedef struct _rci_t 80 | { 81 | char *pUrl; // must be free()'d 82 | char *pQuery; // must be free()'d 83 | char const *pMethod; 84 | char const *pAction; // must be freed()'d 85 | yajl_val romRoot; // must be yajl_free()'d 86 | yajl_val romRootAction; // do not yajl_free(), is subnode of romRoot 87 | } rci_t; // Rom Context Info Type; 88 | 89 | enum { RCI_ACTION_NONE, RCI_ACTION_SELECT, RCI_ACTION_INSERT, RCI_ACTION_UPDATE, RCI_ACTION_DELETE }; 90 | 91 | void rciFree(rci_t *pRci); 92 | rci_t *rciFetch(char const *pRomUrl, char const *pRomPath, int action); 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /regexapi.c: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * This "source code" is part of Spamilter - http://www.spamilter.org 4 | * Additionally, this "source code" is herby, also incorporated as part json_fdw 5 | * 6 | * Developed by; 7 | * Neal Horman - http://www.wanlink.com 8 | * Copyright (c) 2010-2015 Neal Horman. All Rights Reserved 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met; 13 | * 14 | * 1. Redistributions of source code must retain the above copyright 15 | * notice, this list of conditions and the following disclaimer. 16 | * 2. Redistributions in binary form must reproduce the above copyright 17 | * notice, this list of conditions and the following disclaimer in the 18 | * documentation and/or other materials provided with the distribution. 19 | * 3. All advertising materials mentioning features or use of this software 20 | * must display the following acknowledgement: 21 | * This product includes software developed by Neal Horman. 22 | * 4. Neither the name Neal Horman nor the names of any contributors 23 | * may be used to endorse or promote products derived from this software 24 | * without specific prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY NEAL HORMAN AND ANY CONTRIBUTORS ``AS IS'' AND 27 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 | * ARE DISCLAIMED. IN NO EVENT SHALL NEAL HORMAN OR ANY CONTRIBUTORS BE LIABLE 30 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 | * SUCH DAMAGE. 37 | * 38 | * Alternately; 39 | * 40 | * This "source code" is free software: you can redistribute it and/or modify 41 | * it under the terms of the GNU General Public License as published by 42 | * the Free Software Foundation, either version 3 of the License, or 43 | * (at your option) any later version. 44 | * 45 | * This "source code" is distributed in the hope that it will be useful, 46 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 47 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 48 | * GNU General Public License for more details. 49 | * 50 | * You should have received a copy of the GNU General Public License 51 | * along with this "source code". If not, see . 52 | * 53 | * RCSID: $Id$ 54 | * 55 | *--------------------------------------------------------------------*/ 56 | 57 | static char const rcsid[] = "@(#)$Id$"; 58 | 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | #define _IS_REGEXAPI_ 65 | #include "regexapi.h" 66 | 67 | void regexapi_free(regexapi_t *prat) 68 | { 69 | if(prat != NULL) 70 | { unsigned int i,j; 71 | 72 | for(i=0; imatches; i++) 73 | { regexapimatch_t *pmatch = prat->pmatches+i; 74 | 75 | if(pmatch->nsubs && pmatch->ppsubs != NULL) 76 | { 77 | for(j=0; jnsubs; j++) 78 | free(*(pmatch->ppsubs+j)); 79 | free(pmatch->ppsubs); 80 | } 81 | } 82 | if(prat->matches && prat->pmatches != NULL) 83 | free(prat->pmatches); 84 | regfree(&prat->re); 85 | if(prat->preerr != NULL) 86 | free(prat->preerr); 87 | free(prat); 88 | } 89 | } 90 | 91 | const char *regexapi_sub(regexapi_t *prat, size_t match, size_t nsub) 92 | { 93 | return (prat != NULL && match < prat->matches && nsub <= (prat->pmatches+match)->nsubs ? *((prat->pmatches+match)->ppsubs+nsub) : NULL); 94 | } 95 | 96 | int regexapi_nsubs(regexapi_t *prat, size_t match) 97 | { 98 | return (prat != NULL && match < prat->matches ? (prat->pmatches+match)->nsubs : 0); 99 | } 100 | 101 | int regexapi_matches(regexapi_t *prat) 102 | { 103 | return (prat != NULL ? prat->matches : 0); 104 | } 105 | 106 | int regexapi_err(regexapi_t *prat) 107 | { 108 | return (prat != NULL ? prat->rerc : 0); 109 | } 110 | 111 | const char *regexapi_errStr(regexapi_t *prat) 112 | { 113 | return (prat != NULL && prat->preerr != NULL ? prat->preerr : ""); 114 | } 115 | 116 | static void regexapi_buildErrStr(regexapi_t *prat) 117 | { 118 | if(prat != NULL) 119 | { char errbuf[1024]; 120 | 121 | memset(&errbuf,0,sizeof(errbuf)); 122 | regerror(prat->rerc,&prat->re,errbuf,sizeof(errbuf)); 123 | prat->preerr = strdup(errbuf); 124 | } 125 | } 126 | 127 | regexapi_t *regexapi_exec(const char *pstr, const char *pregex, unsigned int cflags, unsigned int findCount) 128 | { regexapi_t *prat = calloc(sizeof(regexapi_t),1); 129 | 130 | #ifdef _REGEX_DEBUG 131 | printf("%s:%d - pstr '%s' pregex '%s' cflags 0x%04X findCount %u\n" , __func__, __LINE__ , pstr, pregex, cflags, findCount); 132 | #endif 133 | if(prat != NULL) 134 | { 135 | prat->rerc = regcomp(&prat->re,pregex,cflags); 136 | 137 | #ifdef _REGEX_DEBUG 138 | if(prat->rerc == 0) 139 | printf("%s:%d - regcomp() = %d, nsub = %d\n", __func__, __LINE__, prat->rerc, prat->re.re_nsub); 140 | #endif 141 | if(prat->rerc == 0) 142 | { size_t i; 143 | char *pdst = NULL; 144 | regmatch_t *presubs = (regmatch_t *)calloc(sizeof(regmatch_t),prat->re.re_nsub+1); 145 | regexapimatch_t *pmatch = NULL; 146 | size_t last = 0; 147 | 148 | // don't allow iteration for more subs than actually exist 149 | if(prat->re.re_nsub < findCount) 150 | findCount = prat->re.re_nsub; 151 | 152 | while((prat->rerc = regexec(&prat->re,pstr+last,prat->re.re_nsub+1,presubs,0)) == 0 && findCount != 0) 153 | { 154 | findCount --; 155 | prat->matches ++; 156 | prat->pmatches = realloc(prat->pmatches,sizeof(regexapimatch_t)*prat->matches); 157 | pmatch = prat->pmatches+(prat->matches-1); 158 | #ifdef _REGEX_DEBUG 159 | printf("%s:%d - regexec() = %d\n", __func__, __LINE__, prat->rerc); 160 | #endif 161 | pmatch->nsubs = 0; 162 | pmatch->ppsubs = (char **)calloc(1,sizeof(char *)*prat->re.re_nsub); 163 | 164 | if(pmatch->ppsubs != NULL) 165 | { 166 | for(i=1; ire.re_nsub+1; i++) 167 | { size_t so = (presubs+i)->rm_so + last; 168 | size_t eo = (presubs+i)->rm_eo + last; 169 | size_t qo = (eo - so); 170 | 171 | pmatch->nsubs++; 172 | pdst = *(pmatch->ppsubs+(i-1)) = (char *)calloc(qo+1,1); 173 | strncpy(pdst,(pstr+so),qo); 174 | *(pdst+qo) = 0; 175 | 176 | if(i == prat->re.re_nsub) 177 | last = eo; 178 | #ifdef _REGEX_DEBUG 179 | printf("%s:%d - sub %d: so %d eo %d qo %d - '%*.*s'\n", __func__, __LINE__, i, so, eo, qo, qo, qo, pdst); 180 | #endif 181 | } 182 | } 183 | } 184 | 185 | if(presubs != NULL) 186 | free(presubs); 187 | 188 | if(prat->matches > 0 && prat->rerc == 1) 189 | prat->rerc = 0; 190 | } 191 | } 192 | 193 | if(prat != NULL && prat->rerc) 194 | { 195 | regexapi_buildErrStr(prat); 196 | #ifdef _REGEX_DEBUG 197 | printf("regex error: %d/'%s'\n",prat->rerc,regexapi_errStr(prat)); 198 | #endif 199 | } 200 | 201 | return prat; 202 | } 203 | 204 | int regexapi(const char *pstr, const char *pregex, int cflags) 205 | { regexapi_t *prat = regexapi_exec(pstr,pregex,cflags,1); 206 | int rc = regexapi_matches(prat) != 0; 207 | 208 | if(prat != NULL) 209 | regexapi_free(prat); 210 | 211 | return rc; 212 | } 213 | 214 | 215 | #ifdef _REGEX_UNIT_TEST 216 | int main(int argc, char **argv) 217 | { 218 | if(argc == 3) 219 | { int i = 1; 220 | regexapi_t *prat = regexapi_exec(argv[i],argv[i+1],REGEX_DEFAULT_CFLAGS,REGEX_FIND_ALL); 221 | 222 | printf("%s: '%s' %c= '%s'\n", argv[0], argv[i], (regexapi_matches(prat) ? '=' : '!'), argv[i+1]); 223 | if(regexapi_matches(prat)) 224 | { int q; 225 | 226 | for(i=0,q=regexapi_nsubs(prat,0); i. 52 | * 53 | * RCSID: $Id$ 54 | * 55 | *--------------------------------------------------------------------*/ 56 | 57 | #ifndef _REGEXAPI_H_ 58 | #define _REGEXAPI_H_ 59 | 60 | #ifdef __cplusplus 61 | extern "C" { 62 | #endif 63 | 64 | #include 65 | 66 | #define REGEX_DEFAULT_CFLAGS ( REG_EXTENDED | REG_ICASE ) 67 | #define REGEX_FIND_ALL ~0 68 | 69 | #ifdef _IS_REGEXAPI_ 70 | typedef struct _regexapimatch_t 71 | { 72 | size_t nsubs; 73 | char **ppsubs; 74 | }regexapimatch_t; 75 | 76 | typedef struct _regexapi_t 77 | { 78 | regex_t re; 79 | int rerc; 80 | char *preerr; 81 | 82 | unsigned int matches; 83 | regexapimatch_t *pmatches; 84 | }regexapi_t; 85 | #else 86 | typedef struct _regexapi_t regexapi_t; 87 | #endif 88 | 89 | void regexapi_free(regexapi_t *prat); 90 | const char *regexapi_sub(regexapi_t *prat, size_t match, size_t nsub); 91 | int regexapi_nsubs(regexapi_t *prat, size_t match); 92 | int regexapi_matches(regexapi_t *prat); 93 | int regexapi_err(regexapi_t *prat); 94 | const char *regexapi_errStr(regexapi_t *prat); 95 | regexapi_t *regexapi_exec(const char *pstr, const char *pregex, unsigned int cflags, unsigned int findCount); 96 | 97 | // for simplicitly 98 | int regexapi(const char *pstr, const char *pregex, int cflags); 99 | 100 | #ifdef __cplusplus 101 | } 102 | #endif 103 | 104 | #endif 105 | 106 | -------------------------------------------------------------------------------- /regexapi_helper.c: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "regexapi.h" 29 | #include "regexapi_helper.h" 30 | 31 | // URL validation support 32 | typedef struct _regexapilist_t 33 | { 34 | const char *pattern; 35 | int flags; 36 | int findCount; 37 | }regexapilist_t; 38 | 39 | #define URLHOSTNAME "([a-z0-9][a-z0-9._-]*[.][a-z]{2,})" 40 | #define URLHOSTIPV4 "([0-9]{1,3}[.][0-9]{1,3}[.][0.9]{1,3}[.][0-9]{1,3})" 41 | #define URLHOSTLOCAL "(localhost)" 42 | #define URLHOST "(" URLHOSTNAME "|" URLHOSTLOCAL "|" URLHOSTIPV4 ")" 43 | #define URLPORT "(:[0-9]+)*" 44 | #define URLSPEC URLHOST URLPORT 45 | #define URISPEC "/.*" 46 | 47 | // http[s]?://([a-z0-9][a-z0-9._-]*[.][a-z]{2,}(:[0-9]+)*)(.*) 48 | // http[s]?://((([a-z0-9][a-z0-9._-]*[.][a-z]{2,})|(localhost)|([0-9]{1,3}[.][0-9]{1,3}[.][0.9]{1,3}[.][0-9]{1,3}))(:[0-9]+)*)(/.{0,}) 49 | 50 | // List of valid URL regexes that CURL supports 51 | static regexapilist_t const regexUrls[] = 52 | { 53 | { "(http[s]?)://(" URLSPEC ")(" URISPEC ")", ( REG_EXTENDED | REG_ICASE ), 2 }, 54 | { NULL, 0, 0 }, 55 | }; 56 | 57 | // Supported URL regex validation iterator 58 | static regexapi_t *regexapi_exec_list(const char *subject, regexapilist_t const *pRegexList) 59 | { regexapi_t *pRat = NULL; 60 | 61 | while(pRat == NULL && pRegexList->pattern != NULL) 62 | pRat = regexapi_exec(subject, pRegexList->pattern, pRegexList->flags, pRegexList->findCount); 63 | 64 | return pRat; 65 | } 66 | 67 | regexapi_t *regexapi_url(char const *subject) 68 | { 69 | return regexapi_exec_list(subject, regexUrls); 70 | } 71 | -------------------------------------------------------------------------------- /regexapi_helper.h: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------* 2 | * 3 | * Developed by; 4 | * Neal Horman - http://www.wanlink.com 5 | * Copyright (c) 2015 Neal Horman. All Rights Reserved 6 | * 7 | * This "source code" is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This "source code" is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this "source code". If not, see . 19 | * 20 | * RCSID: $Id$ 21 | * 22 | *--------------------------------------------------------------------*/ 23 | 24 | #ifndef _REGEXAPI_HELPER_H_ 25 | #define _REGEXAPI_HELPER_H_ 26 | 27 | regexapi_t *regexapi_url(char const *subject); 28 | #endif 29 | -------------------------------------------------------------------------------- /sql/.gitignore: -------------------------------------------------------------------------------- 1 | # This directory will be populated when testing from input directory 2 | # Ignore everything in this directory 3 | * 4 | # Except this file 5 | !.gitignore 6 | --------------------------------------------------------------------------------