├── README.md ├── cloudfs_fdw ├── __init__.py └── cloudfs_fdw.py └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # cloudfs_fdw 2 | 3 | A foreign data wrapper for accessing CSV, JSON, EXCEL and ODF files on cloud filesystems. 4 | 5 | ## Installation 6 | 7 | `CREATE EXTENSION multicorn;` 8 | 9 | `CREATE SERVER FOREIGN DATA WRAPPER multicorn OPTIONS (wrapper 'cloudfs_fdw.cloudfs_fdw.cloudfs_fdw');` 10 | 11 | `CREATE USER MAPPING FOR SERVER ;` 12 | 13 | ### Create table 14 | 15 | `CREATE FOREIGN TABLE . (...) SERVER OPTIONS ();` 16 | 17 | See [this](https://www.postgresql.org/docs/11/ddl-foreign-data.html) for general information how SQL/MED in PostgreSQL works, and [this](https://www.postgresql.org/docs/11/sql-createforeigntable.html) for CREATE FOREIGN TABLE syntax. 18 | 19 | ### Available Options 20 | 21 | #### S3 compatible 22 | 23 | * source 's3' 24 | * bucket <bucket> 25 | * filepath <filename> 26 | * host <host>, default: localhost 27 | * port <port>, default: 443 28 | * region <region> 29 | * aws_access_key <access_key> 30 | * aws_secret_key <secret_key> 31 | 32 | #### HTTP / HTTPS 33 | 34 | * source 'http/https' 35 | * url <URL> 36 | 37 | #### Filesystem 38 | 39 | * source 'file' 40 | * filepath <filepath> 41 | 42 | #### JSON 43 | 44 | * format 'json' 45 | * json_path <json_path> 46 | 47 | #### CSV 48 | 49 | * format 'csv' 50 | * header <true/false> 51 | * delimiter <delimiter>, default: , 52 | * quote_char <quote_char>, default: " 53 | 54 | #### EXCEL (.xls, .xlsx) 55 | 56 | * format 'excel' 57 | * header <true/false> 58 | * sheet <sheet_name>, default: first sheet 59 | 60 | #### ODF (.ods) 61 | 62 | * format 'odf' 63 | * header <true/false> 64 | * sheet <sheet_name>, default: first sheet 65 | 66 | #### Compression 67 | 68 | '.gz' and '.zip' compressed files are decompressed automagically. 69 | -------------------------------------------------------------------------------- /cloudfs_fdw/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ergo70/cloudfs_fdw/828fe65c91f709beaca4a375296451fcca288206/cloudfs_fdw/__init__.py -------------------------------------------------------------------------------- /cloudfs_fdw/cloudfs_fdw.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import smart_open 3 | import ijson 4 | import pandas 5 | from multicorn import ForeignDataWrapper 6 | from multicorn.utils import log_to_postgres, ERROR, WARNING, DEBUG 7 | 8 | 9 | class cloudfs_fdw(ForeignDataWrapper): 10 | """A foreign data wrapper for accessing CSV, JSON, EXCEL and ODF files on cloud filesystems. 11 | 12 | Valid options: 13 | - source : the data source (S3, HTTP/S, file, FTP, SCP, HDFS) 14 | Default: "S3" 15 | - format : the file format (CSV, JSON, EXCEL, ODF, plain or compressed) 16 | Default : CSV 17 | - host : hostname 18 | - port : port 19 | - region : the AWS region (S3 only) 20 | - aws_access_key : AWS access keys (S3 only) 21 | - aws_secret_key : AWS secret keys (S3 only) 22 | - bucket : bucket (S3 only) 23 | - filepath : full path to the csv file, which must be readable 24 | - delimiter : the delimiter used between fields (CSV only) 25 | Default : "," 26 | - quote_char : quote character (CSV only) 27 | Default : """"""" 28 | - header : skip header line (CSV only) 29 | Default : false 30 | - sheet : EXCEL sheet (EXCEL / ODF only) 31 | Default : 1st sheet 32 | """ 33 | 34 | def __init__(self, fdw_options, fdw_columns): 35 | super(cloudfs_fdw, self).__init__(fdw_options, fdw_columns) 36 | self.source = fdw_options.get("source", 's3').lower() 37 | if self.source is None: 38 | log_to_postgres("Please set the source", ERROR) 39 | 40 | self.format = fdw_options.get("format", 'csv').lower() 41 | if self.format is None: 42 | log_to_postgres("Please set the file format", ERROR) 43 | 44 | self.region = fdw_options.get("region") 45 | # if self.region is None: 46 | # log_to_postgres("Please set the AWS region", ERROR) 47 | 48 | self.host = fdw_options.get("host", "localhost") 49 | # if self.host is None: 50 | # log_to_postgres("Please set the endpoint host", ERROR) 51 | 52 | self.port = int(fdw_options.get("port", "443")) 53 | # if self.port is None: 54 | # log_to_postgres("Please set the endpoint port", ERROR) 55 | 56 | self.http_url = fdw_options.get("url") 57 | 58 | self.filepath = fdw_options.get("filepath") 59 | # if self.filename is None: 60 | # log_to_postgres("Please set the filename", ERROR) 61 | 62 | self.bucket = fdw_options.get('bucket') 63 | # if self.bucket is None: 64 | # log_to_postgres("Please set the bucket", ERROR) 65 | 66 | self.aws_access_key = fdw_options.get('aws_access_key') 67 | # if self.aws_access_key is None: 68 | # log_to_postgres("Please set the AWS access key", ERROR) 69 | 70 | self.aws_secret_key = fdw_options.get('aws_secret_key') 71 | # if self.aws_secret_key is None: 72 | # log_to_postgres("Please set the AWS secret key", ERROR) 73 | 74 | self.delimiter = fdw_options.get("delimiter", ",") 75 | 76 | self.quotechar = fdw_options.get("quote_char", '"') 77 | 78 | self.skip_header = ('true' == fdw_options.get( 79 | 'header', 'false').lower()) 80 | 81 | self.json_path = fdw_options.get('json_path', 'item') 82 | if 'item' != self.json_path: 83 | self.json_path = self.json_path + '.item' 84 | 85 | self.sheet = fdw_options.get('sheet', 0) 86 | 87 | self.columns = fdw_columns 88 | 89 | def can_sort(self, sortkeys): 90 | can_sort = [] 91 | 92 | if self.format in ['xls', 'xlsx', 'odf']: 93 | return sortkeys 94 | 95 | return can_sort 96 | 97 | def execute(self, quals, columns, sortkeys=None): 98 | if 's3' == self.source: 99 | url = 's3://{}:{}@{}:{}@{}/{}'.format( 100 | self.aws_access_key, self.aws_secret_key, self.host, self.port, self.bucket, self.filepath) 101 | elif 'file' == self.source: 102 | url = 'file://{}'.format(self.filepath) 103 | elif 'http/https' == self.source: 104 | url = self.http_url 105 | else: 106 | log_to_postgres("Source {} not supported".format(self.source)) 107 | 108 | data_stream = smart_open.open(url, 'rb') 109 | 110 | if 'csv' == self.format: 111 | for row in self._render_csv(data_stream): 112 | yield row 113 | 114 | elif 'json' == self.format: 115 | for row in self._render_json(data_stream): 116 | yield row 117 | 118 | elif self.format in ['xls', 'xlsx', 'odf']: 119 | for row in self._render_excel_or_odf(data_stream, quals, sortkeys): 120 | yield row 121 | 122 | else: 123 | log_to_postgres("Format {} not supported".format(self.format)) 124 | 125 | def _render_csv(self, data_stream): 126 | object_stream = csv.reader(data_stream, delimiter=self.delimiter, 127 | quotechar=self.quotechar) 128 | 129 | if (self.skip_header): 130 | for _ in object_stream: 131 | break 132 | 133 | for obj in object_stream: 134 | yield obj[:len(self.columns)] 135 | 136 | def _render_json(self, data_stream): 137 | object_stream = ijson.items(data_stream, self.json_path) 138 | 139 | for obj in object_stream: 140 | yield obj.values()[:len(self.columns)] 141 | 142 | def _render_excel_or_odf(self, data_stream, quals, sortkeys): 143 | engine = 'xlrd' 144 | 145 | if self.format == 'odf': 146 | engine = 'odf' 147 | 148 | object_stream = pandas.read_excel( 149 | data_stream, sheet_name=self.sheet, header=0 if self.skip_header else None, engine=engine) 150 | 151 | if quals or sortkeys: 152 | object_stream.columns = [column.replace( 153 | " ", "_").replace(":", "_") for column in object_stream.columns] 154 | df_columns = object_stream.columns.values 155 | 156 | if quals: 157 | query = '' 158 | column_names = list(self.columns.keys()) 159 | 160 | for qual in quals: 161 | column_type = self.columns[qual.field_name].base_type_name 162 | column_index = column_names.index(qual.field_name) 163 | if qual.operator in ['<', '<=', '=', '>', '>=']: 164 | query += df_columns[column_index] + ('==' if qual.operator == '=' else qual.operator) + ( 165 | ('"' + str(qual.value) + '"') if column_type in ['text', 'varchar', 'character varying', 'char', 'character'] else str(qual.value)) + ' and ' 166 | 167 | if query: 168 | object_stream.query(expr=query[:-5], inplace=True) 169 | 170 | if sortkeys and len(object_stream.index) > 1: 171 | sort_columns = [] 172 | sort_orders = [] 173 | 174 | for sortkey in sortkeys: 175 | sort_columns.append(df_columns[sortkey.attnum - 1]) 176 | sort_orders.append(not sortkey.is_reversed) 177 | 178 | object_stream.sort_values( 179 | by=sort_columns, axis=0, ascending=sort_orders, inplace=True) 180 | 181 | for row in object_stream.iterrows(): 182 | yield row[1].values[:len(self.columns)] 183 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | 4 | 5 | def read(fname): 6 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 7 | 8 | 9 | setup( 10 | name='cloudfs_fdw', 11 | version='1.0.0', 12 | author='Ernst-Georg Schmid', 13 | author_email='pgchem@tuschehund.de', 14 | packages=['cloudfs_fdw'], 15 | url='https://github.com/ergo70/cloudfs_fdw', 16 | license='LICENSE.txt', 17 | description='A foreign data wrapper for accessing csv and JSON files on cloud filesystems.', 18 | install_requires=["smart_open","ijson","multicorn", "pandas>=1.0.1", "xlrd", "odfpy"], 19 | ) 20 | --------------------------------------------------------------------------------