├── README.md
├── cloudfs_fdw
    ├── __init__.py
    └── cloudfs_fdw.py
└── setup.py


/README.md:
--------------------------------------------------------------------------------
 1 | # cloudfs_fdw
 2 | 
 3 | A foreign data wrapper for accessing CSV, JSON, EXCEL and ODF files on cloud filesystems.
 4 | 
 5 | ## Installation
 6 | 
 7 | `CREATE EXTENSION multicorn;`
 8 | 
 9 | `CREATE SERVER <server> FOREIGN DATA WRAPPER multicorn OPTIONS (wrapper 'cloudfs_fdw.cloudfs_fdw.cloudfs_fdw');`
10 | 
11 | `CREATE USER MAPPING FOR <user> SERVER <server>;`
12 | 
13 | ### Create table
14 | 
15 | `CREATE FOREIGN TABLE <schema>.<table> (...) SERVER <server> OPTIONS (<options>);`
16 | 
17 | See [this](https://www.postgresql.org/docs/11/ddl-foreign-data.html) for general information how SQL/MED in PostgreSQL works, and [this](https://www.postgresql.org/docs/11/sql-createforeigntable.html) for CREATE FOREIGN TABLE syntax.
18 | 
19 | ### Available Options
20 | 
21 | #### S3 compatible
22 | 
23 | * source 's3'
24 | * bucket &lt;bucket&gt;
25 | * filepath &lt;filename&gt;
26 | * host &lt;host&gt;, default: localhost
27 | * port &lt;port&gt;, default: 443
28 | * region &lt;region&gt;
29 | * aws_access_key &lt;access_key&gt;
30 | * aws_secret_key &lt;secret_key&gt;
31 | 
32 | #### HTTP / HTTPS
33 | 
34 | * source 'http/https'
35 | * url &lt;URL&gt;
36 | 
37 | #### Filesystem
38 | 
39 | * source 'file'
40 | * filepath &lt;filepath&gt;
41 | 
42 | #### JSON
43 | 
44 | * format 'json'
45 | * json_path &lt;json_path&gt;
46 | 
47 | #### CSV
48 | 
49 | * format 'csv'
50 | * header &lt;true/false&gt;
51 | * delimiter &lt;delimiter&gt;, default: ,
52 | * quote_char &lt;quote_char&gt;, default: "
53 | 
54 | #### EXCEL (.xls, .xlsx)
55 | 
56 |  * format 'excel'
57 |  * header &lt;true/false&gt;
58 |  * sheet &lt;sheet_name&gt;, default: first sheet
59 | 
60 | #### ODF (.ods)
61 | 
62 |  * format 'odf'
63 |  * header &lt;true/false&gt;
64 |  * sheet &lt;sheet_name&gt;, default: first sheet
65 | 
66 | #### Compression
67 | 
68 | '.gz' and '.zip' compressed files are decompressed automagically.
69 | 


--------------------------------------------------------------------------------
/cloudfs_fdw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ergo70/cloudfs_fdw/828fe65c91f709beaca4a375296451fcca288206/cloudfs_fdw/__init__.py


--------------------------------------------------------------------------------
/cloudfs_fdw/cloudfs_fdw.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import smart_open
  3 | import ijson
  4 | import pandas
  5 | from multicorn import ForeignDataWrapper
  6 | from multicorn.utils import log_to_postgres, ERROR, WARNING, DEBUG
  7 | 
  8 | 
  9 | class cloudfs_fdw(ForeignDataWrapper):
 10 |     """A foreign data wrapper for accessing CSV, JSON, EXCEL and ODF files on cloud filesystems.
 11 | 
 12 |     Valid options:
 13 |         - source : the data source (S3, HTTP/S, file, FTP, SCP, HDFS)
 14 |           Default: "S3"
 15 |         - format : the file format (CSV, JSON, EXCEL, ODF, plain or compressed)
 16 |           Default : CSV
 17 |         - host : hostname
 18 |         - port : port
 19 |         - region : the AWS region (S3 only)
 20 |         - aws_access_key : AWS access keys (S3 only)
 21 |         - aws_secret_key : AWS secret keys (S3 only)
 22 |         - bucket : bucket (S3 only)
 23 |         - filepath : full path to the csv file, which must be readable
 24 |         - delimiter : the delimiter used between fields (CSV only)
 25 |           Default : ","
 26 |         - quote_char : quote character (CSV only)
 27 |           Default : """""""
 28 |         - header :  skip header line (CSV only)
 29 |           Default : false
 30 |         - sheet : EXCEL sheet (EXCEL / ODF only)
 31 |           Default : 1st sheet
 32 |     """
 33 | 
 34 |     def __init__(self, fdw_options, fdw_columns):
 35 |         super(cloudfs_fdw, self).__init__(fdw_options, fdw_columns)
 36 |         self.source = fdw_options.get("source", 's3').lower()
 37 |         if self.source is None:
 38 |             log_to_postgres("Please set the source", ERROR)
 39 | 
 40 |         self.format = fdw_options.get("format", 'csv').lower()
 41 |         if self.format is None:
 42 |             log_to_postgres("Please set the file format", ERROR)
 43 | 
 44 |         self.region = fdw_options.get("region")
 45 |         # if self.region is None:
 46 |         #    log_to_postgres("Please set the AWS region", ERROR)
 47 | 
 48 |         self.host = fdw_options.get("host", "localhost")
 49 |         # if self.host is None:
 50 |         #    log_to_postgres("Please set the endpoint host", ERROR)
 51 | 
 52 |         self.port = int(fdw_options.get("port", "443"))
 53 |         # if self.port is None:
 54 |         #    log_to_postgres("Please set the endpoint port", ERROR)
 55 | 
 56 |         self.http_url = fdw_options.get("url")
 57 | 
 58 |         self.filepath = fdw_options.get("filepath")
 59 |         # if self.filename is None:
 60 |         #    log_to_postgres("Please set the filename", ERROR)
 61 | 
 62 |         self.bucket = fdw_options.get('bucket')
 63 |         # if self.bucket is None:
 64 |         #    log_to_postgres("Please set the bucket", ERROR)
 65 | 
 66 |         self.aws_access_key = fdw_options.get('aws_access_key')
 67 |         # if self.aws_access_key is None:
 68 |         #    log_to_postgres("Please set the AWS access key", ERROR)
 69 | 
 70 |         self.aws_secret_key = fdw_options.get('aws_secret_key')
 71 |         # if self.aws_secret_key is None:
 72 |         #    log_to_postgres("Please set the AWS secret key", ERROR)
 73 | 
 74 |         self.delimiter = fdw_options.get("delimiter", ",")
 75 | 
 76 |         self.quotechar = fdw_options.get("quote_char", '"')
 77 | 
 78 |         self.skip_header = ('true' == fdw_options.get(
 79 |             'header', 'false').lower())
 80 | 
 81 |         self.json_path = fdw_options.get('json_path', 'item')
 82 |         if 'item' != self.json_path:
 83 |             self.json_path = self.json_path + '.item'
 84 | 
 85 |         self.sheet = fdw_options.get('sheet', 0)
 86 | 
 87 |         self.columns = fdw_columns
 88 | 
 89 |     def can_sort(self, sortkeys):
 90 |         can_sort = []
 91 | 
 92 |         if self.format in ['xls', 'xlsx', 'odf']:
 93 |             return sortkeys
 94 | 
 95 |         return can_sort
 96 | 
 97 |     def execute(self, quals, columns, sortkeys=None):
 98 |         if 's3' == self.source:
 99 |             url = 's3://{}:{}@{}:{}@{}/{}'.format(
100 |                 self.aws_access_key, self.aws_secret_key, self.host, self.port, self.bucket, self.filepath)
101 |         elif 'file' == self.source:
102 |             url = 'file://{}'.format(self.filepath)
103 |         elif 'http/https' == self.source:
104 |             url = self.http_url
105 |         else:
106 |             log_to_postgres("Source {} not supported".format(self.source))
107 | 
108 |         data_stream = smart_open.open(url, 'rb')
109 | 
110 |         if 'csv' == self.format:
111 |             for row in self._render_csv(data_stream):
112 |                 yield row
113 | 
114 |         elif 'json' == self.format:
115 |             for row in self._render_json(data_stream):
116 |                 yield row
117 | 
118 |         elif self.format in ['xls', 'xlsx', 'odf']:
119 |             for row in self._render_excel_or_odf(data_stream, quals, sortkeys):
120 |                 yield row
121 | 
122 |         else:
123 |             log_to_postgres("Format {} not supported".format(self.format))
124 | 
125 |     def _render_csv(self, data_stream):
126 |         object_stream = csv.reader(data_stream, delimiter=self.delimiter,
127 |                                    quotechar=self.quotechar)
128 | 
129 |         if (self.skip_header):
130 |             for _ in object_stream:
131 |                 break
132 | 
133 |         for obj in object_stream:
134 |             yield obj[:len(self.columns)]
135 | 
136 |     def _render_json(self, data_stream):
137 |         object_stream = ijson.items(data_stream, self.json_path)
138 | 
139 |         for obj in object_stream:
140 |             yield obj.values()[:len(self.columns)]
141 | 
142 |     def _render_excel_or_odf(self, data_stream, quals, sortkeys):
143 |         engine = 'xlrd'
144 | 
145 |         if self.format == 'odf':
146 |             engine = 'odf'
147 | 
148 |         object_stream = pandas.read_excel(
149 |             data_stream, sheet_name=self.sheet, header=0 if self.skip_header else None, engine=engine)
150 | 
151 |         if quals or sortkeys:
152 |             object_stream.columns = [column.replace(
153 |                 " ", "_").replace(":", "_") for column in object_stream.columns]
154 |             df_columns = object_stream.columns.values
155 | 
156 |         if quals:
157 |             query = ''
158 |             column_names = list(self.columns.keys())
159 | 
160 |             for qual in quals:
161 |                 column_type = self.columns[qual.field_name].base_type_name
162 |                 column_index = column_names.index(qual.field_name)
163 |                 if qual.operator in ['<', '<=', '=', '>', '>=']:
164 |                     query += df_columns[column_index] + ('==' if qual.operator == '=' else qual.operator) + (
165 |                         ('"' + str(qual.value) + '"') if column_type in ['text', 'varchar', 'character varying', 'char', 'character'] else str(qual.value)) + ' and '
166 | 
167 |             if query:
168 |                 object_stream.query(expr=query[:-5], inplace=True)
169 | 
170 |         if sortkeys and len(object_stream.index) > 1:
171 |             sort_columns = []
172 |             sort_orders = []
173 | 
174 |             for sortkey in sortkeys:
175 |                 sort_columns.append(df_columns[sortkey.attnum - 1])
176 |                 sort_orders.append(not sortkey.is_reversed)
177 | 
178 |             object_stream.sort_values(
179 |                 by=sort_columns, axis=0, ascending=sort_orders, inplace=True)
180 | 
181 |         for row in object_stream.iterrows():
182 |             yield row[1].values[:len(self.columns)]
183 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os 
 3 | 
 4 | 
 5 | def read(fname):
 6 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 7 | 
 8 | 
 9 | setup(
10 |     name='cloudfs_fdw',
11 |     version='1.0.0',
12 |     author='Ernst-Georg Schmid',
13 |     author_email='pgchem@tuschehund.de',
14 |     packages=['cloudfs_fdw'],
15 |     url='https://github.com/ergo70/cloudfs_fdw',
16 |     license='LICENSE.txt',
17 |     description='A foreign data wrapper for accessing csv and JSON files on cloud filesystems.',
18 |     install_requires=["smart_open","ijson","multicorn", "pandas>=1.0.1", "xlrd", "odfpy"],
19 | )
20 | 


--------------------------------------------------------------------------------