├── .env.template
├── duck.bat
├── requirements.txt
├── query2.sql
├── query.sql
├── LICENSE
├── README.md
├── .gitignore
└── main.py


/.env.template:
--------------------------------------------------------------------------------
1 | DB_DSN=postgresql://user:pass@host/db
2 | BATCH_SIZE=10000
3 | COMPRESSION=SNAPPY
4 | LOGLEVEL=INFO


--------------------------------------------------------------------------------
/duck.bat:
--------------------------------------------------------------------------------
1 | duckdb -c "select count(*) from output.parquet" 
2 | 
3 | duckdb -c "DESCRIBE select * from output.parquet" 
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | adbc-driver-postgresql==1.0.0
2 | black==24.4.2
3 | duckdb==1.0.0
4 | psycopg==3.1.19
5 | psycopg-binary==3.1.19
6 | pyarrow==16.1.0
7 | python-dotenv==1.0.1
8 | 


--------------------------------------------------------------------------------
/query2.sql:
--------------------------------------------------------------------------------
 1 | SELECT p.id,
 2 |     p.departureport,
 3 |     dp.portname as port_from,
 4 |     p.arrivalport,
 5 |     ap.portname as port_to,
 6 |     s.depdate, s.firstportdepdate,
 7 |     s.id as sid, p.price,
 8 |     sh.name as shipname, sh.shipcode, p.licnumber,
 9 |     p.discount, p.cancelticket, p.confirm
10 |     FROM vehicle p
11 |     LEFT JOIN sailing s on p.sailing_fk = s.id
12 |     LEFT JOIN port dp ON dp.portcode = p.departureport
13 |     LEFT JOIN port ap ON ap.portcode = p.arrivalport
14 |     LEFT JOIN "Ship" sh ON sh.shipcode = s.shipcode
15 |     WHERE p.id > %s
16 |     order by p.id
17 | 


--------------------------------------------------------------------------------
/query.sql:
--------------------------------------------------------------------------------
 1 | SELECT p.id,
 2 |     p.departureport,
 3 |     dp.portname as port_from,
 4 |     p.arrivalport,
 5 |     ap.portname as port_to,
 6 |     s.depdate, s.firstportdepdate,
 7 |     s.id as sid, p.price,
 8 |     sh.name as shipname, sh.shipcode, p.passengername,
 9 |     p.discount, p.cancelticket, p.confirm
10 |     FROM passenger p
11 |     LEFT JOIN sailing s on p.sailing_fk = s.id
12 |     LEFT JOIN port dp ON dp.portcode = p.departureport
13 |     LEFT JOIN port ap ON ap.portcode = p.arrivalport
14 |     LEFT JOIN "Ship" sh ON sh.shipcode = s.shipcode
15 |     WHERE p.id > %s
16 |     order by p.id
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Serafeim Papastefanos
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Postgres to parquet with python
 2 | 
 3 | Export a query from postgres to [parquet](https://parquet.apache.org/) with python. Apache parquet is 
 4 | open source, column-oriented data file format designed for efficient data storage and retrieval. The
 5 | parquet file can then be used with a columnar database or even queried directly using something like
 6 | [duckdb](https://duckdb.org/docs/data/parquet/overview.html).
 7 | 
 8 | ## Requirements
 9 | 
10 | This project works with supported versions of python 3. All requirements are listed in the `requirements.txt` file.
11 | It uses the following dependencies:
12 | 
13 | * pyarrow: the parquet file is created with [apache arrow](https://arrow.apache.org/docs/index.html); this is its python bindings
14 | * adbc-driver-postgresql: this is the [arrowdb connect - adbc](https://arrow.apache.org/docs/format/ADBC.html) driver for postgresql; it is used to retrieve the types of the columns of the queries so they can be re-used on the parquet file
15 | * psycopg: the library to query postgresql
16 | * python-dotenv: to load configuration from an `.env` file
17 | 
18 | ## Installation
19 | 
20 | Create a virtualenv and install the requirements:
21 | 
22 | ```bash
23 | $ python -m venv venv
24 | $ source venv/bin/activate
25 | $ pip install -r requirements.txt
26 | ```
27 | 
28 | or in windows
29 | 
30 | ```plain
31 | > py -3 -m venv vevn
32 | > venv\Scripts\activate.bat
33 | > pip install -r requirements.txt
34 | ```
35 | 
36 | 
37 | ## Usage
38 | 
39 | Copy over `.env.template` to `.env` and setup your database dsn and any other options you want. You can also use environment variables instead of the `.env` file.
40 | 
41 | Then run `python main.py query_file.sql` where the query_file should contain the SQL query whose contents you want to export to the parquet file. See the file `query.sql` for an example. The output file will be named `output.parquet` by default.
42 | 
43 | ## Configuration
44 | 
45 | You can set the `LOGLEVEL` to `DEBUG` to see more messages including timings or to `ERROR` to see only error messages. The default is `INFO`. You can also set the `COMPRESSION` to `SNAPPY` (most common), `GZIP`, `BROTLI`, `LZ4` or `ZSTD`. The default is `NONE`. The `BATCH_SIZE` is the number of rows to fetch at a time from the database. The default is `10000`. Finally, the `DB_DSN` must have the format `DB_DSN=postgresql://user:pass@host/db` with correct values for user, pass, host and db name.
46 | 
47 | ## Why?
48 | 
49 | After you've created the parquet file of your data you import it at a columnar database or even query it directly using something like 
50 | [duckdb](https://duckdb.org/docs/data/parquet/overview.html). Duckdb [binaries](https://duckdb.org/docs/installation/) are available for most systems or you can use a library to query the parquet file from within your app. Clickhouse db can also [query or import parquet] files (https://clickhouse.com/docs/en/integrations/data-formats/parquet).
51 | 
52 | For example run something `duckdb -c "select count(*) from output.parquet"`.
53 | 
54 | To give you an example of the timing differences: I had a table with ~ 150M rows. It took ~ 45 minutes to create the parquet file for this table resulting in an 1.3GB file (with SNAPPY compression). Then I could run aggregates for this data (group by, sum, count, etc) in seconds. 
55 | 
56 | The same aggregates on the original table took hours. To consider the difference, to run a `count(*)` on the original table needs more than **10 minutes**(!). For a simple group by two columns and a count it takes **like 18 minutes**. The `count(*)` query for the parquet file takes **half a second** and the group by query takes **3 seconds**. 
57 | 
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | *.arrow
163 | *.parquet*
164 | 
165 | *.sql
166 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import sys, os, time, logging, datetime
  2 | import pyarrow as pa
  3 | import pyarrow.parquet as pq
  4 | import psycopg
  5 | from dotenv import load_dotenv
  6 | from collections import OrderedDict
  7 | import adbc_driver_postgresql.dbapi
  8 | import duckdb
  9 | 
 10 | _query = None
 11 | 
 12 | 
 13 | tt = time.time()
 14 | gt = time.time()
 15 | 
 16 | 
 17 | def debug_w_time(msg):
 18 |     global tt
 19 |     logging.debug(
 20 |         f"{msg} ~~~ time: {time.time() - tt :.2f}, accumulated time: {time.time() - gt :.2f} ~~~"
 21 |     )
 22 |     tt = time.time()
 23 | 
 24 | 
 25 | def get_connection_dsn():
 26 |     return os.getenv("DB_DSN", "postgresql://postgres@127.0.0.1/postgres")
 27 | 
 28 | 
 29 | def write_batch(writer, name_types, schema):
 30 |     # debug_w_time(str(name_types))
 31 |     batch = pa.record_batch(
 32 |         [
 33 |             pa.array(name_types[k]["values"], type=name_types[k]["type"])
 34 |             for k in name_types.keys()
 35 |         ],
 36 |         schema,
 37 |     )
 38 |     writer.write_batch(batch)
 39 | 
 40 | 
 41 | def get_query(query_file):
 42 |     global _query
 43 |     if _query:
 44 |         return _query
 45 |     with open(query_file, "r", encoding="utf-8") as f:
 46 |         _query = f.read()
 47 |         return _query
 48 | 
 49 | 
 50 | def get_query_with_limit(query_file):
 51 |     query = get_query(query_file)
 52 | 
 53 |     querylines = query.splitlines()
 54 |     query = "\n".join([x for x in querylines if '%s' not in x])
 55 | 
 56 |     if "limit" not in query.lower():
 57 |         return query + " limit 1"
 58 |     else:
 59 |         lidx = query.lower().index("limit") + 6
 60 |         q = query[:lidx] + "1"
 61 |         return q
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     load_dotenv()
 66 | 
 67 |     loglevel = os.getenv("LOGLEVEL", "INFO")
 68 |     logging.basicConfig(
 69 |         format="%(asctime)s %(levelname)s %(message)s",
 70 |         level=getattr(logging, loglevel),
 71 |         datefmt="%Y-%m-%d %H:%M:%S",
 72 |     )
 73 |     debug_w_time("Starting with debug")
 74 | 
 75 |     if len(sys.argv) != 3:
 76 |         print("Usage: python main.py <query_file> <output_folder>")
 77 |         sys.exit(1)
 78 |     query_file = sys.argv[1]
 79 |     output_folder = sys.argv[2]
 80 | 
 81 |     try:
 82 |         res = duckdb.sql(
 83 |             f"SELECT max(id) FROM read_parquet('{output_folder}/**/*.parquet')"
 84 |         )
 85 |         min_id = res.fetchone()[0]
 86 |     except duckdb.duckdb.IOException:
 87 |         min_id = 0
 88 | 
 89 |     debug_w_time(f"Existing max_id = {min_id}")
 90 | 
 91 |     batch_size = int(os.getenv("BATCH_SIZE", "10000"))
 92 |     logging.info("Starting with batch size of {}".format(batch_size))
 93 | 
 94 |     debug_w_time("Trying to connect to db and read query column types")
 95 |     with adbc_driver_postgresql.dbapi.connect(get_connection_dsn()) as conn:
 96 |         with conn.cursor() as cur:
 97 |             cur.execute(get_query_with_limit(query_file))
 98 |             name_types = OrderedDict(
 99 |                 {x[0]: {"type": x[1], "values": []} for x in cur.description}
100 |             )
101 | 
102 |     name_type_keys = list(name_types.keys())
103 | 
104 |     schema = pa.schema([(x[0], x[1]["type"]) for x in name_types.items()])
105 |     debug_w_time(schema)
106 | 
107 |     compression = os.getenv("COMPRESSION", "NONE")
108 |     now = datetime.datetime.now()
109 |     now_str = now.strftime("%Y%m%dT%H%M%S")
110 | 
111 |     output_file = os.path.join(output_folder, f"output-{now_str}.parquet")
112 |     with pq.ParquetWriter(
113 |         output_file, schema=schema, compression=compression
114 |     ) as writer:
115 |         with psycopg.connect(get_connection_dsn()) as conn:
116 |             debug_w_time("Connected, starting to execute query...")
117 |             with conn.cursor("pg-parquet-cursor") as cur:
118 |                 cur.itersize = batch_size
119 |                 cur.execute(get_query(query_file), [min_id])
120 | 
121 |                 debug_w_time("Query executed...")
122 |                 for idx, record in enumerate(cur):
123 |                     for ridx, value in enumerate(record):
124 |                         name_types[name_type_keys[ridx]]["values"].append(value)
125 | 
126 |                     if idx % batch_size == 0:
127 |                         debug_w_time("Writing batch {}...".format(idx // batch_size))
128 |                         write_batch(writer, name_types, schema)
129 | 
130 |                         id = []
131 |                         for x in name_type_keys:
132 |                             name_types[x]["values"] = []
133 | 
134 |                 write_batch(writer, name_types, schema)
135 | 
136 |     logging.info("DONE!")
137 | 


--------------------------------------------------------------------------------