├── .env.template ├── duck.bat ├── requirements.txt ├── query2.sql ├── query.sql ├── LICENSE ├── README.md ├── .gitignore └── main.py /.env.template: -------------------------------------------------------------------------------- 1 | DB_DSN=postgresql://user:pass@host/db 2 | BATCH_SIZE=10000 3 | COMPRESSION=SNAPPY 4 | LOGLEVEL=INFO -------------------------------------------------------------------------------- /duck.bat: -------------------------------------------------------------------------------- 1 | duckdb -c "select count(*) from output.parquet" 2 | 3 | duckdb -c "DESCRIBE select * from output.parquet" 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | adbc-driver-postgresql==1.0.0 2 | black==24.4.2 3 | duckdb==1.0.0 4 | psycopg==3.1.19 5 | psycopg-binary==3.1.19 6 | pyarrow==16.1.0 7 | python-dotenv==1.0.1 8 | -------------------------------------------------------------------------------- /query2.sql: -------------------------------------------------------------------------------- 1 | SELECT p.id, 2 | p.departureport, 3 | dp.portname as port_from, 4 | p.arrivalport, 5 | ap.portname as port_to, 6 | s.depdate, s.firstportdepdate, 7 | s.id as sid, p.price, 8 | sh.name as shipname, sh.shipcode, p.licnumber, 9 | p.discount, p.cancelticket, p.confirm 10 | FROM vehicle p 11 | LEFT JOIN sailing s on p.sailing_fk = s.id 12 | LEFT JOIN port dp ON dp.portcode = p.departureport 13 | LEFT JOIN port ap ON ap.portcode = p.arrivalport 14 | LEFT JOIN "Ship" sh ON sh.shipcode = s.shipcode 15 | WHERE p.id > %s 16 | order by p.id 17 | -------------------------------------------------------------------------------- /query.sql: -------------------------------------------------------------------------------- 1 | SELECT p.id, 2 | p.departureport, 3 | dp.portname as port_from, 4 | p.arrivalport, 5 | ap.portname as port_to, 6 | s.depdate, s.firstportdepdate, 7 | s.id as sid, p.price, 8 | sh.name as shipname, sh.shipcode, p.passengername, 9 | p.discount, p.cancelticket, p.confirm 10 | FROM passenger p 11 | LEFT JOIN sailing s on p.sailing_fk = s.id 12 | LEFT JOIN port dp ON dp.portcode = p.departureport 13 | LEFT JOIN port ap ON ap.portcode = p.arrivalport 14 | LEFT JOIN "Ship" sh ON sh.shipcode = s.shipcode 15 | WHERE p.id > %s 16 | order by p.id 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Serafeim Papastefanos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Postgres to parquet with python 2 | 3 | Export a query from postgres to [parquet](https://parquet.apache.org/) with python. Apache parquet is 4 | open source, column-oriented data file format designed for efficient data storage and retrieval. The 5 | parquet file can then be used with a columnar database or even queried directly using something like 6 | [duckdb](https://duckdb.org/docs/data/parquet/overview.html). 7 | 8 | ## Requirements 9 | 10 | This project works with supported versions of python 3. All requirements are listed in the `requirements.txt` file. 11 | It uses the following dependencies: 12 | 13 | * pyarrow: the parquet file is created with [apache arrow](https://arrow.apache.org/docs/index.html); this is its python bindings 14 | * adbc-driver-postgresql: this is the [arrowdb connect - adbc](https://arrow.apache.org/docs/format/ADBC.html) driver for postgresql; it is used to retrieve the types of the columns of the queries so they can be re-used on the parquet file 15 | * psycopg: the library to query postgresql 16 | * python-dotenv: to load configuration from an `.env` file 17 | 18 | ## Installation 19 | 20 | Create a virtualenv and install the requirements: 21 | 22 | ```bash 23 | $ python -m venv venv 24 | $ source venv/bin/activate 25 | $ pip install -r requirements.txt 26 | ``` 27 | 28 | or in windows 29 | 30 | ```plain 31 | > py -3 -m venv vevn 32 | > venv\Scripts\activate.bat 33 | > pip install -r requirements.txt 34 | ``` 35 | 36 | 37 | ## Usage 38 | 39 | Copy over `.env.template` to `.env` and setup your database dsn and any other options you want. You can also use environment variables instead of the `.env` file. 40 | 41 | Then run `python main.py query_file.sql` where the query_file should contain the SQL query whose contents you want to export to the parquet file. See the file `query.sql` for an example. The output file will be named `output.parquet` by default. 42 | 43 | ## Configuration 44 | 45 | You can set the `LOGLEVEL` to `DEBUG` to see more messages including timings or to `ERROR` to see only error messages. The default is `INFO`. You can also set the `COMPRESSION` to `SNAPPY` (most common), `GZIP`, `BROTLI`, `LZ4` or `ZSTD`. The default is `NONE`. The `BATCH_SIZE` is the number of rows to fetch at a time from the database. The default is `10000`. Finally, the `DB_DSN` must have the format `DB_DSN=postgresql://user:pass@host/db` with correct values for user, pass, host and db name. 46 | 47 | ## Why? 48 | 49 | After you've created the parquet file of your data you import it at a columnar database or even query it directly using something like 50 | [duckdb](https://duckdb.org/docs/data/parquet/overview.html). Duckdb [binaries](https://duckdb.org/docs/installation/) are available for most systems or you can use a library to query the parquet file from within your app. Clickhouse db can also [query or import parquet] files (https://clickhouse.com/docs/en/integrations/data-formats/parquet). 51 | 52 | For example run something `duckdb -c "select count(*) from output.parquet"`. 53 | 54 | To give you an example of the timing differences: I had a table with ~ 150M rows. It took ~ 45 minutes to create the parquet file for this table resulting in an 1.3GB file (with SNAPPY compression). Then I could run aggregates for this data (group by, sum, count, etc) in seconds. 55 | 56 | The same aggregates on the original table took hours. To consider the difference, to run a `count(*)` on the original table needs more than **10 minutes**(!). For a simple group by two columns and a count it takes **like 18 minutes**. The `count(*)` query for the parquet file takes **half a second** and the group by query takes **3 seconds**. 57 | 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | *.arrow 163 | *.parquet* 164 | 165 | *.sql 166 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys, os, time, logging, datetime 2 | import pyarrow as pa 3 | import pyarrow.parquet as pq 4 | import psycopg 5 | from dotenv import load_dotenv 6 | from collections import OrderedDict 7 | import adbc_driver_postgresql.dbapi 8 | import duckdb 9 | 10 | _query = None 11 | 12 | 13 | tt = time.time() 14 | gt = time.time() 15 | 16 | 17 | def debug_w_time(msg): 18 | global tt 19 | logging.debug( 20 | f"{msg} ~~~ time: {time.time() - tt :.2f}, accumulated time: {time.time() - gt :.2f} ~~~" 21 | ) 22 | tt = time.time() 23 | 24 | 25 | def get_connection_dsn(): 26 | return os.getenv("DB_DSN", "postgresql://postgres@127.0.0.1/postgres") 27 | 28 | 29 | def write_batch(writer, name_types, schema): 30 | # debug_w_time(str(name_types)) 31 | batch = pa.record_batch( 32 | [ 33 | pa.array(name_types[k]["values"], type=name_types[k]["type"]) 34 | for k in name_types.keys() 35 | ], 36 | schema, 37 | ) 38 | writer.write_batch(batch) 39 | 40 | 41 | def get_query(query_file): 42 | global _query 43 | if _query: 44 | return _query 45 | with open(query_file, "r", encoding="utf-8") as f: 46 | _query = f.read() 47 | return _query 48 | 49 | 50 | def get_query_with_limit(query_file): 51 | query = get_query(query_file) 52 | 53 | querylines = query.splitlines() 54 | query = "\n".join([x for x in querylines if '%s' not in x]) 55 | 56 | if "limit" not in query.lower(): 57 | return query + " limit 1" 58 | else: 59 | lidx = query.lower().index("limit") + 6 60 | q = query[:lidx] + "1" 61 | return q 62 | 63 | 64 | if __name__ == "__main__": 65 | load_dotenv() 66 | 67 | loglevel = os.getenv("LOGLEVEL", "INFO") 68 | logging.basicConfig( 69 | format="%(asctime)s %(levelname)s %(message)s", 70 | level=getattr(logging, loglevel), 71 | datefmt="%Y-%m-%d %H:%M:%S", 72 | ) 73 | debug_w_time("Starting with debug") 74 | 75 | if len(sys.argv) != 3: 76 | print("Usage: python main.py ") 77 | sys.exit(1) 78 | query_file = sys.argv[1] 79 | output_folder = sys.argv[2] 80 | 81 | try: 82 | res = duckdb.sql( 83 | f"SELECT max(id) FROM read_parquet('{output_folder}/**/*.parquet')" 84 | ) 85 | min_id = res.fetchone()[0] 86 | except duckdb.duckdb.IOException: 87 | min_id = 0 88 | 89 | debug_w_time(f"Existing max_id = {min_id}") 90 | 91 | batch_size = int(os.getenv("BATCH_SIZE", "10000")) 92 | logging.info("Starting with batch size of {}".format(batch_size)) 93 | 94 | debug_w_time("Trying to connect to db and read query column types") 95 | with adbc_driver_postgresql.dbapi.connect(get_connection_dsn()) as conn: 96 | with conn.cursor() as cur: 97 | cur.execute(get_query_with_limit(query_file)) 98 | name_types = OrderedDict( 99 | {x[0]: {"type": x[1], "values": []} for x in cur.description} 100 | ) 101 | 102 | name_type_keys = list(name_types.keys()) 103 | 104 | schema = pa.schema([(x[0], x[1]["type"]) for x in name_types.items()]) 105 | debug_w_time(schema) 106 | 107 | compression = os.getenv("COMPRESSION", "NONE") 108 | now = datetime.datetime.now() 109 | now_str = now.strftime("%Y%m%dT%H%M%S") 110 | 111 | output_file = os.path.join(output_folder, f"output-{now_str}.parquet") 112 | with pq.ParquetWriter( 113 | output_file, schema=schema, compression=compression 114 | ) as writer: 115 | with psycopg.connect(get_connection_dsn()) as conn: 116 | debug_w_time("Connected, starting to execute query...") 117 | with conn.cursor("pg-parquet-cursor") as cur: 118 | cur.itersize = batch_size 119 | cur.execute(get_query(query_file), [min_id]) 120 | 121 | debug_w_time("Query executed...") 122 | for idx, record in enumerate(cur): 123 | for ridx, value in enumerate(record): 124 | name_types[name_type_keys[ridx]]["values"].append(value) 125 | 126 | if idx % batch_size == 0: 127 | debug_w_time("Writing batch {}...".format(idx // batch_size)) 128 | write_batch(writer, name_types, schema) 129 | 130 | id = [] 131 | for x in name_type_keys: 132 | name_types[x]["values"] = [] 133 | 134 | write_batch(writer, name_types, schema) 135 | 136 | logging.info("DONE!") 137 | --------------------------------------------------------------------------------