├── .flake8 ├── .github └── workflows │ └── fmt-test-lint.yaml ├── .gitignore ├── Makefile ├── README.md ├── build.requirements.txt ├── dask_polars ├── __init__.py ├── __version.py ├── core.py └── tests │ ├── __init__.py │ └── test_core.py ├── pyproject.toml └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # just stop shouting as black decides line lengths. 3 | max-line-length = 180 4 | # E203, W503: due to black fmt 5 | ignore = E203,W503 6 | exclude = .venv 7 | 8 | -------------------------------------------------------------------------------- /.github/workflows/fmt-test-lint.yaml: -------------------------------------------------------------------------------- 1 | name: format, lint and test 2 | 3 | on: 4 | - pull_request 5 | jobs: 6 | all: 7 | name: format, lint and test 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [ "3.7", "3.10" ] 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install dependencies 19 | run: | 20 | make .venv 21 | - name: Run formatting checks 22 | run: | 23 | .venv/bin/black --check . && .venv/bin/isort --check . 24 | - name: Run linting 25 | run: | 26 | .venv/bin/flake8 27 | - name: Run tests 28 | run: | 29 | make test 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .pytest_cache/ 3 | .idea/ 4 | *.egg-info 5 | __pycache__/ 6 | .mypy_cache/ 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: clean fmt lint test 3 | 4 | .venv: 5 | @python -m venv .venv 6 | @.venv/bin/pip install -U pip 7 | @.venv/bin/pip install -r build.requirements.txt 8 | 9 | test: .venv 10 | @.venv/bin/pytest dask_polars/tests 11 | 12 | clean: 13 | @rm -r .venv 14 | 15 | fmt: .venv 16 | @.venv/bin/isort . 17 | @.venv/bin/black . 18 | 19 | lint: .venv 20 | @.venv/bin/flake8 . 21 | @.venv/bin/mypy 22 | 23 | pre-commit: fmt lint 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dask-polars 2 | 3 | ## Development guide 4 | 5 | We've got some make rules to make it easy to start: 6 | 7 | * `$ make fmt` 8 | * `$ make lint` 9 | * `$ make pre-commit: fmt lint` 10 | * `$ make test` 11 | * `$ make clean` 12 | 13 | -------------------------------------------------------------------------------- /build.requirements.txt: -------------------------------------------------------------------------------- 1 | # dependencies 2 | polars 3 | dask 4 | 5 | # tooling 6 | pytest==6.2.5 7 | black==21.6b0 8 | isort~=5.9.2 9 | flake8~=4.0.1 10 | mypy==0.931 -------------------------------------------------------------------------------- /dask_polars/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version import __version__ 2 | from .core import from_dataframe 3 | 4 | __all__ = ["__version__", "from_dataframe"] 5 | -------------------------------------------------------------------------------- /dask_polars/__version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1" 2 | -------------------------------------------------------------------------------- /dask_polars/core.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import operator 3 | 4 | import dask 5 | import polars as pl 6 | from dask.utils import apply, funcname 7 | 8 | 9 | def create_empty_df(df: pl.DataFrame) -> pl.DataFrame: 10 | """ 11 | Create an empty polars DataFrame without increasing the reference count 12 | 13 | Parameters 14 | ---------- 15 | df 16 | DataFrame to create an empty from 17 | """ 18 | return pl.DataFrame( 19 | [pl.Series(name, [], dtype=dtype) for name, dtype in zip(df.columns, df.dtypes)] 20 | ) 21 | 22 | 23 | class DataFrame(dask.base.DaskMethodsMixin): 24 | def __init__(self, name: str, graph: dict, meta: pl.DataFrame, npartitions: int): 25 | self._name = name 26 | self._graph = graph 27 | # also used as identity in folds 28 | self._meta = meta 29 | self.npartitions = npartitions 30 | 31 | def __dask_graph__(self): 32 | return self._graph 33 | 34 | def __dask_keys__(self): 35 | return [(self._name, i) for i in range(self.npartitions)] 36 | 37 | @staticmethod 38 | def __dask_optimize__(graph, keys): 39 | return graph 40 | 41 | __dask_scheduler__ = staticmethod(dask.threaded.get) 42 | 43 | def __dask_postcompute__(self): 44 | return pl.concat, () 45 | 46 | def __dask_tokenize__(self): 47 | return self._name 48 | 49 | def map_partitions(self, func, *args, **kwargs): 50 | name = funcname(func) + "-" + dask.base.tokenize(self, func, **kwargs) 51 | graph = { 52 | (name, i): (apply, func, [key] + list(args), kwargs) 53 | for i, key in enumerate(self.__dask_keys__()) 54 | } 55 | meta = func(self._meta, *args, **kwargs) 56 | return DataFrame(name, {**self._graph, **graph}, meta, self.npartitions) 57 | 58 | def __add__(self, other): 59 | if not isinstance(other, numbers.Number): 60 | return NotImplemented 61 | return self.map_partitions(operator.add, other) 62 | 63 | def head(self, length: int = 5): 64 | name = "head-" + dask.base.tokenize(self, length) 65 | graph = {(name, 0): (pl.DataFrame.head, self._graph[(self._name, 0)], length)} 66 | return DataFrame(name, {**self._graph, **graph}, self._meta, 1) 67 | 68 | def sum(self): 69 | tmp = self.map_partitions(pl.DataFrame.sum) 70 | name = "sum-" + dask.base.tokenize(tmp) 71 | graph = {(name, 0): (pl.DataFrame.sum, (pl.concat, tmp.__dask_keys__()))} 72 | return DataFrame(name, {**tmp._graph, **graph}, self._meta.sum(), 1) 73 | 74 | def __repr__(self): 75 | return self.head().compute().__repr__() 76 | 77 | 78 | def from_dataframe(df: pl.DataFrame, npartitions: int = 1) -> DataFrame: 79 | assert npartitions == 1 80 | name = "from-dataframe-" + dask.base.tokenize(df) 81 | graph = {(name, 0): df} 82 | 83 | return DataFrame(name, graph, create_empty_df(df), npartitions) 84 | -------------------------------------------------------------------------------- /dask_polars/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pola-rs/dask-polars/8ec05a836c93c1cd2b9ddc484cbcbadbe5abd251/dask_polars/tests/__init__.py -------------------------------------------------------------------------------- /dask_polars/tests/test_core.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | import dask_polars as dp 4 | 5 | df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [1.0, 2.0, 3.0, 4.0]}) 6 | ddf = dp.from_dataframe(df) 7 | 8 | 9 | def test_basic(): 10 | assert str(ddf.compute()) == str(df) 11 | 12 | 13 | def test_meta(): 14 | assert list(ddf._meta.schema.keys()) == df.columns 15 | assert list(ddf._meta.schema.values()) == df.dtypes 16 | 17 | 18 | def test_sum(): 19 | assert str(ddf.sum().compute()) == str(df.sum()) 20 | 21 | 22 | def test_add(): 23 | assert str((ddf + 2).compute()) == str(df + 2) 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dask_polars" 3 | license = "BSD" 4 | 5 | dependencies = [ 6 | "polars", 7 | "dask" 8 | ] 9 | 10 | requires-python = ">=3.7" 11 | 12 | [tool.isort] 13 | profile = "black" 14 | 15 | [tool.mypy] 16 | warn_unused_ignores = true 17 | show_error_codes = true 18 | files = ["dask_polars"] 19 | 20 | [[tool.mypy.overrides]] 21 | module = ["dask.*"] 22 | ignore_missing_imports = true 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from os.path import exists 4 | 5 | from setuptools import setup 6 | 7 | import dask_polars 8 | 9 | setup( 10 | name="dask_polars", 11 | version=dask_polars.__version__, 12 | license="BSD", 13 | packages=["dask_polars"], 14 | long_description=(open("README.rst").read() if exists("README.rst") else ""), 15 | zip_safe=False, 16 | python_requires=">=3.7", 17 | ) 18 | --------------------------------------------------------------------------------