├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── python-app.yml
    │   └── python-publish.yml
├── tests
    ├── __init__.py
    ├── test_pickle.py
    ├── test_place.py
    ├── test_partitioned.py
    ├── test_iterate.py
    ├── test_divide.py
    ├── test_access.py
    ├── test_filesystem.py
    ├── test_assemble.py
    └── conftest.py
├── docs
    ├── row.png
    ├── both.png
    ├── blocks.gif
    ├── column.png
    ├── filtered.png
    ├── row_iter.png
    ├── blocks_layout.png
    ├── column_iter.png
    ├── requirements.txt
    ├── core.rst
    ├── filesystem.rst
    ├── Makefile
    ├── index.rst
    ├── examples.rst
    ├── conf.py
    └── quickstart.rst
├── MANIFEST.in
├── blocks
    ├── filesystem
    │   ├── __init__.py
    │   ├── gcs_filesystem.py
    │   ├── gcs_native_filesystem.py
    │   └── base.py
    ├── __init__.py
    ├── utils.py
    ├── dfio.py
    └── core.py
├── .readthedocs.yaml
├── .pre-commit-config.yaml
├── setup.cfg
├── setup.py
├── .gitignore
├── README.md
├── CHANGELOG.md
└── LICENSE.txt


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @baxen
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/docs/row.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/row.png


--------------------------------------------------------------------------------
/docs/both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/both.png


--------------------------------------------------------------------------------
/docs/blocks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/blocks.gif


--------------------------------------------------------------------------------
/docs/column.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/column.png


--------------------------------------------------------------------------------
/docs/filtered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/filtered.png


--------------------------------------------------------------------------------
/docs/row_iter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/row_iter.png


--------------------------------------------------------------------------------
/docs/blocks_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/blocks_layout.png


--------------------------------------------------------------------------------
/docs/column_iter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/blocks/HEAD/docs/column_iter.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | 
3 | recursive-include tests *
4 | recursive-exclude * __pycache__
5 | recursive-exclude * *.py[co]
6 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Frozen versions of fsspec and gcsfs to avoid conflict in read the docs
2 | fsspec==2021.05.0
3 | gcsfs==2021.05.0
4 | 


--------------------------------------------------------------------------------
/docs/core.rst:
--------------------------------------------------------------------------------
 1 | ====
 2 | Core
 3 | ====
 4 | 
 5 | .. automodule:: blocks.core
 6 |     :members:
 7 |     :undoc-members:
 8 |     :show-inheritance:
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/filesystem.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Filesystem
 3 | ==========
 4 | 
 5 | 
 6 | .. automodule:: blocks.filesystem.base
 7 |     :members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/blocks/filesystem/__init__.py:
--------------------------------------------------------------------------------
1 | from blocks.filesystem.gcs_filesystem import GCSFileSystem  # NOQA
2 | from blocks.filesystem.gcs_native_filesystem import GCSNativeFileSystem  # NOQA
3 | from blocks.filesystem.base import FileSystem  # NOQA
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |    configuration: docs/conf.py
 5 | 
 6 | formats:
 7 |    - pdf
 8 | 
 9 | python:
10 |    version: 3.7
11 |    install:
12 |    - requirements: docs/requirements.txt
13 |    - method: pip
14 |      path: .
15 |      extra_requirements:
16 |        - doc
17 | 


--------------------------------------------------------------------------------
/tests/test_pickle.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import blocks
 3 | 
 4 | 
 5 | def test_pickle(fs, temp):
 6 |     original = {"example": 0.0}
 7 |     path = os.path.join(temp, "test.pkl")
 8 |     blocks.pickle(original, path, filesystem=fs)
 9 |     unpickled = blocks.unpickle(path, filesystem=fs)
10 |     assert original == unpickled
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |     -   id: end-of-file-fixer
 7 |     -   id: trailing-whitespace
 8 | -   repo: https://github.com/psf/black
 9 |     rev: 19.10b0
10 |     hooks:
11 |     -   id: black
12 | 


--------------------------------------------------------------------------------
/blocks/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __author__ = """Bradley Axen"""
 4 | __email__ = "baxen@squareup.com"
 5 | __version__ = "0.9.4"
 6 | 
 7 | 
 8 | from blocks.core import (
 9 |     assemble,
10 |     iterate,
11 |     partitioned,
12 |     divide,
13 |     place,
14 |     pickle,
15 |     unpickle,
16 | )  # NOQA
17 | 


--------------------------------------------------------------------------------
/tests/test_place.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import blocks
 4 | 
 5 | 
 6 | def test_place(datadir, randomdata, fs):
 7 |     blocks.place(randomdata, os.path.join(datadir, "example.parquet"), filesystem=fs)
 8 |     assert len(fs.ls(datadir)) == 1
 9 |     df = blocks.assemble(os.path.join(datadir, "*"), filesystem=fs)
10 |     assert np.isclose(df, randomdata).all().all()
11 | 


--------------------------------------------------------------------------------
/blocks/filesystem/gcs_filesystem.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from blocks.filesystem.base import FileSystem
 4 | 
 5 | 
 6 | class GCSFileSystem(FileSystem):
 7 |     """Deprecated, see FileSystem"""
 8 | 
 9 |     def __init__(self, *args, **kwargs):
10 |         warnings.warn(
11 |             "This class is deprecated, use blocks.filesystem.FileSystem",
12 |             DeprecationWarning,
13 |         )
14 |         super().__init__(*args, **kwargs)
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.9.4
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version="{current_version}"
 8 | replace = version="{new_version}"
 9 | 
10 | [bumpversion:file:blocks/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 | 
14 | [bumpversion:file:docs/conf.py]
15 | 
16 | [flake8]
17 | max-line-length = 100
18 | exclude = docs
19 | 


--------------------------------------------------------------------------------
/blocks/filesystem/gcs_native_filesystem.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from blocks.filesystem.base import FileSystem
 4 | 
 5 | 
 6 | class GCSNativeFileSystem(FileSystem):
 7 |     """Deprecated, see FileSystem"""
 8 | 
 9 |     def __init__(self, *args, **kwargs):
10 |         warnings.warn(
11 |             "This class is deprecated, use blocks.filesystem.FileSystem",
12 |             DeprecationWarning,
13 |         )
14 |         super().__init__(*args, **kwargs)
15 | 


--------------------------------------------------------------------------------
/tests/test_partitioned.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import pytest
 4 | from blocks import core
 5 | 
 6 | 
 7 | def test_partitioned(populated):
 8 |     pytest.importorskip("dask.dataframe")
 9 |     df = core.partitioned(populated).compute()
10 |     assert df.shape == (40, 41)
11 |     expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)]
12 |     expected.append("key")
13 |     assert set(df.columns) == set(expected)
14 | 


--------------------------------------------------------------------------------
/blocks/utils.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import tempfile
 3 | 
 4 | import wrapt
 5 | 
 6 | 
 7 | @wrapt.decorator
 8 | def with_function_tmpdir(wrapped, instance, args, kwargs):
 9 |     with tempfile.TemporaryDirectory() as tmpdir:
10 |         kwargs["tmpdir"] = tmpdir
11 |         return wrapped(*args, **kwargs)
12 | 
13 | 
14 | @wrapt.decorator
15 | def with_session_tmpdir(wrapped, instance, args, kwargs):
16 |     tmpdir = tempfile.TemporaryDirectory()
17 |     kwargs["tmpdir"] = tmpdir.name
18 |     atexit.register(tmpdir.cleanup)
19 |     return wrapped(*args, **kwargs)
20 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | name: Python application
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: [3.6, 3.7, 3.8]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install --upgrade .[dev]
27 |     - name: Test
28 |       run: |
29 |         pytest
30 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Blocks
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | coverage: Makefile
18 | 	rm -rf htmlcov && pipenv run pytest --cov=blocks --cov-report html ../tests
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: '3.x'
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install setuptools twine
22 |     - name: Build
23 |       run: |
24 |         python setup.py sdist bdist_wheel
25 |     - name: Publish
26 |       env:
27 |         TWINE_REPOSITORY_URL: https://upload.pypi.org/legacy/
28 |         TWINE_USERNAME: __token__
29 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
30 |       run: |
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/tests/test_iterate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from blocks import core
 4 | 
 5 | 
 6 | def test_iterate(populated):
 7 |     dfs = core.iterate(populated)
 8 |     for i in range(4):
 9 |         for j in range(4):
10 |             cname, rname, df = next(dfs)
11 |             assert cname == "c{}".format(i)
12 |             assert rname == "part.{}.csv".format(j)
13 |             assert df.shape == (10, 11)
14 | 
15 | 
16 | def test_iterate_ordered(populated):
17 |     order = ["c3", "c1", "c2", "c0"]
18 |     dfs = core.iterate(populated, cgroups=order)
19 |     for i in range(4):
20 |         for j in range(4):
21 |             cname, rname, df = next(dfs)
22 |             assert cname == order[i]
23 |             assert rname == "part.{}.csv".format(j)
24 |             assert df.shape == (10, 11)
25 | 
26 | 
27 | def test_iterate_axis0(populated):
28 |     dfs = core.iterate(populated, axis=0)
29 |     for i in range(4):
30 |         rname, df = next(dfs)
31 |         assert rname == "part.{}.csv".format(i)
32 |         assert df.shape == (10, 41)
33 | 
34 | 
35 | def test_iterate_axis1(populated):
36 |     dfs = core.iterate(populated, axis=1)
37 |     for i in range(4):
38 |         cname, df = next(dfs)
39 |         assert cname == "c{}".format(i)
40 |         assert df.shape == (40, 11)
41 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | with open("README.md") as readme_file:
 7 |     readme = readme_file.read()
 8 | 
 9 | requirements = [
10 |     "pandas",
11 |     "fsspec>=2021.7.0",
12 |     "gcsfs>=2021.7.0",
13 |     "s3fs",
14 |     "pyarrow",
15 |     "fastavro",
16 |     "wrapt",
17 | ]
18 | 
19 | extras_require = {
20 |     "tests": ["pytest", "pytest-cov", "delegator.py", "flake8"],
21 |     "doc": ["sphinx", "numpydoc", "furo", "sphinx-copybutton"],
22 |     "format": ["pre-commit"],
23 | }
24 | extras_require["dev"] = set(sum(extras_require.values(), []))
25 | 
26 | setup(
27 |     name="sq-blocks",
28 |     version="0.9.4",
29 |     description=(
30 |         "Blocks provides a simple interface to read, organize, and manipulate structured data"
31 |         " in files on local and cloud storage"
32 |     ),
33 |     long_description=readme,
34 |     long_description_content_type="text/markdown",
35 |     author="Bradley Axen",
36 |     author_email="baxen@squareup.com",
37 |     packages=find_packages(),
38 |     include_package_data=True,
39 |     install_requires=requirements,
40 |     extras_require=extras_require,
41 |     zip_safe=False,
42 |     keywords="blocks",
43 |     classifiers=[
44 |         "Intended Audience :: Developers",
45 |         "Natural Language :: English",
46 |         "Programming Language :: Python :: 3",
47 |     ],
48 | )
49 | 


--------------------------------------------------------------------------------
/tests/test_divide.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import blocks
 3 | 
 4 | 
 5 | def test_divide(datadir, randomdata, fs):
 6 |     blocks.divide(randomdata, datadir, 10, extension=".csv", filesystem=fs)
 7 |     assert len(fs.ls(datadir)) == 10
 8 |     df = blocks.assemble(datadir)
 9 |     assert np.isclose(df, randomdata).all().all()
10 | 
11 | 
12 | def test_divide_offset(datadir, randomdata, fs):
13 |     blocks.divide(randomdata, datadir, 10, extension=".csv", filesystem=fs)
14 |     blocks.divide(randomdata, datadir, 10, 10, extension=".csv", filesystem=fs)
15 |     assert len(fs.ls(datadir)) == 20
16 | 
17 |     df = blocks.assemble(datadir)
18 |     expected = randomdata.append(randomdata)
19 |     assert np.isclose(df, expected).all().all()
20 | 
21 | 
22 | def test_divide_cgroups(datadir, randomdata, fs):
23 |     randomdata.insert(0, "key", list(range(10)))
24 |     cgroups_columns = {
25 |         "cgroup1": ["key", "f0", "f1", "f2"],
26 |         "cgroup2": ["key", "f3", "f4", "f5"],
27 |         "cgroup3": ["key", "f6", "f7", "f8", "f9"],
28 |     }
29 | 
30 |     blocks.divide(
31 |         randomdata,
32 |         datadir,
33 |         10,
34 |         cgroup_columns=cgroups_columns,
35 |         extension=".csv",
36 |         filesystem=fs,
37 |     )
38 |     assert len(fs.ls(datadir)) == 3
39 |     for subdir in fs.ls(datadir):
40 |         assert len(fs.ls(subdir)) == 10
41 |     df = blocks.assemble(datadir)
42 |     assert np.isclose(df, randomdata).all().all()
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .vscode/
107 | 


--------------------------------------------------------------------------------
/tests/test_access.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from blocks import core
 5 | 
 6 | 
 7 | def test_expand(populated, fs):
 8 |     expected = [
 9 |         os.path.join(populated, "c{}/part.{}.csv".format(i, j))
10 |         for i in range(4)
11 |         for j in range(4)
12 |     ]
13 | 
14 |     # All of these patterns should expand into the same set of files
15 |     for ex in ["", "*/*", "**"]:
16 |         ex = os.path.join(populated, ex)
17 |         paths = fs.ls(ex)
18 |         expanded = sorted(core._expand(paths, fs))
19 |         assert expanded == expected
20 | 
21 | 
22 | def test_expand_pattern(populated, fs):
23 |     expected = [
24 |         os.path.join(populated, "c{}/part.{}.csv".format(i, j))
25 |         for i in range(2)
26 |         for j in range(4)
27 |     ]
28 | 
29 |     # All of these patterns should expand into the same set of files
30 |     for ex in ["c[01]**", "c[01]/*"]:
31 |         ex = os.path.join(populated, ex)
32 |         paths = fs.ls(ex)
33 |         expanded = sorted(core._expand(paths, fs))
34 |         assert expanded == expected
35 | 
36 | 
37 | def test_cgroups():
38 |     expanded = ["base/c{}/part.{}.csv".format(i, j) for i in range(4) for j in range(4)]
39 |     cgroups = core._cgroups(expanded)
40 |     for i in range(4):
41 |         key = "c{}".format(i)
42 |         assert key in cgroups
43 |         assert cgroups[key] == ["base/c{}/part.{}.csv".format(i, j) for j in range(4)]
44 | 
45 | 
46 | def test_access(populated, fs, tmpdir):
47 |     tmpdir = str(tmpdir)
48 |     paths = fs.ls(populated)
49 |     expanded = core._expand(paths, fs)
50 |     cgroups = core._cgroups(expanded)
51 |     cgroups = core._access(cgroups, fs, tmpdir)
52 |     assert len(cgroups) == 4
53 |     for c, paths in cgroups.items():
54 |         assert len(paths) == 4
55 |         for path in paths:
56 |             with open(path, "r") as f:
57 |                 assert f.read()
58 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ======
 2 | Blocks
 3 | ======
 4 | 
 5 | .. image:: blocks.gif
 6 | 
 7 | Blocks provides a simple interface to read, organize, and manipulate structured data in files
 8 | on local and cloud storage
 9 | 
10 | Install
11 | -------------
12 | .. code-block:: bash
13 | 
14 |     pip install sq-blocks
15 | 
16 | Features
17 | --------
18 | 
19 | .. code-block:: python
20 | 
21 |     import blocks
22 |   
23 |     # Load one or more files with the same interface
24 |     df = blocks.assemble('data.csv')
25 |     train = blocks.assemble('data/*[0-7].csv')
26 |     test = blocks.assemble('data/*[89].csv')
27 |   
28 |     # With direct support for files on GCS
29 |     df = blocks.assemble('gs://mybucket/data.csv')
30 |     df = blocks.assemble('gs://mybucket/data/*.csv')
31 |                 
32 | The interface emulates the tools you're used to from the command line, with full support for globbing and pattern
33 | matching. And blocks can handle more complicated structures as your data grows in complexity:
34 | 
35 | =======================  =====================================================================
36 | Layout                   Recipe
37 | =======================  =====================================================================
38 | .. image:: both.png      .. code-block:: python
39 | 
40 |                              blocks.assemble('data/**')
41 | 
42 | .. image:: column.png    .. code-block:: python
43 | 
44 |                              blocks.assemble('data/g1/*')
45 | 
46 | .. image:: row.png       .. code-block:: python
47 | 
48 |                              blocks.assemble('data/*/part_01.pq')
49 | 
50 | .. image:: filtered.png  .. code-block:: python
51 | 
52 |                              blocks.assemble('data/g[124]/part_01.pq')
53 | 
54 | =======================  =====================================================================
55 | 
56 | 
57 | 
58 | .. toctree::
59 |    :hidden:
60 | 
61 |    quickstart
62 |    examples
63 |    core
64 |    filesystem
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Blocks
 2 | 
 3 | Blocks provides a simple interface to read, organize, and manipulate structured data in files
 4 | on local and cloud storage. See the [documentation](https://sq-blocks.readthedocs.io) for more
 5 | information.
 6 | 
 7 |     pip install sq-blocks
 8 | 
 9 | ![blocks](docs/blocks.gif)
10 | 
11 | ## Development
12 | 
13 | ### Setup
14 | 
15 | To install all dependencies for local development and testing, you can do
16 | 
17 |     pip install -e .[dev]
18 | 
19 | ### Tests
20 | 
21 | * `pytest` runs the unit tests
22 | 
23 | To run them locally:
24 | 
25 |     pytest
26 | 
27 | ### Continuous Integrations
28 | 
29 | CI is handled through GitHub Actions, and will run non-GCS tests on 3.6, 3.7, 3.8.
30 | We may add cloud storage tests to CI soon, but for now tests should also be
31 | run locally to confirm that functionality works as well.
32 | 
33 | ### Versions and Tags
34 | 
35 | Use bumpversion to update the version of the package
36 | 
37 |     bumpversion [major|minor|patch]
38 | 
39 | This will increment the version and update it both in `setup.py` and `blocks/__init__.py`.
40 | It will also automatically commit a tag with the corresponding version. You can push this to the repo
41 | with
42 | 
43 |     git push --tags
44 | 
45 | ### Formatting
46 | 
47 | We use pre-commit to ensure consistent formatting, to make sure you run the
48 | hooks:
49 | 
50 |     pre-commit install
51 | 
52 | ### Docs
53 | 
54 | The docs are generated from the code with
55 | [sphinx](https://www.sphinx-doc.org/en/master/), and can be tested locally:
56 | 
57 |     cd docs
58 |     make html
59 | 
60 | ## License
61 | 
62 | Copyright 2018 Square, Inc.
63 | 
64 | Licensed under the Apache License, Version 2.0 (the "License");
65 | you may not use this file except in compliance with the License.
66 | You may obtain a copy of the License at
67 | 
68 |    http://www.apache.org/licenses/LICENSE-2.0
69 | 
70 | Unless required by applicable law or agreed to in writing, software
71 | distributed under the License is distributed on an "AS IS" BASIS,
72 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
73 | See the License for the specific language governing permissions and
74 | limitations under the License.
75 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [0.9.0] - 2021-02-25
 2 | 
 3 | ### Removed
 4 | 
 5 | - GCS(Native)FileSystem no longer provides store/access
 6 | - GCSFileSystem are now backwards compatibility wrappers for FileSystem and will
 7 |   be removed in 1.0.0
 8 | - No more explicit compression support, compression may still be possible
 9 |   through read/write args
10 | 
11 | ### Added
12 |  
13 | - New generic FileSystem backed by fsspec 
14 |   - rather than using fsspec directly we use this wrapper for better backwards
15 |     compatibility and more automatic protocol handling
16 |   - In theory any fsspec implementation is supported but only local and gcsfs
17 |     are tested so far
18 | 
19 | ### Changed
20 | - We now use paths (rather than file objects) in pandas io methods for better
21 |   compatibility
22 | - All GCS operations are handled through gcsfs, which has much better
23 |   performance with large numbers of files and has been more robust to connection
24 |   errors
25 | - Globbing must now expand to match patterns to literal files, not directories
26 | 
27 | ## [0.8.0] - 2020-10-14
28 | 
29 | ### Removed
30 | 
31 | - Dropped Python 2 support.
32 | - Compression on write no longer supported by Pandas
33 | 
34 | ### Added
35 | 
36 | - Typehints for Python 3
37 | - some missing abstract methods to the base FileSystem class definition.
38 | 
39 | ## [0.7.1] - 2020-08-20
40 | 
41 | ### Added
42 | 
43 | `blocks.pickle` and `blocks.unpickle` utilities to save and load pickle files.
44 | 
45 | ## [0.7.0] - 2020-07-22
46 | 
47 | This release has minor backwards incompatible for anyone that directly used
48 | datafiles. The top level and filesystem APIs (assemble, iterate, partitioned,
49 | etc) are unchanged.
50 | 
51 | ### Added
52 | 
53 | - LocalDataFile that implements datafile for local paths
54 | - GCSNativeDataFile that implements datafile for GCS paths using GCS python blob API
55 | 
56 | ### Changed
57 | 
58 | - The old datafile namedtuple is now an abstract base class
59 | - Datafiles now use a contextmanager for handle, which yields a file handle
60 | - Datafiles are only opened one at a time just before the data is loaded into
61 |   memory
62 |   - This should prevent exceeding the os open file limit with large directories
63 |   - Also sets the stage for better multithreading support
64 | 


--------------------------------------------------------------------------------
/tests/test_filesystem.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from uuid import uuid4
 5 | 
 6 | TEST_STRING = b"test"
 7 | 
 8 | 
 9 | def test_ls_directory(populated, fs):
10 |     ex = os.path.join(populated, "")
11 |     expected = [os.path.join(populated, "c{}".format(i)) for i in range(4)]
12 | 
13 |     # one of the fixtures has an extra file
14 |     found = fs.ls(ex)
15 |     if len(found) > 4:
16 |         found = found[:-1]
17 |     assert found == expected
18 | 
19 | 
20 | def test_ls_wildcard(populated, fs):
21 |     ex = os.path.join(populated, "*/part.1.csv")
22 |     expected = [os.path.join(populated, "c{}/part.1.csv".format(i)) for i in range(4)]
23 |     assert fs.ls(ex) == expected
24 | 
25 | 
26 | def test_ls_double_wildcard(populated, fs):
27 |     ex = os.path.join(populated, "**")
28 |     expected = {
29 |         os.path.join(populated, "c{}/part.{}.csv".format(i, j))
30 |         for i in range(4)
31 |         for j in range(4)
32 |     }
33 |     assert expected.issubset(set(fs.ls(ex)))
34 | 
35 | 
36 | def test_ls_pattern(populated, fs):
37 |     ex = os.path.join(populated, "*/part.[01].csv")
38 |     expected = [
39 |         os.path.join(populated, "c{}/part.{}.csv".format(i, j))
40 |         for i in range(4)
41 |         for j in range(2)
42 |     ]
43 |     assert fs.ls(ex) == expected
44 | 
45 | 
46 | def test_open_read(populated, fs):
47 |     with fs.open(os.path.join(populated, "c0/part.0.csv"), "r") as f:
48 |         assert f.readline() == "f0_0,f0_1,f0_2,f0_3,f0_4,f0_5,f0_6,f0_7,f0_8,f0_9,key\n"
49 | 
50 | 
51 | def test_open_write(temp, fs):
52 |     content = str(uuid4())
53 |     path = os.path.join(temp, "content")
54 |     with fs.open(path, "w") as f:
55 |         f.write(content)
56 | 
57 |     with fs.open(path, "r") as f:
58 |         assert f.read() == content
59 | 
60 | 
61 | def test_copy_recursive_to_local(populated, tmpdir, fs):
62 |     dest = str(tmpdir)
63 |     fs.cp(populated, dest, recursive=True)
64 |     source = [p.replace(populated, "") for p in fs.ls(populated + "/**")]
65 |     copy = fs.ls(dest + "/**")
66 |     assert (s in c for s, c in zip(source, copy))
67 | 
68 | 
69 | def test_copy_recursive_matched(populated, fs):
70 |     dest = populated.replace("data", "copy")
71 |     try:
72 |         fs.cp(populated, dest, recursive=True)
73 |         source = [p.replace(populated, "") for p in fs.ls(populated + "/**")]
74 |         copy = fs.ls(dest + "/**")
75 |         assert (s in c for s, c in zip(source, copy))
76 |     finally:
77 |         fs.rm(dest, recursive=True)
78 | 
79 | 
80 | def test_rm(populated, fs):
81 |     dest = populated.replace("data", "copy")
82 |     fs.cp([os.path.join(populated, "")], dest, recursive=True)
83 |     assert fs.ls(dest)
84 |     fs.rm([dest], recursive=True)
85 |     assert fs.ls(dest) == []
86 | 


--------------------------------------------------------------------------------
/tests/test_assemble.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from blocks import core
 5 | 
 6 | 
 7 | def test_assemble(populated, fs):
 8 |     df = core.assemble(populated, filesystem=fs)
 9 |     assert df.shape == (40, 41)
10 |     expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)]
11 |     expected.append("key")
12 |     assert set(df.columns) == set(expected)
13 | 
14 | 
15 | def test_assemble_single(populated, fs):
16 |     df = core.assemble(os.path.join(populated, "c0", "part.0.csv"))
17 |     assert df.shape == (10, 11)
18 | 
19 | 
20 | def test_assemble_flat(populated, fs):
21 |     df = core.assemble(os.path.join(populated, "c0"), filesystem=fs)
22 |     assert df.shape == (40, 11)
23 |     expected = ["f{}_{}".format(i, j) for i in range(1) for j in range(10)]
24 |     expected.append("key")
25 |     assert set(df.columns) == set(expected)
26 | 
27 | 
28 | def test_assemble_ordered(populated, fs):
29 |     order = ["c2", "c1", "c3", "c0"]
30 |     df = core.assemble(populated, cgroups=order, filesystem=fs)
31 |     assert df.shape == (40, 41)
32 |     expected = ["f{}_{}".format(order[i][1], j) for i in range(4) for j in range(10)]
33 |     expected.append("key")
34 |     assert set(df.columns) == set(expected)
35 |     # Check the features are in the right order
36 |     assert [c for c in df.columns if c != "key"] == expected[:-1]
37 | 
38 | 
39 | # Various options do not depend on filesystem so we can just test locally
40 | def test_assemble_filtered_cgroup(populated_local, keys):
41 |     df = core.assemble(populated_local, cgroups=["c0", "c3"])
42 |     assert df.shape == (40, 21)
43 |     expected = ["f{}_{}".format(i, j) for i in [0, 3] for j in range(10)]
44 |     expected.append("key")
45 |     assert set(df.columns) == set(expected)
46 |     assert (df.key == keys).all()
47 | 
48 | 
49 | def test_assemble_filtered_rgroup(populated_local, keys):
50 |     df = core.assemble(populated_local, rgroups=["part.0.csv", "part.1.csv"])
51 |     assert df.shape == (20, 41)
52 |     expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)]
53 |     expected.append("key")
54 |     assert set(df.columns) == set(expected)
55 |     assert (df.key == keys[:20].reset_index(drop=True)).all()
56 | 
57 | 
58 | def test_assemble_read_args(populated_local, keys):
59 |     read_args = {"dtype": str}
60 |     df = core.assemble(populated_local, read_args=read_args)
61 |     assert df.shape == (40, 41)
62 |     expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)]
63 |     expected.append("key")
64 |     assert set(df.columns) == set(expected)
65 |     assert (df.key == keys).all()
66 |     assert (df.dtypes == "object").all()
67 | 
68 | 
69 | def test_assemble_cgroup_args(populated_local, keys):
70 |     cgroup_args = {"c0": {"dtype": str}}
71 |     df = core.assemble(populated_local, cgroup_args=cgroup_args)
72 |     assert df.shape == (40, 41)
73 |     expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)]
74 |     expected.append("key")
75 |     assert set(df.columns) == set(expected)
76 |     assert (df.key == keys).all()
77 |     for col in ["f0_{}".format(i) for i in range(10)]:
78 |         assert df.dtypes[col] == "object"
79 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
  1 | ========
  2 | Examples
  3 | ========
  4 | 
  5 | Inspect Data
  6 | ------------
  7 | 
  8 | You can use assemble to grab a small subset of your data
  9 | 
 10 | .. code-block:: python
 11 | 
 12 |     import blocks
 13 | 
 14 |     df = blocks.assemble('data/*/part_00.pq')
 15 |     df.describe()
 16 | 
 17 | 
 18 | This works great when dealing with data staged on GCS
 19 | 
 20 | .. code-block:: python
 21 | 
 22 |     import blocks
 23 | 
 24 |     df = blocks.assemble('gs://bucket/*/part_00.pq')
 25 |     df.describe()
 26 | 
 27 | 
 28 | Large Datasets
 29 | --------------
 30 | 
 31 | It's common to end up with a dataset that won't easily fit into memory. But you often still need to calculate
 32 | aggregate statistics on that data. For example, you might need to get a unique list of categories in one of your fields.
 33 | 
 34 | Iterate makes this easy:
 35 | 
 36 | .. code-block:: python
 37 | 
 38 |     import blocks
 39 | 
 40 |     uniques = set()
 41 |     for _, _, block in blocks.iterate('data/'):
 42 |         uniques |= set(block['feature'])
 43 | 
 44 | 
 45 | or maybe you want to parallelize the process
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     import blocks
 50 |     from multiprocessing import Pool
 51 | 
 52 |     def unique_f1(block):
 53 |         return set(block[-1]['feature'])
 54 | 
 55 |     uniques_per_block = Pool(4).map(unique_f1, blocks.iterate('data/'))
 56 |     uniques = reduce(lambda a, b: a | b, uniques_per_block)
 57 | 
 58 | 
 59 | And if you have dask installed the parallelization is even easier
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |     import blocks
 64 | 
 65 |     uniques = blocks.partitioned('data')['feature'].unique().compute()
 66 | 
 67 | 
 68 | Batch Training
 69 | --------------
 70 | 
 71 | If you're working with a tool like Keras, you might want to train a model on an iterator of batches
 72 | without every loading more than one partition into memory:
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |     import blocks
 77 | 
 78 |     def batch_generator(path):
 79 |         for _, df in blocks.iterate(path, axis=0):
 80 |             while df.shape[0] >= nbatch:
 81 |                 # Grab a sample and drop from original
 82 |                 sub = df.sample(nbatch)
 83 |                 df.drop(sub.index, inplace=True)
 84 |                 yield sub.values
 85 | 
 86 |     model.fit_generator(
 87 |         generator=batch_generator('train/'),
 88 |         validation_data=batch_generator('validate/'),
 89 |     )
 90 | 
 91 | If you use an efficient file format like ``parquet``, this simple code will be suprisingly fast. You should make
 92 | sure that you don't use multiple cgroups in a situation like this, however, because merging can slow
 93 | down the process.
 94 | 
 95 | 
 96 | Combining
 97 | ---------
 98 | 
 99 | If you end up with a dataset with multiple column groups, say because you grabbed your data from multiple sources,
100 | you may want to merge accross those groups. However it is expensive to do this by loading the whole dataset into memory.
101 | If you use the blocks structure you can merge each row partition separately and then save to new files. You can
102 | even subdivide those files into smaller row groups to ensure that they don't grow too large:
103 | 
104 | 
105 | .. code-block:: python
106 | 
107 |     import blocks
108 | 
109 |     offset = 0
110 |     for _, df in blocks.iterate(path, axis=0):
111 |         blocks.divide(df, 'combined/', n_rgroup=10, rgroup_offset=offset)
112 |         rgroup_offset += 10
113 | 
114 | 
115 | Filesystem
116 | ----------
117 | 
118 | Blocks provide a default filesystem that supports local files and GCS files. If you need additional functionality,
119 | you can create a custom filesystem instance:
120 | 
121 | 
122 | .. code-block:: python
123 | 
124 |     import blocks
125 |     from blocks.filesystem import GCSFileSystem
126 | 
127 |     fs = GCSFileSystem()
128 |     df = blocks.assemble('gs://bucket/data/', filesystem=fs)
129 | 
130 | 
131 | The default filesystem has support for GCS, and you can implement your own FileSystem class by
132 | inheriting from ``blocks.filesystem.FileSystem``. This can be used to extend blocks to additional
133 | cloud platforms, to support encryption/decryption, etc...
134 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import pandas as pd
  4 | import numpy as np
  5 | import uuid
  6 | 
  7 | from blocks.filesystem import FileSystem
  8 | from delegator import run
  9 | 
 10 | BUCKET_GCS = "gs://blocks-example"
 11 | BUCKET_S3 = "s3://blocks-example"
 12 | 
 13 | if os.environ.get("CI"):
 14 |     inputs = ["local"]
 15 |     outputs = ["local"]
 16 |     temps = ["local"]
 17 | else:
 18 |     inputs = ["local", "gcs", "gcs_extra", "s3"]
 19 |     outputs = ["local", "gcs", "s3"]
 20 |     temps = ["local", "gcs", "s3"]
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def fs(request):
 25 |     return FileSystem()
 26 | 
 27 | 
 28 | @pytest.fixture(scope="function", params=temps)
 29 | def temp(request, tmpdir_factory):
 30 |     if request.param == "local":
 31 |         path = str(tmpdir_factory.mktemp("temp"))
 32 |         yield path
 33 | 
 34 |     if request.param == "gcs":
 35 |         path = os.path.join(BUCKET_GCS, "temp")
 36 |         yield path
 37 |         run("gsutil rm -r {}".format(path))
 38 | 
 39 |     if request.param == "s3":
 40 |         path = os.path.join(BUCKET_S3, "temp")
 41 |         yield path
 42 |         run("aws s3 rm --recursive {}".format(path))
 43 | 
 44 | 
 45 | @pytest.fixture(scope="session")
 46 | def populated_local(request, tmpdir_factory):
 47 |     tmpdir = str(tmpdir_factory.mktemp("data"))
 48 |     _populate(tmpdir)
 49 |     return tmpdir
 50 | 
 51 | 
 52 | # This is the same directory structure as above but parametrized on different file systems
 53 | @pytest.fixture(scope="session", params=inputs)
 54 | def populated(request, populated_local):
 55 |     if request.param == "local":
 56 |         yield populated_local
 57 | 
 58 |     if request.param == "gcs":
 59 |         path = os.path.join(BUCKET_GCS, "data1")
 60 |         run("gsutil cp -r {} {}".format(populated_local, path))
 61 |         yield path
 62 |         run("gsutil rm -r {}".format(path))
 63 | 
 64 |     if request.param == "gcs_extra":
 65 |         path = os.path.join(BUCKET_GCS, "data2")
 66 |         run("gsutil cp -r {} {}".format(populated_local, path))
 67 |         # Also add an extra file
 68 |         run("touch extra")
 69 |         run("gsutil cp extra {}".format(path))
 70 |         os.remove("extra")
 71 |         yield path
 72 |         run("gsutil rm -r {}".format(path))
 73 | 
 74 |     if request.param == "s3":
 75 |         path = os.path.join(BUCKET_S3, "data1")
 76 |         run("aws s3 cp --recursive {} {}".format(populated_local, path))
 77 |         yield path
 78 |         run("aws s3 rm --recursive {}".format(path))
 79 | 
 80 | 
 81 | @pytest.fixture(scope="session")
 82 | def keys():
 83 |     return pd.Series(["key{:02d}".format(i) for i in range(40)])
 84 | 
 85 | 
 86 | @pytest.fixture()
 87 | def randomdata():
 88 |     df = pd.DataFrame(
 89 |         np.random.rand(10, 10), columns=["f{}".format(i) for i in range(10)]
 90 |     )
 91 |     return df
 92 | 
 93 | 
 94 | @pytest.fixture()
 95 | def datadir_local(request, tmpdir_factory):
 96 |     return str(tmpdir_factory.mktemp("data"))
 97 | 
 98 | 
 99 | @pytest.fixture(params=outputs)
100 | def datadir(request, tmpdir_factory):
101 |     output = str(uuid.uuid4()).replace("-", "")
102 |     if request.param == "local":
103 |         tmpdir = str(tmpdir_factory.mktemp("data"))
104 |         yield tmpdir
105 | 
106 |     if request.param == "gcs":
107 |         path = os.path.join(BUCKET_GCS, output)
108 |         yield path
109 |         run("gsutil rm -r {}".format(path))
110 | 
111 |     if request.param == "s3":
112 |         path = os.path.join(BUCKET_S3, output)
113 |         yield path
114 |         run("aws s3 rm --recursive {}".format(path))
115 | 
116 | 
117 | def _populate(tmpdir):
118 |     """Create a directory of blocks with 4 cgroups and 4 rgroups"""
119 |     for c in range(4):
120 |         cgroup = os.path.join(tmpdir, "c{}".format(c))
121 |         if not os.path.exists(cgroup):
122 |             os.makedirs(cgroup)
123 |         for r in range(4):
124 |             df = pd.DataFrame(
125 |                 np.random.rand(10, 10),
126 |                 index=list(range(r * 10, (r + 1) * 10)),
127 |                 columns=["f{}_{}".format(c, i) for i in range(10)],
128 |             )
129 |             df["key"] = [
130 |                 "key{:02d}".format(i) for i in df.index
131 |             ]  # common key for merges
132 |             df.to_csv(os.path.join(cgroup, "part.{}.csv".format(r)), index=False)
133 | 


--------------------------------------------------------------------------------
/blocks/dfio.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import warnings
  3 | import pandas as pd
  4 | 
  5 | try:
  6 |     import fastavro as avro
  7 | 
  8 |     avro_imported = True
  9 | except ImportError:
 10 |     avro_imported = False
 11 | 
 12 | 
 13 | def read_df(path, **read_args):
 14 |     """Read a dataframe path based on the file extension
 15 |     parquet, avro, csv, pickle, json
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     path: str
 20 |         The path to the file holding data
 21 |     read_args : optional
 22 |         All keyword args are passed to the read function
 23 | 
 24 |     Returns
 25 |     -------
 26 |     data : pd.DataFrame
 27 | 
 28 |     Notes
 29 |     -----
 30 |     The read functions are taken from pandas, e.g. pd.read_csv
 31 |     Check the pandas doc for more information on the supported arguments
 32 |     """
 33 |     filetype = _get_extension(path)
 34 |     reader = _readers[filetype]
 35 |     if reader == pd.read_json:
 36 |         # Default json file is newline delimited json records, but can be overwritten
 37 |         defaults = {"lines": True, "orient": "records"}
 38 |         defaults.update(read_args)
 39 |         read_args = defaults
 40 | 
 41 |     return reader(path, **read_args)
 42 | 
 43 | 
 44 | def write_df(df, path, **write_args):
 45 |     """Write a dataframe to file based on the file extension
 46 | 
 47 |     The following formats are supported:
 48 |     parquet, avro, csv, pickle, json
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     df : pd.DataFrame
 53 |         The dataframe to write to disk
 54 |     datafile : DataFile
 55 |         Datafile instance with the path and file handle
 56 |     write_args : optional
 57 |         All keyword args are passed to the write function
 58 | 
 59 |     Notes
 60 |     -----
 61 |     The write functions are taken from pandas, e.g. pd.to_csv
 62 |     Check the pandas doc for more information on the supported arguments
 63 |     """
 64 |     extension = _get_extension(path)
 65 |     write_name = _writers[extension]
 66 | 
 67 |     # Some customizations for different file types
 68 |     if write_name == "to_avro":
 69 |         return _write_avro(df, path, **write_args)
 70 | 
 71 |     if write_name == "to_parquet" and not pd.Series(df.columns).map(type).eq(str).all():
 72 |         warnings.warn(
 73 |             "Dataframe contains non-string column names, which cannot be saved in parquet.\n"
 74 |             "Blocks will attempt to convert them to strings."
 75 |         )
 76 |         df.columns = df.columns.astype("str")
 77 | 
 78 |     if write_name == "to_json":
 79 |         defaults = {"lines": True, "orient": "records"}
 80 |         defaults.update(write_args)
 81 |         write_args = defaults
 82 | 
 83 |     if write_name == "to_csv":
 84 |         # make index=False the default for similar behaviour to other formats
 85 |         write_args["index"] = write_args.get("index", False)
 86 | 
 87 |     write_fn = getattr(df, write_name)
 88 |     write_fn(path, **write_args)
 89 | 
 90 | 
 91 | def _read_avro(path, **read_args):
 92 |     if not avro_imported:
 93 |         raise ImportError(
 94 |             "Avro support requires fastavro.\n"
 95 |             "Install blocks with the [avro] option or `pip install fastavro`"
 96 |         )
 97 |     records = []
 98 |     with open(path, "rb") as f:
 99 |         avro_reader = avro.reader(f)
100 |         for record in avro_reader:
101 |             records.append(record)
102 |     return pd.DataFrame.from_dict(records)
103 | 
104 | 
105 | def _write_avro(df, path, **write_args):
106 |     if not avro_imported:
107 |         raise ImportError(
108 |             "Avro support requires fastavro.\n"
109 |             "Install blocks with the [avro] option or `pip install fastavro`"
110 |         )
111 |     schema = None
112 |     schema_path = None
113 |     try:
114 |         schema = write_args["schema"]
115 |     except KeyError:
116 |         try:
117 |             schema_path = write_args["schema_path"]
118 |         except KeyError:
119 |             raise Exception(
120 |                 "You must provide a schema or schema path when writing to Avro"
121 |             )
122 |     if schema is None:
123 |         schema = avro.schema.load_schema(schema_path)
124 |     records = df.to_dict("records")
125 |     with open(path, "wb") as f:
126 |         avro.writer(f, schema, records)
127 | 
128 | 
129 | def _get_extension(path):
130 |     name, ext = os.path.splitext(path)
131 |     return ext
132 | 
133 | 
134 | _readers = {
135 |     ".pq": pd.read_parquet,
136 |     ".parquet": pd.read_parquet,
137 |     ".csv": pd.read_csv,
138 |     ".pkl": pd.read_pickle,
139 |     ".avro": _read_avro,
140 |     ".json": pd.read_json,
141 | }
142 | 
143 | 
144 | _writers = {
145 |     ".pq": "to_parquet",
146 |     ".parquet": "to_parquet",
147 |     ".csv": "to_csv",
148 |     ".pkl": "to_pickle",
149 |     ".avro": "to_avro",
150 |     ".json": "to_json",
151 | }
152 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Blocks documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Nov 27 14:57:51 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import os
 20 | import sys
 21 | 
 22 | sys.path.insert(0, os.path.abspath("../.."))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     "sphinx.ext.autodoc",
 36 |     "sphinx.ext.autosummary",
 37 |     "sphinx.ext.mathjax",
 38 |     "sphinx.ext.viewcode",
 39 |     "sphinx_copybutton",
 40 |     "numpydoc",
 41 | ]
 42 | autodoc_typehints = "none"
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ["_templates"]
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | #
 50 | 
 51 | # source_suffix = ['.rst', '.md']
 52 | source_suffix = ".rst"
 53 | 
 54 | # The master toctree document.
 55 | master_doc = "index"
 56 | 
 57 | # General information about the project.
 58 | project = u"Blocks"
 59 | copyright = u"2018 Square, Inc."
 60 | author = u"Bradley Axen"
 61 | 
 62 | # The version info for the project you're documenting, acts as replacement for
 63 | # |version| and |release|, also used in various other places throughout the
 64 | # built documents.
 65 | #
 66 | # The short X.Y version.
 67 | version = u"0.9.4"
 68 | # The full version, including alpha/beta/rc tags.
 69 | release = u"0.9.4"
 70 | 
 71 | # The language for content autogenerated by Sphinx. Refer to documentation
 72 | # for a list of supported languages.
 73 | #
 74 | # This is also used if you do content translation via gettext catalogs.
 75 | # Usually you set "language" from the command line for these cases.
 76 | language = None
 77 | 
 78 | # List of patterns, relative to source directory, that match files and
 79 | # directories to ignore when looking for source files.
 80 | # This patterns also effect to html_static_path and html_extra_path
 81 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = "sphinx"
 85 | 
 86 | 
 87 | # -- Options for HTML output ----------------------------------------------
 88 | 
 89 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 90 | # a list of builtin themes.
 91 | #
 92 | 
 93 | html_theme = "furo"
 94 | html_title = project
 95 | html_last_updated_fmt = "%b %d, %Y"
 96 | 
 97 | # Add any paths that contain custom static files (such as style sheets) here,
 98 | # relative to this directory. They are copied after the builtin static files,
 99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = []
101 | 
102 | 
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further.  For a list of options available for each theme, see the
105 | # documentation.
106 | #
107 | # html_theme_options = {}
108 | 
109 | # Add any paths that contain custom static files (such as style sheets) here,
110 | # relative to this directory. They are copied after the builtin static files,
111 | # so a file named "default.css" will overwrite the builtin "default.css".
112 | # html_static_path = ["_static"]
113 | 
114 | 
115 | # -- Options for HTMLHelp output ------------------------------------------
116 | 
117 | # Output file base name for HTML help builder.
118 | htmlhelp_basename = "blocksdoc"
119 | 
120 | 
121 | # -- Options for LaTeX output ---------------------------------------------
122 | 
123 | latex_elements = {
124 |     # The paper size ('letterpaper' or 'a4paper').
125 |     #
126 |     # 'papersize': 'letterpaper',
127 |     # The font size ('10pt', '11pt' or '12pt').
128 |     #
129 |     # 'pointsize': '10pt',
130 |     # Additional stuff for the LaTeX preamble.
131 |     #
132 |     # 'preamble': '',
133 |     # Latex figure (float) alignment
134 |     #
135 |     # 'figure_align': 'htbp',
136 | }
137 | 
138 | # Grouping the document tree into LaTeX files. List of tuples
139 | # (source start file, target name, title,
140 | #  author, documentclass [howto, manual, or own class]).
141 | latex_documents = [
142 |     (master_doc, "Blocks.tex", u"Blocks Documentation", u"Bradley Axen", "manual"),
143 | ]
144 | 
145 | 
146 | # -- Options for manual page output ---------------------------------------
147 | 
148 | # One entry per manual page. List of tuples
149 | # (source start file, name, description, authors, manual section).
150 | man_pages = [(master_doc, "blocks", u"Blocks Documentation", [author], 1)]
151 | 
152 | 
153 | # -- Options for Texinfo output -------------------------------------------
154 | 
155 | # Grouping the document tree into Texinfo files. List of tuples
156 | # (source start file, target name, title, author,
157 | #  dir menu entry, description, category)
158 | texinfo_documents = [
159 |     (
160 |         master_doc,
161 |         "Blocks",
162 |         u"Blocks Documentation",
163 |         author,
164 |         "Blocks",
165 |         "One line description of project.",
166 |         "Miscellaneous",
167 |     ),
168 | ]
169 | 
170 | 
171 | # -- Autodoc --
172 | # autodoc_typehints = "description"
173 | 


--------------------------------------------------------------------------------
/blocks/filesystem/base.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, Sequence, Tuple, List
  2 | from collections import defaultdict
  3 | from fsspec.core import split_protocol, get_filesystem_class, has_magic
  4 | 
  5 | 
  6 | class FileSystem:
  7 |     """Filesystem for manipulating files in the cloud
  8 | 
  9 |     This supports operations on local files and any other protocol supported by fsspec.
 10 |     This is a wrapper to fsspec which provides backwards compatibility for blocks filesystems
 11 |     and a simplified interface.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     storage_options: Mapping[str, Mapping[str, Any]]
 16 |         Additional options passed to each filesystem for each protocol
 17 |         e.g. {'gs': {'project': 'example'}} to set the gs filesytem project to example
 18 |     """
 19 | 
 20 |     def __init__(self, **storage_options):
 21 |         self.storage_options = defaultdict(dict)
 22 |         self.storage_options.update(storage_options)
 23 |         self.storage_options[None]["auto_mkdir"] = True
 24 |         self.filesystems = {}
 25 | 
 26 |     def _get_protocol_path(self, urlpath) -> Tuple[str, List[str]]:
 27 |         if isinstance(urlpath, str):
 28 |             return split_protocol(urlpath)
 29 | 
 30 |         protocols, paths = zip(*map(split_protocol, urlpath))
 31 |         assert (
 32 |             len(set(protocols)) == 1
 33 |         ), "Cannot mix file protocols in a single operation"
 34 |         return protocols[0], list(paths)
 35 | 
 36 |     def _get_filesystem(self, protocol):
 37 |         if protocol not in self.filesystems:
 38 |             self.filesystems[protocol] = get_filesystem_class(protocol)(
 39 |                 **self.storage_options[protocol]
 40 |             )
 41 |         return self.filesystems[protocol]
 42 | 
 43 |     def ls(self, path: str) -> Sequence[str]:
 44 |         """List files correspond to path, including glob wildcards
 45 | 
 46 |         Parameters
 47 |         ----------
 48 |         path : str
 49 |             The path to the file or directory to list; supports wildcards
 50 |         """
 51 |         protocol, path = self._get_protocol_path(path)
 52 |         fs = self._get_filesystem(protocol)
 53 |         try:
 54 |             if has_magic(path):
 55 |                 output = fs.glob(path)
 56 |             else:
 57 |                 output = fs.ls(path)
 58 |         # TODO fix in base
 59 |         except FileNotFoundError:
 60 |             return []
 61 |         except NotADirectoryError:
 62 |             return [path]
 63 | 
 64 |         if protocol is not None:
 65 |             output = ["://".join([protocol, path]) for path in output]
 66 |         return sorted(output)
 67 | 
 68 |     def copy(
 69 |         self,
 70 |         sources: Union[str, Sequence[str]],
 71 |         dest: Union[str, Sequence[str]],
 72 |         recursive=False,
 73 |     ):
 74 |         """Copy the files in sources to dest
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         sources : list of str
 79 |             The list of paths to copy
 80 |         dest : str
 81 |             The destination(s) for the copy of source(s)
 82 |         recursive : bool
 83 |             If true, recursively copy any directories
 84 |         """
 85 |         if isinstance(sources, str):
 86 |             sources = [sources]
 87 | 
 88 |         protocol_source, sources = self._get_protocol_path(sources)
 89 |         protocol_dest, dest = self._get_protocol_path(dest)
 90 | 
 91 |         if protocol_source == protocol_dest:
 92 |             fs = self._get_filesystem(protocol_source)
 93 | 
 94 |             # Temporary workaround for a bug in gcsfs
 95 |             if protocol_source == "gs" and recursive:
 96 |                 sources = fs.expand_path(sources, recursive=True)
 97 |                 sources = ["gs://" + s for s in sources if not fs.isdir(s)]
 98 |                 return self.copy(sources, "gs://" + dest, recursive=False)
 99 | 
100 |             fs.copy(sources, dest, recursive=recursive)
101 | 
102 |         elif protocol_source is None:
103 |             fs = self._get_filesystem(protocol_dest)
104 |             fs.put(sources, dest, recursive=recursive)
105 | 
106 |         elif protocol_dest is None:
107 |             fs = self._get_filesystem(protocol_source)
108 |             fs.get(sources, dest, recursive=recursive)
109 | 
110 |         elif protocol_dest is not None and protocol_source is not None:
111 |             raise NotImplementedError(
112 |                 "Cannot do direct copy between two different cloud filesystems"
113 |             )
114 | 
115 |         if protocol_dest == "gs":
116 |             # Make sure we invalidate the gcsfs cache since we have added new files
117 |             if isinstance(dest, str):
118 |                 fs.invalidate_cache(dest)
119 |             else:
120 |                 for d in dest:
121 |                     fs.invalidate_cache(d)
122 | 
123 |     def remove(self, paths: Union[str, List[str]], recursive: bool = False):
124 |         """Remove the files at paths
125 | 
126 |         Parameters
127 |         ----------
128 |         paths : list of str
129 |             The paths to remove
130 |         recursive : bool, default False
131 |             If true, recursively remove any directories
132 |         """
133 |         protocol, paths = self._get_protocol_path(paths)
134 |         fs = self._get_filesystem(protocol)
135 | 
136 |         if protocol is None and not isinstance(paths, str):
137 |             # TODO should local not just handle this?
138 |             for path in paths:
139 |                 fs.rm(path, recursive=recursive)
140 |         else:
141 |             return fs.rm(paths, recursive=recursive)
142 | 
143 |     def open(self, path: str, mode="rb", **kwargs):
144 |         """Return a file-like object from the filesystem
145 | 
146 |         The resultant instance must function correctly in a context ``with``
147 |         block.
148 | 
149 |         Parameters
150 |         ----------
151 |         path: str
152 |             Target file
153 |         mode: str like 'rb', 'w'
154 |             See builtin ``open()``
155 |         kwargs:
156 |             Forwarded to the filesystem implementation
157 |         """
158 |         protocol, path = self._get_protocol_path(path)
159 |         fs = self._get_filesystem(protocol)
160 |         return fs.open(path, mode, **kwargs)
161 | 
162 |     def isdir(self, path: str):
163 |         """Check if the path is a directory"""
164 |         protocol, path = self._get_protocol_path(path)
165 |         fs = self._get_filesystem(protocol)
166 |         return fs.isdir(path)
167 | 
168 |     def mkdir(self, path: str):
169 |         """Make directory at path"""
170 |         protocol, path = self._get_protocol_path(path)
171 |         fs = self._get_filesystem(protocol)
172 |         return fs.mkdir(path)
173 | 
174 |     # Aliases
175 |     cp = copy
176 |     rm = remove
177 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
  1 | ==========
  2 | Quickstart
  3 | ==========
  4 | 
  5 | Layout
  6 | ------
  7 | 
  8 | In the simplest case, you might want to read your data from a single file. This is pretty easy in
  9 | pandas, but blocks adds additional support for inferring file types and support cloud storage:
 10 | 
 11 | .. code-block:: python
 12 | 
 13 |     import pandas as pd
 14 |     import blocks
 15 |     df = blocks.assemble('data.pkl') # same as pd.read_pickle
 16 |     df = blocks.assemble('gs://mybucket/data.parquet')
 17 | 
 18 | Many projects need to combine data stored in  several files. To support this, blocks makes a few
 19 | assumptions about your data. You've split it up into blocks, either into groups of columns (cgroups)
 20 | or groups of rows (rgroups). You can read all this data into a single dataframe in memory with one
 21 | command:
 22 | 
 23 | .. code-block:: python
 24 | 
 25 |     import blocks
 26 |     blocks.assemble('data/')
 27 | 
 28 | If all of your files are in one directory, then the rows will be concatenated:
 29 | 
 30 | ::
 31 | 
 32 |     data
 33 |     ├── part.00.pq
 34 |     ├── part.01.pq
 35 |     └── part.02.pq
 36 | 
 37 | 
 38 | 
 39 | If your files actually contain the same rows but store different columns, you should place them in different folders with corresponding names:
 40 | 
 41 | ::
 42 | 
 43 |     data
 44 |     ├── g0
 45 |     │   └── part.00.pq
 46 |     ├── g1
 47 |     │   └── part.00.pq
 48 |     └── g2
 49 |         └── part.00.pq
 50 | 
 51 | 
 52 | In the most general case you can do both, laying out your data in multiple cgroups and rgroups - where each rgroup should contain the same
 53 | logical rows (e.g. different attributes of the same event)
 54 | 
 55 | ::
 56 | 
 57 |     ─ data
 58 |       ├── g0
 59 |       │   ├── part.00.pq
 60 |       │   ├── part.01.pq
 61 |       │   ├── part.02.pq
 62 |       │   └── part.03.pq
 63 |       ├── g1
 64 |       │   ├── part.00.pq
 65 |       │   ├── part.01.pq
 66 |       │   ├── part.02.pq
 67 |       │   └── part.03.pq
 68 |       ├── g2
 69 |       │   ├── part.00.pq
 70 |       │   ├── part.01.pq
 71 |       │   ├── part.02.pq
 72 |       │   └── part.03.pq
 73 |       └── g3
 74 |           ├── part.00.pq
 75 |           ├── part.01.pq
 76 |           ├── part.02.pq
 77 |           └── part.03.pq
 78 | 
 79 | This corresponds to the following dataframe structure:
 80 | 
 81 | .. image:: blocks_layout.png
 82 | 
 83 | 
 84 | This pattern generalizes very well when you start collecting data from multiple sources and with enough content that the
 85 | entire dataset won't comfortably fit into memory at once.
 86 | 
 87 | Blocks supports multiple data formats, including ``csv``, ``hdf5``, ``pickle``, and ``parquet``. Reads from these files
 88 | are handled by ``pandas`` libraries, so they support all of the options you expect like headers, index columns, etc.
 89 | All of the ``blocks`` interfaces below support passing keyword args to the read functions for the files (see the docstrings).
 90 | The files can be local (referenced by normal paths) or on GCS (referenced by paths like ``gs://bucket``).
 91 | 
 92 | **Note that rgroups are combined by simple concatenation, and cgroups are combined by a "natural left join":
 93 | any shared columns are considered join keys.**  Key-based merging only makes sense with named columns, so make sure
 94 | any CSVs you use have a column header if you want to join cgroups.
 95 | 
 96 | Read
 97 | ----
 98 | 
 99 | Assemble
100 | ========
101 | 
102 | Assemble is the primary data reading command, and can handle any of the layouts above. You can select subsets of the data
103 | using glob patterns or the ``cgroups`` and ``rgroups`` arguments:
104 | 
105 | 
106 | =======================  =====================================================================
107 | Layout                   Recipe
108 | =======================  =====================================================================
109 | .. image:: both.png      .. code-block:: python
110 | 
111 |                              blocks.assemble('data/')
112 | 
113 | .. image:: column.png    .. code-block:: python
114 | 
115 |                              blocks.assemble('data/g1/*')
116 |                              # or
117 |                              blocks.assemble('data/', cgroups=['g1'])
118 | 
119 | .. image:: row.png       .. code-block:: python
120 | 
121 |                              blocks.assemble('data/*/part.01.pq')
122 |                              # or
123 |                              blocks.assemble('data/', rgroups=['part.01.pq'])
124 | 
125 | .. image:: filtered.png  .. code-block:: python
126 | 
127 |                              blocks.assemble('data/*/part.01.pq', cgroups=['g0', 'g1', 'g3'])
128 |                              # or
129 |                              blocks.assemble(
130 |                                  'data/',
131 |                                  rgroups=['part.01.pq'],
132 |                                  cgroups=['g0', 'g1', 'g3']
133 |                              )
134 | 
135 | =======================  =====================================================================
136 | 
137 | Iterate
138 | =======
139 | 
140 | Blocks also has an iterative option for performing operations on each of the blocks without loading them all
141 | into memory at once:
142 | 
143 | .. code-block:: python
144 | 
145 |     import blocks
146 | 
147 |     for cgroup, rgroup, df in blocks.iterate('data/'):
148 |         print(df.shape)
149 | 
150 | 
151 | ``iterate`` supports the same syntax and features as ``assemble`` above, but instead of returning a merged dataframe,
152 | it returns an iterator of ``(rgroup, cgroup, dataframe)`` where the ``rgroup`` and ``cgroup`` are the names of the
153 | groups (``'g0'`` and ``'part.00.pq'`` from above).
154 | 
155 | 
156 | ``iterate`` can also operate on multiple axes - the default is to iterate over every block separately. But if you
157 | specify ``axis=0``, then iterate will combine cgroups and iterate over rgroups, and for ``axis=1`` it will iterate
158 | over the cgroups while combining any rgroups.
159 | 
160 | ==========================  =====================================================================
161 | Direction                   Recipe
162 | ==========================  =====================================================================
163 | .. image:: row_iter.png     .. code-block:: python
164 | 
165 |                                 # iterate over one dataframe per rgroup
166 |                                 for rgroup, df in  blocks.iterate('gs://path/to/data', axis=0):
167 |                                     print(df.shape)
168 | .. image:: column_iter.png  .. code-block:: python
169 | 
170 |                                 # iterate over one dataframe per cgroup
171 |                                 for cgroup, df in  blocks.iterate('gs://path/to/data', axis=1):
172 |                                     print(df.shape)
173 | ==========================  =====================================================================
174 | 
175 | 
176 | Partitioned
177 | ===========
178 | 
179 | Dask_ provides a great interface to a partitioned dataframe, and you can use blocks' simple syntax to
180 | build a ``dask.dataframe``. Checkout the dask documentation for details on how to use the resulting object.
181 | 
182 | .. code-block:: python
183 | 
184 |     import blocks
185 | 
186 |     # need to have separately installed dask
187 |     dask_df = blocks.partitioned('data/*/part_0[1-4].pq')
188 | 
189 |     dask_df.groupby('category').mean().compute()
190 | 
191 | 
192 | Write
193 | -----
194 | 
195 | Place
196 | =====
197 | 
198 | If you want to put a dataframe into a single file, use ``place``:
199 | 
200 | .. code-block:: python
201 | 
202 |     import blocks
203 | 
204 |     blocks.place(df, 'data/part_00.pq')
205 |     blocks.place(df, 'gs://mybucket/data/part_00.pq')
206 | 
207 | Like with ``assemble`` for a single file, this is easy in ``pandas``, but ``blocks`` infers the file
208 | type and has support for cloud storage.
209 | 
210 | Divide
211 | ======
212 | 
213 | For paritioning your data, blocks also has a divide function. You'd use this to split up a single large dataframe
214 | in memory into many rgroups and/or cgroups on disk, to help with parallelizing analysis. By default the blocks are
215 | written as ``parquet`` files, but you can specify other extensions including ``.hdf5``, ``.csv``, and ``.pkl``.
216 | 
217 | .. code-block:: python
218 | 
219 |     import blocks
220 | 
221 |     # divide into just row groups
222 |     blocks.divide(df, 'data/', n_rgroup=3)
223 | 
224 | ::
225 | 
226 |     data
227 |     ├── part_00.pq
228 |     ├── part_01.pq
229 |     └── part_02.pq
230 | 
231 | Divide can also handle column groups:
232 | 
233 | .. code-block:: python
234 | 
235 |     # split into 10 rgroups and specific cgroups
236 |     cgroup_columns = {
237 |         'g0': ['id', 'timestamp', 'metadata'],
238 |         'g1': ['id', 'timestamp', 'feature0', 'feature1'],
239 |         'g2': ['id', 'timestamp', 'feature2', 'feature3'],
240 |         'g3': ['id', 'timestamp', 'feature4', 'feature5', 'feature6'],
241 |     }
242 |     blocks.divide(df, 'data/', 4, cgroup_columns=cgroup_columns)
243 | 
244 | ::
245 | 
246 |     ─ data
247 |       ├── g0
248 |       │   ├── part.00.pq
249 |       │   ├── part.01.pq
250 |       │   ├── part.02.pq
251 |       │   └── part.03.pq
252 |       ├── g1
253 |       │   ├── part.00.pq
254 |       │   ├── part.01.pq
255 |       │   ├── part.02.pq
256 |       │   └── part.03.pq
257 |       ├── g2
258 |       │   ├── part.00.pq
259 |       │   ├── part.01.pq
260 |       │   ├── part.02.pq
261 |       │   └── part.03.pq
262 |       └── g3
263 |           ├── part.00.pq
264 |           ├── part.01.pq
265 |           ├── part.02.pq
266 |           └── part.03.pq
267 | 
268 | 
269 | .. _Dask: http://dask.pydata.org/en/latest/
270 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/blocks/core.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | import warnings
  6 | import pickle as _pickle
  7 | 
  8 | from functools import reduce
  9 | from collections import defaultdict, OrderedDict
 10 | from typing import (
 11 |     Optional,
 12 |     Sequence,
 13 |     Any,
 14 |     Dict,
 15 |     Iterator,
 16 |     Tuple,
 17 |     DefaultDict,
 18 |     List,
 19 |     Iterable,
 20 |     Union,
 21 | )
 22 | 
 23 | from blocks.filesystem import FileSystem
 24 | from blocks.utils import with_function_tmpdir, with_session_tmpdir
 25 | from blocks.dfio import read_df, write_df
 26 | 
 27 | cgroup = str
 28 | rgroup = str
 29 | 
 30 | 
 31 | @with_function_tmpdir
 32 | def assemble(
 33 |     path: str,
 34 |     cgroups: Optional[Sequence[cgroup]] = None,
 35 |     rgroups: Optional[Sequence[rgroup]] = None,
 36 |     read_args: Any = {},
 37 |     cgroup_args: Dict[cgroup, Any] = {},
 38 |     merge: str = "inner",
 39 |     filesystem: FileSystem = FileSystem(),
 40 |     tmpdir: str = None,
 41 | ) -> pd.DataFrame:
 42 |     """Assemble multiple dataframe blocks into a single frame
 43 | 
 44 |     Each file included in the path (or subdirs of that path) is combined into
 45 |     a single dataframe by first concatenating over row groups and then merging
 46 |     over column groups. A row group is a subset of rows of the data stored in
 47 |     different files. A column group is a subset of columns of the data stored in
 48 |     different folders. The merges are performed in the order of listed cgroups if
 49 |     provided, otherwise in alphabetic order. Files are opened by a method inferred
 50 |     from their extension.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     path : str
 55 |         The glob-able path to all data files to assemble into a frame
 56 |         e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
 57 |         See the README for a more detailed explanation
 58 |     cgroups : list of str, optional
 59 |         The list of cgroups (folder names) to include from the glob path
 60 |     rgroups : list of str, optional
 61 |         The list of rgroups (file names) to include from the glob path
 62 |     read_args : optional
 63 |         Any additional keyword args to pass to the read function
 64 |     cgroup_args : {cgroup: kwargs}, optional
 65 |         Any cgroup specific read arguments, where each key is the name
 66 |         of the cgroup and each value is a dictionary of keyword args
 67 |     merge : one of 'left', 'right', 'outer', 'inner', default 'inner'
 68 |         The merge strategy to pass to pandas.merge
 69 |     filesystem : blocks.filesystem.FileSystem or similar
 70 |         A filesystem object that implements the blocks.FileSystem API
 71 | 
 72 |     Returns
 73 |     -------
 74 |     data : pd.DataFrame
 75 |         The combined dataframe from all the blocks
 76 | 
 77 |     """
 78 |     grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir)
 79 | 
 80 |     # ----------------------------------------
 81 |     # Concatenate all rgroups
 82 |     # ----------------------------------------
 83 |     frames = []
 84 | 
 85 |     for group in grouped:
 86 |         files = grouped[group]
 87 |         args = read_args.copy()
 88 |         if group in cgroup_args:
 89 |             args.update(cgroup_args[group])
 90 |         frames.append(pd.concat(read_df(f, **args) for f in files))
 91 | 
 92 |     # ----------------------------------------
 93 |     # Merge all cgroups
 94 |     # ----------------------------------------
 95 |     df = _merge_all(frames, merge=merge)
 96 |     return df
 97 | 
 98 | 
 99 | @with_function_tmpdir
100 | def iterate(
101 |     path: str,
102 |     axis: int = -1,
103 |     cgroups: Optional[Sequence[cgroup]] = None,
104 |     rgroups: Optional[Sequence[rgroup]] = None,
105 |     read_args: Any = {},
106 |     cgroup_args: Dict[cgroup, Any] = {},
107 |     merge: str = "inner",
108 |     filesystem: FileSystem = FileSystem(),
109 |     tmpdir: str = None,
110 | ) -> Union[
111 |     Iterator[Tuple[cgroup, rgroup, pd.DataFrame]], Iterator[Tuple[str, pd.DataFrame]]
112 | ]:
113 |     """Iterate over dataframe blocks
114 | 
115 |     Each file include in the path (or subdirs of that path) is opened as a
116 |     dataframe and returned in a generator of (cname, rname, dataframe).
117 |     Files are opened by a method inferred from their extension
118 | 
119 |     Parameters
120 |     ----------
121 |     path : str
122 |         The glob-able path to all files to assemble into a frame
123 |         e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
124 |         See the README for a more detailed explanation
125 |     axis : int, default -1
126 |         The axis to iterate along
127 |         If -1 (the default), iterate over both columns and rows
128 |         If 0, iterate over the rgroups, combining any cgroups
129 |         If 1, iterate over the cgroups, combining any rgroups
130 |     cgroups : list of str, or {str: args} optional
131 |         The list of cgroups (folder names) to include from the glob path
132 |     rgroups : list of str, optional
133 |         The list of rgroups (file names) to include from the glob path
134 |     read_args : dict, optional
135 |         Any additional keyword args to pass to the read function
136 |     cgroup_args : {cgroup: kwargs}, optional
137 |         Any cgroup specific read arguments, where each key is the name
138 |         of the cgroup and each value is a dictionary of keyword args
139 |     merge : one of 'left', 'right', 'outer', 'inner', default 'inner'
140 |         The merge strategy to pass to pandas.merge, only used when axis=0
141 |     filesystem : blocks.filesystem.FileSystem or similar
142 |         A filesystem object that implements the blocks.FileSystem API
143 | 
144 |     Returns
145 |     -------
146 |     data : generator
147 |         A generator of (cname, rname, dataframe) for each collected path
148 |         If axis=0, yields (rname, dataframe)
149 |         If axis=1, yields (cname, dataframe)
150 | 
151 |     """
152 |     grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir)
153 | 
154 |     if axis == -1:
155 |         for cgroup in grouped:
156 |             args = read_args.copy()
157 |             if cgroup in cgroup_args:
158 |                 args.update(cgroup_args[cgroup])
159 |             for path in grouped[cgroup]:
160 |                 yield _cname(path), _rname(path), read_df(path, **args)
161 | 
162 |     elif axis == 0:
163 |         # find the shared files among all subfolders
164 |         rgroups = _shared_rgroups(grouped)
165 | 
166 |         for rgroup in sorted(rgroups):
167 |             frames = []
168 |             for cgroup in grouped:
169 |                 path = next(d for d in grouped[cgroup] if _rname(d) == rgroup)
170 | 
171 |                 args = read_args.copy()
172 |                 if cgroup in cgroup_args:
173 |                     args.update(cgroup_args[cgroup])
174 |                 frames.append(read_df(path, **args))
175 |             yield rgroup, _merge_all(frames, merge=merge)
176 | 
177 |     elif axis == 1:
178 |         for cgroup in grouped:
179 |             files = grouped[cgroup]
180 |             args = read_args.copy()
181 |             if cgroup in cgroup_args:
182 |                 args.update(cgroup_args[cgroup])
183 |             yield cgroup, pd.concat(read_df(path, **args) for path in files)
184 | 
185 |     else:
186 |         raise ValueError("Invalid choice for axis, options are -1, 0, 1")
187 | 
188 | 
189 | @with_session_tmpdir
190 | def partitioned(
191 |     path: str,
192 |     cgroups: Sequence[cgroup] = None,
193 |     rgroups: Sequence[rgroup] = None,
194 |     read_args: Any = {},
195 |     cgroup_args: Dict[cgroup, Any] = {},
196 |     merge: str = "inner",
197 |     filesystem: FileSystem = FileSystem(),
198 |     tmpdir: str = None,
199 | ):
200 |     """Return a partitioned dask dataframe, where each partition is a row group
201 | 
202 |     The results are the same as iterate with axis=0, except that it returns a dask dataframe
203 |     instead of a generator. Note that this requires dask to be installed
204 | 
205 |     Parameters
206 |     ----------
207 |     path : str
208 |         The glob-able path to all files to assemble into a frame
209 |         e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
210 |         See the README for a more detailed explanation
211 |     cgroups : list of str, or {str: args} optional
212 |         The list of cgroups (folder names) to include from the glob path
213 |     rgroups : list of str, optional
214 |         The list of rgroups (file names) to include from the glob path
215 |     read_args : dict, optional
216 |         Any additional keyword args to pass to the read function
217 |     cgroup_args : {cgroup: kwargs}, optional
218 |         Any cgroup specific read arguments, where each key is the name
219 |         of the cgroup and each value is a dictionary of keyword args
220 |     merge : one of 'left', 'right', 'outer', 'inner', default 'inner'
221 |         The merge strategy to pass to pandas.merge, only used when axis=0
222 |     filesystem : blocks.filesystem.FileSystem or similar
223 |         A filesystem object that implements the blocks.FileSystem API
224 | 
225 |     Returns
226 |     -------
227 |     data : dask.dataframe
228 |         A dask dataframe partitioned by row groups, with all cgroups merged
229 | 
230 |     """
231 |     try:
232 |         import dask
233 |         import dask.dataframe as dd
234 |     except ImportError:
235 |         raise ImportError("Partitioned requires dask[dataframe] to be installed")
236 | 
237 |     grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir)
238 |     blocks = []
239 | 
240 |     @dask.delayed()
241 |     def merged(rgroup):
242 |         frames = []
243 |         for cgroup in grouped:
244 |             p = next(p for p in grouped[cgroup] if os.path.basename(p) == rgroup)
245 |             args = read_args.copy()
246 |             if cgroup in cgroup_args:
247 |                 args.update(cgroup_args[cgroup])
248 |             frames.append(read_df(p, **args))
249 |         return _merge_all(frames, merge=merge)
250 | 
251 |     for rgroup in _shared_rgroups(grouped):
252 |         blocks.append(merged(rgroup))
253 |     return dd.from_delayed(blocks)
254 | 
255 | 
256 | @with_function_tmpdir
257 | def place(
258 |     df: pd.DataFrame,
259 |     path: str,
260 |     filesystem: FileSystem = FileSystem(),
261 |     tmpdir: str = None,
262 |     **write_args,
263 | ) -> None:
264 |     """Place a dataframe block onto the filesystem at the specified path
265 | 
266 |     Parameters
267 |     ----------
268 |     df : pd.DataFrame
269 |         The data to place
270 |     path : str
271 |         Path to the directory (possibly on GCS) in which to place the columns
272 |     write_args : dict
273 |         Any additional args to pass to the write function
274 |     filesystem : blocks.filesystem.FileSystem or similar
275 |         A filesystem object that implements the blocks.FileSystem API
276 | 
277 |     """
278 |     fname = os.path.basename(path)
279 |     tmp = os.path.join(tmpdir, fname)
280 |     write_df(df, tmp, **write_args)
281 |     filesystem.copy(tmp, path)
282 | 
283 | 
284 | @with_function_tmpdir
285 | def divide(
286 |     df: pd.DataFrame,
287 |     path: str,
288 |     n_rgroup: int = 1,
289 |     rgroup_offset: int = 0,
290 |     cgroup_columns: Optional[Dict[Optional[cgroup], Sequence[str]]] = None,
291 |     extension: str = ".pq",
292 |     convert: bool = False,
293 |     filesystem: FileSystem = FileSystem(),
294 |     prefix=None,
295 |     tmpdir: str = None,
296 |     **write_args,
297 | ) -> None:
298 |     """Split a dataframe into rgroups/cgroups and save to disk
299 | 
300 |     Note that this splitting does not preserve the original index, so make sure
301 |     to have another column to track values
302 | 
303 |     Parameters
304 |     ----------
305 |     df : pd.DataFrame
306 |         The data to divide
307 |     path : str
308 |         Path to the directory (possibly on GCS) in which to place the columns
309 |     n_rgroup : int, default 1
310 |         The number of row groups to partition the data into
311 |         The rgroups will have approximately equal sizes
312 |     rgroup_offset : int, default 0
313 |         The index to start from in the name of file parts
314 |         e.g. If rgroup_offset=10 then the first file will be `part_00010.pq`
315 |     cgroup_columns : {cgroup: list of column names}
316 |         The column lists to form cgroups; if None, do not make cgroups
317 |         Each key is the name of the cgroup, and each value is the list of columns to include
318 |         To reassemble later make sure to include join keys for each cgroup
319 |     extension : str, default .pq
320 |         The file extension for the dataframe (file type inferred from this extension
321 |     convert : bool, default False
322 |         If true attempt to coerce types to numeric. This can avoid issues with ambiguous
323 |         object columns but requires additional time
324 |     filesystem : blocks.filesystem.FileSystem or similar
325 |         A filesystem object that implements the blocks.FileSystem API
326 |     prefix: str
327 |         Prefix to add to written filenames
328 |     write_args : dict
329 |         Any additional args to pass to the write function
330 | 
331 |     """
332 |     # Use a single dummy cgroup if None wanted
333 |     if cgroup_columns is None:
334 |         cgroup_columns = {None: df.columns}
335 | 
336 |     # Add leading dot if not in extension
337 |     if extension[0] != ".":
338 |         extension = "." + extension
339 | 
340 |     if convert:
341 |         for col in df.columns:
342 |             df[col] = pd.to_numeric(df[col], errors="ignore")
343 | 
344 |     files = []
345 |     for cname, columns in cgroup_columns.items():
346 |         cgroup = df[columns]
347 | 
348 |         bucket = os.path.join(path, cname) if cname else path
349 |         tmp_cgroup = os.path.join(tmpdir, cname) if cname else tmpdir
350 | 
351 |         if not filesystem.isdir(tmp_cgroup):
352 |             filesystem.mkdir(tmp_cgroup)
353 | 
354 |         rnames = [
355 |             "part_{:05d}{}".format(i + rgroup_offset, extension)
356 |             for i in range(n_rgroup)
357 |         ]
358 |         if prefix is not None:
359 |             rnames = [prefix + "_" + rn for rn in rnames]
360 | 
361 |         for rgroup, rname in zip(np.array_split(cgroup, n_rgroup), rnames):
362 |             tmp = os.path.join(tmp_cgroup, rname)
363 |             write_df(rgroup.reset_index(drop=True), tmp, **write_args)
364 |             files.append((cname, rname) if cname else (rname,))
365 | 
366 |     filesystem.copy(
367 |         [os.path.join(tmpdir, *f) for f in files],
368 |         [os.path.join(path, *f) for f in files],
369 |     )
370 | 
371 | 
372 | def pickle(obj: Any, path: str, filesystem: FileSystem = FileSystem()):
373 |     """Save a pickle of obj at the specified path
374 | 
375 |     Parameters
376 |     ----------
377 |     obj : Object
378 |         Any pickle compatible object
379 |     path : str
380 |         The path to the location to save the pickle file, support gcs paths
381 |     filesystem : blocks.filesystem.FileSystem or similar
382 |         A filesystem object that implements the blocks.FileSystem API
383 |     """
384 |     with filesystem.open(path, "wb") as f:
385 |         _pickle.dump(obj, f)
386 | 
387 | 
388 | def unpickle(path: str, filesystem: FileSystem = FileSystem()):
389 |     """Load an object from the pickle file at path
390 | 
391 |     Parameters
392 |     ----------
393 |     obj : Object
394 |         Any pickle compatible object
395 |     path : str
396 |         The path to the location of the saved pickle file, support gcs paths
397 |     filesystem : blocks.filesystem.FileSystem or similar
398 |         A filesystem object that implements the blocks.FileSystem API
399 |     """
400 |     with filesystem.open(path, "rb") as f:
401 |         return _pickle.load(f)
402 | 
403 | 
404 | def _collect(
405 |     path: str,
406 |     cgroups: Optional[Sequence[cgroup]],
407 |     rgroups: Optional[Sequence[rgroup]],
408 |     filesystem: FileSystem,
409 |     tmpdir: str,
410 | ) -> Dict[cgroup, Sequence[str]]:
411 |     """Collect paths into cgroups and download any gcs files for local access
412 | 
413 |     Parameters
414 |     ----------
415 |     path : str
416 |         The glob-able path to all files to assemble into a frame
417 |         e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
418 |         See the README for a more detailed explanation
419 |     cgroups : list of str, optional
420 |         The list of cgroups (folder names) to include from the glob path
421 |     rgroups : list of str, optional
422 |         The list of rgroups (file names) to include from the glob path
423 |     filesystem : blocks.filesystem.FileSystem or similar
424 |         A filesystem object that implements the blocks.FileSystem API
425 |     tmpdir : str
426 |         The path of a temporary directory to use for copies of files
427 | 
428 |     Returns
429 |     -------
430 |     grouped : {cgroup: list of paths}
431 |         Paths to local copies of the data, grouped by cgroup
432 | 
433 |     """
434 |     # ----------------------------------------
435 |     # Collect paths into cgroups
436 |     # ----------------------------------------
437 |     paths = filesystem.ls(path)
438 |     if not paths:
439 |         raise ValueError(f"Did not find any files at the path: {path}")
440 |     expanded = _expand(paths, filesystem)
441 |     filtered = _filter(expanded, cgroups, rgroups)
442 |     grouped = _cgroups(filtered)
443 |     accessed = _access(grouped, filesystem, tmpdir)
444 | 
445 |     # Go in specified cgroup order, or alphabetical if not specified
446 |     if cgroups is None:
447 |         cgroups = sorted(grouped.keys())
448 | 
449 |     return OrderedDict((k, accessed[k]) for k in cgroups)
450 | 
451 | 
452 | def _has_ext(path: str) -> bool:
453 |     return os.path.splitext(path)[1] != ""
454 | 
455 | 
456 | def _expand(paths: Sequence[str], filesystem: FileSystem) -> List[str]:
457 |     """For any directories in paths, expand into all the contained files"""
458 |     expanded = []
459 |     for path in paths:
460 |         if _has_ext(path):
461 |             # Has an extension so treat it as a file
462 |             expanded.append(path)
463 |         else:
464 |             # Otherwise try to read it like a directory
465 |             expanded += filesystem.ls(path + "**")
466 |     # Some cases might result in duplicates, so we convert to set and back
467 |     return sorted(set(p for p in expanded if _has_ext(p)))
468 | 
469 | 
470 | def _filter(
471 |     paths: Sequence[str],
472 |     cgroups: Optional[Sequence[cgroup]],
473 |     rgroups: Optional[Sequence[rgroup]],
474 | ) -> List[str]:
475 |     """Keep only paths with the appropriate cgroups and/or rgroups"""
476 |     kept = []
477 |     for path in paths:
478 |         valid_cgroup = cgroups is None or _cname(path) in cgroups
479 |         valid_rgroup = rgroups is None or _rname(path) in rgroups
480 |         if valid_cgroup and valid_rgroup:
481 |             kept.append(path)
482 |     return kept
483 | 
484 | 
485 | def _base(path: str) -> str:
486 |     """Get base from path (name of the top level folder)"""
487 |     return os.path.dirname(os.path.dirname(path))
488 | 
489 | 
490 | def _cname(path: str) -> cgroup:
491 |     """Get cname from path (name of the parent folder)"""
492 |     return os.path.basename(os.path.dirname(path))
493 | 
494 | 
495 | def _rname(path: str) -> rgroup:
496 |     """Get cname from path (name of the file)"""
497 |     return os.path.basename(path)
498 | 
499 | 
500 | def _cgroups(paths: Sequence[str]) -> DefaultDict[cgroup, List[str]]:
501 |     """Group paths by cgroup (the parent folder)"""
502 |     cgroups = defaultdict(list)
503 |     for path in paths:
504 |         cgroups[_cname(path)].append(path)
505 |     return cgroups
506 | 
507 | 
508 | def _access(cgroups, filesystem: FileSystem, tmpdir: str) -> Dict[cgroup, List[str]]:
509 |     """Access potentially cloud stored files, preserving cgroups"""
510 |     updated = {}
511 | 
512 |     for cgroup, paths in cgroups.items():
513 |         if filesystem._get_protocol_path(paths)[0] is None:
514 |             updated[cgroup] = paths
515 |         else:
516 |             tmp_cgroup = os.path.join(tmpdir, cgroup, "")
517 |             filesystem.copy(paths, tmp_cgroup)
518 |             updated[cgroup] = filesystem.ls(tmp_cgroup)
519 |     return updated
520 | 
521 | 
522 | def _safe_merge(df1: pd.DataFrame, df2: pd.DataFrame, merge="inner") -> pd.DataFrame:
523 |     """Merge two dataframes, warning of any shape differences"""
524 |     s1, s2 = df1.shape[0], df2.shape[0]
525 |     if s1 != s2:
526 |         warnings.warn(
527 |             f"The two cgroups have a different number of rows: {s1} versus {s2}"
528 |         )
529 |     return pd.merge(df1, df2, how=merge)
530 | 
531 | 
532 | def _merge_all(frames: Sequence[pd.DataFrame], merge="inner") -> pd.DataFrame:
533 |     """Merge a list of dataframes with safe merge"""
534 |     result = frames[0]
535 |     for frame in frames[1:]:
536 |         result = _safe_merge(result, frame, merge)
537 |     return result
538 | 
539 | 
540 | def _shared_rgroups(grouped) -> Iterable[rgroup]:
541 |     rgroups = [[_rname(path) for path in group] for group in grouped.values()]
542 |     return reduce(lambda a, b: set(a) & set(b), rgroups)
543 | 


--------------------------------------------------------------------------------