├── .github ├── CODEOWNERS └── workflows │ ├── python-app.yml │ └── python-publish.yml ├── tests ├── __init__.py ├── test_pickle.py ├── test_place.py ├── test_partitioned.py ├── test_iterate.py ├── test_divide.py ├── test_access.py ├── test_filesystem.py ├── test_assemble.py └── conftest.py ├── docs ├── row.png ├── both.png ├── blocks.gif ├── column.png ├── filtered.png ├── row_iter.png ├── blocks_layout.png ├── column_iter.png ├── requirements.txt ├── core.rst ├── filesystem.rst ├── Makefile ├── index.rst ├── examples.rst ├── conf.py └── quickstart.rst ├── MANIFEST.in ├── blocks ├── filesystem │ ├── __init__.py │ ├── gcs_filesystem.py │ ├── gcs_native_filesystem.py │ └── base.py ├── __init__.py ├── utils.py ├── dfio.py └── core.py ├── .readthedocs.yaml ├── .pre-commit-config.yaml ├── setup.cfg ├── setup.py ├── .gitignore ├── README.md ├── CHANGELOG.md └── LICENSE.txt /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @baxen 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /docs/row.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/row.png -------------------------------------------------------------------------------- /docs/both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/both.png -------------------------------------------------------------------------------- /docs/blocks.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/blocks.gif -------------------------------------------------------------------------------- /docs/column.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/column.png -------------------------------------------------------------------------------- /docs/filtered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/filtered.png -------------------------------------------------------------------------------- /docs/row_iter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/row_iter.png -------------------------------------------------------------------------------- /docs/blocks_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/blocks_layout.png -------------------------------------------------------------------------------- /docs/column_iter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/blocks/HEAD/docs/column_iter.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | 3 | recursive-include tests * 4 | recursive-exclude * __pycache__ 5 | recursive-exclude * *.py[co] 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Frozen versions of fsspec and gcsfs to avoid conflict in read the docs 2 | fsspec==2021.05.0 3 | gcsfs==2021.05.0 4 | -------------------------------------------------------------------------------- /docs/core.rst: -------------------------------------------------------------------------------- 1 | ==== 2 | Core 3 | ==== 4 | 5 | .. automodule:: blocks.core 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | -------------------------------------------------------------------------------- /docs/filesystem.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Filesystem 3 | ========== 4 | 5 | 6 | .. automodule:: blocks.filesystem.base 7 | :members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /blocks/filesystem/__init__.py: -------------------------------------------------------------------------------- 1 | from blocks.filesystem.gcs_filesystem import GCSFileSystem # NOQA 2 | from blocks.filesystem.gcs_native_filesystem import GCSNativeFileSystem # NOQA 3 | from blocks.filesystem.base import FileSystem # NOQA 4 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/conf.py 5 | 6 | formats: 7 | - pdf 8 | 9 | python: 10 | version: 3.7 11 | install: 12 | - requirements: docs/requirements.txt 13 | - method: pip 14 | path: . 15 | extra_requirements: 16 | - doc 17 | -------------------------------------------------------------------------------- /tests/test_pickle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import blocks 3 | 4 | 5 | def test_pickle(fs, temp): 6 | original = {"example": 0.0} 7 | path = os.path.join(temp, "test.pkl") 8 | blocks.pickle(original, path, filesystem=fs) 9 | unpickled = blocks.unpickle(path, filesystem=fs) 10 | assert original == unpickled 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 19.10b0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /blocks/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Bradley Axen""" 4 | __email__ = "baxen@squareup.com" 5 | __version__ = "0.9.4" 6 | 7 | 8 | from blocks.core import ( 9 | assemble, 10 | iterate, 11 | partitioned, 12 | divide, 13 | place, 14 | pickle, 15 | unpickle, 16 | ) # NOQA 17 | -------------------------------------------------------------------------------- /tests/test_place.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import blocks 4 | 5 | 6 | def test_place(datadir, randomdata, fs): 7 | blocks.place(randomdata, os.path.join(datadir, "example.parquet"), filesystem=fs) 8 | assert len(fs.ls(datadir)) == 1 9 | df = blocks.assemble(os.path.join(datadir, "*"), filesystem=fs) 10 | assert np.isclose(df, randomdata).all().all() 11 | -------------------------------------------------------------------------------- /blocks/filesystem/gcs_filesystem.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from blocks.filesystem.base import FileSystem 4 | 5 | 6 | class GCSFileSystem(FileSystem): 7 | """Deprecated, see FileSystem""" 8 | 9 | def __init__(self, *args, **kwargs): 10 | warnings.warn( 11 | "This class is deprecated, use blocks.filesystem.FileSystem", 12 | DeprecationWarning, 13 | ) 14 | super().__init__(*args, **kwargs) 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.9.4 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version="{current_version}" 8 | replace = version="{new_version}" 9 | 10 | [bumpversion:file:blocks/__init__.py] 11 | search = __version__ = "{current_version}" 12 | replace = __version__ = "{new_version}" 13 | 14 | [bumpversion:file:docs/conf.py] 15 | 16 | [flake8] 17 | max-line-length = 100 18 | exclude = docs 19 | -------------------------------------------------------------------------------- /blocks/filesystem/gcs_native_filesystem.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from blocks.filesystem.base import FileSystem 4 | 5 | 6 | class GCSNativeFileSystem(FileSystem): 7 | """Deprecated, see FileSystem""" 8 | 9 | def __init__(self, *args, **kwargs): 10 | warnings.warn( 11 | "This class is deprecated, use blocks.filesystem.FileSystem", 12 | DeprecationWarning, 13 | ) 14 | super().__init__(*args, **kwargs) 15 | -------------------------------------------------------------------------------- /tests/test_partitioned.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pytest 4 | from blocks import core 5 | 6 | 7 | def test_partitioned(populated): 8 | pytest.importorskip("dask.dataframe") 9 | df = core.partitioned(populated).compute() 10 | assert df.shape == (40, 41) 11 | expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)] 12 | expected.append("key") 13 | assert set(df.columns) == set(expected) 14 | -------------------------------------------------------------------------------- /blocks/utils.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import tempfile 3 | 4 | import wrapt 5 | 6 | 7 | @wrapt.decorator 8 | def with_function_tmpdir(wrapped, instance, args, kwargs): 9 | with tempfile.TemporaryDirectory() as tmpdir: 10 | kwargs["tmpdir"] = tmpdir 11 | return wrapped(*args, **kwargs) 12 | 13 | 14 | @wrapt.decorator 15 | def with_session_tmpdir(wrapped, instance, args, kwargs): 16 | tmpdir = tempfile.TemporaryDirectory() 17 | kwargs["tmpdir"] = tmpdir.name 18 | atexit.register(tmpdir.cleanup) 19 | return wrapped(*args, **kwargs) 20 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | name: Python application 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.6, 3.7, 3.8] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install --upgrade .[dev] 27 | - name: Test 28 | run: | 29 | pytest 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Blocks 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | coverage: Makefile 18 | rm -rf htmlcov && pipenv run pytest --cov=blocks --cov-report html ../tests 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.x' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools twine 22 | - name: Build 23 | run: | 24 | python setup.py sdist bdist_wheel 25 | - name: Publish 26 | env: 27 | TWINE_REPOSITORY_URL: https://upload.pypi.org/legacy/ 28 | TWINE_USERNAME: __token__ 29 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 30 | run: | 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /tests/test_iterate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from blocks import core 4 | 5 | 6 | def test_iterate(populated): 7 | dfs = core.iterate(populated) 8 | for i in range(4): 9 | for j in range(4): 10 | cname, rname, df = next(dfs) 11 | assert cname == "c{}".format(i) 12 | assert rname == "part.{}.csv".format(j) 13 | assert df.shape == (10, 11) 14 | 15 | 16 | def test_iterate_ordered(populated): 17 | order = ["c3", "c1", "c2", "c0"] 18 | dfs = core.iterate(populated, cgroups=order) 19 | for i in range(4): 20 | for j in range(4): 21 | cname, rname, df = next(dfs) 22 | assert cname == order[i] 23 | assert rname == "part.{}.csv".format(j) 24 | assert df.shape == (10, 11) 25 | 26 | 27 | def test_iterate_axis0(populated): 28 | dfs = core.iterate(populated, axis=0) 29 | for i in range(4): 30 | rname, df = next(dfs) 31 | assert rname == "part.{}.csv".format(i) 32 | assert df.shape == (10, 41) 33 | 34 | 35 | def test_iterate_axis1(populated): 36 | dfs = core.iterate(populated, axis=1) 37 | for i in range(4): 38 | cname, df = next(dfs) 39 | assert cname == "c{}".format(i) 40 | assert df.shape == (40, 11) 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | 6 | with open("README.md") as readme_file: 7 | readme = readme_file.read() 8 | 9 | requirements = [ 10 | "pandas", 11 | "fsspec>=2021.7.0", 12 | "gcsfs>=2021.7.0", 13 | "s3fs", 14 | "pyarrow", 15 | "fastavro", 16 | "wrapt", 17 | ] 18 | 19 | extras_require = { 20 | "tests": ["pytest", "pytest-cov", "delegator.py", "flake8"], 21 | "doc": ["sphinx", "numpydoc", "furo", "sphinx-copybutton"], 22 | "format": ["pre-commit"], 23 | } 24 | extras_require["dev"] = set(sum(extras_require.values(), [])) 25 | 26 | setup( 27 | name="sq-blocks", 28 | version="0.9.4", 29 | description=( 30 | "Blocks provides a simple interface to read, organize, and manipulate structured data" 31 | " in files on local and cloud storage" 32 | ), 33 | long_description=readme, 34 | long_description_content_type="text/markdown", 35 | author="Bradley Axen", 36 | author_email="baxen@squareup.com", 37 | packages=find_packages(), 38 | include_package_data=True, 39 | install_requires=requirements, 40 | extras_require=extras_require, 41 | zip_safe=False, 42 | keywords="blocks", 43 | classifiers=[ 44 | "Intended Audience :: Developers", 45 | "Natural Language :: English", 46 | "Programming Language :: Python :: 3", 47 | ], 48 | ) 49 | -------------------------------------------------------------------------------- /tests/test_divide.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import blocks 3 | 4 | 5 | def test_divide(datadir, randomdata, fs): 6 | blocks.divide(randomdata, datadir, 10, extension=".csv", filesystem=fs) 7 | assert len(fs.ls(datadir)) == 10 8 | df = blocks.assemble(datadir) 9 | assert np.isclose(df, randomdata).all().all() 10 | 11 | 12 | def test_divide_offset(datadir, randomdata, fs): 13 | blocks.divide(randomdata, datadir, 10, extension=".csv", filesystem=fs) 14 | blocks.divide(randomdata, datadir, 10, 10, extension=".csv", filesystem=fs) 15 | assert len(fs.ls(datadir)) == 20 16 | 17 | df = blocks.assemble(datadir) 18 | expected = randomdata.append(randomdata) 19 | assert np.isclose(df, expected).all().all() 20 | 21 | 22 | def test_divide_cgroups(datadir, randomdata, fs): 23 | randomdata.insert(0, "key", list(range(10))) 24 | cgroups_columns = { 25 | "cgroup1": ["key", "f0", "f1", "f2"], 26 | "cgroup2": ["key", "f3", "f4", "f5"], 27 | "cgroup3": ["key", "f6", "f7", "f8", "f9"], 28 | } 29 | 30 | blocks.divide( 31 | randomdata, 32 | datadir, 33 | 10, 34 | cgroup_columns=cgroups_columns, 35 | extension=".csv", 36 | filesystem=fs, 37 | ) 38 | assert len(fs.ls(datadir)) == 3 39 | for subdir in fs.ls(datadir): 40 | assert len(fs.ls(subdir)) == 10 41 | df = blocks.assemble(datadir) 42 | assert np.isclose(df, randomdata).all().all() 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .vscode/ 107 | -------------------------------------------------------------------------------- /tests/test_access.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from blocks import core 5 | 6 | 7 | def test_expand(populated, fs): 8 | expected = [ 9 | os.path.join(populated, "c{}/part.{}.csv".format(i, j)) 10 | for i in range(4) 11 | for j in range(4) 12 | ] 13 | 14 | # All of these patterns should expand into the same set of files 15 | for ex in ["", "*/*", "**"]: 16 | ex = os.path.join(populated, ex) 17 | paths = fs.ls(ex) 18 | expanded = sorted(core._expand(paths, fs)) 19 | assert expanded == expected 20 | 21 | 22 | def test_expand_pattern(populated, fs): 23 | expected = [ 24 | os.path.join(populated, "c{}/part.{}.csv".format(i, j)) 25 | for i in range(2) 26 | for j in range(4) 27 | ] 28 | 29 | # All of these patterns should expand into the same set of files 30 | for ex in ["c[01]**", "c[01]/*"]: 31 | ex = os.path.join(populated, ex) 32 | paths = fs.ls(ex) 33 | expanded = sorted(core._expand(paths, fs)) 34 | assert expanded == expected 35 | 36 | 37 | def test_cgroups(): 38 | expanded = ["base/c{}/part.{}.csv".format(i, j) for i in range(4) for j in range(4)] 39 | cgroups = core._cgroups(expanded) 40 | for i in range(4): 41 | key = "c{}".format(i) 42 | assert key in cgroups 43 | assert cgroups[key] == ["base/c{}/part.{}.csv".format(i, j) for j in range(4)] 44 | 45 | 46 | def test_access(populated, fs, tmpdir): 47 | tmpdir = str(tmpdir) 48 | paths = fs.ls(populated) 49 | expanded = core._expand(paths, fs) 50 | cgroups = core._cgroups(expanded) 51 | cgroups = core._access(cgroups, fs, tmpdir) 52 | assert len(cgroups) == 4 53 | for c, paths in cgroups.items(): 54 | assert len(paths) == 4 55 | for path in paths: 56 | with open(path, "r") as f: 57 | assert f.read() 58 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Blocks 3 | ====== 4 | 5 | .. image:: blocks.gif 6 | 7 | Blocks provides a simple interface to read, organize, and manipulate structured data in files 8 | on local and cloud storage 9 | 10 | Install 11 | ------------- 12 | .. code-block:: bash 13 | 14 | pip install sq-blocks 15 | 16 | Features 17 | -------- 18 | 19 | .. code-block:: python 20 | 21 | import blocks 22 | 23 | # Load one or more files with the same interface 24 | df = blocks.assemble('data.csv') 25 | train = blocks.assemble('data/*[0-7].csv') 26 | test = blocks.assemble('data/*[89].csv') 27 | 28 | # With direct support for files on GCS 29 | df = blocks.assemble('gs://mybucket/data.csv') 30 | df = blocks.assemble('gs://mybucket/data/*.csv') 31 | 32 | The interface emulates the tools you're used to from the command line, with full support for globbing and pattern 33 | matching. And blocks can handle more complicated structures as your data grows in complexity: 34 | 35 | ======================= ===================================================================== 36 | Layout Recipe 37 | ======================= ===================================================================== 38 | .. image:: both.png .. code-block:: python 39 | 40 | blocks.assemble('data/**') 41 | 42 | .. image:: column.png .. code-block:: python 43 | 44 | blocks.assemble('data/g1/*') 45 | 46 | .. image:: row.png .. code-block:: python 47 | 48 | blocks.assemble('data/*/part_01.pq') 49 | 50 | .. image:: filtered.png .. code-block:: python 51 | 52 | blocks.assemble('data/g[124]/part_01.pq') 53 | 54 | ======================= ===================================================================== 55 | 56 | 57 | 58 | .. toctree:: 59 | :hidden: 60 | 61 | quickstart 62 | examples 63 | core 64 | filesystem 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blocks 2 | 3 | Blocks provides a simple interface to read, organize, and manipulate structured data in files 4 | on local and cloud storage. See the [documentation](https://sq-blocks.readthedocs.io) for more 5 | information. 6 | 7 | pip install sq-blocks 8 | 9 | ![blocks](docs/blocks.gif) 10 | 11 | ## Development 12 | 13 | ### Setup 14 | 15 | To install all dependencies for local development and testing, you can do 16 | 17 | pip install -e .[dev] 18 | 19 | ### Tests 20 | 21 | * `pytest` runs the unit tests 22 | 23 | To run them locally: 24 | 25 | pytest 26 | 27 | ### Continuous Integrations 28 | 29 | CI is handled through GitHub Actions, and will run non-GCS tests on 3.6, 3.7, 3.8. 30 | We may add cloud storage tests to CI soon, but for now tests should also be 31 | run locally to confirm that functionality works as well. 32 | 33 | ### Versions and Tags 34 | 35 | Use bumpversion to update the version of the package 36 | 37 | bumpversion [major|minor|patch] 38 | 39 | This will increment the version and update it both in `setup.py` and `blocks/__init__.py`. 40 | It will also automatically commit a tag with the corresponding version. You can push this to the repo 41 | with 42 | 43 | git push --tags 44 | 45 | ### Formatting 46 | 47 | We use pre-commit to ensure consistent formatting, to make sure you run the 48 | hooks: 49 | 50 | pre-commit install 51 | 52 | ### Docs 53 | 54 | The docs are generated from the code with 55 | [sphinx](https://www.sphinx-doc.org/en/master/), and can be tested locally: 56 | 57 | cd docs 58 | make html 59 | 60 | ## License 61 | 62 | Copyright 2018 Square, Inc. 63 | 64 | Licensed under the Apache License, Version 2.0 (the "License"); 65 | you may not use this file except in compliance with the License. 66 | You may obtain a copy of the License at 67 | 68 | http://www.apache.org/licenses/LICENSE-2.0 69 | 70 | Unless required by applicable law or agreed to in writing, software 71 | distributed under the License is distributed on an "AS IS" BASIS, 72 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 73 | See the License for the specific language governing permissions and 74 | limitations under the License. 75 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [0.9.0] - 2021-02-25 2 | 3 | ### Removed 4 | 5 | - GCS(Native)FileSystem no longer provides store/access 6 | - GCSFileSystem are now backwards compatibility wrappers for FileSystem and will 7 | be removed in 1.0.0 8 | - No more explicit compression support, compression may still be possible 9 | through read/write args 10 | 11 | ### Added 12 | 13 | - New generic FileSystem backed by fsspec 14 | - rather than using fsspec directly we use this wrapper for better backwards 15 | compatibility and more automatic protocol handling 16 | - In theory any fsspec implementation is supported but only local and gcsfs 17 | are tested so far 18 | 19 | ### Changed 20 | - We now use paths (rather than file objects) in pandas io methods for better 21 | compatibility 22 | - All GCS operations are handled through gcsfs, which has much better 23 | performance with large numbers of files and has been more robust to connection 24 | errors 25 | - Globbing must now expand to match patterns to literal files, not directories 26 | 27 | ## [0.8.0] - 2020-10-14 28 | 29 | ### Removed 30 | 31 | - Dropped Python 2 support. 32 | - Compression on write no longer supported by Pandas 33 | 34 | ### Added 35 | 36 | - Typehints for Python 3 37 | - some missing abstract methods to the base FileSystem class definition. 38 | 39 | ## [0.7.1] - 2020-08-20 40 | 41 | ### Added 42 | 43 | `blocks.pickle` and `blocks.unpickle` utilities to save and load pickle files. 44 | 45 | ## [0.7.0] - 2020-07-22 46 | 47 | This release has minor backwards incompatible for anyone that directly used 48 | datafiles. The top level and filesystem APIs (assemble, iterate, partitioned, 49 | etc) are unchanged. 50 | 51 | ### Added 52 | 53 | - LocalDataFile that implements datafile for local paths 54 | - GCSNativeDataFile that implements datafile for GCS paths using GCS python blob API 55 | 56 | ### Changed 57 | 58 | - The old datafile namedtuple is now an abstract base class 59 | - Datafiles now use a contextmanager for handle, which yields a file handle 60 | - Datafiles are only opened one at a time just before the data is loaded into 61 | memory 62 | - This should prevent exceeding the os open file limit with large directories 63 | - Also sets the stage for better multithreading support 64 | -------------------------------------------------------------------------------- /tests/test_filesystem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from uuid import uuid4 5 | 6 | TEST_STRING = b"test" 7 | 8 | 9 | def test_ls_directory(populated, fs): 10 | ex = os.path.join(populated, "") 11 | expected = [os.path.join(populated, "c{}".format(i)) for i in range(4)] 12 | 13 | # one of the fixtures has an extra file 14 | found = fs.ls(ex) 15 | if len(found) > 4: 16 | found = found[:-1] 17 | assert found == expected 18 | 19 | 20 | def test_ls_wildcard(populated, fs): 21 | ex = os.path.join(populated, "*/part.1.csv") 22 | expected = [os.path.join(populated, "c{}/part.1.csv".format(i)) for i in range(4)] 23 | assert fs.ls(ex) == expected 24 | 25 | 26 | def test_ls_double_wildcard(populated, fs): 27 | ex = os.path.join(populated, "**") 28 | expected = { 29 | os.path.join(populated, "c{}/part.{}.csv".format(i, j)) 30 | for i in range(4) 31 | for j in range(4) 32 | } 33 | assert expected.issubset(set(fs.ls(ex))) 34 | 35 | 36 | def test_ls_pattern(populated, fs): 37 | ex = os.path.join(populated, "*/part.[01].csv") 38 | expected = [ 39 | os.path.join(populated, "c{}/part.{}.csv".format(i, j)) 40 | for i in range(4) 41 | for j in range(2) 42 | ] 43 | assert fs.ls(ex) == expected 44 | 45 | 46 | def test_open_read(populated, fs): 47 | with fs.open(os.path.join(populated, "c0/part.0.csv"), "r") as f: 48 | assert f.readline() == "f0_0,f0_1,f0_2,f0_3,f0_4,f0_5,f0_6,f0_7,f0_8,f0_9,key\n" 49 | 50 | 51 | def test_open_write(temp, fs): 52 | content = str(uuid4()) 53 | path = os.path.join(temp, "content") 54 | with fs.open(path, "w") as f: 55 | f.write(content) 56 | 57 | with fs.open(path, "r") as f: 58 | assert f.read() == content 59 | 60 | 61 | def test_copy_recursive_to_local(populated, tmpdir, fs): 62 | dest = str(tmpdir) 63 | fs.cp(populated, dest, recursive=True) 64 | source = [p.replace(populated, "") for p in fs.ls(populated + "/**")] 65 | copy = fs.ls(dest + "/**") 66 | assert (s in c for s, c in zip(source, copy)) 67 | 68 | 69 | def test_copy_recursive_matched(populated, fs): 70 | dest = populated.replace("data", "copy") 71 | try: 72 | fs.cp(populated, dest, recursive=True) 73 | source = [p.replace(populated, "") for p in fs.ls(populated + "/**")] 74 | copy = fs.ls(dest + "/**") 75 | assert (s in c for s, c in zip(source, copy)) 76 | finally: 77 | fs.rm(dest, recursive=True) 78 | 79 | 80 | def test_rm(populated, fs): 81 | dest = populated.replace("data", "copy") 82 | fs.cp([os.path.join(populated, "")], dest, recursive=True) 83 | assert fs.ls(dest) 84 | fs.rm([dest], recursive=True) 85 | assert fs.ls(dest) == [] 86 | -------------------------------------------------------------------------------- /tests/test_assemble.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from blocks import core 5 | 6 | 7 | def test_assemble(populated, fs): 8 | df = core.assemble(populated, filesystem=fs) 9 | assert df.shape == (40, 41) 10 | expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)] 11 | expected.append("key") 12 | assert set(df.columns) == set(expected) 13 | 14 | 15 | def test_assemble_single(populated, fs): 16 | df = core.assemble(os.path.join(populated, "c0", "part.0.csv")) 17 | assert df.shape == (10, 11) 18 | 19 | 20 | def test_assemble_flat(populated, fs): 21 | df = core.assemble(os.path.join(populated, "c0"), filesystem=fs) 22 | assert df.shape == (40, 11) 23 | expected = ["f{}_{}".format(i, j) for i in range(1) for j in range(10)] 24 | expected.append("key") 25 | assert set(df.columns) == set(expected) 26 | 27 | 28 | def test_assemble_ordered(populated, fs): 29 | order = ["c2", "c1", "c3", "c0"] 30 | df = core.assemble(populated, cgroups=order, filesystem=fs) 31 | assert df.shape == (40, 41) 32 | expected = ["f{}_{}".format(order[i][1], j) for i in range(4) for j in range(10)] 33 | expected.append("key") 34 | assert set(df.columns) == set(expected) 35 | # Check the features are in the right order 36 | assert [c for c in df.columns if c != "key"] == expected[:-1] 37 | 38 | 39 | # Various options do not depend on filesystem so we can just test locally 40 | def test_assemble_filtered_cgroup(populated_local, keys): 41 | df = core.assemble(populated_local, cgroups=["c0", "c3"]) 42 | assert df.shape == (40, 21) 43 | expected = ["f{}_{}".format(i, j) for i in [0, 3] for j in range(10)] 44 | expected.append("key") 45 | assert set(df.columns) == set(expected) 46 | assert (df.key == keys).all() 47 | 48 | 49 | def test_assemble_filtered_rgroup(populated_local, keys): 50 | df = core.assemble(populated_local, rgroups=["part.0.csv", "part.1.csv"]) 51 | assert df.shape == (20, 41) 52 | expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)] 53 | expected.append("key") 54 | assert set(df.columns) == set(expected) 55 | assert (df.key == keys[:20].reset_index(drop=True)).all() 56 | 57 | 58 | def test_assemble_read_args(populated_local, keys): 59 | read_args = {"dtype": str} 60 | df = core.assemble(populated_local, read_args=read_args) 61 | assert df.shape == (40, 41) 62 | expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)] 63 | expected.append("key") 64 | assert set(df.columns) == set(expected) 65 | assert (df.key == keys).all() 66 | assert (df.dtypes == "object").all() 67 | 68 | 69 | def test_assemble_cgroup_args(populated_local, keys): 70 | cgroup_args = {"c0": {"dtype": str}} 71 | df = core.assemble(populated_local, cgroup_args=cgroup_args) 72 | assert df.shape == (40, 41) 73 | expected = ["f{}_{}".format(i, j) for i in range(4) for j in range(10)] 74 | expected.append("key") 75 | assert set(df.columns) == set(expected) 76 | assert (df.key == keys).all() 77 | for col in ["f0_{}".format(i) for i in range(10)]: 78 | assert df.dtypes[col] == "object" 79 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | Inspect Data 6 | ------------ 7 | 8 | You can use assemble to grab a small subset of your data 9 | 10 | .. code-block:: python 11 | 12 | import blocks 13 | 14 | df = blocks.assemble('data/*/part_00.pq') 15 | df.describe() 16 | 17 | 18 | This works great when dealing with data staged on GCS 19 | 20 | .. code-block:: python 21 | 22 | import blocks 23 | 24 | df = blocks.assemble('gs://bucket/*/part_00.pq') 25 | df.describe() 26 | 27 | 28 | Large Datasets 29 | -------------- 30 | 31 | It's common to end up with a dataset that won't easily fit into memory. But you often still need to calculate 32 | aggregate statistics on that data. For example, you might need to get a unique list of categories in one of your fields. 33 | 34 | Iterate makes this easy: 35 | 36 | .. code-block:: python 37 | 38 | import blocks 39 | 40 | uniques = set() 41 | for _, _, block in blocks.iterate('data/'): 42 | uniques |= set(block['feature']) 43 | 44 | 45 | or maybe you want to parallelize the process 46 | 47 | .. code-block:: python 48 | 49 | import blocks 50 | from multiprocessing import Pool 51 | 52 | def unique_f1(block): 53 | return set(block[-1]['feature']) 54 | 55 | uniques_per_block = Pool(4).map(unique_f1, blocks.iterate('data/')) 56 | uniques = reduce(lambda a, b: a | b, uniques_per_block) 57 | 58 | 59 | And if you have dask installed the parallelization is even easier 60 | 61 | .. code-block:: python 62 | 63 | import blocks 64 | 65 | uniques = blocks.partitioned('data')['feature'].unique().compute() 66 | 67 | 68 | Batch Training 69 | -------------- 70 | 71 | If you're working with a tool like Keras, you might want to train a model on an iterator of batches 72 | without every loading more than one partition into memory: 73 | 74 | .. code-block:: python 75 | 76 | import blocks 77 | 78 | def batch_generator(path): 79 | for _, df in blocks.iterate(path, axis=0): 80 | while df.shape[0] >= nbatch: 81 | # Grab a sample and drop from original 82 | sub = df.sample(nbatch) 83 | df.drop(sub.index, inplace=True) 84 | yield sub.values 85 | 86 | model.fit_generator( 87 | generator=batch_generator('train/'), 88 | validation_data=batch_generator('validate/'), 89 | ) 90 | 91 | If you use an efficient file format like ``parquet``, this simple code will be suprisingly fast. You should make 92 | sure that you don't use multiple cgroups in a situation like this, however, because merging can slow 93 | down the process. 94 | 95 | 96 | Combining 97 | --------- 98 | 99 | If you end up with a dataset with multiple column groups, say because you grabbed your data from multiple sources, 100 | you may want to merge accross those groups. However it is expensive to do this by loading the whole dataset into memory. 101 | If you use the blocks structure you can merge each row partition separately and then save to new files. You can 102 | even subdivide those files into smaller row groups to ensure that they don't grow too large: 103 | 104 | 105 | .. code-block:: python 106 | 107 | import blocks 108 | 109 | offset = 0 110 | for _, df in blocks.iterate(path, axis=0): 111 | blocks.divide(df, 'combined/', n_rgroup=10, rgroup_offset=offset) 112 | rgroup_offset += 10 113 | 114 | 115 | Filesystem 116 | ---------- 117 | 118 | Blocks provide a default filesystem that supports local files and GCS files. If you need additional functionality, 119 | you can create a custom filesystem instance: 120 | 121 | 122 | .. code-block:: python 123 | 124 | import blocks 125 | from blocks.filesystem import GCSFileSystem 126 | 127 | fs = GCSFileSystem() 128 | df = blocks.assemble('gs://bucket/data/', filesystem=fs) 129 | 130 | 131 | The default filesystem has support for GCS, and you can implement your own FileSystem class by 132 | inheriting from ``blocks.filesystem.FileSystem``. This can be used to extend blocks to additional 133 | cloud platforms, to support encryption/decryption, etc... 134 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import pandas as pd 4 | import numpy as np 5 | import uuid 6 | 7 | from blocks.filesystem import FileSystem 8 | from delegator import run 9 | 10 | BUCKET_GCS = "gs://blocks-example" 11 | BUCKET_S3 = "s3://blocks-example" 12 | 13 | if os.environ.get("CI"): 14 | inputs = ["local"] 15 | outputs = ["local"] 16 | temps = ["local"] 17 | else: 18 | inputs = ["local", "gcs", "gcs_extra", "s3"] 19 | outputs = ["local", "gcs", "s3"] 20 | temps = ["local", "gcs", "s3"] 21 | 22 | 23 | @pytest.fixture 24 | def fs(request): 25 | return FileSystem() 26 | 27 | 28 | @pytest.fixture(scope="function", params=temps) 29 | def temp(request, tmpdir_factory): 30 | if request.param == "local": 31 | path = str(tmpdir_factory.mktemp("temp")) 32 | yield path 33 | 34 | if request.param == "gcs": 35 | path = os.path.join(BUCKET_GCS, "temp") 36 | yield path 37 | run("gsutil rm -r {}".format(path)) 38 | 39 | if request.param == "s3": 40 | path = os.path.join(BUCKET_S3, "temp") 41 | yield path 42 | run("aws s3 rm --recursive {}".format(path)) 43 | 44 | 45 | @pytest.fixture(scope="session") 46 | def populated_local(request, tmpdir_factory): 47 | tmpdir = str(tmpdir_factory.mktemp("data")) 48 | _populate(tmpdir) 49 | return tmpdir 50 | 51 | 52 | # This is the same directory structure as above but parametrized on different file systems 53 | @pytest.fixture(scope="session", params=inputs) 54 | def populated(request, populated_local): 55 | if request.param == "local": 56 | yield populated_local 57 | 58 | if request.param == "gcs": 59 | path = os.path.join(BUCKET_GCS, "data1") 60 | run("gsutil cp -r {} {}".format(populated_local, path)) 61 | yield path 62 | run("gsutil rm -r {}".format(path)) 63 | 64 | if request.param == "gcs_extra": 65 | path = os.path.join(BUCKET_GCS, "data2") 66 | run("gsutil cp -r {} {}".format(populated_local, path)) 67 | # Also add an extra file 68 | run("touch extra") 69 | run("gsutil cp extra {}".format(path)) 70 | os.remove("extra") 71 | yield path 72 | run("gsutil rm -r {}".format(path)) 73 | 74 | if request.param == "s3": 75 | path = os.path.join(BUCKET_S3, "data1") 76 | run("aws s3 cp --recursive {} {}".format(populated_local, path)) 77 | yield path 78 | run("aws s3 rm --recursive {}".format(path)) 79 | 80 | 81 | @pytest.fixture(scope="session") 82 | def keys(): 83 | return pd.Series(["key{:02d}".format(i) for i in range(40)]) 84 | 85 | 86 | @pytest.fixture() 87 | def randomdata(): 88 | df = pd.DataFrame( 89 | np.random.rand(10, 10), columns=["f{}".format(i) for i in range(10)] 90 | ) 91 | return df 92 | 93 | 94 | @pytest.fixture() 95 | def datadir_local(request, tmpdir_factory): 96 | return str(tmpdir_factory.mktemp("data")) 97 | 98 | 99 | @pytest.fixture(params=outputs) 100 | def datadir(request, tmpdir_factory): 101 | output = str(uuid.uuid4()).replace("-", "") 102 | if request.param == "local": 103 | tmpdir = str(tmpdir_factory.mktemp("data")) 104 | yield tmpdir 105 | 106 | if request.param == "gcs": 107 | path = os.path.join(BUCKET_GCS, output) 108 | yield path 109 | run("gsutil rm -r {}".format(path)) 110 | 111 | if request.param == "s3": 112 | path = os.path.join(BUCKET_S3, output) 113 | yield path 114 | run("aws s3 rm --recursive {}".format(path)) 115 | 116 | 117 | def _populate(tmpdir): 118 | """Create a directory of blocks with 4 cgroups and 4 rgroups""" 119 | for c in range(4): 120 | cgroup = os.path.join(tmpdir, "c{}".format(c)) 121 | if not os.path.exists(cgroup): 122 | os.makedirs(cgroup) 123 | for r in range(4): 124 | df = pd.DataFrame( 125 | np.random.rand(10, 10), 126 | index=list(range(r * 10, (r + 1) * 10)), 127 | columns=["f{}_{}".format(c, i) for i in range(10)], 128 | ) 129 | df["key"] = [ 130 | "key{:02d}".format(i) for i in df.index 131 | ] # common key for merges 132 | df.to_csv(os.path.join(cgroup, "part.{}.csv".format(r)), index=False) 133 | -------------------------------------------------------------------------------- /blocks/dfio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | import pandas as pd 4 | 5 | try: 6 | import fastavro as avro 7 | 8 | avro_imported = True 9 | except ImportError: 10 | avro_imported = False 11 | 12 | 13 | def read_df(path, **read_args): 14 | """Read a dataframe path based on the file extension 15 | parquet, avro, csv, pickle, json 16 | 17 | Parameters 18 | ---------- 19 | path: str 20 | The path to the file holding data 21 | read_args : optional 22 | All keyword args are passed to the read function 23 | 24 | Returns 25 | ------- 26 | data : pd.DataFrame 27 | 28 | Notes 29 | ----- 30 | The read functions are taken from pandas, e.g. pd.read_csv 31 | Check the pandas doc for more information on the supported arguments 32 | """ 33 | filetype = _get_extension(path) 34 | reader = _readers[filetype] 35 | if reader == pd.read_json: 36 | # Default json file is newline delimited json records, but can be overwritten 37 | defaults = {"lines": True, "orient": "records"} 38 | defaults.update(read_args) 39 | read_args = defaults 40 | 41 | return reader(path, **read_args) 42 | 43 | 44 | def write_df(df, path, **write_args): 45 | """Write a dataframe to file based on the file extension 46 | 47 | The following formats are supported: 48 | parquet, avro, csv, pickle, json 49 | 50 | Parameters 51 | ---------- 52 | df : pd.DataFrame 53 | The dataframe to write to disk 54 | datafile : DataFile 55 | Datafile instance with the path and file handle 56 | write_args : optional 57 | All keyword args are passed to the write function 58 | 59 | Notes 60 | ----- 61 | The write functions are taken from pandas, e.g. pd.to_csv 62 | Check the pandas doc for more information on the supported arguments 63 | """ 64 | extension = _get_extension(path) 65 | write_name = _writers[extension] 66 | 67 | # Some customizations for different file types 68 | if write_name == "to_avro": 69 | return _write_avro(df, path, **write_args) 70 | 71 | if write_name == "to_parquet" and not pd.Series(df.columns).map(type).eq(str).all(): 72 | warnings.warn( 73 | "Dataframe contains non-string column names, which cannot be saved in parquet.\n" 74 | "Blocks will attempt to convert them to strings." 75 | ) 76 | df.columns = df.columns.astype("str") 77 | 78 | if write_name == "to_json": 79 | defaults = {"lines": True, "orient": "records"} 80 | defaults.update(write_args) 81 | write_args = defaults 82 | 83 | if write_name == "to_csv": 84 | # make index=False the default for similar behaviour to other formats 85 | write_args["index"] = write_args.get("index", False) 86 | 87 | write_fn = getattr(df, write_name) 88 | write_fn(path, **write_args) 89 | 90 | 91 | def _read_avro(path, **read_args): 92 | if not avro_imported: 93 | raise ImportError( 94 | "Avro support requires fastavro.\n" 95 | "Install blocks with the [avro] option or `pip install fastavro`" 96 | ) 97 | records = [] 98 | with open(path, "rb") as f: 99 | avro_reader = avro.reader(f) 100 | for record in avro_reader: 101 | records.append(record) 102 | return pd.DataFrame.from_dict(records) 103 | 104 | 105 | def _write_avro(df, path, **write_args): 106 | if not avro_imported: 107 | raise ImportError( 108 | "Avro support requires fastavro.\n" 109 | "Install blocks with the [avro] option or `pip install fastavro`" 110 | ) 111 | schema = None 112 | schema_path = None 113 | try: 114 | schema = write_args["schema"] 115 | except KeyError: 116 | try: 117 | schema_path = write_args["schema_path"] 118 | except KeyError: 119 | raise Exception( 120 | "You must provide a schema or schema path when writing to Avro" 121 | ) 122 | if schema is None: 123 | schema = avro.schema.load_schema(schema_path) 124 | records = df.to_dict("records") 125 | with open(path, "wb") as f: 126 | avro.writer(f, schema, records) 127 | 128 | 129 | def _get_extension(path): 130 | name, ext = os.path.splitext(path) 131 | return ext 132 | 133 | 134 | _readers = { 135 | ".pq": pd.read_parquet, 136 | ".parquet": pd.read_parquet, 137 | ".csv": pd.read_csv, 138 | ".pkl": pd.read_pickle, 139 | ".avro": _read_avro, 140 | ".json": pd.read_json, 141 | } 142 | 143 | 144 | _writers = { 145 | ".pq": "to_parquet", 146 | ".parquet": "to_parquet", 147 | ".csv": "to_csv", 148 | ".pkl": "to_pickle", 149 | ".avro": "to_avro", 150 | ".json": "to_json", 151 | } 152 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Blocks documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Nov 27 14:57:51 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | 22 | sys.path.insert(0, os.path.abspath("../..")) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.autosummary", 37 | "sphinx.ext.mathjax", 38 | "sphinx.ext.viewcode", 39 | "sphinx_copybutton", 40 | "numpydoc", 41 | ] 42 | autodoc_typehints = "none" 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | 51 | # source_suffix = ['.rst', '.md'] 52 | source_suffix = ".rst" 53 | 54 | # The master toctree document. 55 | master_doc = "index" 56 | 57 | # General information about the project. 58 | project = u"Blocks" 59 | copyright = u"2018 Square, Inc." 60 | author = u"Bradley Axen" 61 | 62 | # The version info for the project you're documenting, acts as replacement for 63 | # |version| and |release|, also used in various other places throughout the 64 | # built documents. 65 | # 66 | # The short X.Y version. 67 | version = u"0.9.4" 68 | # The full version, including alpha/beta/rc tags. 69 | release = u"0.9.4" 70 | 71 | # The language for content autogenerated by Sphinx. Refer to documentation 72 | # for a list of supported languages. 73 | # 74 | # This is also used if you do content translation via gettext catalogs. 75 | # Usually you set "language" from the command line for these cases. 76 | language = None 77 | 78 | # List of patterns, relative to source directory, that match files and 79 | # directories to ignore when looking for source files. 80 | # This patterns also effect to html_static_path and html_extra_path 81 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = "sphinx" 85 | 86 | 87 | # -- Options for HTML output ---------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | 93 | html_theme = "furo" 94 | html_title = project 95 | html_last_updated_fmt = "%b %d, %Y" 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = [] 101 | 102 | 103 | # Theme options are theme-specific and customize the look and feel of a theme 104 | # further. For a list of options available for each theme, see the 105 | # documentation. 106 | # 107 | # html_theme_options = {} 108 | 109 | # Add any paths that contain custom static files (such as style sheets) here, 110 | # relative to this directory. They are copied after the builtin static files, 111 | # so a file named "default.css" will overwrite the builtin "default.css". 112 | # html_static_path = ["_static"] 113 | 114 | 115 | # -- Options for HTMLHelp output ------------------------------------------ 116 | 117 | # Output file base name for HTML help builder. 118 | htmlhelp_basename = "blocksdoc" 119 | 120 | 121 | # -- Options for LaTeX output --------------------------------------------- 122 | 123 | latex_elements = { 124 | # The paper size ('letterpaper' or 'a4paper'). 125 | # 126 | # 'papersize': 'letterpaper', 127 | # The font size ('10pt', '11pt' or '12pt'). 128 | # 129 | # 'pointsize': '10pt', 130 | # Additional stuff for the LaTeX preamble. 131 | # 132 | # 'preamble': '', 133 | # Latex figure (float) alignment 134 | # 135 | # 'figure_align': 'htbp', 136 | } 137 | 138 | # Grouping the document tree into LaTeX files. List of tuples 139 | # (source start file, target name, title, 140 | # author, documentclass [howto, manual, or own class]). 141 | latex_documents = [ 142 | (master_doc, "Blocks.tex", u"Blocks Documentation", u"Bradley Axen", "manual"), 143 | ] 144 | 145 | 146 | # -- Options for manual page output --------------------------------------- 147 | 148 | # One entry per manual page. List of tuples 149 | # (source start file, name, description, authors, manual section). 150 | man_pages = [(master_doc, "blocks", u"Blocks Documentation", [author], 1)] 151 | 152 | 153 | # -- Options for Texinfo output ------------------------------------------- 154 | 155 | # Grouping the document tree into Texinfo files. List of tuples 156 | # (source start file, target name, title, author, 157 | # dir menu entry, description, category) 158 | texinfo_documents = [ 159 | ( 160 | master_doc, 161 | "Blocks", 162 | u"Blocks Documentation", 163 | author, 164 | "Blocks", 165 | "One line description of project.", 166 | "Miscellaneous", 167 | ), 168 | ] 169 | 170 | 171 | # -- Autodoc -- 172 | # autodoc_typehints = "description" 173 | -------------------------------------------------------------------------------- /blocks/filesystem/base.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Sequence, Tuple, List 2 | from collections import defaultdict 3 | from fsspec.core import split_protocol, get_filesystem_class, has_magic 4 | 5 | 6 | class FileSystem: 7 | """Filesystem for manipulating files in the cloud 8 | 9 | This supports operations on local files and any other protocol supported by fsspec. 10 | This is a wrapper to fsspec which provides backwards compatibility for blocks filesystems 11 | and a simplified interface. 12 | 13 | Parameters 14 | ---------- 15 | storage_options: Mapping[str, Mapping[str, Any]] 16 | Additional options passed to each filesystem for each protocol 17 | e.g. {'gs': {'project': 'example'}} to set the gs filesytem project to example 18 | """ 19 | 20 | def __init__(self, **storage_options): 21 | self.storage_options = defaultdict(dict) 22 | self.storage_options.update(storage_options) 23 | self.storage_options[None]["auto_mkdir"] = True 24 | self.filesystems = {} 25 | 26 | def _get_protocol_path(self, urlpath) -> Tuple[str, List[str]]: 27 | if isinstance(urlpath, str): 28 | return split_protocol(urlpath) 29 | 30 | protocols, paths = zip(*map(split_protocol, urlpath)) 31 | assert ( 32 | len(set(protocols)) == 1 33 | ), "Cannot mix file protocols in a single operation" 34 | return protocols[0], list(paths) 35 | 36 | def _get_filesystem(self, protocol): 37 | if protocol not in self.filesystems: 38 | self.filesystems[protocol] = get_filesystem_class(protocol)( 39 | **self.storage_options[protocol] 40 | ) 41 | return self.filesystems[protocol] 42 | 43 | def ls(self, path: str) -> Sequence[str]: 44 | """List files correspond to path, including glob wildcards 45 | 46 | Parameters 47 | ---------- 48 | path : str 49 | The path to the file or directory to list; supports wildcards 50 | """ 51 | protocol, path = self._get_protocol_path(path) 52 | fs = self._get_filesystem(protocol) 53 | try: 54 | if has_magic(path): 55 | output = fs.glob(path) 56 | else: 57 | output = fs.ls(path) 58 | # TODO fix in base 59 | except FileNotFoundError: 60 | return [] 61 | except NotADirectoryError: 62 | return [path] 63 | 64 | if protocol is not None: 65 | output = ["://".join([protocol, path]) for path in output] 66 | return sorted(output) 67 | 68 | def copy( 69 | self, 70 | sources: Union[str, Sequence[str]], 71 | dest: Union[str, Sequence[str]], 72 | recursive=False, 73 | ): 74 | """Copy the files in sources to dest 75 | 76 | Parameters 77 | ---------- 78 | sources : list of str 79 | The list of paths to copy 80 | dest : str 81 | The destination(s) for the copy of source(s) 82 | recursive : bool 83 | If true, recursively copy any directories 84 | """ 85 | if isinstance(sources, str): 86 | sources = [sources] 87 | 88 | protocol_source, sources = self._get_protocol_path(sources) 89 | protocol_dest, dest = self._get_protocol_path(dest) 90 | 91 | if protocol_source == protocol_dest: 92 | fs = self._get_filesystem(protocol_source) 93 | 94 | # Temporary workaround for a bug in gcsfs 95 | if protocol_source == "gs" and recursive: 96 | sources = fs.expand_path(sources, recursive=True) 97 | sources = ["gs://" + s for s in sources if not fs.isdir(s)] 98 | return self.copy(sources, "gs://" + dest, recursive=False) 99 | 100 | fs.copy(sources, dest, recursive=recursive) 101 | 102 | elif protocol_source is None: 103 | fs = self._get_filesystem(protocol_dest) 104 | fs.put(sources, dest, recursive=recursive) 105 | 106 | elif protocol_dest is None: 107 | fs = self._get_filesystem(protocol_source) 108 | fs.get(sources, dest, recursive=recursive) 109 | 110 | elif protocol_dest is not None and protocol_source is not None: 111 | raise NotImplementedError( 112 | "Cannot do direct copy between two different cloud filesystems" 113 | ) 114 | 115 | if protocol_dest == "gs": 116 | # Make sure we invalidate the gcsfs cache since we have added new files 117 | if isinstance(dest, str): 118 | fs.invalidate_cache(dest) 119 | else: 120 | for d in dest: 121 | fs.invalidate_cache(d) 122 | 123 | def remove(self, paths: Union[str, List[str]], recursive: bool = False): 124 | """Remove the files at paths 125 | 126 | Parameters 127 | ---------- 128 | paths : list of str 129 | The paths to remove 130 | recursive : bool, default False 131 | If true, recursively remove any directories 132 | """ 133 | protocol, paths = self._get_protocol_path(paths) 134 | fs = self._get_filesystem(protocol) 135 | 136 | if protocol is None and not isinstance(paths, str): 137 | # TODO should local not just handle this? 138 | for path in paths: 139 | fs.rm(path, recursive=recursive) 140 | else: 141 | return fs.rm(paths, recursive=recursive) 142 | 143 | def open(self, path: str, mode="rb", **kwargs): 144 | """Return a file-like object from the filesystem 145 | 146 | The resultant instance must function correctly in a context ``with`` 147 | block. 148 | 149 | Parameters 150 | ---------- 151 | path: str 152 | Target file 153 | mode: str like 'rb', 'w' 154 | See builtin ``open()`` 155 | kwargs: 156 | Forwarded to the filesystem implementation 157 | """ 158 | protocol, path = self._get_protocol_path(path) 159 | fs = self._get_filesystem(protocol) 160 | return fs.open(path, mode, **kwargs) 161 | 162 | def isdir(self, path: str): 163 | """Check if the path is a directory""" 164 | protocol, path = self._get_protocol_path(path) 165 | fs = self._get_filesystem(protocol) 166 | return fs.isdir(path) 167 | 168 | def mkdir(self, path: str): 169 | """Make directory at path""" 170 | protocol, path = self._get_protocol_path(path) 171 | fs = self._get_filesystem(protocol) 172 | return fs.mkdir(path) 173 | 174 | # Aliases 175 | cp = copy 176 | rm = remove 177 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Quickstart 3 | ========== 4 | 5 | Layout 6 | ------ 7 | 8 | In the simplest case, you might want to read your data from a single file. This is pretty easy in 9 | pandas, but blocks adds additional support for inferring file types and support cloud storage: 10 | 11 | .. code-block:: python 12 | 13 | import pandas as pd 14 | import blocks 15 | df = blocks.assemble('data.pkl') # same as pd.read_pickle 16 | df = blocks.assemble('gs://mybucket/data.parquet') 17 | 18 | Many projects need to combine data stored in several files. To support this, blocks makes a few 19 | assumptions about your data. You've split it up into blocks, either into groups of columns (cgroups) 20 | or groups of rows (rgroups). You can read all this data into a single dataframe in memory with one 21 | command: 22 | 23 | .. code-block:: python 24 | 25 | import blocks 26 | blocks.assemble('data/') 27 | 28 | If all of your files are in one directory, then the rows will be concatenated: 29 | 30 | :: 31 | 32 | data 33 | ├── part.00.pq 34 | ├── part.01.pq 35 | └── part.02.pq 36 | 37 | 38 | 39 | If your files actually contain the same rows but store different columns, you should place them in different folders with corresponding names: 40 | 41 | :: 42 | 43 | data 44 | ├── g0 45 | │   └── part.00.pq 46 | ├── g1 47 | │   └── part.00.pq 48 | └── g2 49 | └── part.00.pq 50 | 51 | 52 | In the most general case you can do both, laying out your data in multiple cgroups and rgroups - where each rgroup should contain the same 53 | logical rows (e.g. different attributes of the same event) 54 | 55 | :: 56 | 57 | ─ data 58 | ├── g0 59 | │   ├── part.00.pq 60 | │   ├── part.01.pq 61 | │   ├── part.02.pq 62 | │   └── part.03.pq 63 | ├── g1 64 | │   ├── part.00.pq 65 | │   ├── part.01.pq 66 | │   ├── part.02.pq 67 | │   └── part.03.pq 68 | ├── g2 69 | │   ├── part.00.pq 70 | │   ├── part.01.pq 71 | │   ├── part.02.pq 72 | │   └── part.03.pq 73 | └── g3 74 | ├── part.00.pq 75 | ├── part.01.pq 76 | ├── part.02.pq 77 | └── part.03.pq 78 | 79 | This corresponds to the following dataframe structure: 80 | 81 | .. image:: blocks_layout.png 82 | 83 | 84 | This pattern generalizes very well when you start collecting data from multiple sources and with enough content that the 85 | entire dataset won't comfortably fit into memory at once. 86 | 87 | Blocks supports multiple data formats, including ``csv``, ``hdf5``, ``pickle``, and ``parquet``. Reads from these files 88 | are handled by ``pandas`` libraries, so they support all of the options you expect like headers, index columns, etc. 89 | All of the ``blocks`` interfaces below support passing keyword args to the read functions for the files (see the docstrings). 90 | The files can be local (referenced by normal paths) or on GCS (referenced by paths like ``gs://bucket``). 91 | 92 | **Note that rgroups are combined by simple concatenation, and cgroups are combined by a "natural left join": 93 | any shared columns are considered join keys.** Key-based merging only makes sense with named columns, so make sure 94 | any CSVs you use have a column header if you want to join cgroups. 95 | 96 | Read 97 | ---- 98 | 99 | Assemble 100 | ======== 101 | 102 | Assemble is the primary data reading command, and can handle any of the layouts above. You can select subsets of the data 103 | using glob patterns or the ``cgroups`` and ``rgroups`` arguments: 104 | 105 | 106 | ======================= ===================================================================== 107 | Layout Recipe 108 | ======================= ===================================================================== 109 | .. image:: both.png .. code-block:: python 110 | 111 | blocks.assemble('data/') 112 | 113 | .. image:: column.png .. code-block:: python 114 | 115 | blocks.assemble('data/g1/*') 116 | # or 117 | blocks.assemble('data/', cgroups=['g1']) 118 | 119 | .. image:: row.png .. code-block:: python 120 | 121 | blocks.assemble('data/*/part.01.pq') 122 | # or 123 | blocks.assemble('data/', rgroups=['part.01.pq']) 124 | 125 | .. image:: filtered.png .. code-block:: python 126 | 127 | blocks.assemble('data/*/part.01.pq', cgroups=['g0', 'g1', 'g3']) 128 | # or 129 | blocks.assemble( 130 | 'data/', 131 | rgroups=['part.01.pq'], 132 | cgroups=['g0', 'g1', 'g3'] 133 | ) 134 | 135 | ======================= ===================================================================== 136 | 137 | Iterate 138 | ======= 139 | 140 | Blocks also has an iterative option for performing operations on each of the blocks without loading them all 141 | into memory at once: 142 | 143 | .. code-block:: python 144 | 145 | import blocks 146 | 147 | for cgroup, rgroup, df in blocks.iterate('data/'): 148 | print(df.shape) 149 | 150 | 151 | ``iterate`` supports the same syntax and features as ``assemble`` above, but instead of returning a merged dataframe, 152 | it returns an iterator of ``(rgroup, cgroup, dataframe)`` where the ``rgroup`` and ``cgroup`` are the names of the 153 | groups (``'g0'`` and ``'part.00.pq'`` from above). 154 | 155 | 156 | ``iterate`` can also operate on multiple axes - the default is to iterate over every block separately. But if you 157 | specify ``axis=0``, then iterate will combine cgroups and iterate over rgroups, and for ``axis=1`` it will iterate 158 | over the cgroups while combining any rgroups. 159 | 160 | ========================== ===================================================================== 161 | Direction Recipe 162 | ========================== ===================================================================== 163 | .. image:: row_iter.png .. code-block:: python 164 | 165 | # iterate over one dataframe per rgroup 166 | for rgroup, df in blocks.iterate('gs://path/to/data', axis=0): 167 | print(df.shape) 168 | .. image:: column_iter.png .. code-block:: python 169 | 170 | # iterate over one dataframe per cgroup 171 | for cgroup, df in blocks.iterate('gs://path/to/data', axis=1): 172 | print(df.shape) 173 | ========================== ===================================================================== 174 | 175 | 176 | Partitioned 177 | =========== 178 | 179 | Dask_ provides a great interface to a partitioned dataframe, and you can use blocks' simple syntax to 180 | build a ``dask.dataframe``. Checkout the dask documentation for details on how to use the resulting object. 181 | 182 | .. code-block:: python 183 | 184 | import blocks 185 | 186 | # need to have separately installed dask 187 | dask_df = blocks.partitioned('data/*/part_0[1-4].pq') 188 | 189 | dask_df.groupby('category').mean().compute() 190 | 191 | 192 | Write 193 | ----- 194 | 195 | Place 196 | ===== 197 | 198 | If you want to put a dataframe into a single file, use ``place``: 199 | 200 | .. code-block:: python 201 | 202 | import blocks 203 | 204 | blocks.place(df, 'data/part_00.pq') 205 | blocks.place(df, 'gs://mybucket/data/part_00.pq') 206 | 207 | Like with ``assemble`` for a single file, this is easy in ``pandas``, but ``blocks`` infers the file 208 | type and has support for cloud storage. 209 | 210 | Divide 211 | ====== 212 | 213 | For paritioning your data, blocks also has a divide function. You'd use this to split up a single large dataframe 214 | in memory into many rgroups and/or cgroups on disk, to help with parallelizing analysis. By default the blocks are 215 | written as ``parquet`` files, but you can specify other extensions including ``.hdf5``, ``.csv``, and ``.pkl``. 216 | 217 | .. code-block:: python 218 | 219 | import blocks 220 | 221 | # divide into just row groups 222 | blocks.divide(df, 'data/', n_rgroup=3) 223 | 224 | :: 225 | 226 | data 227 | ├── part_00.pq 228 | ├── part_01.pq 229 | └── part_02.pq 230 | 231 | Divide can also handle column groups: 232 | 233 | .. code-block:: python 234 | 235 | # split into 10 rgroups and specific cgroups 236 | cgroup_columns = { 237 | 'g0': ['id', 'timestamp', 'metadata'], 238 | 'g1': ['id', 'timestamp', 'feature0', 'feature1'], 239 | 'g2': ['id', 'timestamp', 'feature2', 'feature3'], 240 | 'g3': ['id', 'timestamp', 'feature4', 'feature5', 'feature6'], 241 | } 242 | blocks.divide(df, 'data/', 4, cgroup_columns=cgroup_columns) 243 | 244 | :: 245 | 246 | ─ data 247 | ├── g0 248 | │   ├── part.00.pq 249 | │   ├── part.01.pq 250 | │   ├── part.02.pq 251 | │   └── part.03.pq 252 | ├── g1 253 | │   ├── part.00.pq 254 | │   ├── part.01.pq 255 | │   ├── part.02.pq 256 | │   └── part.03.pq 257 | ├── g2 258 | │   ├── part.00.pq 259 | │   ├── part.01.pq 260 | │   ├── part.02.pq 261 | │   └── part.03.pq 262 | └── g3 263 | ├── part.00.pq 264 | ├── part.01.pq 265 | ├── part.02.pq 266 | └── part.03.pq 267 | 268 | 269 | .. _Dask: http://dask.pydata.org/en/latest/ 270 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /blocks/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | import warnings 6 | import pickle as _pickle 7 | 8 | from functools import reduce 9 | from collections import defaultdict, OrderedDict 10 | from typing import ( 11 | Optional, 12 | Sequence, 13 | Any, 14 | Dict, 15 | Iterator, 16 | Tuple, 17 | DefaultDict, 18 | List, 19 | Iterable, 20 | Union, 21 | ) 22 | 23 | from blocks.filesystem import FileSystem 24 | from blocks.utils import with_function_tmpdir, with_session_tmpdir 25 | from blocks.dfio import read_df, write_df 26 | 27 | cgroup = str 28 | rgroup = str 29 | 30 | 31 | @with_function_tmpdir 32 | def assemble( 33 | path: str, 34 | cgroups: Optional[Sequence[cgroup]] = None, 35 | rgroups: Optional[Sequence[rgroup]] = None, 36 | read_args: Any = {}, 37 | cgroup_args: Dict[cgroup, Any] = {}, 38 | merge: str = "inner", 39 | filesystem: FileSystem = FileSystem(), 40 | tmpdir: str = None, 41 | ) -> pd.DataFrame: 42 | """Assemble multiple dataframe blocks into a single frame 43 | 44 | Each file included in the path (or subdirs of that path) is combined into 45 | a single dataframe by first concatenating over row groups and then merging 46 | over column groups. A row group is a subset of rows of the data stored in 47 | different files. A column group is a subset of columns of the data stored in 48 | different folders. The merges are performed in the order of listed cgroups if 49 | provided, otherwise in alphabetic order. Files are opened by a method inferred 50 | from their extension. 51 | 52 | Parameters 53 | ---------- 54 | path : str 55 | The glob-able path to all data files to assemble into a frame 56 | e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* 57 | See the README for a more detailed explanation 58 | cgroups : list of str, optional 59 | The list of cgroups (folder names) to include from the glob path 60 | rgroups : list of str, optional 61 | The list of rgroups (file names) to include from the glob path 62 | read_args : optional 63 | Any additional keyword args to pass to the read function 64 | cgroup_args : {cgroup: kwargs}, optional 65 | Any cgroup specific read arguments, where each key is the name 66 | of the cgroup and each value is a dictionary of keyword args 67 | merge : one of 'left', 'right', 'outer', 'inner', default 'inner' 68 | The merge strategy to pass to pandas.merge 69 | filesystem : blocks.filesystem.FileSystem or similar 70 | A filesystem object that implements the blocks.FileSystem API 71 | 72 | Returns 73 | ------- 74 | data : pd.DataFrame 75 | The combined dataframe from all the blocks 76 | 77 | """ 78 | grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir) 79 | 80 | # ---------------------------------------- 81 | # Concatenate all rgroups 82 | # ---------------------------------------- 83 | frames = [] 84 | 85 | for group in grouped: 86 | files = grouped[group] 87 | args = read_args.copy() 88 | if group in cgroup_args: 89 | args.update(cgroup_args[group]) 90 | frames.append(pd.concat(read_df(f, **args) for f in files)) 91 | 92 | # ---------------------------------------- 93 | # Merge all cgroups 94 | # ---------------------------------------- 95 | df = _merge_all(frames, merge=merge) 96 | return df 97 | 98 | 99 | @with_function_tmpdir 100 | def iterate( 101 | path: str, 102 | axis: int = -1, 103 | cgroups: Optional[Sequence[cgroup]] = None, 104 | rgroups: Optional[Sequence[rgroup]] = None, 105 | read_args: Any = {}, 106 | cgroup_args: Dict[cgroup, Any] = {}, 107 | merge: str = "inner", 108 | filesystem: FileSystem = FileSystem(), 109 | tmpdir: str = None, 110 | ) -> Union[ 111 | Iterator[Tuple[cgroup, rgroup, pd.DataFrame]], Iterator[Tuple[str, pd.DataFrame]] 112 | ]: 113 | """Iterate over dataframe blocks 114 | 115 | Each file include in the path (or subdirs of that path) is opened as a 116 | dataframe and returned in a generator of (cname, rname, dataframe). 117 | Files are opened by a method inferred from their extension 118 | 119 | Parameters 120 | ---------- 121 | path : str 122 | The glob-able path to all files to assemble into a frame 123 | e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* 124 | See the README for a more detailed explanation 125 | axis : int, default -1 126 | The axis to iterate along 127 | If -1 (the default), iterate over both columns and rows 128 | If 0, iterate over the rgroups, combining any cgroups 129 | If 1, iterate over the cgroups, combining any rgroups 130 | cgroups : list of str, or {str: args} optional 131 | The list of cgroups (folder names) to include from the glob path 132 | rgroups : list of str, optional 133 | The list of rgroups (file names) to include from the glob path 134 | read_args : dict, optional 135 | Any additional keyword args to pass to the read function 136 | cgroup_args : {cgroup: kwargs}, optional 137 | Any cgroup specific read arguments, where each key is the name 138 | of the cgroup and each value is a dictionary of keyword args 139 | merge : one of 'left', 'right', 'outer', 'inner', default 'inner' 140 | The merge strategy to pass to pandas.merge, only used when axis=0 141 | filesystem : blocks.filesystem.FileSystem or similar 142 | A filesystem object that implements the blocks.FileSystem API 143 | 144 | Returns 145 | ------- 146 | data : generator 147 | A generator of (cname, rname, dataframe) for each collected path 148 | If axis=0, yields (rname, dataframe) 149 | If axis=1, yields (cname, dataframe) 150 | 151 | """ 152 | grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir) 153 | 154 | if axis == -1: 155 | for cgroup in grouped: 156 | args = read_args.copy() 157 | if cgroup in cgroup_args: 158 | args.update(cgroup_args[cgroup]) 159 | for path in grouped[cgroup]: 160 | yield _cname(path), _rname(path), read_df(path, **args) 161 | 162 | elif axis == 0: 163 | # find the shared files among all subfolders 164 | rgroups = _shared_rgroups(grouped) 165 | 166 | for rgroup in sorted(rgroups): 167 | frames = [] 168 | for cgroup in grouped: 169 | path = next(d for d in grouped[cgroup] if _rname(d) == rgroup) 170 | 171 | args = read_args.copy() 172 | if cgroup in cgroup_args: 173 | args.update(cgroup_args[cgroup]) 174 | frames.append(read_df(path, **args)) 175 | yield rgroup, _merge_all(frames, merge=merge) 176 | 177 | elif axis == 1: 178 | for cgroup in grouped: 179 | files = grouped[cgroup] 180 | args = read_args.copy() 181 | if cgroup in cgroup_args: 182 | args.update(cgroup_args[cgroup]) 183 | yield cgroup, pd.concat(read_df(path, **args) for path in files) 184 | 185 | else: 186 | raise ValueError("Invalid choice for axis, options are -1, 0, 1") 187 | 188 | 189 | @with_session_tmpdir 190 | def partitioned( 191 | path: str, 192 | cgroups: Sequence[cgroup] = None, 193 | rgroups: Sequence[rgroup] = None, 194 | read_args: Any = {}, 195 | cgroup_args: Dict[cgroup, Any] = {}, 196 | merge: str = "inner", 197 | filesystem: FileSystem = FileSystem(), 198 | tmpdir: str = None, 199 | ): 200 | """Return a partitioned dask dataframe, where each partition is a row group 201 | 202 | The results are the same as iterate with axis=0, except that it returns a dask dataframe 203 | instead of a generator. Note that this requires dask to be installed 204 | 205 | Parameters 206 | ---------- 207 | path : str 208 | The glob-able path to all files to assemble into a frame 209 | e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* 210 | See the README for a more detailed explanation 211 | cgroups : list of str, or {str: args} optional 212 | The list of cgroups (folder names) to include from the glob path 213 | rgroups : list of str, optional 214 | The list of rgroups (file names) to include from the glob path 215 | read_args : dict, optional 216 | Any additional keyword args to pass to the read function 217 | cgroup_args : {cgroup: kwargs}, optional 218 | Any cgroup specific read arguments, where each key is the name 219 | of the cgroup and each value is a dictionary of keyword args 220 | merge : one of 'left', 'right', 'outer', 'inner', default 'inner' 221 | The merge strategy to pass to pandas.merge, only used when axis=0 222 | filesystem : blocks.filesystem.FileSystem or similar 223 | A filesystem object that implements the blocks.FileSystem API 224 | 225 | Returns 226 | ------- 227 | data : dask.dataframe 228 | A dask dataframe partitioned by row groups, with all cgroups merged 229 | 230 | """ 231 | try: 232 | import dask 233 | import dask.dataframe as dd 234 | except ImportError: 235 | raise ImportError("Partitioned requires dask[dataframe] to be installed") 236 | 237 | grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir) 238 | blocks = [] 239 | 240 | @dask.delayed() 241 | def merged(rgroup): 242 | frames = [] 243 | for cgroup in grouped: 244 | p = next(p for p in grouped[cgroup] if os.path.basename(p) == rgroup) 245 | args = read_args.copy() 246 | if cgroup in cgroup_args: 247 | args.update(cgroup_args[cgroup]) 248 | frames.append(read_df(p, **args)) 249 | return _merge_all(frames, merge=merge) 250 | 251 | for rgroup in _shared_rgroups(grouped): 252 | blocks.append(merged(rgroup)) 253 | return dd.from_delayed(blocks) 254 | 255 | 256 | @with_function_tmpdir 257 | def place( 258 | df: pd.DataFrame, 259 | path: str, 260 | filesystem: FileSystem = FileSystem(), 261 | tmpdir: str = None, 262 | **write_args, 263 | ) -> None: 264 | """Place a dataframe block onto the filesystem at the specified path 265 | 266 | Parameters 267 | ---------- 268 | df : pd.DataFrame 269 | The data to place 270 | path : str 271 | Path to the directory (possibly on GCS) in which to place the columns 272 | write_args : dict 273 | Any additional args to pass to the write function 274 | filesystem : blocks.filesystem.FileSystem or similar 275 | A filesystem object that implements the blocks.FileSystem API 276 | 277 | """ 278 | fname = os.path.basename(path) 279 | tmp = os.path.join(tmpdir, fname) 280 | write_df(df, tmp, **write_args) 281 | filesystem.copy(tmp, path) 282 | 283 | 284 | @with_function_tmpdir 285 | def divide( 286 | df: pd.DataFrame, 287 | path: str, 288 | n_rgroup: int = 1, 289 | rgroup_offset: int = 0, 290 | cgroup_columns: Optional[Dict[Optional[cgroup], Sequence[str]]] = None, 291 | extension: str = ".pq", 292 | convert: bool = False, 293 | filesystem: FileSystem = FileSystem(), 294 | prefix=None, 295 | tmpdir: str = None, 296 | **write_args, 297 | ) -> None: 298 | """Split a dataframe into rgroups/cgroups and save to disk 299 | 300 | Note that this splitting does not preserve the original index, so make sure 301 | to have another column to track values 302 | 303 | Parameters 304 | ---------- 305 | df : pd.DataFrame 306 | The data to divide 307 | path : str 308 | Path to the directory (possibly on GCS) in which to place the columns 309 | n_rgroup : int, default 1 310 | The number of row groups to partition the data into 311 | The rgroups will have approximately equal sizes 312 | rgroup_offset : int, default 0 313 | The index to start from in the name of file parts 314 | e.g. If rgroup_offset=10 then the first file will be `part_00010.pq` 315 | cgroup_columns : {cgroup: list of column names} 316 | The column lists to form cgroups; if None, do not make cgroups 317 | Each key is the name of the cgroup, and each value is the list of columns to include 318 | To reassemble later make sure to include join keys for each cgroup 319 | extension : str, default .pq 320 | The file extension for the dataframe (file type inferred from this extension 321 | convert : bool, default False 322 | If true attempt to coerce types to numeric. This can avoid issues with ambiguous 323 | object columns but requires additional time 324 | filesystem : blocks.filesystem.FileSystem or similar 325 | A filesystem object that implements the blocks.FileSystem API 326 | prefix: str 327 | Prefix to add to written filenames 328 | write_args : dict 329 | Any additional args to pass to the write function 330 | 331 | """ 332 | # Use a single dummy cgroup if None wanted 333 | if cgroup_columns is None: 334 | cgroup_columns = {None: df.columns} 335 | 336 | # Add leading dot if not in extension 337 | if extension[0] != ".": 338 | extension = "." + extension 339 | 340 | if convert: 341 | for col in df.columns: 342 | df[col] = pd.to_numeric(df[col], errors="ignore") 343 | 344 | files = [] 345 | for cname, columns in cgroup_columns.items(): 346 | cgroup = df[columns] 347 | 348 | bucket = os.path.join(path, cname) if cname else path 349 | tmp_cgroup = os.path.join(tmpdir, cname) if cname else tmpdir 350 | 351 | if not filesystem.isdir(tmp_cgroup): 352 | filesystem.mkdir(tmp_cgroup) 353 | 354 | rnames = [ 355 | "part_{:05d}{}".format(i + rgroup_offset, extension) 356 | for i in range(n_rgroup) 357 | ] 358 | if prefix is not None: 359 | rnames = [prefix + "_" + rn for rn in rnames] 360 | 361 | for rgroup, rname in zip(np.array_split(cgroup, n_rgroup), rnames): 362 | tmp = os.path.join(tmp_cgroup, rname) 363 | write_df(rgroup.reset_index(drop=True), tmp, **write_args) 364 | files.append((cname, rname) if cname else (rname,)) 365 | 366 | filesystem.copy( 367 | [os.path.join(tmpdir, *f) for f in files], 368 | [os.path.join(path, *f) for f in files], 369 | ) 370 | 371 | 372 | def pickle(obj: Any, path: str, filesystem: FileSystem = FileSystem()): 373 | """Save a pickle of obj at the specified path 374 | 375 | Parameters 376 | ---------- 377 | obj : Object 378 | Any pickle compatible object 379 | path : str 380 | The path to the location to save the pickle file, support gcs paths 381 | filesystem : blocks.filesystem.FileSystem or similar 382 | A filesystem object that implements the blocks.FileSystem API 383 | """ 384 | with filesystem.open(path, "wb") as f: 385 | _pickle.dump(obj, f) 386 | 387 | 388 | def unpickle(path: str, filesystem: FileSystem = FileSystem()): 389 | """Load an object from the pickle file at path 390 | 391 | Parameters 392 | ---------- 393 | obj : Object 394 | Any pickle compatible object 395 | path : str 396 | The path to the location of the saved pickle file, support gcs paths 397 | filesystem : blocks.filesystem.FileSystem or similar 398 | A filesystem object that implements the blocks.FileSystem API 399 | """ 400 | with filesystem.open(path, "rb") as f: 401 | return _pickle.load(f) 402 | 403 | 404 | def _collect( 405 | path: str, 406 | cgroups: Optional[Sequence[cgroup]], 407 | rgroups: Optional[Sequence[rgroup]], 408 | filesystem: FileSystem, 409 | tmpdir: str, 410 | ) -> Dict[cgroup, Sequence[str]]: 411 | """Collect paths into cgroups and download any gcs files for local access 412 | 413 | Parameters 414 | ---------- 415 | path : str 416 | The glob-able path to all files to assemble into a frame 417 | e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* 418 | See the README for a more detailed explanation 419 | cgroups : list of str, optional 420 | The list of cgroups (folder names) to include from the glob path 421 | rgroups : list of str, optional 422 | The list of rgroups (file names) to include from the glob path 423 | filesystem : blocks.filesystem.FileSystem or similar 424 | A filesystem object that implements the blocks.FileSystem API 425 | tmpdir : str 426 | The path of a temporary directory to use for copies of files 427 | 428 | Returns 429 | ------- 430 | grouped : {cgroup: list of paths} 431 | Paths to local copies of the data, grouped by cgroup 432 | 433 | """ 434 | # ---------------------------------------- 435 | # Collect paths into cgroups 436 | # ---------------------------------------- 437 | paths = filesystem.ls(path) 438 | if not paths: 439 | raise ValueError(f"Did not find any files at the path: {path}") 440 | expanded = _expand(paths, filesystem) 441 | filtered = _filter(expanded, cgroups, rgroups) 442 | grouped = _cgroups(filtered) 443 | accessed = _access(grouped, filesystem, tmpdir) 444 | 445 | # Go in specified cgroup order, or alphabetical if not specified 446 | if cgroups is None: 447 | cgroups = sorted(grouped.keys()) 448 | 449 | return OrderedDict((k, accessed[k]) for k in cgroups) 450 | 451 | 452 | def _has_ext(path: str) -> bool: 453 | return os.path.splitext(path)[1] != "" 454 | 455 | 456 | def _expand(paths: Sequence[str], filesystem: FileSystem) -> List[str]: 457 | """For any directories in paths, expand into all the contained files""" 458 | expanded = [] 459 | for path in paths: 460 | if _has_ext(path): 461 | # Has an extension so treat it as a file 462 | expanded.append(path) 463 | else: 464 | # Otherwise try to read it like a directory 465 | expanded += filesystem.ls(path + "**") 466 | # Some cases might result in duplicates, so we convert to set and back 467 | return sorted(set(p for p in expanded if _has_ext(p))) 468 | 469 | 470 | def _filter( 471 | paths: Sequence[str], 472 | cgroups: Optional[Sequence[cgroup]], 473 | rgroups: Optional[Sequence[rgroup]], 474 | ) -> List[str]: 475 | """Keep only paths with the appropriate cgroups and/or rgroups""" 476 | kept = [] 477 | for path in paths: 478 | valid_cgroup = cgroups is None or _cname(path) in cgroups 479 | valid_rgroup = rgroups is None or _rname(path) in rgroups 480 | if valid_cgroup and valid_rgroup: 481 | kept.append(path) 482 | return kept 483 | 484 | 485 | def _base(path: str) -> str: 486 | """Get base from path (name of the top level folder)""" 487 | return os.path.dirname(os.path.dirname(path)) 488 | 489 | 490 | def _cname(path: str) -> cgroup: 491 | """Get cname from path (name of the parent folder)""" 492 | return os.path.basename(os.path.dirname(path)) 493 | 494 | 495 | def _rname(path: str) -> rgroup: 496 | """Get cname from path (name of the file)""" 497 | return os.path.basename(path) 498 | 499 | 500 | def _cgroups(paths: Sequence[str]) -> DefaultDict[cgroup, List[str]]: 501 | """Group paths by cgroup (the parent folder)""" 502 | cgroups = defaultdict(list) 503 | for path in paths: 504 | cgroups[_cname(path)].append(path) 505 | return cgroups 506 | 507 | 508 | def _access(cgroups, filesystem: FileSystem, tmpdir: str) -> Dict[cgroup, List[str]]: 509 | """Access potentially cloud stored files, preserving cgroups""" 510 | updated = {} 511 | 512 | for cgroup, paths in cgroups.items(): 513 | if filesystem._get_protocol_path(paths)[0] is None: 514 | updated[cgroup] = paths 515 | else: 516 | tmp_cgroup = os.path.join(tmpdir, cgroup, "") 517 | filesystem.copy(paths, tmp_cgroup) 518 | updated[cgroup] = filesystem.ls(tmp_cgroup) 519 | return updated 520 | 521 | 522 | def _safe_merge(df1: pd.DataFrame, df2: pd.DataFrame, merge="inner") -> pd.DataFrame: 523 | """Merge two dataframes, warning of any shape differences""" 524 | s1, s2 = df1.shape[0], df2.shape[0] 525 | if s1 != s2: 526 | warnings.warn( 527 | f"The two cgroups have a different number of rows: {s1} versus {s2}" 528 | ) 529 | return pd.merge(df1, df2, how=merge) 530 | 531 | 532 | def _merge_all(frames: Sequence[pd.DataFrame], merge="inner") -> pd.DataFrame: 533 | """Merge a list of dataframes with safe merge""" 534 | result = frames[0] 535 | for frame in frames[1:]: 536 | result = _safe_merge(result, frame, merge) 537 | return result 538 | 539 | 540 | def _shared_rgroups(grouped) -> Iterable[rgroup]: 541 | rgroups = [[_rname(path) for path in group] for group in grouped.values()] 542 | return reduce(lambda a, b: set(a) & set(b), rgroups) 543 | --------------------------------------------------------------------------------