├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── docs ├── zmq.png └── zmq.svg ├── partd ├── __init__.py ├── _version.py ├── buffer.py ├── compressed.py ├── core.py ├── dict.py ├── encode.py ├── file.py ├── numpy.py ├── pandas.py ├── pickle.py ├── python.py ├── tests │ ├── test_buffer.py │ ├── test_compressed.py │ ├── test_dict.py │ ├── test_encode.py │ ├── test_file.py │ ├── test_numpy.py │ ├── test_pandas.py │ ├── test_partd.py │ ├── test_pickle.py │ ├── test_python.py │ ├── test_utils.py │ └── test_zmq.py ├── utils.py └── zmq.py ├── pyproject.toml ├── requirements.txt └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | partd/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Python ${{ matrix.python-version }} 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ["3.9", "3.10", "3.11", "3.12"] 13 | 14 | steps: 15 | - name: Checkout source 16 | uses: actions/checkout@v2 17 | 18 | - name: Setup Conda Environment 19 | uses: conda-incubator/setup-miniconda@v2.2.0 20 | with: 21 | miniforge-variant: Mambaforge 22 | miniforge-version: latest 23 | use-mamba: true 24 | channel-priority: strict 25 | python-version: ${{ matrix.python-version }} 26 | auto-activate-base: false 27 | 28 | - name: Install dependencies 29 | shell: bash -l {0} 30 | run: mamba install pytest locket numpy toolz pandas blosc pyzmq pyarrow -c conda-forge 31 | 32 | - name: Install 33 | shell: bash -l {0} 34 | run: pip install . 35 | 36 | - name: Run Tests 37 | shell: bash -l {0} 38 | run: pytest partd --doctest-modules --verbose 39 | env: 40 | PYTHON_VERSION: ${{ matrix.python-version }} 41 | 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | *.egg-info/ 4 | build/ 5 | dist/ 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 2 | 3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html. 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Continuum Analytics, Inc. and contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | Neither the name of Continuum Analytics nor the names of any contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 28 | THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include partd *.py 2 | 3 | include setup.py 4 | include README.rst 5 | include LICENSE.txt 6 | include MANIFEST.in 7 | include versioneer.py 8 | include partd/_version.py 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PartD 2 | ===== 3 | 4 | |Build Status| |Version Status| 5 | 6 | Key-value byte store with appendable values 7 | 8 | Partd stores key-value pairs. 9 | Values are raw bytes. 10 | We append on old values. 11 | 12 | Partd excels at shuffling operations. 13 | 14 | Operations 15 | ---------- 16 | 17 | PartD has two main operations, ``append`` and ``get``. 18 | 19 | 20 | Example 21 | ------- 22 | 23 | 1. Create a Partd backed by a directory:: 24 | 25 | >>> import partd 26 | >>> p = partd.File('/path/to/new/dataset/') 27 | 28 | 2. Append key-byte pairs to dataset:: 29 | 30 | >>> p.append({'x': b'Hello ', 'y': b'123'}) 31 | >>> p.append({'x': b'world!', 'y': b'456'}) 32 | 33 | 3. Get bytes associated to keys:: 34 | 35 | >>> p.get('x') # One key 36 | b'Hello world!' 37 | 38 | >>> p.get(['y', 'x']) # List of keys 39 | [b'123456', b'Hello world!'] 40 | 41 | 4. Destroy partd dataset:: 42 | 43 | >>> p.drop() 44 | 45 | That's it. 46 | 47 | 48 | Implementations 49 | --------------- 50 | 51 | We can back a partd by an in-memory dictionary:: 52 | 53 | >>> p = Dict() 54 | 55 | For larger amounts of data or to share data between processes we back a partd 56 | by a directory of files. This uses file-based locks for consistency.:: 57 | 58 | >>> p = File('/path/to/dataset/') 59 | 60 | However this can fail for many small writes. In these cases you may wish to buffer one partd with another, keeping a fixed maximum of data in the buffering partd. This writes the larger elements of the first partd to the second partd when space runs low:: 61 | 62 | >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer 63 | 64 | You might also want to have many distributed process write to a single partd 65 | consistently. This can be done with a server 66 | 67 | * Server Process:: 68 | 69 | >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer 70 | >>> s = Server(p, address='ipc://server') 71 | 72 | * Worker processes:: 73 | 74 | >>> p = Client('ipc://server') # Client machine talks to remote server 75 | 76 | 77 | Encodings and Compression 78 | ------------------------- 79 | 80 | Once we can robustly and efficiently append bytes to a partd we consider 81 | compression and encodings. This is generally available with the ``Encode`` 82 | partd, which accepts three functions, one to apply on bytes as they are 83 | written, one to apply to bytes as they are read, and one to join bytestreams. 84 | Common configurations already exist for common data and compression formats. 85 | 86 | We may wish to compress and decompress data transparently as we interact with a 87 | partd. Objects like ``BZ2``, ``Blosc``, ``ZLib`` and ``Snappy`` exist and take 88 | another partd as an argument.:: 89 | 90 | >>> p = File(...) 91 | >>> p = ZLib(p) 92 | 93 | These work exactly as before, the (de)compression happens automatically. 94 | 95 | Common data formats like Python lists, numpy arrays, and pandas 96 | dataframes are also supported out of the box.:: 97 | 98 | >>> p = File(...) 99 | >>> p = NumPy(p) 100 | >>> p.append({'x': np.array([...])}) 101 | 102 | This lets us forget about bytes and think instead in our normal data types. 103 | 104 | Composition 105 | ----------- 106 | 107 | In principle we want to compose all of these choices together 108 | 109 | 1. Write policy: ``Dict``, ``File``, ``Buffer``, ``Client`` 110 | 2. Encoding: ``Pickle``, ``Numpy``, ``Pandas``, ... 111 | 3. Compression: ``Blosc``, ``Snappy``, ... 112 | 113 | Partd objects compose by nesting. Here we make a partd that writes pickle 114 | encoded BZ2 compressed bytes directly to disk:: 115 | 116 | >>> p = Pickle(BZ2(File('foo'))) 117 | 118 | We could construct more complex systems that include compression, 119 | serialization, buffering, and remote access.:: 120 | 121 | >>> server = Server(Buffer(Dict(), File(), available_memory=2e0)) 122 | 123 | >>> client = Pickle(Snappy(Client(server.address))) 124 | >>> client.append({'x': [1, 2, 3]}) 125 | 126 | .. |Build Status| image:: https://github.com/dask/partd/workflows/CI/badge.svg 127 | :target: https://github.com/dask/partd/actions?query=workflow%3ACI 128 | .. |Version Status| image:: https://img.shields.io/pypi/v/partd.svg 129 | :target: https://pypi.python.org/pypi/partd/ 130 | -------------------------------------------------------------------------------- /docs/zmq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask/partd/e832b655606342dc742ec1c564b07abf1ad58383/docs/zmq.png -------------------------------------------------------------------------------- /docs/zmq.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 27 | 33 | 34 | 41 | 47 | 48 | 55 | 61 | 62 | 69 | 75 | 76 | 77 | 99 | 110 | 111 | 113 | 114 | 116 | image/svg+xml 117 | 119 | 120 | 121 | 122 | 123 | 128 | 135 | 142 | 149 | 156 | 163 | 170 | 177 | 184 | 189 | 194 | 201 | 208 | 215 | 222 | 229 | 236 | 243 | 250 | 257 | 264 | 271 | in memory groups send to files 339 | 344 | 349 | 354 | 359 | 364 | 369 | 376 | 381 | File system 413 | ZeroMQsockets 433 | 440 | 447 | 454 | 461 | 468 | 475 | 482 | 489 | 496 | 497 | 498 | -------------------------------------------------------------------------------- /partd/__init__.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress 2 | 3 | from .file import File 4 | from .dict import Dict 5 | from .buffer import Buffer 6 | from .encode import Encode 7 | from .pickle import Pickle 8 | from .python import Python 9 | from .compressed import * 10 | with suppress(ImportError): 11 | from .numpy import Numpy 12 | with suppress(ImportError): 13 | from .pandas import PandasColumns, PandasBlocks 14 | with suppress(ImportError): 15 | from .zmq import Client, Server 16 | 17 | from . import _version 18 | __version__ = _version.get_versions()['version'] 19 | -------------------------------------------------------------------------------- /partd/_version.py: -------------------------------------------------------------------------------- 1 | 2 | # This file helps to compute a version number in source trees obtained from 3 | # git-archive tarball (such as those provided by githubs download-from-tag 4 | # feature). Distribution tarballs (built by setup.py sdist) and build 5 | # directories (produced by setup.py build) will contain a much shorter file 6 | # that just contains the computed version number. 7 | 8 | # This file is released into the public domain. 9 | # Generated by versioneer-0.29 10 | # https://github.com/python-versioneer/python-versioneer 11 | 12 | """Git implementation of _version.py.""" 13 | 14 | import errno 15 | import os 16 | import re 17 | import subprocess 18 | import sys 19 | from typing import Any, Callable, Dict, List, Optional, Tuple 20 | import functools 21 | 22 | 23 | def get_keywords() -> Dict[str, str]: 24 | """Get the keywords needed to look up the version information.""" 25 | # these strings will be replaced by git during git-archive. 26 | # setup.py/versioneer.py will grep for the variable names, so they must 27 | # each be defined on a line of their own. _version.py will just call 28 | # get_keywords(). 29 | git_refnames = " (HEAD -> main)" 30 | git_full = "e832b655606342dc742ec1c564b07abf1ad58383" 31 | git_date = "2024-07-15 16:21:10 -0500" 32 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 33 | return keywords 34 | 35 | 36 | class VersioneerConfig: 37 | """Container for Versioneer configuration parameters.""" 38 | 39 | VCS: str 40 | style: str 41 | tag_prefix: str 42 | parentdir_prefix: str 43 | versionfile_source: str 44 | verbose: bool 45 | 46 | 47 | def get_config() -> VersioneerConfig: 48 | """Create, populate and return the VersioneerConfig() object.""" 49 | # these strings are filled in when 'setup.py versioneer' creates 50 | # _version.py 51 | cfg = VersioneerConfig() 52 | cfg.VCS = "git" 53 | cfg.style = "pep440" 54 | cfg.tag_prefix = "" 55 | cfg.parentdir_prefix = "partd-" 56 | cfg.versionfile_source = "partd/_version.py" 57 | cfg.verbose = False 58 | return cfg 59 | 60 | 61 | class NotThisMethod(Exception): 62 | """Exception raised if a method is not valid for the current scenario.""" 63 | 64 | 65 | LONG_VERSION_PY: Dict[str, str] = {} 66 | HANDLERS: Dict[str, Dict[str, Callable]] = {} 67 | 68 | 69 | def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator 70 | """Create decorator to mark a method as the handler of a VCS.""" 71 | def decorate(f: Callable) -> Callable: 72 | """Store f in HANDLERS[vcs][method].""" 73 | if vcs not in HANDLERS: 74 | HANDLERS[vcs] = {} 75 | HANDLERS[vcs][method] = f 76 | return f 77 | return decorate 78 | 79 | 80 | def run_command( 81 | commands: List[str], 82 | args: List[str], 83 | cwd: Optional[str] = None, 84 | verbose: bool = False, 85 | hide_stderr: bool = False, 86 | env: Optional[Dict[str, str]] = None, 87 | ) -> Tuple[Optional[str], Optional[int]]: 88 | """Call the given command(s).""" 89 | assert isinstance(commands, list) 90 | process = None 91 | 92 | popen_kwargs: Dict[str, Any] = {} 93 | if sys.platform == "win32": 94 | # This hides the console window if pythonw.exe is used 95 | startupinfo = subprocess.STARTUPINFO() 96 | startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW 97 | popen_kwargs["startupinfo"] = startupinfo 98 | 99 | for command in commands: 100 | try: 101 | dispcmd = str([command] + args) 102 | # remember shell=False, so use git.cmd on windows, not just git 103 | process = subprocess.Popen([command] + args, cwd=cwd, env=env, 104 | stdout=subprocess.PIPE, 105 | stderr=(subprocess.PIPE if hide_stderr 106 | else None), **popen_kwargs) 107 | break 108 | except OSError as e: 109 | if e.errno == errno.ENOENT: 110 | continue 111 | if verbose: 112 | print("unable to run %s" % dispcmd) 113 | print(e) 114 | return None, None 115 | else: 116 | if verbose: 117 | print("unable to find command, tried %s" % (commands,)) 118 | return None, None 119 | stdout = process.communicate()[0].strip().decode() 120 | if process.returncode != 0: 121 | if verbose: 122 | print("unable to run %s (error)" % dispcmd) 123 | print("stdout was %s" % stdout) 124 | return None, process.returncode 125 | return stdout, process.returncode 126 | 127 | 128 | def versions_from_parentdir( 129 | parentdir_prefix: str, 130 | root: str, 131 | verbose: bool, 132 | ) -> Dict[str, Any]: 133 | """Try to determine the version from the parent directory name. 134 | 135 | Source tarballs conventionally unpack into a directory that includes both 136 | the project name and a version string. We will also support searching up 137 | two directory levels for an appropriately named parent directory 138 | """ 139 | rootdirs = [] 140 | 141 | for _ in range(3): 142 | dirname = os.path.basename(root) 143 | if dirname.startswith(parentdir_prefix): 144 | return {"version": dirname[len(parentdir_prefix):], 145 | "full-revisionid": None, 146 | "dirty": False, "error": None, "date": None} 147 | rootdirs.append(root) 148 | root = os.path.dirname(root) # up a level 149 | 150 | if verbose: 151 | print("Tried directories %s but none started with prefix %s" % 152 | (str(rootdirs), parentdir_prefix)) 153 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 154 | 155 | 156 | @register_vcs_handler("git", "get_keywords") 157 | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: 158 | """Extract version information from the given file.""" 159 | # the code embedded in _version.py can just fetch the value of these 160 | # keywords. When used from setup.py, we don't want to import _version.py, 161 | # so we do it with a regexp instead. This function is not used from 162 | # _version.py. 163 | keywords: Dict[str, str] = {} 164 | try: 165 | with open(versionfile_abs, "r") as fobj: 166 | for line in fobj: 167 | if line.strip().startswith("git_refnames ="): 168 | mo = re.search(r'=\s*"(.*)"', line) 169 | if mo: 170 | keywords["refnames"] = mo.group(1) 171 | if line.strip().startswith("git_full ="): 172 | mo = re.search(r'=\s*"(.*)"', line) 173 | if mo: 174 | keywords["full"] = mo.group(1) 175 | if line.strip().startswith("git_date ="): 176 | mo = re.search(r'=\s*"(.*)"', line) 177 | if mo: 178 | keywords["date"] = mo.group(1) 179 | except OSError: 180 | pass 181 | return keywords 182 | 183 | 184 | @register_vcs_handler("git", "keywords") 185 | def git_versions_from_keywords( 186 | keywords: Dict[str, str], 187 | tag_prefix: str, 188 | verbose: bool, 189 | ) -> Dict[str, Any]: 190 | """Get version information from git keywords.""" 191 | if "refnames" not in keywords: 192 | raise NotThisMethod("Short version file found") 193 | date = keywords.get("date") 194 | if date is not None: 195 | # Use only the last line. Previous lines may contain GPG signature 196 | # information. 197 | date = date.splitlines()[-1] 198 | 199 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 200 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 201 | # -like" string, which we must then edit to make compliant), because 202 | # it's been around since git-1.5.3, and it's too difficult to 203 | # discover which version we're using, or to work around using an 204 | # older one. 205 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 206 | refnames = keywords["refnames"].strip() 207 | if refnames.startswith("$Format"): 208 | if verbose: 209 | print("keywords are unexpanded, not using") 210 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 211 | refs = {r.strip() for r in refnames.strip("()").split(",")} 212 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 213 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 214 | TAG = "tag: " 215 | tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} 216 | if not tags: 217 | # Either we're using git < 1.8.3, or there really are no tags. We use 218 | # a heuristic: assume all version tags have a digit. The old git %d 219 | # expansion behaves like git log --decorate=short and strips out the 220 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 221 | # between branches and tags. By ignoring refnames without digits, we 222 | # filter out many common branch names like "release" and 223 | # "stabilization", as well as "HEAD" and "master". 224 | tags = {r for r in refs if re.search(r'\d', r)} 225 | if verbose: 226 | print("discarding '%s', no digits" % ",".join(refs - tags)) 227 | if verbose: 228 | print("likely tags: %s" % ",".join(sorted(tags))) 229 | for ref in sorted(tags): 230 | # sorting will prefer e.g. "2.0" over "2.0rc1" 231 | if ref.startswith(tag_prefix): 232 | r = ref[len(tag_prefix):] 233 | # Filter out refs that exactly match prefix or that don't start 234 | # with a number once the prefix is stripped (mostly a concern 235 | # when prefix is '') 236 | if not re.match(r'\d', r): 237 | continue 238 | if verbose: 239 | print("picking %s" % r) 240 | return {"version": r, 241 | "full-revisionid": keywords["full"].strip(), 242 | "dirty": False, "error": None, 243 | "date": date} 244 | # no suitable tags, so version is "0+unknown", but full hex is still there 245 | if verbose: 246 | print("no suitable tags, using unknown + full revision id") 247 | return {"version": "0+unknown", 248 | "full-revisionid": keywords["full"].strip(), 249 | "dirty": False, "error": "no suitable tags", "date": None} 250 | 251 | 252 | @register_vcs_handler("git", "pieces_from_vcs") 253 | def git_pieces_from_vcs( 254 | tag_prefix: str, 255 | root: str, 256 | verbose: bool, 257 | runner: Callable = run_command 258 | ) -> Dict[str, Any]: 259 | """Get version from 'git describe' in the root of the source tree. 260 | 261 | This only gets called if the git-archive 'subst' keywords were *not* 262 | expanded, and _version.py hasn't already been rewritten with a short 263 | version string, meaning we're inside a checked out source tree. 264 | """ 265 | GITS = ["git"] 266 | if sys.platform == "win32": 267 | GITS = ["git.cmd", "git.exe"] 268 | 269 | # GIT_DIR can interfere with correct operation of Versioneer. 270 | # It may be intended to be passed to the Versioneer-versioned project, 271 | # but that should not change where we get our version from. 272 | env = os.environ.copy() 273 | env.pop("GIT_DIR", None) 274 | runner = functools.partial(runner, env=env) 275 | 276 | _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, 277 | hide_stderr=not verbose) 278 | if rc != 0: 279 | if verbose: 280 | print("Directory %s not under git control" % root) 281 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 282 | 283 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 284 | # if there isn't one, this yields HEX[-dirty] (no NUM) 285 | describe_out, rc = runner(GITS, [ 286 | "describe", "--tags", "--dirty", "--always", "--long", 287 | "--match", f"{tag_prefix}[[:digit:]]*" 288 | ], cwd=root) 289 | # --long was added in git-1.5.5 290 | if describe_out is None: 291 | raise NotThisMethod("'git describe' failed") 292 | describe_out = describe_out.strip() 293 | full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) 294 | if full_out is None: 295 | raise NotThisMethod("'git rev-parse' failed") 296 | full_out = full_out.strip() 297 | 298 | pieces: Dict[str, Any] = {} 299 | pieces["long"] = full_out 300 | pieces["short"] = full_out[:7] # maybe improved later 301 | pieces["error"] = None 302 | 303 | branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], 304 | cwd=root) 305 | # --abbrev-ref was added in git-1.6.3 306 | if rc != 0 or branch_name is None: 307 | raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") 308 | branch_name = branch_name.strip() 309 | 310 | if branch_name == "HEAD": 311 | # If we aren't exactly on a branch, pick a branch which represents 312 | # the current commit. If all else fails, we are on a branchless 313 | # commit. 314 | branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) 315 | # --contains was added in git-1.5.4 316 | if rc != 0 or branches is None: 317 | raise NotThisMethod("'git branch --contains' returned error") 318 | branches = branches.split("\n") 319 | 320 | # Remove the first line if we're running detached 321 | if "(" in branches[0]: 322 | branches.pop(0) 323 | 324 | # Strip off the leading "* " from the list of branches. 325 | branches = [branch[2:] for branch in branches] 326 | if "master" in branches: 327 | branch_name = "master" 328 | elif not branches: 329 | branch_name = None 330 | else: 331 | # Pick the first branch that is returned. Good or bad. 332 | branch_name = branches[0] 333 | 334 | pieces["branch"] = branch_name 335 | 336 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 337 | # TAG might have hyphens. 338 | git_describe = describe_out 339 | 340 | # look for -dirty suffix 341 | dirty = git_describe.endswith("-dirty") 342 | pieces["dirty"] = dirty 343 | if dirty: 344 | git_describe = git_describe[:git_describe.rindex("-dirty")] 345 | 346 | # now we have TAG-NUM-gHEX or HEX 347 | 348 | if "-" in git_describe: 349 | # TAG-NUM-gHEX 350 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) 351 | if not mo: 352 | # unparsable. Maybe git-describe is misbehaving? 353 | pieces["error"] = ("unable to parse git-describe output: '%s'" 354 | % describe_out) 355 | return pieces 356 | 357 | # tag 358 | full_tag = mo.group(1) 359 | if not full_tag.startswith(tag_prefix): 360 | if verbose: 361 | fmt = "tag '%s' doesn't start with prefix '%s'" 362 | print(fmt % (full_tag, tag_prefix)) 363 | pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" 364 | % (full_tag, tag_prefix)) 365 | return pieces 366 | pieces["closest-tag"] = full_tag[len(tag_prefix):] 367 | 368 | # distance: number of commits since tag 369 | pieces["distance"] = int(mo.group(2)) 370 | 371 | # commit: short hex revision ID 372 | pieces["short"] = mo.group(3) 373 | 374 | else: 375 | # HEX: no tags 376 | pieces["closest-tag"] = None 377 | out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) 378 | pieces["distance"] = len(out.split()) # total number of commits 379 | 380 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 381 | date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() 382 | # Use only the last line. Previous lines may contain GPG signature 383 | # information. 384 | date = date.splitlines()[-1] 385 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 386 | 387 | return pieces 388 | 389 | 390 | def plus_or_dot(pieces: Dict[str, Any]) -> str: 391 | """Return a + if we don't already have one, else return a .""" 392 | if "+" in pieces.get("closest-tag", ""): 393 | return "." 394 | return "+" 395 | 396 | 397 | def render_pep440(pieces: Dict[str, Any]) -> str: 398 | """Build up version string, with post-release "local version identifier". 399 | 400 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 401 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 402 | 403 | Exceptions: 404 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 405 | """ 406 | if pieces["closest-tag"]: 407 | rendered = pieces["closest-tag"] 408 | if pieces["distance"] or pieces["dirty"]: 409 | rendered += plus_or_dot(pieces) 410 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 411 | if pieces["dirty"]: 412 | rendered += ".dirty" 413 | else: 414 | # exception #1 415 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], 416 | pieces["short"]) 417 | if pieces["dirty"]: 418 | rendered += ".dirty" 419 | return rendered 420 | 421 | 422 | def render_pep440_branch(pieces: Dict[str, Any]) -> str: 423 | """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . 424 | 425 | The ".dev0" means not master branch. Note that .dev0 sorts backwards 426 | (a feature branch will appear "older" than the master branch). 427 | 428 | Exceptions: 429 | 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] 430 | """ 431 | if pieces["closest-tag"]: 432 | rendered = pieces["closest-tag"] 433 | if pieces["distance"] or pieces["dirty"]: 434 | if pieces["branch"] != "master": 435 | rendered += ".dev0" 436 | rendered += plus_or_dot(pieces) 437 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 438 | if pieces["dirty"]: 439 | rendered += ".dirty" 440 | else: 441 | # exception #1 442 | rendered = "0" 443 | if pieces["branch"] != "master": 444 | rendered += ".dev0" 445 | rendered += "+untagged.%d.g%s" % (pieces["distance"], 446 | pieces["short"]) 447 | if pieces["dirty"]: 448 | rendered += ".dirty" 449 | return rendered 450 | 451 | 452 | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: 453 | """Split pep440 version string at the post-release segment. 454 | 455 | Returns the release segments before the post-release and the 456 | post-release version number (or -1 if no post-release segment is present). 457 | """ 458 | vc = str.split(ver, ".post") 459 | return vc[0], int(vc[1] or 0) if len(vc) == 2 else None 460 | 461 | 462 | def render_pep440_pre(pieces: Dict[str, Any]) -> str: 463 | """TAG[.postN.devDISTANCE] -- No -dirty. 464 | 465 | Exceptions: 466 | 1: no tags. 0.post0.devDISTANCE 467 | """ 468 | if pieces["closest-tag"]: 469 | if pieces["distance"]: 470 | # update the post release segment 471 | tag_version, post_version = pep440_split_post(pieces["closest-tag"]) 472 | rendered = tag_version 473 | if post_version is not None: 474 | rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) 475 | else: 476 | rendered += ".post0.dev%d" % (pieces["distance"]) 477 | else: 478 | # no commits, use the tag as the version 479 | rendered = pieces["closest-tag"] 480 | else: 481 | # exception #1 482 | rendered = "0.post0.dev%d" % pieces["distance"] 483 | return rendered 484 | 485 | 486 | def render_pep440_post(pieces: Dict[str, Any]) -> str: 487 | """TAG[.postDISTANCE[.dev0]+gHEX] . 488 | 489 | The ".dev0" means dirty. Note that .dev0 sorts backwards 490 | (a dirty tree will appear "older" than the corresponding clean one), 491 | but you shouldn't be releasing software with -dirty anyways. 492 | 493 | Exceptions: 494 | 1: no tags. 0.postDISTANCE[.dev0] 495 | """ 496 | if pieces["closest-tag"]: 497 | rendered = pieces["closest-tag"] 498 | if pieces["distance"] or pieces["dirty"]: 499 | rendered += ".post%d" % pieces["distance"] 500 | if pieces["dirty"]: 501 | rendered += ".dev0" 502 | rendered += plus_or_dot(pieces) 503 | rendered += "g%s" % pieces["short"] 504 | else: 505 | # exception #1 506 | rendered = "0.post%d" % pieces["distance"] 507 | if pieces["dirty"]: 508 | rendered += ".dev0" 509 | rendered += "+g%s" % pieces["short"] 510 | return rendered 511 | 512 | 513 | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: 514 | """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . 515 | 516 | The ".dev0" means not master branch. 517 | 518 | Exceptions: 519 | 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] 520 | """ 521 | if pieces["closest-tag"]: 522 | rendered = pieces["closest-tag"] 523 | if pieces["distance"] or pieces["dirty"]: 524 | rendered += ".post%d" % pieces["distance"] 525 | if pieces["branch"] != "master": 526 | rendered += ".dev0" 527 | rendered += plus_or_dot(pieces) 528 | rendered += "g%s" % pieces["short"] 529 | if pieces["dirty"]: 530 | rendered += ".dirty" 531 | else: 532 | # exception #1 533 | rendered = "0.post%d" % pieces["distance"] 534 | if pieces["branch"] != "master": 535 | rendered += ".dev0" 536 | rendered += "+g%s" % pieces["short"] 537 | if pieces["dirty"]: 538 | rendered += ".dirty" 539 | return rendered 540 | 541 | 542 | def render_pep440_old(pieces: Dict[str, Any]) -> str: 543 | """TAG[.postDISTANCE[.dev0]] . 544 | 545 | The ".dev0" means dirty. 546 | 547 | Exceptions: 548 | 1: no tags. 0.postDISTANCE[.dev0] 549 | """ 550 | if pieces["closest-tag"]: 551 | rendered = pieces["closest-tag"] 552 | if pieces["distance"] or pieces["dirty"]: 553 | rendered += ".post%d" % pieces["distance"] 554 | if pieces["dirty"]: 555 | rendered += ".dev0" 556 | else: 557 | # exception #1 558 | rendered = "0.post%d" % pieces["distance"] 559 | if pieces["dirty"]: 560 | rendered += ".dev0" 561 | return rendered 562 | 563 | 564 | def render_git_describe(pieces: Dict[str, Any]) -> str: 565 | """TAG[-DISTANCE-gHEX][-dirty]. 566 | 567 | Like 'git describe --tags --dirty --always'. 568 | 569 | Exceptions: 570 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 571 | """ 572 | if pieces["closest-tag"]: 573 | rendered = pieces["closest-tag"] 574 | if pieces["distance"]: 575 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 576 | else: 577 | # exception #1 578 | rendered = pieces["short"] 579 | if pieces["dirty"]: 580 | rendered += "-dirty" 581 | return rendered 582 | 583 | 584 | def render_git_describe_long(pieces: Dict[str, Any]) -> str: 585 | """TAG-DISTANCE-gHEX[-dirty]. 586 | 587 | Like 'git describe --tags --dirty --always -long'. 588 | The distance/hash is unconditional. 589 | 590 | Exceptions: 591 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 592 | """ 593 | if pieces["closest-tag"]: 594 | rendered = pieces["closest-tag"] 595 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 596 | else: 597 | # exception #1 598 | rendered = pieces["short"] 599 | if pieces["dirty"]: 600 | rendered += "-dirty" 601 | return rendered 602 | 603 | 604 | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: 605 | """Render the given version pieces into the requested style.""" 606 | if pieces["error"]: 607 | return {"version": "unknown", 608 | "full-revisionid": pieces.get("long"), 609 | "dirty": None, 610 | "error": pieces["error"], 611 | "date": None} 612 | 613 | if not style or style == "default": 614 | style = "pep440" # the default 615 | 616 | if style == "pep440": 617 | rendered = render_pep440(pieces) 618 | elif style == "pep440-branch": 619 | rendered = render_pep440_branch(pieces) 620 | elif style == "pep440-pre": 621 | rendered = render_pep440_pre(pieces) 622 | elif style == "pep440-post": 623 | rendered = render_pep440_post(pieces) 624 | elif style == "pep440-post-branch": 625 | rendered = render_pep440_post_branch(pieces) 626 | elif style == "pep440-old": 627 | rendered = render_pep440_old(pieces) 628 | elif style == "git-describe": 629 | rendered = render_git_describe(pieces) 630 | elif style == "git-describe-long": 631 | rendered = render_git_describe_long(pieces) 632 | else: 633 | raise ValueError("unknown style '%s'" % style) 634 | 635 | return {"version": rendered, "full-revisionid": pieces["long"], 636 | "dirty": pieces["dirty"], "error": None, 637 | "date": pieces.get("date")} 638 | 639 | 640 | def get_versions() -> Dict[str, Any]: 641 | """Get version information or return default if unable to do so.""" 642 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 643 | # __file__, we can work backwards from there to the root. Some 644 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 645 | # case we can only use expanded keywords. 646 | 647 | cfg = get_config() 648 | verbose = cfg.verbose 649 | 650 | try: 651 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, 652 | verbose) 653 | except NotThisMethod: 654 | pass 655 | 656 | try: 657 | root = os.path.realpath(__file__) 658 | # versionfile_source is the relative path from the top of the source 659 | # tree (where the .git directory might live) to this file. Invert 660 | # this to find the root from __file__. 661 | for _ in cfg.versionfile_source.split('/'): 662 | root = os.path.dirname(root) 663 | except NameError: 664 | return {"version": "0+unknown", "full-revisionid": None, 665 | "dirty": None, 666 | "error": "unable to find root of source tree", 667 | "date": None} 668 | 669 | try: 670 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 671 | return render(pieces, cfg.style) 672 | except NotThisMethod: 673 | pass 674 | 675 | try: 676 | if cfg.parentdir_prefix: 677 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 678 | except NotThisMethod: 679 | pass 680 | 681 | return {"version": "0+unknown", "full-revisionid": None, 682 | "dirty": None, 683 | "error": "unable to compute version", "date": None} 684 | -------------------------------------------------------------------------------- /partd/buffer.py: -------------------------------------------------------------------------------- 1 | from .core import Interface 2 | from threading import Lock 3 | from toolz import merge_with, topk, accumulate, pluck 4 | from operator import add 5 | from bisect import bisect 6 | from collections import defaultdict 7 | from queue import Queue, Empty 8 | 9 | 10 | def zero(): 11 | return 0 12 | 13 | class Buffer(Interface): 14 | def __init__(self, fast, slow, available_memory=1e9): 15 | self.lock = Lock() 16 | self.fast = fast 17 | self.slow = slow 18 | self.available_memory = available_memory 19 | self.lengths = defaultdict(zero) 20 | self.memory_usage = 0 21 | Interface.__init__(self) 22 | 23 | def __getstate__(self): 24 | return {'fast': self.fast, 25 | 'slow': self.slow, 26 | 'memory_usage': self.memory_usage, 27 | 'lengths': self.lengths, 28 | 'available_memory': self.available_memory} 29 | 30 | def __setstate__(self, state): 31 | Interface.__setstate__(self, state) 32 | self.lock = Lock() 33 | self.__dict__.update(state) 34 | 35 | def append(self, data, lock=True, **kwargs): 36 | if lock: self.lock.acquire() 37 | try: 38 | for k, v in data.items(): 39 | self.lengths[k] += len(v) 40 | self.memory_usage += len(v) 41 | self.fast.append(data, lock=False, **kwargs) 42 | 43 | while self.memory_usage > self.available_memory: 44 | keys = keys_to_flush(self.lengths, 0.1, maxcount=20) 45 | self.flush(keys) 46 | 47 | finally: 48 | if lock: self.lock.release() 49 | 50 | def _get(self, keys, lock=True, **kwargs): 51 | if lock: self.lock.acquire() 52 | try: 53 | result = list(map(add, self.fast.get(keys, lock=False), 54 | self.slow.get(keys, lock=False))) 55 | finally: 56 | if lock: self.lock.release() 57 | return result 58 | 59 | def _iset(self, key, value, lock=True): 60 | """ Idempotent set """ 61 | if lock: self.lock.acquire() 62 | try: 63 | self.fast.iset(key, value, lock=False) 64 | finally: 65 | if lock: self.lock.release() 66 | 67 | def _delete(self, keys, lock=True): 68 | if lock: self.lock.acquire() 69 | try: 70 | self.fast.delete(keys, lock=False) 71 | self.slow.delete(keys, lock=False) 72 | finally: 73 | if lock: self.lock.release() 74 | 75 | def drop(self): 76 | self._iset_seen.clear() 77 | self.fast.drop() 78 | self.slow.drop() 79 | 80 | def __exit__(self, *args): 81 | self.drop() 82 | 83 | def flush(self, keys=None, block=None): 84 | """ Flush keys to disk 85 | 86 | Parameters 87 | ---------- 88 | 89 | keys: list or None 90 | list of keys to flush 91 | block: bool (defaults to None) 92 | Whether or not to block until all writing is complete 93 | 94 | If no keys are given then flush all keys 95 | """ 96 | if keys is None: 97 | keys = list(self.lengths) 98 | 99 | self.slow.append(dict(zip(keys, self.fast.get(keys)))) 100 | self.fast.delete(keys) 101 | 102 | for key in keys: 103 | self.memory_usage -= self.lengths[key] 104 | del self.lengths[key] 105 | 106 | 107 | def keys_to_flush(lengths, fraction=0.1, maxcount=100000): 108 | """ Which keys to remove 109 | 110 | >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, 111 | ... 'e': 10, 'f': 25, 'g': 5} 112 | >>> keys_to_flush(lengths, 0.5) 113 | ['f', 'a'] 114 | """ 115 | top = topk(max(len(lengths) // 2, 1), 116 | lengths.items(), 117 | key=1) 118 | total = sum(lengths.values()) 119 | cutoff = min(maxcount, max(1, 120 | bisect(list(accumulate(add, pluck(1, top))), 121 | total * fraction))) 122 | result = [k for k, v in top[:cutoff]] 123 | assert result 124 | return result 125 | -------------------------------------------------------------------------------- /partd/compressed.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress 2 | from functools import partial 3 | 4 | from .encode import Encode 5 | 6 | __all__ = [] 7 | 8 | 9 | def bytes_concat(L): 10 | return b''.join(L) 11 | 12 | 13 | with suppress(ImportError, AttributeError): 14 | # In case snappy is not installed, or another package called snappy that does not implement compress / decompress. 15 | # For example, SnapPy (https://pypi.org/project/snappy/) 16 | import snappy 17 | Snappy = partial(Encode, 18 | snappy.compress, 19 | snappy.decompress, 20 | bytes_concat) 21 | __all__.append('Snappy') 22 | 23 | 24 | with suppress(ImportError): 25 | import zlib 26 | ZLib = partial(Encode, 27 | zlib.compress, 28 | zlib.decompress, 29 | bytes_concat) 30 | __all__.append('ZLib') 31 | 32 | 33 | with suppress(ImportError): 34 | import bz2 35 | BZ2 = partial(Encode, 36 | bz2.compress, 37 | bz2.decompress, 38 | bytes_concat) 39 | __all__.append('BZ2') 40 | 41 | 42 | with suppress(ImportError): 43 | import blosc 44 | Blosc = partial(Encode, 45 | blosc.compress, 46 | blosc.decompress, 47 | bytes_concat) 48 | __all__.append('Blosc') 49 | -------------------------------------------------------------------------------- /partd/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import locket 4 | import string 5 | from toolz import memoize 6 | from contextlib import contextmanager 7 | from .utils import nested_get, flatten 8 | 9 | 10 | 11 | # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python 12 | valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep 13 | 14 | 15 | def escape_filename(fn): 16 | """ Escape text so that it is a valid filename 17 | 18 | >>> escape_filename('Foo!bar?') 19 | 'Foobar' 20 | 21 | """ 22 | return ''.join(filter(valid_chars.__contains__, fn)) 23 | 24 | 25 | def filename(path, key): 26 | return os.path.join(path, escape_filename(token(key))) 27 | 28 | 29 | def token(key): 30 | """ 31 | 32 | >>> token('hello') 33 | 'hello' 34 | >>> token(('hello', 'world')) # doctest: +SKIP 35 | 'hello/world' 36 | """ 37 | if isinstance(key, str): 38 | return key 39 | elif isinstance(key, tuple): 40 | return os.path.join(*map(token, key)) 41 | else: 42 | return str(key) 43 | 44 | 45 | class Interface: 46 | def __init__(self): 47 | self._iset_seen = set() 48 | 49 | def __setstate__(self, state): 50 | self.__dict__.update(state) 51 | self._iset_seen = set() 52 | 53 | def iset(self, key, value, **kwargs): 54 | if key in self._iset_seen: 55 | return 56 | else: 57 | self._iset(key, value, **kwargs) 58 | self._iset_seen.add(key) 59 | 60 | def __enter__(self): 61 | return self 62 | 63 | def __exit__(self, type, value, traceback): 64 | self.drop() 65 | 66 | def iget(self, key): 67 | return self._get([key], lock=False)[0] 68 | 69 | def get(self, keys, **kwargs): 70 | if not isinstance(keys, list): 71 | return self.get([keys], **kwargs)[0] 72 | elif any(isinstance(key, list) for key in keys): # nested case 73 | flatkeys = list(flatten(keys)) 74 | result = self.get(flatkeys, **kwargs) 75 | return nested_get(keys, dict(zip(flatkeys, result))) 76 | else: 77 | return self._get(keys, **kwargs) 78 | 79 | def delete(self, keys, **kwargs): 80 | if not isinstance(keys, list): 81 | return self._delete([keys], **kwargs) 82 | else: 83 | return self._delete(keys, **kwargs) 84 | 85 | def pop(self, keys, **kwargs): 86 | with self.partd.lock: 87 | result = self.partd.get(keys, lock=False) 88 | self.partd.delete(keys, lock=False) 89 | return result 90 | 91 | -------------------------------------------------------------------------------- /partd/dict.py: -------------------------------------------------------------------------------- 1 | from .core import Interface 2 | from threading import Lock 3 | 4 | 5 | class Dict(Interface): 6 | def __init__(self): 7 | self.lock = Lock() 8 | self.data = dict() 9 | Interface.__init__(self) 10 | 11 | def __getstate__(self): 12 | return {'data': self.data} 13 | 14 | def __setstate__(self, state): 15 | Interface.__setstate__(self, state) 16 | Dict.__init__(self) 17 | self.data = state['data'] 18 | 19 | def append(self, data, lock=True, **kwargs): 20 | if lock: self.lock.acquire() 21 | try: 22 | for k, v in data.items(): 23 | if k not in self.data: 24 | self.data[k] = [] 25 | self.data[k].append(v) 26 | finally: 27 | if lock: self.lock.release() 28 | 29 | def _get(self, keys, lock=True, **kwargs): 30 | assert isinstance(keys, (list, tuple, set)) 31 | if lock: 32 | self.lock.acquire() 33 | try: 34 | result = [b''.join(self.data.get(key, [])) for key in keys] 35 | finally: 36 | if lock: 37 | self.lock.release() 38 | return result 39 | 40 | def _iset(self, key, value, lock=True): 41 | """ Idempotent set """ 42 | if lock: 43 | self.lock.acquire() 44 | try: 45 | self.data[key] = [value] 46 | finally: 47 | if lock: 48 | self.lock.release() 49 | 50 | def _delete(self, keys, lock=True): 51 | if lock: 52 | self.lock.acquire() 53 | try: 54 | for key in keys: 55 | if key in self.data: 56 | del self.data[key] 57 | finally: 58 | if lock: 59 | self.lock.release() 60 | 61 | def drop(self): 62 | self._iset_seen.clear() 63 | self.data.clear() 64 | 65 | def __exit__(self, *args): 66 | self.drop() 67 | -------------------------------------------------------------------------------- /partd/encode.py: -------------------------------------------------------------------------------- 1 | from .core import Interface 2 | from .file import File 3 | from toolz import valmap 4 | from .utils import frame, framesplit 5 | 6 | 7 | class Encode(Interface): 8 | def __init__(self, encode, decode, join, partd=None): 9 | if not partd or isinstance(partd, str): 10 | partd = File(partd) 11 | self.partd = partd 12 | self.encode = encode 13 | self.decode = decode 14 | self.join = join 15 | Interface.__init__(self) 16 | 17 | def __getstate__(self): 18 | return self.__dict__ 19 | 20 | __setstate__ = Interface.__setstate__ 21 | 22 | def append(self, data, **kwargs): 23 | data = valmap(self.encode, data) 24 | data = valmap(frame, data) 25 | self.partd.append(data, **kwargs) 26 | 27 | def _get(self, keys, **kwargs): 28 | raw = self.partd._get(keys, **kwargs) 29 | return [self.join([self.decode(frame) for frame in framesplit(chunk)]) 30 | for chunk in raw] 31 | 32 | def delete(self, keys, **kwargs): 33 | return self.partd.delete(keys, **kwargs) 34 | 35 | def _iset(self, key, value, **kwargs): 36 | return self.partd.iset(key, frame(self.encode(value)), **kwargs) 37 | 38 | def drop(self): 39 | return self.partd.drop() 40 | 41 | @property 42 | def lock(self): 43 | return self.partd.lock 44 | 45 | def __exit__(self, *args): 46 | self.drop() 47 | self.partd.__exit__(*args) 48 | -------------------------------------------------------------------------------- /partd/file.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from contextlib import suppress 3 | import os 4 | import shutil 5 | import string 6 | import tempfile 7 | 8 | from .core import Interface 9 | import locket 10 | 11 | 12 | class File(Interface): 13 | def __init__(self, path=None, dir=None): 14 | if not path: 15 | path = tempfile.mkdtemp(suffix='.partd', dir=dir) 16 | cleanup_files.append(path) 17 | self._explicitly_given_path = False 18 | else: 19 | self._explicitly_given_path = True 20 | self.path = path 21 | if not os.path.exists(path): 22 | with suppress(OSError): 23 | os.makedirs(path) 24 | self.lock = locket.lock_file(self.filename('.lock')) 25 | Interface.__init__(self) 26 | 27 | def __getstate__(self): 28 | return {'path': self.path} 29 | 30 | def __setstate__(self, state): 31 | Interface.__setstate__(self, state) 32 | File.__init__(self, state['path']) 33 | 34 | def append(self, data, lock=True, fsync=False, **kwargs): 35 | if lock: self.lock.acquire() 36 | try: 37 | for k, v in data.items(): 38 | fn = self.filename(k) 39 | if not os.path.exists(os.path.dirname(fn)): 40 | os.makedirs(os.path.dirname(fn)) 41 | with open(fn, 'ab') as f: 42 | f.write(v) 43 | if fsync: 44 | os.fsync(f) 45 | finally: 46 | if lock: self.lock.release() 47 | 48 | def _get(self, keys, lock=True, **kwargs): 49 | assert isinstance(keys, (list, tuple, set)) 50 | if lock: 51 | self.lock.acquire() 52 | try: 53 | result = [] 54 | for key in keys: 55 | try: 56 | with open(self.filename(key), 'rb') as f: 57 | result.append(f.read()) 58 | except OSError: 59 | result.append(b'') 60 | finally: 61 | if lock: 62 | self.lock.release() 63 | return result 64 | 65 | def _iset(self, key, value, lock=True): 66 | """ Idempotent set """ 67 | fn = self.filename(key) 68 | if not os.path.exists(os.path.dirname(fn)): 69 | os.makedirs(os.path.dirname(fn)) 70 | if lock: 71 | self.lock.acquire() 72 | try: 73 | with open(self.filename(key), 'wb') as f: 74 | f.write(value) 75 | finally: 76 | if lock: 77 | self.lock.release() 78 | 79 | def _delete(self, keys, lock=True): 80 | if lock: 81 | self.lock.acquire() 82 | try: 83 | for key in keys: 84 | path = filename(self.path, key) 85 | if os.path.exists(path): 86 | os.remove(path) 87 | finally: 88 | if lock: 89 | self.lock.release() 90 | 91 | def drop(self): 92 | if os.path.exists(self.path): 93 | shutil.rmtree(self.path) 94 | self._iset_seen.clear() 95 | os.mkdir(self.path) 96 | 97 | def filename(self, key): 98 | return filename(self.path, key) 99 | 100 | def __exit__(self, *args): 101 | self.drop() 102 | os.rmdir(self.path) 103 | 104 | def __del__(self): 105 | if not self._explicitly_given_path: 106 | self.drop() 107 | os.rmdir(self.path) 108 | 109 | 110 | def filename(path, key): 111 | return os.path.join(path, escape_filename(token(key))) 112 | 113 | 114 | # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python 115 | valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep 116 | 117 | 118 | def escape_filename(fn): 119 | """ Escape text so that it is a valid filename 120 | 121 | >>> escape_filename('Foo!bar?') 122 | 'Foobar' 123 | 124 | """ 125 | return ''.join(filter(valid_chars.__contains__, fn)) 126 | 127 | 128 | 129 | def token(key): 130 | """ 131 | 132 | >>> token('hello') 133 | 'hello' 134 | >>> token(('hello', 'world')) # doctest: +SKIP 135 | 'hello/world' 136 | """ 137 | if isinstance(key, str): 138 | return key 139 | elif isinstance(key, tuple): 140 | return os.path.join(*map(token, key)) 141 | else: 142 | return str(key) 143 | 144 | 145 | cleanup_files = list() 146 | 147 | @atexit.register 148 | def cleanup(): 149 | for fn in cleanup_files: 150 | if os.path.exists(fn): 151 | shutil.rmtree(fn) 152 | -------------------------------------------------------------------------------- /partd/numpy.py: -------------------------------------------------------------------------------- 1 | """ Store arrays 2 | 3 | We put arrays on disk as raw bytes, extending along the first dimension. 4 | Alongside each array x we ensure the value x.dtype which stores the string 5 | description of the array's dtype. 6 | """ 7 | from contextlib import suppress 8 | import pickle 9 | 10 | import numpy as np 11 | from toolz import valmap, identity, partial 12 | from .core import Interface 13 | from .file import File 14 | from .utils import frame, framesplit, suffix 15 | 16 | 17 | def serialize_dtype(dt): 18 | """ Serialize dtype to bytes 19 | 20 | >>> serialize_dtype(np.dtype('i4')) 21 | b'>> serialize_dtype(np.dtype('M8[us]')) 23 | b'>> parse_dtype(b'i4') 32 | dtype('int32') 33 | 34 | >>> parse_dtype(b"[('a', 'i4')]") 35 | dtype([('a', '= (0, 5, 2): 109 | unpack_kwargs = {'raw': False} 110 | else: 111 | unpack_kwargs = {'encoding': 'utf-8'} 112 | 113 | blocks = [msgpack.unpackb(f, **unpack_kwargs) 114 | for f in framesplit(bytes)] 115 | except Exception: 116 | blocks = [pickle.loads(f) for f in framesplit(bytes)] 117 | 118 | result = np.empty(sum(map(len, blocks)), dtype='O') 119 | i = 0 120 | for block in blocks: 121 | result[i:i + len(block)] = block 122 | i += len(block) 123 | return result 124 | else: 125 | result = np.frombuffer(bytes, dtype) 126 | if copy: 127 | result = result.copy() 128 | return result 129 | 130 | 131 | compress_text = identity 132 | decompress_text = identity 133 | compress_bytes = lambda bytes, itemsize: bytes 134 | decompress_bytes = identity 135 | 136 | with suppress(ImportError): 137 | import blosc 138 | blosc.set_nthreads(1) 139 | 140 | compress_bytes = blosc.compress 141 | decompress_bytes = blosc.decompress 142 | 143 | compress_text = partial(blosc.compress, typesize=1) 144 | decompress_text = blosc.decompress 145 | 146 | with suppress(ImportError): 147 | from snappy import compress as compress_text 148 | from snappy import decompress as decompress_text 149 | 150 | 151 | def compress(bytes, dtype): 152 | if dtype == 'O': 153 | return compress_text(bytes) 154 | else: 155 | return compress_bytes(bytes, dtype.itemsize) 156 | 157 | 158 | def decompress(bytes, dtype): 159 | if dtype == 'O': 160 | return decompress_text(bytes) 161 | else: 162 | return decompress_bytes(bytes) 163 | -------------------------------------------------------------------------------- /partd/pandas.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import pickle 3 | 4 | import pandas as pd 5 | from packaging.version import Version 6 | 7 | PANDAS_GE_210 = Version(pd.__version__).release >= (2, 1, 0) 8 | PANDAS_GE_300 = Version(pd.__version__).major >= 3 9 | 10 | if PANDAS_GE_300: 11 | from pandas.api.internals import create_dataframe_from_blocks 12 | create_block_manager_from_blocks = None 13 | make_block = None 14 | else: 15 | create_dataframe_from_blocks = None 16 | try: 17 | from pandas.core.internals.managers import create_block_manager_from_blocks 18 | except ImportError: 19 | from pandas.core.internals import create_block_manager_from_blocks 20 | 21 | from pandas.core.internals import make_block 22 | 23 | from . import numpy as pnp 24 | from .core import Interface 25 | from .encode import Encode 26 | from .utils import extend, framesplit, frame 27 | from pandas.api.types import is_extension_array_dtype 28 | from pandas.api.extensions import ExtensionArray 29 | 30 | def is_extension_array(x): 31 | return isinstance(x, ExtensionArray) 32 | 33 | 34 | dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) 35 | 36 | 37 | 38 | class PandasColumns(Interface): 39 | def __init__(self, partd=None): 40 | self.partd = pnp.Numpy(partd) 41 | Interface.__init__(self) 42 | 43 | def append(self, data, **kwargs): 44 | for k, df in data.items(): 45 | self.iset(extend(k, '.columns'), dumps(list(df.columns))) 46 | self.iset(extend(k, '.index-name'), dumps(df.index.name)) 47 | 48 | # TODO: don't use values, it does some work. Look at _blocks instead 49 | # pframe/cframe do this well 50 | arrays = {extend(k, col): df[col].values 51 | for k, df in data.items() 52 | for col in df.columns} 53 | arrays.update({extend(k, '.index'): df.index.values 54 | for k, df in data.items()}) 55 | # TODO: handle categoricals 56 | self.partd.append(arrays, **kwargs) 57 | 58 | def _get(self, keys, columns=None, **kwargs): 59 | if columns is None: 60 | columns = self.partd.partd.get([extend(k, '.columns') for k in keys], 61 | **kwargs) 62 | columns = list(map(pickle.loads, columns)) 63 | else: 64 | columns = [columns] * len(keys) 65 | index_names = self.partd.partd.get([extend(k, '.index-name') 66 | for k in keys], **kwargs) 67 | index_names = map(pickle.loads, index_names) 68 | 69 | keys = [[extend(k, '.index'), [extend(k, col) for col in cols]] 70 | for k, cols in zip(keys, columns)] 71 | 72 | arrays = self.partd.get(keys, **kwargs) 73 | 74 | return [pd.DataFrame(dict(zip(cols, arrs)), columns=cols, 75 | index=pd.Index(index, name=iname)) 76 | for iname, (index, arrs), cols in zip(index_names, arrays, columns)] 77 | 78 | def __getstate__(self): 79 | return {'partd': self.partd} 80 | 81 | def _iset(self, key, value): 82 | return self.partd._iset(key, value) 83 | 84 | def drop(self): 85 | return self.partd.drop() 86 | 87 | @property 88 | def lock(self): 89 | return self.partd.partd.lock 90 | 91 | def __exit__(self, *args): 92 | self.drop() 93 | self.partd.__exit__(self, *args) 94 | 95 | def __del__(self): 96 | self.partd.__del__() 97 | 98 | 99 | def index_to_header_bytes(ind): 100 | # These have special `__reduce__` methods, just use pickle 101 | if isinstance(ind, (pd.DatetimeIndex, 102 | pd.MultiIndex, 103 | pd.RangeIndex)): 104 | return None, dumps(ind) 105 | 106 | if isinstance(ind, pd.CategoricalIndex): 107 | cat = (ind.ordered, ind.categories) 108 | values = ind.codes 109 | else: 110 | cat = None 111 | values = ind.values 112 | 113 | if is_extension_array_dtype(ind): 114 | return None, dumps(ind) 115 | 116 | header = (type(ind), {k: getattr(ind, k, None) for k in ind._attributes}, values.dtype, cat) 117 | bytes = pnp.compress(pnp.serialize(values), values.dtype) 118 | return header, bytes 119 | 120 | 121 | def index_from_header_bytes(header, bytes): 122 | if header is None: 123 | return pickle.loads(bytes) 124 | 125 | typ, attr, dtype, cat = header 126 | data = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True) 127 | if cat: 128 | data = pd.Categorical.from_codes(data, cat[1], ordered=cat[0]) 129 | return typ.__new__(typ, data=data, **attr) 130 | 131 | 132 | def block_to_header_bytes(block): 133 | values = block.values 134 | if isinstance(values, pd.Categorical): 135 | extension = ('categorical_type', (values.ordered, values.categories)) 136 | values = values.codes 137 | elif isinstance(block, pd.DatetimeTZDtype): 138 | extension = ('datetime64_tz_type', (block.values.tzinfo,)) 139 | values = values.view('i8') 140 | elif is_extension_array_dtype(block.dtype) or is_extension_array(values): 141 | extension = ("other", ()) 142 | else: 143 | extension = ('numpy_type', ()) 144 | 145 | header = (block.mgr_locs.as_array, values.dtype, values.shape, extension) 146 | if extension == ("other", ()): 147 | bytes = pickle.dumps(values) 148 | else: 149 | bytes = pnp.compress(pnp.serialize(values), values.dtype) 150 | return header, bytes 151 | 152 | 153 | def block_from_header_bytes(header, bytes, create_block: bool): 154 | placement, dtype, shape, (extension_type, extension_values) = header 155 | 156 | if extension_type == "other": 157 | values = pickle.loads(bytes) 158 | else: 159 | values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, 160 | copy=True).reshape(shape) 161 | if extension_type == 'categorical_type': 162 | values = pd.Categorical.from_codes(values, 163 | extension_values[1], 164 | ordered=extension_values[0]) 165 | elif extension_type == 'datetime64_tz_type': 166 | tz_info = extension_values[0] 167 | values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert( 168 | tz_info) 169 | if create_block: 170 | return make_block(values, placement=placement) 171 | return values, placement 172 | 173 | 174 | def serialize(df): 175 | """ Serialize and compress a Pandas DataFrame 176 | 177 | Uses Pandas blocks, snappy, and blosc to deconstruct an array into bytes 178 | """ 179 | col_header, col_bytes = index_to_header_bytes(df.columns) 180 | ind_header, ind_bytes = index_to_header_bytes(df.index) 181 | headers = [col_header, ind_header] 182 | bytes = [col_bytes, ind_bytes] 183 | 184 | for block in df._mgr.blocks: 185 | h, b = block_to_header_bytes(block) 186 | headers.append(h) 187 | bytes.append(b) 188 | 189 | frames = [dumps(headers)] + bytes 190 | return b''.join(map(frame, frames)) 191 | 192 | 193 | def deserialize(bytes): 194 | """ Deserialize and decompress bytes back to a pandas DataFrame """ 195 | frames = list(framesplit(bytes)) 196 | headers = pickle.loads(frames[0]) 197 | bytes = frames[1:] 198 | axes = [index_from_header_bytes(headers[0], bytes[0]), 199 | index_from_header_bytes(headers[1], bytes[1])] 200 | blocks = [block_from_header_bytes(h, b, create_block=not PANDAS_GE_300) 201 | for (h, b) in zip(headers[2:], bytes[2:])] 202 | if PANDAS_GE_300: 203 | return pd.api.internals.create_dataframe_from_blocks(blocks, axes[1], axes[0]) 204 | elif PANDAS_GE_210: 205 | return pd.DataFrame._from_mgr(create_block_manager_from_blocks(blocks, axes), axes=axes) 206 | else: 207 | return pd.DataFrame(create_block_manager_from_blocks(blocks, axes)) 208 | 209 | 210 | def join(dfs): 211 | if not dfs: 212 | return pd.DataFrame() 213 | else: 214 | result = pd.concat(dfs) 215 | dtypes = { 216 | col: "category" 217 | for col in result.columns 218 | if ( 219 | isinstance(dfs[0][col].dtype, pd.CategoricalDtype) 220 | and not isinstance(result[col].dtype, pd.CategoricalDtype) 221 | ) 222 | } 223 | if dtypes: 224 | result = result.astype(dtypes) 225 | return result 226 | 227 | PandasBlocks = partial(Encode, serialize, deserialize, join) 228 | -------------------------------------------------------------------------------- /partd/pickle.py: -------------------------------------------------------------------------------- 1 | """ 2 | get/put functions that consume/produce Python lists using Pickle to serialize 3 | """ 4 | import pickle 5 | 6 | from .encode import Encode 7 | from functools import partial 8 | 9 | def concat(lists): 10 | return sum(lists, []) 11 | 12 | Pickle = partial(Encode, 13 | partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL), 14 | pickle.loads, 15 | concat) 16 | -------------------------------------------------------------------------------- /partd/python.py: -------------------------------------------------------------------------------- 1 | """ 2 | get/put functions that consume/produce Python lists using msgpack or pickle 3 | to serialize. 4 | 5 | First we try msgpack (it's faster). If that fails then we default to pickle. 6 | """ 7 | import pickle 8 | 9 | try: 10 | from pandas import msgpack 11 | except ImportError: 12 | try: 13 | import msgpack 14 | except ImportError: 15 | msgpack = False 16 | 17 | 18 | from .encode import Encode 19 | from functools import partial 20 | 21 | 22 | def dumps(x): 23 | try: 24 | return msgpack.packb(x, use_bin_type=True) 25 | except: 26 | return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL) 27 | 28 | def loads(x): 29 | try: 30 | if msgpack.version >= (0, 5, 2): 31 | unpack_kwargs = {'raw': False} 32 | else: 33 | unpack_kwargs = {'encoding': 'utf-8'} 34 | return msgpack.unpackb(x, **unpack_kwargs) 35 | except: 36 | return pickle.loads(x) 37 | 38 | 39 | def concat(lists): 40 | return sum(lists, []) 41 | 42 | 43 | Python = partial(Encode, dumps, loads, concat) 44 | -------------------------------------------------------------------------------- /partd/tests/test_buffer.py: -------------------------------------------------------------------------------- 1 | from partd.dict import Dict 2 | from partd.file import File 3 | from partd.buffer import Buffer, keys_to_flush 4 | import pickle 5 | 6 | import shutil 7 | import os 8 | 9 | 10 | def test_partd(): 11 | a = Dict() 12 | b = Dict() 13 | with Buffer(a, b, available_memory=10) as p: 14 | p.append({'x': b'Hello', 'y': b'abc'}) 15 | assert a.get(['x', 'y']) == [b'Hello', b'abc'] 16 | 17 | p.append({'x': b'World!', 'y': b'def'}) 18 | assert a.get(['x', 'y']) == [b'', b'abcdef'] 19 | assert b.get(['x', 'y']) == [b'HelloWorld!', b''] 20 | 21 | result = p.get(['y', 'x']) 22 | assert result == [b'abcdef', b'HelloWorld!'] 23 | 24 | assert p.get('z') == b'' 25 | 26 | with p.lock: # uh oh, possible deadlock 27 | result = p.get(['x'], lock=False) 28 | 29 | 30 | def test_keys_to_flush(): 31 | lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, 'e': 10, 'f': 25, 'g': 5} 32 | assert keys_to_flush(lengths, 0.5) == ['f', 'a'] 33 | 34 | 35 | def test_pickle(): 36 | with Dict() as a: 37 | with File() as b: 38 | c = Buffer(a, b) 39 | 40 | c.append({'x': b'123'}) 41 | 42 | d = pickle.loads(pickle.dumps(c)) 43 | 44 | assert d.get('x') == c.get('x') 45 | 46 | pickled_attrs = ('memory_usage', 'lengths', 'available_memory') 47 | for attr in pickled_attrs: 48 | assert hasattr(d, attr) 49 | assert getattr(d, attr) == getattr(c, attr) 50 | # special case Dict and File -- some attrs do not pickle 51 | assert hasattr(d, 'fast') 52 | assert d.fast.data == c.fast.data 53 | assert hasattr(d, 'slow') 54 | assert d.slow.path == c.slow.path 55 | -------------------------------------------------------------------------------- /partd/tests/test_compressed.py: -------------------------------------------------------------------------------- 1 | from partd.compressed import ZLib 2 | 3 | 4 | import shutil 5 | import os 6 | import pickle 7 | 8 | 9 | def test_partd(): 10 | with ZLib() as p: 11 | p.append({'x': b'Hello', 'y': b'abc'}) 12 | p.append({'x': b'World!', 'y': b'def'}) 13 | assert os.path.exists(p.partd.filename('x')) 14 | assert os.path.exists(p.partd.filename('y')) 15 | 16 | result = p.get(['y', 'x']) 17 | assert result == [b'abcdef', b'HelloWorld!'] 18 | 19 | assert p.get('z') == b'' 20 | 21 | with p.lock: # uh oh, possible deadlock 22 | result = p.get(['x'], lock=False) 23 | 24 | assert not os.path.exists(p.partd.path) 25 | 26 | 27 | def test_pickle(): 28 | with ZLib() as p: 29 | p.append({'x': b'123'}) 30 | q = pickle.loads(pickle.dumps(p)) 31 | assert q.get('x') == b'123' 32 | -------------------------------------------------------------------------------- /partd/tests/test_dict.py: -------------------------------------------------------------------------------- 1 | from partd.dict import Dict 2 | 3 | import shutil 4 | import os 5 | 6 | 7 | def test_partd(): 8 | with Dict() as p: 9 | p.append({'x': b'Hello', 'y': b'abc'}) 10 | p.append({'x': b'World!', 'y': b'def'}) 11 | 12 | result = p.get(['y', 'x']) 13 | assert result == [b'abcdef', b'HelloWorld!'] 14 | 15 | assert p.get('z') == b'' 16 | 17 | with p.lock: # uh oh, possible deadlock 18 | result = p.get(['x'], lock=False) 19 | 20 | 21 | def test_key_tuple(): 22 | with Dict() as p: 23 | p.append({('a', 'b'): b'123'}) 24 | assert p.get(('a', 'b')) == b'123' 25 | 26 | 27 | def test_iset(): 28 | with Dict() as p: 29 | p.iset('x', b'123') 30 | assert 'x' in p._iset_seen 31 | assert 'y' not in p._iset_seen 32 | p.iset('x', b'123') 33 | p.iset('x', b'123') 34 | assert p.get('x') == b'123' 35 | 36 | 37 | def test_delete_non_existent_key(): 38 | with Dict() as p: 39 | p.append({'x': b'123'}) 40 | p.delete(['x', 'y']) 41 | assert p.get(['x', 'y']) == [b'', b''] 42 | -------------------------------------------------------------------------------- /partd/tests/test_encode.py: -------------------------------------------------------------------------------- 1 | from partd.file import File 2 | from partd.encode import Encode 3 | 4 | import zlib 5 | import shutil 6 | import os 7 | 8 | 9 | def test_partd(): 10 | with Encode(zlib.compress, zlib.decompress, b''.join) as p: 11 | p.append({'x': b'Hello', 'y': b'abc'}) 12 | p.append({'x': b'World!', 'y': b'def'}) 13 | 14 | result = p.get(['y', 'x']) 15 | assert result == [b'abcdef', b'HelloWorld!'] 16 | 17 | assert p.get('z') == b'' 18 | 19 | with p.lock: # uh oh, possible deadlock 20 | result = p.get(['x'], lock=False) 21 | 22 | 23 | def test_ensure(): 24 | with Encode(zlib.compress, zlib.decompress, b''.join) as p: 25 | p.iset('x', b'123') 26 | p.iset('x', b'123') 27 | p.iset('x', b'123') 28 | assert p.get('x') == b'123' 29 | -------------------------------------------------------------------------------- /partd/tests/test_file.py: -------------------------------------------------------------------------------- 1 | from partd.file import File 2 | 3 | import shutil 4 | import os 5 | 6 | 7 | def test_partd(): 8 | with File() as p: 9 | p.append({'x': b'Hello', 'y': b'abc'}) 10 | p.append({'x': b'World!', 'y': b'def'}) 11 | assert os.path.exists(p.filename('x')) 12 | assert os.path.exists(p.filename('y')) 13 | 14 | result = p.get(['y', 'x']) 15 | assert result == [b'abcdef', b'HelloWorld!'] 16 | 17 | assert p.get('z') == b'' 18 | 19 | with p.lock: # uh oh, possible deadlock 20 | result = p.get(['x'], lock=False) 21 | 22 | assert not os.path.exists(p.path) 23 | 24 | 25 | def test_key_tuple(): 26 | with File() as p: 27 | p.append({('a', 'b'): b'123'}) 28 | assert os.path.exists(p.filename(('a', 'b'))) 29 | 30 | 31 | def test_iset(): 32 | with File() as p: 33 | p.iset('x', b'123') 34 | assert 'x' in p._iset_seen 35 | assert 'y' not in p._iset_seen 36 | p.iset('x', b'123') 37 | p.iset('x', b'123') 38 | assert p.get('x') == b'123' 39 | 40 | 41 | def test_nested_get(): 42 | with File() as p: 43 | p.append({'x': b'1', 'y': b'2', 'z': b'3'}) 44 | assert p.get(['x', ['y', 'z']]) == [b'1', [b'2', b'3']] 45 | 46 | 47 | def test_drop(): 48 | with File() as p: 49 | p.append({'x': b'123'}) 50 | p.iset('y', b'abc') 51 | assert p.get('x') == b'123' 52 | assert p.get('y') == b'abc' 53 | 54 | p.drop() 55 | assert p.get('x') == b'' 56 | assert p.get('y') == b'' 57 | 58 | p.append({'x': b'123'}) 59 | p.iset('y', b'def') 60 | assert p.get('x') == b'123' 61 | assert p.get('y') == b'def' 62 | 63 | 64 | def test_del(): 65 | f = File() 66 | 67 | assert f.path 68 | assert os.path.exists(f.path) 69 | 70 | f.__del__() 71 | assert not os.path.exists(f.path) 72 | 73 | with File('Foo') as p: 74 | p.__del__() 75 | assert os.path.exists(p.path) 76 | 77 | 78 | def test_specify_dirname(): 79 | with File(dir=os.getcwd()) as f: 80 | assert os.getcwd() in f.path 81 | -------------------------------------------------------------------------------- /partd/tests/test_numpy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | np = pytest.importorskip('numpy') # noqa 3 | 4 | import pickle 5 | 6 | import partd 7 | from partd.numpy import Numpy 8 | 9 | 10 | def test_numpy(): 11 | dt = np.dtype([('a', 'i4'), ('b', 'i2'), ('c', 'f8')]) 12 | with Numpy() as p: 13 | p.append({'a': np.array([10, 20, 30], dtype=dt['a']), 14 | 'b': np.array([ 1, 2, 3], dtype=dt['b']), 15 | 'c': np.array([.1, .2, .3], dtype=dt['c'])}) 16 | p.append({'a': np.array([70, 80, 90], dtype=dt['a']), 17 | 'b': np.array([ 7, 8, 9], dtype=dt['b']), 18 | 'c': np.array([.7, .8, .9], dtype=dt['c'])}) 19 | 20 | result = p.get(['a', 'c']) 21 | assert (result[0] == np.array([10, 20, 30, 70, 80, 90],dtype=dt['a'])).all() 22 | assert (result[1] == np.array([.1, .2, .3, .7, .8, .9],dtype=dt['c'])).all() 23 | 24 | with p.lock: # uh oh, possible deadlock 25 | result = p.get(['a'], lock=False) 26 | 27 | 28 | def test_nested(): 29 | with Numpy() as p: 30 | p.append({'x': np.array([1, 2, 3]), 31 | ('y', 1): np.array([4, 5, 6]), 32 | ('z', 'a', 3): np.array([.1, .2, .3])}) 33 | assert (p.get(('z', 'a', 3)) == np.array([.1, .2, .3])).all() 34 | 35 | 36 | def test_serialization(): 37 | with Numpy() as p: 38 | p.append({'x': np.array([1, 2, 3])}) 39 | q = pickle.loads(pickle.dumps(p)) 40 | assert (q.get('x') == [1, 2, 3]).all() 41 | 42 | 43 | array_of_lists = np.empty(3, dtype='O') 44 | array_of_lists[:] = [[1, 2], [3, 4], [5, 6]] 45 | 46 | 47 | @pytest.mark.parametrize('x', [np.array(['Alice', 'Bob', 'Charlie'], dtype='O'), 48 | array_of_lists]) 49 | def test_object_dtype(x): 50 | with Numpy() as p: 51 | p.append({'x': x}) 52 | p.append({'x': x}) 53 | assert isinstance(p.get('x'), np.ndarray) 54 | assert (p.get('x') == np.concatenate([x, x])).all() 55 | 56 | 57 | def test_datetime_types(): 58 | x = np.array(['2014-01-01T12:00:00'], dtype='M8[us]') 59 | y = np.array(['2014-01-01T12:00:00'], dtype='M8[s]') 60 | with Numpy() as p: 61 | p.append({'x': x, 'y': y}) 62 | assert p.get('x').dtype == x.dtype 63 | assert p.get('y').dtype == y.dtype 64 | 65 | 66 | def test_non_utf8_bytes(): 67 | a = np.array([b'\xc3\x28', b'\xa0\xa1', b'\xe2\x28\xa1', b'\xe2\x82\x28', 68 | b'\xf0\x28\x8c\xbc'], dtype='O') 69 | s = partd.numpy.serialize(a) 70 | assert (partd.numpy.deserialize(s, 'O') == a).all() 71 | -------------------------------------------------------------------------------- /partd/tests/test_pandas.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | pytest.importorskip('pandas') # noqa 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pandas.testing as tm 7 | import os 8 | 9 | try: 10 | import pyarrow as pa 11 | except ImportError: 12 | pa = None 13 | 14 | from partd.pandas import PandasColumns, PandasBlocks, serialize, deserialize 15 | 16 | 17 | df1 = pd.DataFrame({'a': [1, 2, 3], 18 | 'b': [1., 2., 3.], 19 | 'c': ['x', 'y', 'x']}, columns=['a', 'b', 'c'], 20 | index=pd.Index([1, 2, 3], name='myindex')) 21 | 22 | df2 = pd.DataFrame({'a': [10, 20, 30], 23 | 'b': [10., 20., 30.], 24 | 'c': ['X', 'Y', 'X']}, columns=['a', 'b', 'c'], 25 | index=pd.Index([10, 20, 30], name='myindex')) 26 | 27 | 28 | def test_PandasColumns(): 29 | with PandasColumns() as p: 30 | assert os.path.exists(p.partd.partd.path) 31 | 32 | p.append({'x': df1, 'y': df2}) 33 | p.append({'x': df2, 'y': df1}) 34 | assert os.path.exists(p.partd.partd.filename('x')) 35 | assert os.path.exists(p.partd.partd.filename(('x', 'a'))) 36 | assert os.path.exists(p.partd.partd.filename(('x', '.index'))) 37 | assert os.path.exists(p.partd.partd.filename('y')) 38 | 39 | result = p.get(['y', 'x']) 40 | tm.assert_frame_equal(result[0], pd.concat([df2, df1])) 41 | tm.assert_frame_equal(result[1], pd.concat([df1, df2])) 42 | 43 | with p.lock: # uh oh, possible deadlock 44 | result = p.get(['x'], lock=False) 45 | 46 | assert not os.path.exists(p.partd.partd.path) 47 | 48 | 49 | def test_column_selection(): 50 | with PandasColumns('foo') as p: 51 | p.append({'x': df1, 'y': df2}) 52 | p.append({'x': df2, 'y': df1}) 53 | result = p.get('x', columns=['c', 'b']) 54 | tm.assert_frame_equal(result, pd.concat([df1, df2])[['c', 'b']]) 55 | 56 | 57 | def test_PandasBlocks(): 58 | with PandasBlocks() as p: 59 | assert os.path.exists(p.partd.path) 60 | 61 | p.append({'x': df1, 'y': df2}) 62 | p.append({'x': df2, 'y': df1}) 63 | assert os.path.exists(p.partd.filename('x')) 64 | assert os.path.exists(p.partd.filename('y')) 65 | 66 | result = p.get(['y', 'x']) 67 | tm.assert_frame_equal(result[0], pd.concat([df2, df1])) 68 | tm.assert_frame_equal(result[1], pd.concat([df1, df2])) 69 | 70 | with p.lock: # uh oh, possible deadlock 71 | result = p.get(['x'], lock=False) 72 | 73 | assert not os.path.exists(p.partd.path) 74 | 75 | 76 | @pytest.mark.parametrize('ordered', [False, True]) 77 | def test_serialize_categoricals(ordered): 78 | frame = pd.DataFrame({'x': [1, 2, 3, 4], 79 | 'y': pd.Categorical(['c', 'a', 'b', 'a'], 80 | ordered=ordered)}, 81 | index=pd.Categorical(['x', 'y', 'z', 'x'], 82 | ordered=ordered)) 83 | frame.index.name = 'foo' 84 | frame.columns.name = 'bar' 85 | 86 | for ind, df in [(0, frame), (1, frame.T)]: 87 | df2 = deserialize(serialize(df)) 88 | tm.assert_frame_equal(df, df2) 89 | 90 | 91 | def test_serialize_multi_index(): 92 | df = pd.DataFrame({'x': ['a', 'b', 'c', 'a', 'b', 'c'], 93 | 'y': [1, 2, 3, 4, 5, 6], 94 | 'z': [7., 8, 9, 10, 11, 12]}) 95 | df = df.groupby([df.x, df.y]).sum() 96 | df.index.name = 'foo' 97 | df.columns.name = 'bar' 98 | 99 | df2 = deserialize(serialize(df)) 100 | tm.assert_frame_equal(df, df2) 101 | 102 | 103 | @pytest.mark.parametrize('base', [ 104 | pd.Timestamp('1987-03-3T01:01:01+0001'), 105 | pd.Timestamp('1987-03-03 01:01:01-0600', tz='US/Central'), 106 | ]) 107 | def test_serialize(base): 108 | df = pd.DataFrame({'x': [ 109 | base + pd.Timedelta(seconds=i) 110 | for i in np.random.randint(0, 1000, size=10)], 111 | 'y': list(range(10)), 112 | 'z': pd.date_range('2017', periods=10)}) 113 | df2 = deserialize(serialize(df)) 114 | tm.assert_frame_equal(df, df2) 115 | 116 | 117 | def test_other_extension_types(): 118 | pytest.importorskip("pandas", minversion="0.25.0") 119 | a = pd.array([pd.Period("2000"), pd.Period("2001")]) 120 | df = pd.DataFrame({"A": a}) 121 | df2 = deserialize(serialize(df)) 122 | tm.assert_frame_equal(df, df2) 123 | 124 | @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32"]) 125 | def test_index_numeric_extension_types(dtype): 126 | pytest.importorskip("pandas", minversion="1.4.0") 127 | 128 | df = pd.DataFrame({"x": [1, 2, 3]}, index=[4, 5, 6]) 129 | df.index = df.index.astype(dtype) 130 | df2 = deserialize(serialize(df)) 131 | tm.assert_frame_equal(df, df2) 132 | 133 | @pytest.mark.parametrize( 134 | "dtype", 135 | [ 136 | "string[python]", 137 | pytest.param( 138 | "string[pyarrow]", 139 | marks=pytest.mark.skipif(pa is None, reason="Requires pyarrow"), 140 | ), 141 | ], 142 | ) 143 | def test_index_non_numeric_extension_types(dtype): 144 | pytest.importorskip("pandas", minversion="1.4.0") 145 | df = pd.DataFrame({"x": [1, 2, 3]}, index=["a", "b", "c"]) 146 | df.index = df.index.astype(dtype) 147 | df2 = deserialize(serialize(df)) 148 | tm.assert_frame_equal(df, df2) 149 | 150 | 151 | def test_categorical_concat(): 152 | pytest.importorskip("pandas", minversion="2") 153 | 154 | df1 = pd.DataFrame({"a": ["x", "y"]}, dtype="category") 155 | df2 = pd.DataFrame({"a": ["y", "z"]}, dtype="category") 156 | 157 | with PandasBlocks() as p: 158 | p.append({'x': df1}) 159 | p.append({'x': df2}) 160 | 161 | result = p.get(["x"]) 162 | pd.testing.assert_frame_equal(result[0], pd.concat([df1, df2]).astype("category")) 163 | -------------------------------------------------------------------------------- /partd/tests/test_partd.py: -------------------------------------------------------------------------------- 1 | from partd import File 2 | from partd.core import token, escape_filename, filename 3 | from partd import core 4 | import os 5 | import shutil 6 | from contextlib import contextmanager 7 | 8 | 9 | def test_partd(): 10 | path = 'tmp.partd' 11 | 12 | with File(path) as p: 13 | p.append({'x': b'Hello', 'y': b'abc'}) 14 | p.append({'x': b'World!', 'y': b'def'}) 15 | assert os.path.exists(p.filename('x')) 16 | assert os.path.exists(p.filename('y')) 17 | 18 | result = p.get(['y', 'x']) 19 | assert result == [b'abcdef', b'HelloWorld!'] 20 | 21 | assert p.get('z') == b'' 22 | 23 | with p.lock: # uh oh, possible deadlock 24 | result = p.get(['x'], lock=False) 25 | 26 | assert not os.path.exists(path) 27 | 28 | 29 | def test_key_tuple(): 30 | with File('foo') as p: 31 | p.append({('a', 'b'): b'123'}) 32 | assert os.path.exists(os.path.join(p.path, 'a', 'b')) 33 | 34 | 35 | def test_ensure(): 36 | with File('foo') as p: 37 | p.iset('x', b'123') 38 | p.iset('x', b'123') 39 | p.iset('x', b'123') 40 | 41 | assert p.get('x') == b'123' 42 | 43 | 44 | def test_filenames(): 45 | assert token('hello') == 'hello' 46 | assert token(('hello', 'world')) == os.path.join('hello', 'world') 47 | assert escape_filename(os.path.join('a', 'b')) == os.path.join('a', 'b') 48 | assert filename('dir', ('a', 'b')) == os.path.join('dir', 'a', 'b') 49 | -------------------------------------------------------------------------------- /partd/tests/test_pickle.py: -------------------------------------------------------------------------------- 1 | from partd.pickle import Pickle 2 | 3 | 4 | import os 5 | import shutil 6 | 7 | def test_pickle(): 8 | with Pickle() as p: 9 | p.append({'x': ['Hello', 'World!'], 'y': [1, 2, 3]}) 10 | p.append({'x': ['Alice', 'Bob!'], 'y': [4, 5, 6]}) 11 | assert os.path.exists(p.partd.filename('x')) 12 | assert os.path.exists(p.partd.filename('y')) 13 | 14 | result = p.get(['y', 'x']) 15 | assert result == [[1, 2, 3, 4, 5, 6], 16 | ['Hello', 'World!', 'Alice', 'Bob!']] 17 | 18 | with p.lock: # uh oh, possible deadlock 19 | result = p.get(['x'], lock=False) 20 | 21 | assert not os.path.exists(p.partd.path) 22 | 23 | 24 | def test_ensure(): 25 | with Pickle() as p: 26 | p.iset('x', [1, 2, 3]) 27 | p.iset('x', [1, 2, 3]) 28 | 29 | assert p.get('x') == [1, 2, 3] 30 | -------------------------------------------------------------------------------- /partd/tests/test_python.py: -------------------------------------------------------------------------------- 1 | from partd.python import dumps, loads 2 | 3 | 4 | import os 5 | import shutil 6 | from math import sin 7 | 8 | 9 | def test_pack_unpack(): 10 | data = [1, 2, b'Hello', 'Hello'] 11 | assert loads(dumps(data)) == data 12 | 13 | data = [1, 2, sin] 14 | assert loads(dumps(data)) == data 15 | -------------------------------------------------------------------------------- /partd/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from partd.utils import frame, framesplit 2 | import struct 3 | 4 | 5 | def test_frame(): 6 | assert frame(b'Hello') == struct.pack('Q', 5) + b'Hello' 7 | 8 | 9 | def test_framesplit(): 10 | L = [b'Hello', b'World!', b'123'] 11 | assert list(framesplit(b''.join(map(frame, L)))) == L 12 | -------------------------------------------------------------------------------- /partd/tests/test_zmq.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | pytest.importorskip('zmq') 3 | 4 | from partd.zmq import Server, keys_to_flush, File, Client 5 | from partd import core, Dict 6 | from threading import Thread 7 | from time import sleep 8 | from contextlib import contextmanager 9 | import pickle 10 | 11 | import os 12 | import shutil 13 | 14 | 15 | def test_server(): 16 | s = Server() 17 | try: 18 | s.start() 19 | s.append({'x': b'abc', 'y': b'1234'}) 20 | s.append({'x': b'def', 'y': b'5678'}) 21 | 22 | assert s.get(['x']) == [b'abcdef'] 23 | assert s.get(['x', 'y']) == [b'abcdef', b'12345678'] 24 | 25 | assert s.get(['x']) == [b'abcdef'] 26 | finally: 27 | s.close() 28 | 29 | 30 | def dont_test_flow_control(): 31 | path = 'bar' 32 | if os.path.exists('bar'): 33 | shutil.rmtree('bar') 34 | s = Server('bar', available_memory=1, n_outstanding_writes=3, start=False) 35 | p = Client(s.address) 36 | try: 37 | listen_thread = Thread(target=s.listen) 38 | listen_thread.start() 39 | """ Don't start these threads 40 | self._write_to_disk_thread = Thread(target=self._write_to_disk) 41 | self._write_to_disk_thread.start() 42 | self._free_frozen_sockets_thread = Thread(target=self._free_frozen_sockets) 43 | self._free_frozen_sockets_thread.start() 44 | """ 45 | p.append({'x': b'12345'}) 46 | sleep(0.1) 47 | assert s._out_disk_buffer.qsize() == 1 48 | p.append({'x': b'12345'}) 49 | p.append({'x': b'12345'}) 50 | sleep(0.1) 51 | assert s._out_disk_buffer.qsize() == 3 52 | 53 | held_append = Thread(target=p.append, args=({'x': b'123'},)) 54 | held_append.start() 55 | 56 | sleep(0.1) 57 | assert held_append.is_alive() # held! 58 | 59 | assert not s._frozen_sockets.empty() 60 | 61 | write_to_disk_thread = Thread(target=s._write_to_disk) 62 | write_to_disk_thread.start() 63 | free_frozen_sockets_thread = Thread(target=s._free_frozen_sockets) 64 | free_frozen_sockets_thread.start() 65 | 66 | sleep(0.2) 67 | assert not held_append.is_alive() 68 | assert s._frozen_sockets.empty() 69 | finally: 70 | s.close() 71 | 72 | 73 | @contextmanager 74 | def partd_server(**kwargs): 75 | with Server(**kwargs) as server: 76 | with Client(server.address) as p: 77 | yield (p, server) 78 | 79 | 80 | def test_partd_object(): 81 | with partd_server() as (p, server): 82 | p.append({'x': b'Hello', 'y': b'abc'}) 83 | p.append({'x': b'World!', 'y': b'def'}) 84 | 85 | result = p.get(['y', 'x']) 86 | assert result == [b'abcdef', b'HelloWorld!'] 87 | 88 | 89 | def test_delete(): 90 | with partd_server() as (p, server): 91 | p.append({'x': b'Hello'}) 92 | assert p.get('x') == b'Hello' 93 | p.delete(['x']) 94 | assert p.get('x') == b'' 95 | 96 | 97 | def test_iset(): 98 | with partd_server() as (p, server): 99 | p.iset('x', b'111') 100 | p.iset('x', b'111') 101 | assert p.get('x') == b'111' 102 | 103 | 104 | def test_tuple_keys(): 105 | with partd_server() as (p, server): 106 | p.append({('x', 'y'): b'123'}) 107 | assert p.get(('x', 'y')) == b'123' 108 | 109 | 110 | def test_serialization(): 111 | with partd_server() as (p, server): 112 | p.append({'x': b'123'}) 113 | q = pickle.loads(pickle.dumps(p)) 114 | assert q.get('x') == b'123' 115 | 116 | 117 | def test_drop(): 118 | with partd_server() as (p, server): 119 | p.append({'x': b'123'}) 120 | p.drop() 121 | assert p.get('x') == b'' 122 | 123 | 124 | def dont_test_server_autocreation(): 125 | with Client() as p: 126 | p.append({'x': b'123'}) 127 | assert p.get('x') == b'123' 128 | -------------------------------------------------------------------------------- /partd/utils.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import os 3 | import shutil 4 | import tempfile 5 | import struct 6 | 7 | 8 | def raises(exc, lamda): 9 | try: 10 | lamda() 11 | return False 12 | except exc: 13 | return True 14 | 15 | 16 | @contextmanager 17 | def tmpfile(extension=''): 18 | extension = '.' + extension.lstrip('.') 19 | handle, filename = tempfile.mkstemp(extension) 20 | os.close(handle) 21 | os.remove(filename) 22 | 23 | try: 24 | yield filename 25 | finally: 26 | if os.path.exists(filename): 27 | if os.path.isdir(filename): 28 | shutil.rmtree(filename) 29 | else: 30 | os.remove(filename) 31 | 32 | 33 | def frame(bytes): 34 | """ Pack the length of the bytes in front of the bytes 35 | 36 | TODO: This does a full copy. This should maybe be inlined somehow 37 | wherever this gets used instead. My laptop shows a data bandwidth of 38 | 2GB/s 39 | """ 40 | return struct.pack('Q', len(bytes)) + bytes 41 | 42 | 43 | def framesplit(bytes): 44 | """ Split buffer into frames of concatenated chunks 45 | 46 | >>> data = frame(b'Hello') + frame(b'World') 47 | >>> list(framesplit(data)) # doctest: +SKIP 48 | [b'Hello', b'World'] 49 | """ 50 | i = 0; n = len(bytes) 51 | chunks = list() 52 | while i < n: 53 | nbytes = struct.unpack('Q', bytes[i:i+8])[0] 54 | i += 8 55 | yield bytes[i: i + nbytes] 56 | i += nbytes 57 | 58 | 59 | def partition_all(n, bytes): 60 | """ Partition bytes into evenly sized blocks 61 | 62 | The final block holds the remainder and so may not be of equal size 63 | 64 | >>> list(partition_all(2, b'Hello')) 65 | [b'He', b'll', b'o'] 66 | 67 | See Also: 68 | toolz.partition_all 69 | """ 70 | if len(bytes) < n: # zero copy fast common case 71 | yield bytes 72 | else: 73 | for i in range(0, len(bytes), n): 74 | yield bytes[i: i+n] 75 | 76 | 77 | def nested_get(ind, coll, lazy=False): 78 | """ Get nested index from collection 79 | 80 | Examples 81 | -------- 82 | 83 | >>> nested_get(1, 'abc') 84 | 'b' 85 | >>> nested_get([1, 0], 'abc') 86 | ['b', 'a'] 87 | >>> nested_get([[1, 0], [0, 1]], 'abc') 88 | [['b', 'a'], ['a', 'b']] 89 | """ 90 | if isinstance(ind, list): 91 | if lazy: 92 | return (nested_get(i, coll, lazy=lazy) for i in ind) 93 | else: 94 | return [nested_get(i, coll, lazy=lazy) for i in ind] 95 | else: 96 | return coll[ind] 97 | 98 | 99 | def flatten(seq): 100 | """ 101 | 102 | >>> list(flatten([1])) 103 | [1] 104 | 105 | >>> list(flatten([[1, 2], [1, 2]])) 106 | [1, 2, 1, 2] 107 | 108 | >>> list(flatten([[[1], [2]], [[1], [2]]])) 109 | [1, 2, 1, 2] 110 | 111 | >>> list(flatten(((1, 2), (1, 2)))) # Don't flatten tuples 112 | [(1, 2), (1, 2)] 113 | 114 | >>> list(flatten((1, 2, [3, 4]))) # support heterogeneous 115 | [1, 2, 3, 4] 116 | """ 117 | for item in seq: 118 | if isinstance(item, list): 119 | yield from flatten(item) 120 | else: 121 | yield item 122 | 123 | 124 | def suffix(key, term): 125 | """ suffix a key with a suffix 126 | 127 | Works if they key is a string or a tuple 128 | 129 | >>> suffix('x', '.dtype') 130 | 'x.dtype' 131 | >>> suffix(('a', 'b', 'c'), '.dtype') 132 | ('a', 'b', 'c.dtype') 133 | """ 134 | if isinstance(key, str): 135 | return key + term 136 | elif isinstance(key, tuple): 137 | return key[:-1] + (suffix(key[-1], term),) 138 | else: 139 | return suffix(str(key), term) 140 | 141 | 142 | def extend(key, term): 143 | """ extend a key with a another element in a tuple 144 | 145 | Works if they key is a string or a tuple 146 | 147 | >>> extend('x', '.dtype') 148 | ('x', '.dtype') 149 | >>> extend(('a', 'b', 'c'), '.dtype') 150 | ('a', 'b', 'c', '.dtype') 151 | """ 152 | if isinstance(term, tuple): 153 | pass 154 | elif isinstance(term, str): 155 | term = (term,) 156 | else: 157 | term = (str(term),) 158 | 159 | if not isinstance(key, tuple): 160 | key = (key,) 161 | 162 | return key + term 163 | -------------------------------------------------------------------------------- /partd/zmq.py: -------------------------------------------------------------------------------- 1 | import zmq 2 | import logging 3 | from itertools import chain 4 | from bisect import bisect 5 | import socket 6 | from operator import add 7 | from time import sleep, time 8 | from toolz import accumulate, topk, pluck, merge, keymap 9 | import uuid 10 | from collections import defaultdict 11 | from contextlib import contextmanager, suppress 12 | from threading import Thread, Lock 13 | from datetime import datetime 14 | from multiprocessing import Process 15 | import traceback 16 | import sys 17 | from .dict import Dict 18 | from .file import File 19 | from .buffer import Buffer 20 | from . import core 21 | 22 | 23 | tuple_sep = b'-|-' 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | @contextmanager 29 | def logerrors(): 30 | try: 31 | yield 32 | except Exception as e: 33 | logger.exception(e) 34 | raise 35 | 36 | 37 | class Server: 38 | def __init__(self, partd=None, bind=None, start=True, block=False, 39 | hostname=None): 40 | self.context = zmq.Context() 41 | if partd is None: 42 | partd = Buffer(Dict(), File()) 43 | self.partd = partd 44 | 45 | self.socket = self.context.socket(zmq.ROUTER) 46 | 47 | if hostname is None: 48 | hostname = socket.gethostname() 49 | if isinstance(bind, str): 50 | bind = bind.encode() 51 | if bind is None: 52 | port = self.socket.bind_to_random_port('tcp://*') 53 | else: 54 | self.socket.bind(bind) 55 | port = int(bind.split(':')[-1].rstrip('/')) 56 | self.address = ('tcp://%s:%d' % (hostname, port)).encode() 57 | 58 | self.status = 'created' 59 | 60 | self.partd.lock.acquire() 61 | self._lock = Lock() 62 | self._socket_lock = Lock() 63 | 64 | if start: 65 | self.start() 66 | 67 | if block: 68 | self.block() 69 | 70 | def start(self): 71 | if self.status != 'run': 72 | self.status = 'run' 73 | self._listen_thread = Thread(target=self.listen) 74 | self._listen_thread.start() 75 | logger.debug('Start server at %s', self.address) 76 | 77 | def block(self): 78 | """ Block until all threads close """ 79 | try: 80 | self._listen_thread.join() 81 | except AttributeError: 82 | pass 83 | 84 | def listen(self): 85 | with logerrors(): 86 | logger.debug('Start listening %s', self.address) 87 | while self.status != 'closed': 88 | if not self.socket.poll(100): 89 | continue 90 | 91 | with self._socket_lock: 92 | payload = self.socket.recv_multipart() 93 | 94 | address, command, payload = payload[0], payload[1], payload[2:] 95 | logger.debug('Server receives %s %s', address, command) 96 | if command == b'close': 97 | logger.debug('Server closes') 98 | self.ack(address) 99 | self.status = 'closed' 100 | break 101 | # self.close() 102 | 103 | elif command == b'append': 104 | keys, values = payload[::2], payload[1::2] 105 | keys = list(map(deserialize_key, keys)) 106 | data = dict(zip(keys, values)) 107 | self.partd.append(data, lock=False) 108 | logger.debug('Server appends %d keys', len(data)) 109 | self.ack(address) 110 | 111 | elif command == b'iset': 112 | key, value = payload 113 | key = deserialize_key(key) 114 | self.partd.iset(key, value, lock=False) 115 | self.ack(address) 116 | 117 | elif command == b'get': 118 | keys = list(map(deserialize_key, payload)) 119 | logger.debug('get %s', keys) 120 | result = self.get(keys) 121 | self.send_to_client(address, result) 122 | self.ack(address, flow_control=False) 123 | 124 | elif command == b'delete': 125 | keys = list(map(deserialize_key, payload)) 126 | logger.debug('delete %s', keys) 127 | self.partd.delete(keys, lock=False) 128 | self.ack(address, flow_control=False) 129 | 130 | elif command == b'syn': 131 | self.ack(address) 132 | 133 | elif command == b'drop': 134 | self.drop() 135 | self.ack(address) 136 | 137 | else: 138 | logger.debug("Unknown command: %s", command) 139 | raise ValueError("Unknown command: " + command) 140 | 141 | def send_to_client(self, address, result): 142 | with logerrors(): 143 | if not isinstance(result, list): 144 | result = [result] 145 | with self._socket_lock: 146 | self.socket.send_multipart([address] + result) 147 | 148 | def ack(self, address, flow_control=True): 149 | with logerrors(): 150 | logger.debug('Server sends ack') 151 | self.send_to_client(address, b'ack') 152 | 153 | def append(self, data): 154 | self.partd.append(data, lock=False) 155 | logger.debug('Server appends %d keys', len(data)) 156 | 157 | def drop(self): 158 | with logerrors(): 159 | self.partd.drop() 160 | 161 | def get(self, keys): 162 | with logerrors(): 163 | logger.debug('Server gets keys: %s', keys) 164 | with self._lock: 165 | result = self.partd.get(keys, lock=False) 166 | return result 167 | 168 | def close(self): 169 | logger.debug('Server closes') 170 | self.status = 'closed' 171 | self.block() 172 | with suppress(zmq.error.ZMQError): 173 | self.socket.close(1) 174 | with suppress(zmq.error.ZMQError): 175 | self.context.destroy(3) 176 | self.partd.lock.release() 177 | 178 | def __enter__(self): 179 | self.start() 180 | return self 181 | 182 | def __exit__(self, *args): 183 | self.close() 184 | self.partd.__exit__(*args) 185 | 186 | 187 | def keys_to_flush(lengths, fraction=0.1, maxcount=100000): 188 | """ Which keys to remove 189 | 190 | >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, 191 | ... 'e': 10, 'f': 25, 'g': 5} 192 | >>> keys_to_flush(lengths, 0.5) 193 | ['f', 'a'] 194 | """ 195 | top = topk(max(len(lengths) // 2, 1), 196 | lengths.items(), 197 | key=1) 198 | total = sum(lengths.values()) 199 | cutoff = min(maxcount, max(1, 200 | bisect(list(accumulate(add, pluck(1, top))), 201 | total * fraction))) 202 | result = [k for k, v in top[:cutoff]] 203 | assert result 204 | return result 205 | 206 | 207 | def serialize_key(key): 208 | """ 209 | 210 | >>> serialize_key('x') 211 | b'x' 212 | >>> serialize_key(('a', 'b', 1)) 213 | b'a-|-b-|-1' 214 | """ 215 | if isinstance(key, tuple): 216 | return tuple_sep.join(map(serialize_key, key)) 217 | if isinstance(key, bytes): 218 | return key 219 | if isinstance(key, str): 220 | return key.encode() 221 | return str(key).encode() 222 | 223 | 224 | def deserialize_key(text): 225 | """ 226 | 227 | >>> deserialize_key(b'x') 228 | b'x' 229 | >>> deserialize_key(b'a-|-b-|-1') 230 | (b'a', b'b', b'1') 231 | """ 232 | if tuple_sep in text: 233 | return tuple(text.split(tuple_sep)) 234 | else: 235 | return text 236 | 237 | 238 | from .core import Interface 239 | from .file import File 240 | 241 | 242 | class Client(Interface): 243 | def __init__(self, address=None, create_server=False, **kwargs): 244 | self.address = address 245 | self.context = zmq.Context() 246 | self.socket = self.context.socket(zmq.DEALER) 247 | logger.debug('Client connects to %s', address) 248 | self.socket.connect(address) 249 | self.send(b'syn', [], ack_required=False) 250 | self.lock = NotALock() # Server sequentializes everything 251 | Interface.__init__(self) 252 | 253 | def __getstate__(self): 254 | return {'address': self.address} 255 | 256 | def __setstate__(self, state): 257 | self.__init__(state['address']) 258 | logger.debug('Reconstruct client from pickled state') 259 | 260 | def send(self, command, payload, recv=False, ack_required=True): 261 | if ack_required: 262 | ack = self.socket.recv_multipart() 263 | assert ack == [b'ack'] 264 | logger.debug('Client sends command: %s', command) 265 | self.socket.send_multipart([command] + payload) 266 | if recv: 267 | result = self.socket.recv_multipart() 268 | else: 269 | result = None 270 | return result 271 | 272 | def _get(self, keys, lock=None): 273 | """ 274 | 275 | Lock argument is ignored. Everything is sequential (I think) 276 | """ 277 | logger.debug('Client gets %s %s', self.address, keys) 278 | keys = list(map(serialize_key, keys)) 279 | return self.send(b'get', keys, recv=True) 280 | 281 | def append(self, data, lock=None): 282 | logger.debug('Client appends %s %s', self.address, str(len(data)) + ' keys') 283 | data = keymap(serialize_key, data) 284 | payload = list(chain.from_iterable(data.items())) 285 | self.send(b'append', payload) 286 | 287 | def _delete(self, keys, lock=None): 288 | logger.debug('Client deletes %s %s', self.address, str(len(keys)) + ' keys') 289 | keys = list(map(serialize_key, keys)) 290 | self.send(b'delete', keys) 291 | 292 | def _iset(self, key, value): 293 | self.send(b'iset', [serialize_key(key), value]) 294 | 295 | def drop(self): 296 | self.send(b'drop', []) 297 | sleep(0.05) 298 | 299 | def close_server(self): 300 | self.send(b'close', []) 301 | 302 | def close(self): 303 | if hasattr(self, 'server_process'): 304 | with suppress(zmq.error.ZMQError): 305 | self.close_server() 306 | self.server_process.join() 307 | with suppress(zmq.error.ZMQError): 308 | self.socket.close(1) 309 | with suppress(zmq.error.ZMQError): 310 | self.context.destroy(1) 311 | 312 | def __exit__(self, type, value, traceback): 313 | self.drop() 314 | self.close() 315 | 316 | def __del__(self): 317 | self.close() 318 | 319 | 320 | class NotALock: 321 | def acquire(self): pass 322 | def release(self): pass 323 | 324 | def __enter__(self): 325 | return self 326 | 327 | def __exit__(self, *args): 328 | pass 329 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.2", "versioneer[toml]==0.29"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "partd" 7 | description = "Appendable key-value storage" 8 | maintainers = [{name = "Matthew Rocklin", email = "mrocklin@gmail.com"}] 9 | license = {text = "BSD"} 10 | keywords = [] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.9", 14 | "Programming Language :: Python :: 3.10", 15 | "Programming Language :: Python :: 3.11", 16 | "Programming Language :: Python :: 3.12", 17 | ] 18 | readme = "README.rst" 19 | urls = {Homepage = "http://github.com/dask/partd/"} 20 | requires-python = ">=3.9" 21 | dynamic = ["version"] 22 | dependencies = [ 23 | "locket", 24 | "toolz", 25 | ] 26 | 27 | [project.optional-dependencies] 28 | complete = [ 29 | "numpy >= 1.20.0", 30 | "pandas >=1.3", 31 | "pyzmq", 32 | "blosc", 33 | ] 34 | 35 | [tool.setuptools] 36 | packages = ["partd"] 37 | zip-safe = false 38 | include-package-data = false 39 | 40 | [tool.versioneer] 41 | VCS = "git" 42 | style = "pep440" 43 | versionfile_source = "partd/_version.py" 44 | versionfile_build = "partd/_version.py" 45 | tag_prefix = "" 46 | parentdir_prefix = "partd-" 47 | 48 | [tool.pytest.ini_options] 49 | addopts = "--strict-markers --strict-config" 50 | filterwarnings = ["error"] 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | locket 2 | toolz 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import annotations 4 | 5 | import versioneer 6 | from setuptools import setup 7 | 8 | setup( 9 | version=versioneer.get_version(), 10 | cmdclass=versioneer.get_cmdclass(), 11 | ) 12 | --------------------------------------------------------------------------------