├── .gitattributes
├── .github
└── workflows
│ └── ci.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── docs
├── zmq.png
└── zmq.svg
├── partd
├── __init__.py
├── _version.py
├── buffer.py
├── compressed.py
├── core.py
├── dict.py
├── encode.py
├── file.py
├── numpy.py
├── pandas.py
├── pickle.py
├── python.py
├── tests
│ ├── test_buffer.py
│ ├── test_compressed.py
│ ├── test_dict.py
│ ├── test_encode.py
│ ├── test_file.py
│ ├── test_numpy.py
│ ├── test_pandas.py
│ ├── test_partd.py
│ ├── test_pickle.py
│ ├── test_python.py
│ ├── test_utils.py
│ └── test_zmq.py
├── utils.py
└── zmq.py
├── pyproject.toml
├── requirements.txt
└── setup.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | partd/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | name: Python ${{ matrix.python-version }}
8 | runs-on: ubuntu-latest
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version: ["3.9", "3.10", "3.11", "3.12"]
13 |
14 | steps:
15 | - name: Checkout source
16 | uses: actions/checkout@v2
17 |
18 | - name: Setup Conda Environment
19 | uses: conda-incubator/setup-miniconda@v2.2.0
20 | with:
21 | miniforge-variant: Mambaforge
22 | miniforge-version: latest
23 | use-mamba: true
24 | channel-priority: strict
25 | python-version: ${{ matrix.python-version }}
26 | auto-activate-base: false
27 |
28 | - name: Install dependencies
29 | shell: bash -l {0}
30 | run: mamba install pytest locket numpy toolz pandas blosc pyzmq pyarrow -c conda-forge
31 |
32 | - name: Install
33 | shell: bash -l {0}
34 | run: pip install .
35 |
36 | - name: Run Tests
37 | shell: bash -l {0}
38 | run: pytest partd --doctest-modules --verbose
39 | env:
40 | PYTHON_VERSION: ${{ matrix.python-version }}
41 |
42 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
3 | *.egg-info/
4 | build/
5 | dist/
6 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more.
2 |
3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html.
4 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Continuum Analytics, Inc. and contributors
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | Neither the name of Continuum Analytics nor the names of any contributors
15 | may be used to endorse or promote products derived from this software
16 | without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
28 | THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include partd *.py
2 |
3 | include setup.py
4 | include README.rst
5 | include LICENSE.txt
6 | include MANIFEST.in
7 | include versioneer.py
8 | include partd/_version.py
9 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | PartD
2 | =====
3 |
4 | |Build Status| |Version Status|
5 |
6 | Key-value byte store with appendable values
7 |
8 | Partd stores key-value pairs.
9 | Values are raw bytes.
10 | We append on old values.
11 |
12 | Partd excels at shuffling operations.
13 |
14 | Operations
15 | ----------
16 |
17 | PartD has two main operations, ``append`` and ``get``.
18 |
19 |
20 | Example
21 | -------
22 |
23 | 1. Create a Partd backed by a directory::
24 |
25 | >>> import partd
26 | >>> p = partd.File('/path/to/new/dataset/')
27 |
28 | 2. Append key-byte pairs to dataset::
29 |
30 | >>> p.append({'x': b'Hello ', 'y': b'123'})
31 | >>> p.append({'x': b'world!', 'y': b'456'})
32 |
33 | 3. Get bytes associated to keys::
34 |
35 | >>> p.get('x') # One key
36 | b'Hello world!'
37 |
38 | >>> p.get(['y', 'x']) # List of keys
39 | [b'123456', b'Hello world!']
40 |
41 | 4. Destroy partd dataset::
42 |
43 | >>> p.drop()
44 |
45 | That's it.
46 |
47 |
48 | Implementations
49 | ---------------
50 |
51 | We can back a partd by an in-memory dictionary::
52 |
53 | >>> p = Dict()
54 |
55 | For larger amounts of data or to share data between processes we back a partd
56 | by a directory of files. This uses file-based locks for consistency.::
57 |
58 | >>> p = File('/path/to/dataset/')
59 |
60 | However this can fail for many small writes. In these cases you may wish to buffer one partd with another, keeping a fixed maximum of data in the buffering partd. This writes the larger elements of the first partd to the second partd when space runs low::
61 |
62 | >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer
63 |
64 | You might also want to have many distributed process write to a single partd
65 | consistently. This can be done with a server
66 |
67 | * Server Process::
68 |
69 | >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer
70 | >>> s = Server(p, address='ipc://server')
71 |
72 | * Worker processes::
73 |
74 | >>> p = Client('ipc://server') # Client machine talks to remote server
75 |
76 |
77 | Encodings and Compression
78 | -------------------------
79 |
80 | Once we can robustly and efficiently append bytes to a partd we consider
81 | compression and encodings. This is generally available with the ``Encode``
82 | partd, which accepts three functions, one to apply on bytes as they are
83 | written, one to apply to bytes as they are read, and one to join bytestreams.
84 | Common configurations already exist for common data and compression formats.
85 |
86 | We may wish to compress and decompress data transparently as we interact with a
87 | partd. Objects like ``BZ2``, ``Blosc``, ``ZLib`` and ``Snappy`` exist and take
88 | another partd as an argument.::
89 |
90 | >>> p = File(...)
91 | >>> p = ZLib(p)
92 |
93 | These work exactly as before, the (de)compression happens automatically.
94 |
95 | Common data formats like Python lists, numpy arrays, and pandas
96 | dataframes are also supported out of the box.::
97 |
98 | >>> p = File(...)
99 | >>> p = NumPy(p)
100 | >>> p.append({'x': np.array([...])})
101 |
102 | This lets us forget about bytes and think instead in our normal data types.
103 |
104 | Composition
105 | -----------
106 |
107 | In principle we want to compose all of these choices together
108 |
109 | 1. Write policy: ``Dict``, ``File``, ``Buffer``, ``Client``
110 | 2. Encoding: ``Pickle``, ``Numpy``, ``Pandas``, ...
111 | 3. Compression: ``Blosc``, ``Snappy``, ...
112 |
113 | Partd objects compose by nesting. Here we make a partd that writes pickle
114 | encoded BZ2 compressed bytes directly to disk::
115 |
116 | >>> p = Pickle(BZ2(File('foo')))
117 |
118 | We could construct more complex systems that include compression,
119 | serialization, buffering, and remote access.::
120 |
121 | >>> server = Server(Buffer(Dict(), File(), available_memory=2e0))
122 |
123 | >>> client = Pickle(Snappy(Client(server.address)))
124 | >>> client.append({'x': [1, 2, 3]})
125 |
126 | .. |Build Status| image:: https://github.com/dask/partd/workflows/CI/badge.svg
127 | :target: https://github.com/dask/partd/actions?query=workflow%3ACI
128 | .. |Version Status| image:: https://img.shields.io/pypi/v/partd.svg
129 | :target: https://pypi.python.org/pypi/partd/
130 |
--------------------------------------------------------------------------------
/docs/zmq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask/partd/e832b655606342dc742ec1c564b07abf1ad58383/docs/zmq.png
--------------------------------------------------------------------------------
/docs/zmq.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
498 |
--------------------------------------------------------------------------------
/partd/__init__.py:
--------------------------------------------------------------------------------
1 | from contextlib import suppress
2 |
3 | from .file import File
4 | from .dict import Dict
5 | from .buffer import Buffer
6 | from .encode import Encode
7 | from .pickle import Pickle
8 | from .python import Python
9 | from .compressed import *
10 | with suppress(ImportError):
11 | from .numpy import Numpy
12 | with suppress(ImportError):
13 | from .pandas import PandasColumns, PandasBlocks
14 | with suppress(ImportError):
15 | from .zmq import Client, Server
16 |
17 | from . import _version
18 | __version__ = _version.get_versions()['version']
19 |
--------------------------------------------------------------------------------
/partd/_version.py:
--------------------------------------------------------------------------------
1 |
2 | # This file helps to compute a version number in source trees obtained from
3 | # git-archive tarball (such as those provided by githubs download-from-tag
4 | # feature). Distribution tarballs (built by setup.py sdist) and build
5 | # directories (produced by setup.py build) will contain a much shorter file
6 | # that just contains the computed version number.
7 |
8 | # This file is released into the public domain.
9 | # Generated by versioneer-0.29
10 | # https://github.com/python-versioneer/python-versioneer
11 |
12 | """Git implementation of _version.py."""
13 |
14 | import errno
15 | import os
16 | import re
17 | import subprocess
18 | import sys
19 | from typing import Any, Callable, Dict, List, Optional, Tuple
20 | import functools
21 |
22 |
23 | def get_keywords() -> Dict[str, str]:
24 | """Get the keywords needed to look up the version information."""
25 | # these strings will be replaced by git during git-archive.
26 | # setup.py/versioneer.py will grep for the variable names, so they must
27 | # each be defined on a line of their own. _version.py will just call
28 | # get_keywords().
29 | git_refnames = " (HEAD -> main)"
30 | git_full = "e832b655606342dc742ec1c564b07abf1ad58383"
31 | git_date = "2024-07-15 16:21:10 -0500"
32 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
33 | return keywords
34 |
35 |
36 | class VersioneerConfig:
37 | """Container for Versioneer configuration parameters."""
38 |
39 | VCS: str
40 | style: str
41 | tag_prefix: str
42 | parentdir_prefix: str
43 | versionfile_source: str
44 | verbose: bool
45 |
46 |
47 | def get_config() -> VersioneerConfig:
48 | """Create, populate and return the VersioneerConfig() object."""
49 | # these strings are filled in when 'setup.py versioneer' creates
50 | # _version.py
51 | cfg = VersioneerConfig()
52 | cfg.VCS = "git"
53 | cfg.style = "pep440"
54 | cfg.tag_prefix = ""
55 | cfg.parentdir_prefix = "partd-"
56 | cfg.versionfile_source = "partd/_version.py"
57 | cfg.verbose = False
58 | return cfg
59 |
60 |
61 | class NotThisMethod(Exception):
62 | """Exception raised if a method is not valid for the current scenario."""
63 |
64 |
65 | LONG_VERSION_PY: Dict[str, str] = {}
66 | HANDLERS: Dict[str, Dict[str, Callable]] = {}
67 |
68 |
69 | def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
70 | """Create decorator to mark a method as the handler of a VCS."""
71 | def decorate(f: Callable) -> Callable:
72 | """Store f in HANDLERS[vcs][method]."""
73 | if vcs not in HANDLERS:
74 | HANDLERS[vcs] = {}
75 | HANDLERS[vcs][method] = f
76 | return f
77 | return decorate
78 |
79 |
80 | def run_command(
81 | commands: List[str],
82 | args: List[str],
83 | cwd: Optional[str] = None,
84 | verbose: bool = False,
85 | hide_stderr: bool = False,
86 | env: Optional[Dict[str, str]] = None,
87 | ) -> Tuple[Optional[str], Optional[int]]:
88 | """Call the given command(s)."""
89 | assert isinstance(commands, list)
90 | process = None
91 |
92 | popen_kwargs: Dict[str, Any] = {}
93 | if sys.platform == "win32":
94 | # This hides the console window if pythonw.exe is used
95 | startupinfo = subprocess.STARTUPINFO()
96 | startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
97 | popen_kwargs["startupinfo"] = startupinfo
98 |
99 | for command in commands:
100 | try:
101 | dispcmd = str([command] + args)
102 | # remember shell=False, so use git.cmd on windows, not just git
103 | process = subprocess.Popen([command] + args, cwd=cwd, env=env,
104 | stdout=subprocess.PIPE,
105 | stderr=(subprocess.PIPE if hide_stderr
106 | else None), **popen_kwargs)
107 | break
108 | except OSError as e:
109 | if e.errno == errno.ENOENT:
110 | continue
111 | if verbose:
112 | print("unable to run %s" % dispcmd)
113 | print(e)
114 | return None, None
115 | else:
116 | if verbose:
117 | print("unable to find command, tried %s" % (commands,))
118 | return None, None
119 | stdout = process.communicate()[0].strip().decode()
120 | if process.returncode != 0:
121 | if verbose:
122 | print("unable to run %s (error)" % dispcmd)
123 | print("stdout was %s" % stdout)
124 | return None, process.returncode
125 | return stdout, process.returncode
126 |
127 |
128 | def versions_from_parentdir(
129 | parentdir_prefix: str,
130 | root: str,
131 | verbose: bool,
132 | ) -> Dict[str, Any]:
133 | """Try to determine the version from the parent directory name.
134 |
135 | Source tarballs conventionally unpack into a directory that includes both
136 | the project name and a version string. We will also support searching up
137 | two directory levels for an appropriately named parent directory
138 | """
139 | rootdirs = []
140 |
141 | for _ in range(3):
142 | dirname = os.path.basename(root)
143 | if dirname.startswith(parentdir_prefix):
144 | return {"version": dirname[len(parentdir_prefix):],
145 | "full-revisionid": None,
146 | "dirty": False, "error": None, "date": None}
147 | rootdirs.append(root)
148 | root = os.path.dirname(root) # up a level
149 |
150 | if verbose:
151 | print("Tried directories %s but none started with prefix %s" %
152 | (str(rootdirs), parentdir_prefix))
153 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
154 |
155 |
156 | @register_vcs_handler("git", "get_keywords")
157 | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
158 | """Extract version information from the given file."""
159 | # the code embedded in _version.py can just fetch the value of these
160 | # keywords. When used from setup.py, we don't want to import _version.py,
161 | # so we do it with a regexp instead. This function is not used from
162 | # _version.py.
163 | keywords: Dict[str, str] = {}
164 | try:
165 | with open(versionfile_abs, "r") as fobj:
166 | for line in fobj:
167 | if line.strip().startswith("git_refnames ="):
168 | mo = re.search(r'=\s*"(.*)"', line)
169 | if mo:
170 | keywords["refnames"] = mo.group(1)
171 | if line.strip().startswith("git_full ="):
172 | mo = re.search(r'=\s*"(.*)"', line)
173 | if mo:
174 | keywords["full"] = mo.group(1)
175 | if line.strip().startswith("git_date ="):
176 | mo = re.search(r'=\s*"(.*)"', line)
177 | if mo:
178 | keywords["date"] = mo.group(1)
179 | except OSError:
180 | pass
181 | return keywords
182 |
183 |
184 | @register_vcs_handler("git", "keywords")
185 | def git_versions_from_keywords(
186 | keywords: Dict[str, str],
187 | tag_prefix: str,
188 | verbose: bool,
189 | ) -> Dict[str, Any]:
190 | """Get version information from git keywords."""
191 | if "refnames" not in keywords:
192 | raise NotThisMethod("Short version file found")
193 | date = keywords.get("date")
194 | if date is not None:
195 | # Use only the last line. Previous lines may contain GPG signature
196 | # information.
197 | date = date.splitlines()[-1]
198 |
199 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
200 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
201 | # -like" string, which we must then edit to make compliant), because
202 | # it's been around since git-1.5.3, and it's too difficult to
203 | # discover which version we're using, or to work around using an
204 | # older one.
205 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
206 | refnames = keywords["refnames"].strip()
207 | if refnames.startswith("$Format"):
208 | if verbose:
209 | print("keywords are unexpanded, not using")
210 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
211 | refs = {r.strip() for r in refnames.strip("()").split(",")}
212 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
213 | # just "foo-1.0". If we see a "tag: " prefix, prefer those.
214 | TAG = "tag: "
215 | tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
216 | if not tags:
217 | # Either we're using git < 1.8.3, or there really are no tags. We use
218 | # a heuristic: assume all version tags have a digit. The old git %d
219 | # expansion behaves like git log --decorate=short and strips out the
220 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish
221 | # between branches and tags. By ignoring refnames without digits, we
222 | # filter out many common branch names like "release" and
223 | # "stabilization", as well as "HEAD" and "master".
224 | tags = {r for r in refs if re.search(r'\d', r)}
225 | if verbose:
226 | print("discarding '%s', no digits" % ",".join(refs - tags))
227 | if verbose:
228 | print("likely tags: %s" % ",".join(sorted(tags)))
229 | for ref in sorted(tags):
230 | # sorting will prefer e.g. "2.0" over "2.0rc1"
231 | if ref.startswith(tag_prefix):
232 | r = ref[len(tag_prefix):]
233 | # Filter out refs that exactly match prefix or that don't start
234 | # with a number once the prefix is stripped (mostly a concern
235 | # when prefix is '')
236 | if not re.match(r'\d', r):
237 | continue
238 | if verbose:
239 | print("picking %s" % r)
240 | return {"version": r,
241 | "full-revisionid": keywords["full"].strip(),
242 | "dirty": False, "error": None,
243 | "date": date}
244 | # no suitable tags, so version is "0+unknown", but full hex is still there
245 | if verbose:
246 | print("no suitable tags, using unknown + full revision id")
247 | return {"version": "0+unknown",
248 | "full-revisionid": keywords["full"].strip(),
249 | "dirty": False, "error": "no suitable tags", "date": None}
250 |
251 |
252 | @register_vcs_handler("git", "pieces_from_vcs")
253 | def git_pieces_from_vcs(
254 | tag_prefix: str,
255 | root: str,
256 | verbose: bool,
257 | runner: Callable = run_command
258 | ) -> Dict[str, Any]:
259 | """Get version from 'git describe' in the root of the source tree.
260 |
261 | This only gets called if the git-archive 'subst' keywords were *not*
262 | expanded, and _version.py hasn't already been rewritten with a short
263 | version string, meaning we're inside a checked out source tree.
264 | """
265 | GITS = ["git"]
266 | if sys.platform == "win32":
267 | GITS = ["git.cmd", "git.exe"]
268 |
269 | # GIT_DIR can interfere with correct operation of Versioneer.
270 | # It may be intended to be passed to the Versioneer-versioned project,
271 | # but that should not change where we get our version from.
272 | env = os.environ.copy()
273 | env.pop("GIT_DIR", None)
274 | runner = functools.partial(runner, env=env)
275 |
276 | _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
277 | hide_stderr=not verbose)
278 | if rc != 0:
279 | if verbose:
280 | print("Directory %s not under git control" % root)
281 | raise NotThisMethod("'git rev-parse --git-dir' returned error")
282 |
283 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
284 | # if there isn't one, this yields HEX[-dirty] (no NUM)
285 | describe_out, rc = runner(GITS, [
286 | "describe", "--tags", "--dirty", "--always", "--long",
287 | "--match", f"{tag_prefix}[[:digit:]]*"
288 | ], cwd=root)
289 | # --long was added in git-1.5.5
290 | if describe_out is None:
291 | raise NotThisMethod("'git describe' failed")
292 | describe_out = describe_out.strip()
293 | full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
294 | if full_out is None:
295 | raise NotThisMethod("'git rev-parse' failed")
296 | full_out = full_out.strip()
297 |
298 | pieces: Dict[str, Any] = {}
299 | pieces["long"] = full_out
300 | pieces["short"] = full_out[:7] # maybe improved later
301 | pieces["error"] = None
302 |
303 | branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
304 | cwd=root)
305 | # --abbrev-ref was added in git-1.6.3
306 | if rc != 0 or branch_name is None:
307 | raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
308 | branch_name = branch_name.strip()
309 |
310 | if branch_name == "HEAD":
311 | # If we aren't exactly on a branch, pick a branch which represents
312 | # the current commit. If all else fails, we are on a branchless
313 | # commit.
314 | branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
315 | # --contains was added in git-1.5.4
316 | if rc != 0 or branches is None:
317 | raise NotThisMethod("'git branch --contains' returned error")
318 | branches = branches.split("\n")
319 |
320 | # Remove the first line if we're running detached
321 | if "(" in branches[0]:
322 | branches.pop(0)
323 |
324 | # Strip off the leading "* " from the list of branches.
325 | branches = [branch[2:] for branch in branches]
326 | if "master" in branches:
327 | branch_name = "master"
328 | elif not branches:
329 | branch_name = None
330 | else:
331 | # Pick the first branch that is returned. Good or bad.
332 | branch_name = branches[0]
333 |
334 | pieces["branch"] = branch_name
335 |
336 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
337 | # TAG might have hyphens.
338 | git_describe = describe_out
339 |
340 | # look for -dirty suffix
341 | dirty = git_describe.endswith("-dirty")
342 | pieces["dirty"] = dirty
343 | if dirty:
344 | git_describe = git_describe[:git_describe.rindex("-dirty")]
345 |
346 | # now we have TAG-NUM-gHEX or HEX
347 |
348 | if "-" in git_describe:
349 | # TAG-NUM-gHEX
350 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
351 | if not mo:
352 | # unparsable. Maybe git-describe is misbehaving?
353 | pieces["error"] = ("unable to parse git-describe output: '%s'"
354 | % describe_out)
355 | return pieces
356 |
357 | # tag
358 | full_tag = mo.group(1)
359 | if not full_tag.startswith(tag_prefix):
360 | if verbose:
361 | fmt = "tag '%s' doesn't start with prefix '%s'"
362 | print(fmt % (full_tag, tag_prefix))
363 | pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
364 | % (full_tag, tag_prefix))
365 | return pieces
366 | pieces["closest-tag"] = full_tag[len(tag_prefix):]
367 |
368 | # distance: number of commits since tag
369 | pieces["distance"] = int(mo.group(2))
370 |
371 | # commit: short hex revision ID
372 | pieces["short"] = mo.group(3)
373 |
374 | else:
375 | # HEX: no tags
376 | pieces["closest-tag"] = None
377 | out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
378 | pieces["distance"] = len(out.split()) # total number of commits
379 |
380 | # commit date: see ISO-8601 comment in git_versions_from_keywords()
381 | date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
382 | # Use only the last line. Previous lines may contain GPG signature
383 | # information.
384 | date = date.splitlines()[-1]
385 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
386 |
387 | return pieces
388 |
389 |
390 | def plus_or_dot(pieces: Dict[str, Any]) -> str:
391 | """Return a + if we don't already have one, else return a ."""
392 | if "+" in pieces.get("closest-tag", ""):
393 | return "."
394 | return "+"
395 |
396 |
397 | def render_pep440(pieces: Dict[str, Any]) -> str:
398 | """Build up version string, with post-release "local version identifier".
399 |
400 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
401 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
402 |
403 | Exceptions:
404 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
405 | """
406 | if pieces["closest-tag"]:
407 | rendered = pieces["closest-tag"]
408 | if pieces["distance"] or pieces["dirty"]:
409 | rendered += plus_or_dot(pieces)
410 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
411 | if pieces["dirty"]:
412 | rendered += ".dirty"
413 | else:
414 | # exception #1
415 | rendered = "0+untagged.%d.g%s" % (pieces["distance"],
416 | pieces["short"])
417 | if pieces["dirty"]:
418 | rendered += ".dirty"
419 | return rendered
420 |
421 |
422 | def render_pep440_branch(pieces: Dict[str, Any]) -> str:
423 | """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
424 |
425 | The ".dev0" means not master branch. Note that .dev0 sorts backwards
426 | (a feature branch will appear "older" than the master branch).
427 |
428 | Exceptions:
429 | 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
430 | """
431 | if pieces["closest-tag"]:
432 | rendered = pieces["closest-tag"]
433 | if pieces["distance"] or pieces["dirty"]:
434 | if pieces["branch"] != "master":
435 | rendered += ".dev0"
436 | rendered += plus_or_dot(pieces)
437 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
438 | if pieces["dirty"]:
439 | rendered += ".dirty"
440 | else:
441 | # exception #1
442 | rendered = "0"
443 | if pieces["branch"] != "master":
444 | rendered += ".dev0"
445 | rendered += "+untagged.%d.g%s" % (pieces["distance"],
446 | pieces["short"])
447 | if pieces["dirty"]:
448 | rendered += ".dirty"
449 | return rendered
450 |
451 |
452 | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
453 | """Split pep440 version string at the post-release segment.
454 |
455 | Returns the release segments before the post-release and the
456 | post-release version number (or -1 if no post-release segment is present).
457 | """
458 | vc = str.split(ver, ".post")
459 | return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
460 |
461 |
462 | def render_pep440_pre(pieces: Dict[str, Any]) -> str:
463 | """TAG[.postN.devDISTANCE] -- No -dirty.
464 |
465 | Exceptions:
466 | 1: no tags. 0.post0.devDISTANCE
467 | """
468 | if pieces["closest-tag"]:
469 | if pieces["distance"]:
470 | # update the post release segment
471 | tag_version, post_version = pep440_split_post(pieces["closest-tag"])
472 | rendered = tag_version
473 | if post_version is not None:
474 | rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
475 | else:
476 | rendered += ".post0.dev%d" % (pieces["distance"])
477 | else:
478 | # no commits, use the tag as the version
479 | rendered = pieces["closest-tag"]
480 | else:
481 | # exception #1
482 | rendered = "0.post0.dev%d" % pieces["distance"]
483 | return rendered
484 |
485 |
486 | def render_pep440_post(pieces: Dict[str, Any]) -> str:
487 | """TAG[.postDISTANCE[.dev0]+gHEX] .
488 |
489 | The ".dev0" means dirty. Note that .dev0 sorts backwards
490 | (a dirty tree will appear "older" than the corresponding clean one),
491 | but you shouldn't be releasing software with -dirty anyways.
492 |
493 | Exceptions:
494 | 1: no tags. 0.postDISTANCE[.dev0]
495 | """
496 | if pieces["closest-tag"]:
497 | rendered = pieces["closest-tag"]
498 | if pieces["distance"] or pieces["dirty"]:
499 | rendered += ".post%d" % pieces["distance"]
500 | if pieces["dirty"]:
501 | rendered += ".dev0"
502 | rendered += plus_or_dot(pieces)
503 | rendered += "g%s" % pieces["short"]
504 | else:
505 | # exception #1
506 | rendered = "0.post%d" % pieces["distance"]
507 | if pieces["dirty"]:
508 | rendered += ".dev0"
509 | rendered += "+g%s" % pieces["short"]
510 | return rendered
511 |
512 |
513 | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
514 | """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
515 |
516 | The ".dev0" means not master branch.
517 |
518 | Exceptions:
519 | 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
520 | """
521 | if pieces["closest-tag"]:
522 | rendered = pieces["closest-tag"]
523 | if pieces["distance"] or pieces["dirty"]:
524 | rendered += ".post%d" % pieces["distance"]
525 | if pieces["branch"] != "master":
526 | rendered += ".dev0"
527 | rendered += plus_or_dot(pieces)
528 | rendered += "g%s" % pieces["short"]
529 | if pieces["dirty"]:
530 | rendered += ".dirty"
531 | else:
532 | # exception #1
533 | rendered = "0.post%d" % pieces["distance"]
534 | if pieces["branch"] != "master":
535 | rendered += ".dev0"
536 | rendered += "+g%s" % pieces["short"]
537 | if pieces["dirty"]:
538 | rendered += ".dirty"
539 | return rendered
540 |
541 |
542 | def render_pep440_old(pieces: Dict[str, Any]) -> str:
543 | """TAG[.postDISTANCE[.dev0]] .
544 |
545 | The ".dev0" means dirty.
546 |
547 | Exceptions:
548 | 1: no tags. 0.postDISTANCE[.dev0]
549 | """
550 | if pieces["closest-tag"]:
551 | rendered = pieces["closest-tag"]
552 | if pieces["distance"] or pieces["dirty"]:
553 | rendered += ".post%d" % pieces["distance"]
554 | if pieces["dirty"]:
555 | rendered += ".dev0"
556 | else:
557 | # exception #1
558 | rendered = "0.post%d" % pieces["distance"]
559 | if pieces["dirty"]:
560 | rendered += ".dev0"
561 | return rendered
562 |
563 |
564 | def render_git_describe(pieces: Dict[str, Any]) -> str:
565 | """TAG[-DISTANCE-gHEX][-dirty].
566 |
567 | Like 'git describe --tags --dirty --always'.
568 |
569 | Exceptions:
570 | 1: no tags. HEX[-dirty] (note: no 'g' prefix)
571 | """
572 | if pieces["closest-tag"]:
573 | rendered = pieces["closest-tag"]
574 | if pieces["distance"]:
575 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
576 | else:
577 | # exception #1
578 | rendered = pieces["short"]
579 | if pieces["dirty"]:
580 | rendered += "-dirty"
581 | return rendered
582 |
583 |
584 | def render_git_describe_long(pieces: Dict[str, Any]) -> str:
585 | """TAG-DISTANCE-gHEX[-dirty].
586 |
587 | Like 'git describe --tags --dirty --always -long'.
588 | The distance/hash is unconditional.
589 |
590 | Exceptions:
591 | 1: no tags. HEX[-dirty] (note: no 'g' prefix)
592 | """
593 | if pieces["closest-tag"]:
594 | rendered = pieces["closest-tag"]
595 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
596 | else:
597 | # exception #1
598 | rendered = pieces["short"]
599 | if pieces["dirty"]:
600 | rendered += "-dirty"
601 | return rendered
602 |
603 |
604 | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
605 | """Render the given version pieces into the requested style."""
606 | if pieces["error"]:
607 | return {"version": "unknown",
608 | "full-revisionid": pieces.get("long"),
609 | "dirty": None,
610 | "error": pieces["error"],
611 | "date": None}
612 |
613 | if not style or style == "default":
614 | style = "pep440" # the default
615 |
616 | if style == "pep440":
617 | rendered = render_pep440(pieces)
618 | elif style == "pep440-branch":
619 | rendered = render_pep440_branch(pieces)
620 | elif style == "pep440-pre":
621 | rendered = render_pep440_pre(pieces)
622 | elif style == "pep440-post":
623 | rendered = render_pep440_post(pieces)
624 | elif style == "pep440-post-branch":
625 | rendered = render_pep440_post_branch(pieces)
626 | elif style == "pep440-old":
627 | rendered = render_pep440_old(pieces)
628 | elif style == "git-describe":
629 | rendered = render_git_describe(pieces)
630 | elif style == "git-describe-long":
631 | rendered = render_git_describe_long(pieces)
632 | else:
633 | raise ValueError("unknown style '%s'" % style)
634 |
635 | return {"version": rendered, "full-revisionid": pieces["long"],
636 | "dirty": pieces["dirty"], "error": None,
637 | "date": pieces.get("date")}
638 |
639 |
640 | def get_versions() -> Dict[str, Any]:
641 | """Get version information or return default if unable to do so."""
642 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
643 | # __file__, we can work backwards from there to the root. Some
644 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
645 | # case we can only use expanded keywords.
646 |
647 | cfg = get_config()
648 | verbose = cfg.verbose
649 |
650 | try:
651 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
652 | verbose)
653 | except NotThisMethod:
654 | pass
655 |
656 | try:
657 | root = os.path.realpath(__file__)
658 | # versionfile_source is the relative path from the top of the source
659 | # tree (where the .git directory might live) to this file. Invert
660 | # this to find the root from __file__.
661 | for _ in cfg.versionfile_source.split('/'):
662 | root = os.path.dirname(root)
663 | except NameError:
664 | return {"version": "0+unknown", "full-revisionid": None,
665 | "dirty": None,
666 | "error": "unable to find root of source tree",
667 | "date": None}
668 |
669 | try:
670 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
671 | return render(pieces, cfg.style)
672 | except NotThisMethod:
673 | pass
674 |
675 | try:
676 | if cfg.parentdir_prefix:
677 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
678 | except NotThisMethod:
679 | pass
680 |
681 | return {"version": "0+unknown", "full-revisionid": None,
682 | "dirty": None,
683 | "error": "unable to compute version", "date": None}
684 |
--------------------------------------------------------------------------------
/partd/buffer.py:
--------------------------------------------------------------------------------
1 | from .core import Interface
2 | from threading import Lock
3 | from toolz import merge_with, topk, accumulate, pluck
4 | from operator import add
5 | from bisect import bisect
6 | from collections import defaultdict
7 | from queue import Queue, Empty
8 |
9 |
10 | def zero():
11 | return 0
12 |
13 | class Buffer(Interface):
14 | def __init__(self, fast, slow, available_memory=1e9):
15 | self.lock = Lock()
16 | self.fast = fast
17 | self.slow = slow
18 | self.available_memory = available_memory
19 | self.lengths = defaultdict(zero)
20 | self.memory_usage = 0
21 | Interface.__init__(self)
22 |
23 | def __getstate__(self):
24 | return {'fast': self.fast,
25 | 'slow': self.slow,
26 | 'memory_usage': self.memory_usage,
27 | 'lengths': self.lengths,
28 | 'available_memory': self.available_memory}
29 |
30 | def __setstate__(self, state):
31 | Interface.__setstate__(self, state)
32 | self.lock = Lock()
33 | self.__dict__.update(state)
34 |
35 | def append(self, data, lock=True, **kwargs):
36 | if lock: self.lock.acquire()
37 | try:
38 | for k, v in data.items():
39 | self.lengths[k] += len(v)
40 | self.memory_usage += len(v)
41 | self.fast.append(data, lock=False, **kwargs)
42 |
43 | while self.memory_usage > self.available_memory:
44 | keys = keys_to_flush(self.lengths, 0.1, maxcount=20)
45 | self.flush(keys)
46 |
47 | finally:
48 | if lock: self.lock.release()
49 |
50 | def _get(self, keys, lock=True, **kwargs):
51 | if lock: self.lock.acquire()
52 | try:
53 | result = list(map(add, self.fast.get(keys, lock=False),
54 | self.slow.get(keys, lock=False)))
55 | finally:
56 | if lock: self.lock.release()
57 | return result
58 |
59 | def _iset(self, key, value, lock=True):
60 | """ Idempotent set """
61 | if lock: self.lock.acquire()
62 | try:
63 | self.fast.iset(key, value, lock=False)
64 | finally:
65 | if lock: self.lock.release()
66 |
67 | def _delete(self, keys, lock=True):
68 | if lock: self.lock.acquire()
69 | try:
70 | self.fast.delete(keys, lock=False)
71 | self.slow.delete(keys, lock=False)
72 | finally:
73 | if lock: self.lock.release()
74 |
75 | def drop(self):
76 | self._iset_seen.clear()
77 | self.fast.drop()
78 | self.slow.drop()
79 |
80 | def __exit__(self, *args):
81 | self.drop()
82 |
83 | def flush(self, keys=None, block=None):
84 | """ Flush keys to disk
85 |
86 | Parameters
87 | ----------
88 |
89 | keys: list or None
90 | list of keys to flush
91 | block: bool (defaults to None)
92 | Whether or not to block until all writing is complete
93 |
94 | If no keys are given then flush all keys
95 | """
96 | if keys is None:
97 | keys = list(self.lengths)
98 |
99 | self.slow.append(dict(zip(keys, self.fast.get(keys))))
100 | self.fast.delete(keys)
101 |
102 | for key in keys:
103 | self.memory_usage -= self.lengths[key]
104 | del self.lengths[key]
105 |
106 |
107 | def keys_to_flush(lengths, fraction=0.1, maxcount=100000):
108 | """ Which keys to remove
109 |
110 | >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15,
111 | ... 'e': 10, 'f': 25, 'g': 5}
112 | >>> keys_to_flush(lengths, 0.5)
113 | ['f', 'a']
114 | """
115 | top = topk(max(len(lengths) // 2, 1),
116 | lengths.items(),
117 | key=1)
118 | total = sum(lengths.values())
119 | cutoff = min(maxcount, max(1,
120 | bisect(list(accumulate(add, pluck(1, top))),
121 | total * fraction)))
122 | result = [k for k, v in top[:cutoff]]
123 | assert result
124 | return result
125 |
--------------------------------------------------------------------------------
/partd/compressed.py:
--------------------------------------------------------------------------------
1 | from contextlib import suppress
2 | from functools import partial
3 |
4 | from .encode import Encode
5 |
6 | __all__ = []
7 |
8 |
9 | def bytes_concat(L):
10 | return b''.join(L)
11 |
12 |
13 | with suppress(ImportError, AttributeError):
14 | # In case snappy is not installed, or another package called snappy that does not implement compress / decompress.
15 | # For example, SnapPy (https://pypi.org/project/snappy/)
16 | import snappy
17 | Snappy = partial(Encode,
18 | snappy.compress,
19 | snappy.decompress,
20 | bytes_concat)
21 | __all__.append('Snappy')
22 |
23 |
24 | with suppress(ImportError):
25 | import zlib
26 | ZLib = partial(Encode,
27 | zlib.compress,
28 | zlib.decompress,
29 | bytes_concat)
30 | __all__.append('ZLib')
31 |
32 |
33 | with suppress(ImportError):
34 | import bz2
35 | BZ2 = partial(Encode,
36 | bz2.compress,
37 | bz2.decompress,
38 | bytes_concat)
39 | __all__.append('BZ2')
40 |
41 |
42 | with suppress(ImportError):
43 | import blosc
44 | Blosc = partial(Encode,
45 | blosc.compress,
46 | blosc.decompress,
47 | bytes_concat)
48 | __all__.append('Blosc')
49 |
--------------------------------------------------------------------------------
/partd/core.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import locket
4 | import string
5 | from toolz import memoize
6 | from contextlib import contextmanager
7 | from .utils import nested_get, flatten
8 |
9 |
10 |
11 | # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python
12 | valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep
13 |
14 |
15 | def escape_filename(fn):
16 | """ Escape text so that it is a valid filename
17 |
18 | >>> escape_filename('Foo!bar?')
19 | 'Foobar'
20 |
21 | """
22 | return ''.join(filter(valid_chars.__contains__, fn))
23 |
24 |
25 | def filename(path, key):
26 | return os.path.join(path, escape_filename(token(key)))
27 |
28 |
29 | def token(key):
30 | """
31 |
32 | >>> token('hello')
33 | 'hello'
34 | >>> token(('hello', 'world')) # doctest: +SKIP
35 | 'hello/world'
36 | """
37 | if isinstance(key, str):
38 | return key
39 | elif isinstance(key, tuple):
40 | return os.path.join(*map(token, key))
41 | else:
42 | return str(key)
43 |
44 |
45 | class Interface:
46 | def __init__(self):
47 | self._iset_seen = set()
48 |
49 | def __setstate__(self, state):
50 | self.__dict__.update(state)
51 | self._iset_seen = set()
52 |
53 | def iset(self, key, value, **kwargs):
54 | if key in self._iset_seen:
55 | return
56 | else:
57 | self._iset(key, value, **kwargs)
58 | self._iset_seen.add(key)
59 |
60 | def __enter__(self):
61 | return self
62 |
63 | def __exit__(self, type, value, traceback):
64 | self.drop()
65 |
66 | def iget(self, key):
67 | return self._get([key], lock=False)[0]
68 |
69 | def get(self, keys, **kwargs):
70 | if not isinstance(keys, list):
71 | return self.get([keys], **kwargs)[0]
72 | elif any(isinstance(key, list) for key in keys): # nested case
73 | flatkeys = list(flatten(keys))
74 | result = self.get(flatkeys, **kwargs)
75 | return nested_get(keys, dict(zip(flatkeys, result)))
76 | else:
77 | return self._get(keys, **kwargs)
78 |
79 | def delete(self, keys, **kwargs):
80 | if not isinstance(keys, list):
81 | return self._delete([keys], **kwargs)
82 | else:
83 | return self._delete(keys, **kwargs)
84 |
85 | def pop(self, keys, **kwargs):
86 | with self.partd.lock:
87 | result = self.partd.get(keys, lock=False)
88 | self.partd.delete(keys, lock=False)
89 | return result
90 |
91 |
--------------------------------------------------------------------------------
/partd/dict.py:
--------------------------------------------------------------------------------
1 | from .core import Interface
2 | from threading import Lock
3 |
4 |
5 | class Dict(Interface):
6 | def __init__(self):
7 | self.lock = Lock()
8 | self.data = dict()
9 | Interface.__init__(self)
10 |
11 | def __getstate__(self):
12 | return {'data': self.data}
13 |
14 | def __setstate__(self, state):
15 | Interface.__setstate__(self, state)
16 | Dict.__init__(self)
17 | self.data = state['data']
18 |
19 | def append(self, data, lock=True, **kwargs):
20 | if lock: self.lock.acquire()
21 | try:
22 | for k, v in data.items():
23 | if k not in self.data:
24 | self.data[k] = []
25 | self.data[k].append(v)
26 | finally:
27 | if lock: self.lock.release()
28 |
29 | def _get(self, keys, lock=True, **kwargs):
30 | assert isinstance(keys, (list, tuple, set))
31 | if lock:
32 | self.lock.acquire()
33 | try:
34 | result = [b''.join(self.data.get(key, [])) for key in keys]
35 | finally:
36 | if lock:
37 | self.lock.release()
38 | return result
39 |
40 | def _iset(self, key, value, lock=True):
41 | """ Idempotent set """
42 | if lock:
43 | self.lock.acquire()
44 | try:
45 | self.data[key] = [value]
46 | finally:
47 | if lock:
48 | self.lock.release()
49 |
50 | def _delete(self, keys, lock=True):
51 | if lock:
52 | self.lock.acquire()
53 | try:
54 | for key in keys:
55 | if key in self.data:
56 | del self.data[key]
57 | finally:
58 | if lock:
59 | self.lock.release()
60 |
61 | def drop(self):
62 | self._iset_seen.clear()
63 | self.data.clear()
64 |
65 | def __exit__(self, *args):
66 | self.drop()
67 |
--------------------------------------------------------------------------------
/partd/encode.py:
--------------------------------------------------------------------------------
1 | from .core import Interface
2 | from .file import File
3 | from toolz import valmap
4 | from .utils import frame, framesplit
5 |
6 |
7 | class Encode(Interface):
8 | def __init__(self, encode, decode, join, partd=None):
9 | if not partd or isinstance(partd, str):
10 | partd = File(partd)
11 | self.partd = partd
12 | self.encode = encode
13 | self.decode = decode
14 | self.join = join
15 | Interface.__init__(self)
16 |
17 | def __getstate__(self):
18 | return self.__dict__
19 |
20 | __setstate__ = Interface.__setstate__
21 |
22 | def append(self, data, **kwargs):
23 | data = valmap(self.encode, data)
24 | data = valmap(frame, data)
25 | self.partd.append(data, **kwargs)
26 |
27 | def _get(self, keys, **kwargs):
28 | raw = self.partd._get(keys, **kwargs)
29 | return [self.join([self.decode(frame) for frame in framesplit(chunk)])
30 | for chunk in raw]
31 |
32 | def delete(self, keys, **kwargs):
33 | return self.partd.delete(keys, **kwargs)
34 |
35 | def _iset(self, key, value, **kwargs):
36 | return self.partd.iset(key, frame(self.encode(value)), **kwargs)
37 |
38 | def drop(self):
39 | return self.partd.drop()
40 |
41 | @property
42 | def lock(self):
43 | return self.partd.lock
44 |
45 | def __exit__(self, *args):
46 | self.drop()
47 | self.partd.__exit__(*args)
48 |
--------------------------------------------------------------------------------
/partd/file.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | from contextlib import suppress
3 | import os
4 | import shutil
5 | import string
6 | import tempfile
7 |
8 | from .core import Interface
9 | import locket
10 |
11 |
12 | class File(Interface):
13 | def __init__(self, path=None, dir=None):
14 | if not path:
15 | path = tempfile.mkdtemp(suffix='.partd', dir=dir)
16 | cleanup_files.append(path)
17 | self._explicitly_given_path = False
18 | else:
19 | self._explicitly_given_path = True
20 | self.path = path
21 | if not os.path.exists(path):
22 | with suppress(OSError):
23 | os.makedirs(path)
24 | self.lock = locket.lock_file(self.filename('.lock'))
25 | Interface.__init__(self)
26 |
27 | def __getstate__(self):
28 | return {'path': self.path}
29 |
30 | def __setstate__(self, state):
31 | Interface.__setstate__(self, state)
32 | File.__init__(self, state['path'])
33 |
34 | def append(self, data, lock=True, fsync=False, **kwargs):
35 | if lock: self.lock.acquire()
36 | try:
37 | for k, v in data.items():
38 | fn = self.filename(k)
39 | if not os.path.exists(os.path.dirname(fn)):
40 | os.makedirs(os.path.dirname(fn))
41 | with open(fn, 'ab') as f:
42 | f.write(v)
43 | if fsync:
44 | os.fsync(f)
45 | finally:
46 | if lock: self.lock.release()
47 |
48 | def _get(self, keys, lock=True, **kwargs):
49 | assert isinstance(keys, (list, tuple, set))
50 | if lock:
51 | self.lock.acquire()
52 | try:
53 | result = []
54 | for key in keys:
55 | try:
56 | with open(self.filename(key), 'rb') as f:
57 | result.append(f.read())
58 | except OSError:
59 | result.append(b'')
60 | finally:
61 | if lock:
62 | self.lock.release()
63 | return result
64 |
65 | def _iset(self, key, value, lock=True):
66 | """ Idempotent set """
67 | fn = self.filename(key)
68 | if not os.path.exists(os.path.dirname(fn)):
69 | os.makedirs(os.path.dirname(fn))
70 | if lock:
71 | self.lock.acquire()
72 | try:
73 | with open(self.filename(key), 'wb') as f:
74 | f.write(value)
75 | finally:
76 | if lock:
77 | self.lock.release()
78 |
79 | def _delete(self, keys, lock=True):
80 | if lock:
81 | self.lock.acquire()
82 | try:
83 | for key in keys:
84 | path = filename(self.path, key)
85 | if os.path.exists(path):
86 | os.remove(path)
87 | finally:
88 | if lock:
89 | self.lock.release()
90 |
91 | def drop(self):
92 | if os.path.exists(self.path):
93 | shutil.rmtree(self.path)
94 | self._iset_seen.clear()
95 | os.mkdir(self.path)
96 |
97 | def filename(self, key):
98 | return filename(self.path, key)
99 |
100 | def __exit__(self, *args):
101 | self.drop()
102 | os.rmdir(self.path)
103 |
104 | def __del__(self):
105 | if not self._explicitly_given_path:
106 | self.drop()
107 | os.rmdir(self.path)
108 |
109 |
110 | def filename(path, key):
111 | return os.path.join(path, escape_filename(token(key)))
112 |
113 |
114 | # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python
115 | valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep
116 |
117 |
118 | def escape_filename(fn):
119 | """ Escape text so that it is a valid filename
120 |
121 | >>> escape_filename('Foo!bar?')
122 | 'Foobar'
123 |
124 | """
125 | return ''.join(filter(valid_chars.__contains__, fn))
126 |
127 |
128 |
129 | def token(key):
130 | """
131 |
132 | >>> token('hello')
133 | 'hello'
134 | >>> token(('hello', 'world')) # doctest: +SKIP
135 | 'hello/world'
136 | """
137 | if isinstance(key, str):
138 | return key
139 | elif isinstance(key, tuple):
140 | return os.path.join(*map(token, key))
141 | else:
142 | return str(key)
143 |
144 |
145 | cleanup_files = list()
146 |
147 | @atexit.register
148 | def cleanup():
149 | for fn in cleanup_files:
150 | if os.path.exists(fn):
151 | shutil.rmtree(fn)
152 |
--------------------------------------------------------------------------------
/partd/numpy.py:
--------------------------------------------------------------------------------
1 | """ Store arrays
2 |
3 | We put arrays on disk as raw bytes, extending along the first dimension.
4 | Alongside each array x we ensure the value x.dtype which stores the string
5 | description of the array's dtype.
6 | """
7 | from contextlib import suppress
8 | import pickle
9 |
10 | import numpy as np
11 | from toolz import valmap, identity, partial
12 | from .core import Interface
13 | from .file import File
14 | from .utils import frame, framesplit, suffix
15 |
16 |
17 | def serialize_dtype(dt):
18 | """ Serialize dtype to bytes
19 |
20 | >>> serialize_dtype(np.dtype('i4'))
21 | b'>> serialize_dtype(np.dtype('M8[us]'))
23 | b'>> parse_dtype(b'i4')
32 | dtype('int32')
33 |
34 | >>> parse_dtype(b"[('a', 'i4')]")
35 | dtype([('a', '= (0, 5, 2):
109 | unpack_kwargs = {'raw': False}
110 | else:
111 | unpack_kwargs = {'encoding': 'utf-8'}
112 |
113 | blocks = [msgpack.unpackb(f, **unpack_kwargs)
114 | for f in framesplit(bytes)]
115 | except Exception:
116 | blocks = [pickle.loads(f) for f in framesplit(bytes)]
117 |
118 | result = np.empty(sum(map(len, blocks)), dtype='O')
119 | i = 0
120 | for block in blocks:
121 | result[i:i + len(block)] = block
122 | i += len(block)
123 | return result
124 | else:
125 | result = np.frombuffer(bytes, dtype)
126 | if copy:
127 | result = result.copy()
128 | return result
129 |
130 |
131 | compress_text = identity
132 | decompress_text = identity
133 | compress_bytes = lambda bytes, itemsize: bytes
134 | decompress_bytes = identity
135 |
136 | with suppress(ImportError):
137 | import blosc
138 | blosc.set_nthreads(1)
139 |
140 | compress_bytes = blosc.compress
141 | decompress_bytes = blosc.decompress
142 |
143 | compress_text = partial(blosc.compress, typesize=1)
144 | decompress_text = blosc.decompress
145 |
146 | with suppress(ImportError):
147 | from snappy import compress as compress_text
148 | from snappy import decompress as decompress_text
149 |
150 |
151 | def compress(bytes, dtype):
152 | if dtype == 'O':
153 | return compress_text(bytes)
154 | else:
155 | return compress_bytes(bytes, dtype.itemsize)
156 |
157 |
158 | def decompress(bytes, dtype):
159 | if dtype == 'O':
160 | return decompress_text(bytes)
161 | else:
162 | return decompress_bytes(bytes)
163 |
--------------------------------------------------------------------------------
/partd/pandas.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import pickle
3 |
4 | import pandas as pd
5 | from packaging.version import Version
6 |
7 | PANDAS_GE_210 = Version(pd.__version__).release >= (2, 1, 0)
8 | PANDAS_GE_300 = Version(pd.__version__).major >= 3
9 |
10 | if PANDAS_GE_300:
11 | from pandas.api.internals import create_dataframe_from_blocks
12 | create_block_manager_from_blocks = None
13 | make_block = None
14 | else:
15 | create_dataframe_from_blocks = None
16 | try:
17 | from pandas.core.internals.managers import create_block_manager_from_blocks
18 | except ImportError:
19 | from pandas.core.internals import create_block_manager_from_blocks
20 |
21 | from pandas.core.internals import make_block
22 |
23 | from . import numpy as pnp
24 | from .core import Interface
25 | from .encode import Encode
26 | from .utils import extend, framesplit, frame
27 | from pandas.api.types import is_extension_array_dtype
28 | from pandas.api.extensions import ExtensionArray
29 |
30 | def is_extension_array(x):
31 | return isinstance(x, ExtensionArray)
32 |
33 |
34 | dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
35 |
36 |
37 |
38 | class PandasColumns(Interface):
39 | def __init__(self, partd=None):
40 | self.partd = pnp.Numpy(partd)
41 | Interface.__init__(self)
42 |
43 | def append(self, data, **kwargs):
44 | for k, df in data.items():
45 | self.iset(extend(k, '.columns'), dumps(list(df.columns)))
46 | self.iset(extend(k, '.index-name'), dumps(df.index.name))
47 |
48 | # TODO: don't use values, it does some work. Look at _blocks instead
49 | # pframe/cframe do this well
50 | arrays = {extend(k, col): df[col].values
51 | for k, df in data.items()
52 | for col in df.columns}
53 | arrays.update({extend(k, '.index'): df.index.values
54 | for k, df in data.items()})
55 | # TODO: handle categoricals
56 | self.partd.append(arrays, **kwargs)
57 |
58 | def _get(self, keys, columns=None, **kwargs):
59 | if columns is None:
60 | columns = self.partd.partd.get([extend(k, '.columns') for k in keys],
61 | **kwargs)
62 | columns = list(map(pickle.loads, columns))
63 | else:
64 | columns = [columns] * len(keys)
65 | index_names = self.partd.partd.get([extend(k, '.index-name')
66 | for k in keys], **kwargs)
67 | index_names = map(pickle.loads, index_names)
68 |
69 | keys = [[extend(k, '.index'), [extend(k, col) for col in cols]]
70 | for k, cols in zip(keys, columns)]
71 |
72 | arrays = self.partd.get(keys, **kwargs)
73 |
74 | return [pd.DataFrame(dict(zip(cols, arrs)), columns=cols,
75 | index=pd.Index(index, name=iname))
76 | for iname, (index, arrs), cols in zip(index_names, arrays, columns)]
77 |
78 | def __getstate__(self):
79 | return {'partd': self.partd}
80 |
81 | def _iset(self, key, value):
82 | return self.partd._iset(key, value)
83 |
84 | def drop(self):
85 | return self.partd.drop()
86 |
87 | @property
88 | def lock(self):
89 | return self.partd.partd.lock
90 |
91 | def __exit__(self, *args):
92 | self.drop()
93 | self.partd.__exit__(self, *args)
94 |
95 | def __del__(self):
96 | self.partd.__del__()
97 |
98 |
99 | def index_to_header_bytes(ind):
100 | # These have special `__reduce__` methods, just use pickle
101 | if isinstance(ind, (pd.DatetimeIndex,
102 | pd.MultiIndex,
103 | pd.RangeIndex)):
104 | return None, dumps(ind)
105 |
106 | if isinstance(ind, pd.CategoricalIndex):
107 | cat = (ind.ordered, ind.categories)
108 | values = ind.codes
109 | else:
110 | cat = None
111 | values = ind.values
112 |
113 | if is_extension_array_dtype(ind):
114 | return None, dumps(ind)
115 |
116 | header = (type(ind), {k: getattr(ind, k, None) for k in ind._attributes}, values.dtype, cat)
117 | bytes = pnp.compress(pnp.serialize(values), values.dtype)
118 | return header, bytes
119 |
120 |
121 | def index_from_header_bytes(header, bytes):
122 | if header is None:
123 | return pickle.loads(bytes)
124 |
125 | typ, attr, dtype, cat = header
126 | data = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True)
127 | if cat:
128 | data = pd.Categorical.from_codes(data, cat[1], ordered=cat[0])
129 | return typ.__new__(typ, data=data, **attr)
130 |
131 |
132 | def block_to_header_bytes(block):
133 | values = block.values
134 | if isinstance(values, pd.Categorical):
135 | extension = ('categorical_type', (values.ordered, values.categories))
136 | values = values.codes
137 | elif isinstance(block, pd.DatetimeTZDtype):
138 | extension = ('datetime64_tz_type', (block.values.tzinfo,))
139 | values = values.view('i8')
140 | elif is_extension_array_dtype(block.dtype) or is_extension_array(values):
141 | extension = ("other", ())
142 | else:
143 | extension = ('numpy_type', ())
144 |
145 | header = (block.mgr_locs.as_array, values.dtype, values.shape, extension)
146 | if extension == ("other", ()):
147 | bytes = pickle.dumps(values)
148 | else:
149 | bytes = pnp.compress(pnp.serialize(values), values.dtype)
150 | return header, bytes
151 |
152 |
153 | def block_from_header_bytes(header, bytes, create_block: bool):
154 | placement, dtype, shape, (extension_type, extension_values) = header
155 |
156 | if extension_type == "other":
157 | values = pickle.loads(bytes)
158 | else:
159 | values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype,
160 | copy=True).reshape(shape)
161 | if extension_type == 'categorical_type':
162 | values = pd.Categorical.from_codes(values,
163 | extension_values[1],
164 | ordered=extension_values[0])
165 | elif extension_type == 'datetime64_tz_type':
166 | tz_info = extension_values[0]
167 | values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert(
168 | tz_info)
169 | if create_block:
170 | return make_block(values, placement=placement)
171 | return values, placement
172 |
173 |
174 | def serialize(df):
175 | """ Serialize and compress a Pandas DataFrame
176 |
177 | Uses Pandas blocks, snappy, and blosc to deconstruct an array into bytes
178 | """
179 | col_header, col_bytes = index_to_header_bytes(df.columns)
180 | ind_header, ind_bytes = index_to_header_bytes(df.index)
181 | headers = [col_header, ind_header]
182 | bytes = [col_bytes, ind_bytes]
183 |
184 | for block in df._mgr.blocks:
185 | h, b = block_to_header_bytes(block)
186 | headers.append(h)
187 | bytes.append(b)
188 |
189 | frames = [dumps(headers)] + bytes
190 | return b''.join(map(frame, frames))
191 |
192 |
193 | def deserialize(bytes):
194 | """ Deserialize and decompress bytes back to a pandas DataFrame """
195 | frames = list(framesplit(bytes))
196 | headers = pickle.loads(frames[0])
197 | bytes = frames[1:]
198 | axes = [index_from_header_bytes(headers[0], bytes[0]),
199 | index_from_header_bytes(headers[1], bytes[1])]
200 | blocks = [block_from_header_bytes(h, b, create_block=not PANDAS_GE_300)
201 | for (h, b) in zip(headers[2:], bytes[2:])]
202 | if PANDAS_GE_300:
203 | return pd.api.internals.create_dataframe_from_blocks(blocks, axes[1], axes[0])
204 | elif PANDAS_GE_210:
205 | return pd.DataFrame._from_mgr(create_block_manager_from_blocks(blocks, axes), axes=axes)
206 | else:
207 | return pd.DataFrame(create_block_manager_from_blocks(blocks, axes))
208 |
209 |
210 | def join(dfs):
211 | if not dfs:
212 | return pd.DataFrame()
213 | else:
214 | result = pd.concat(dfs)
215 | dtypes = {
216 | col: "category"
217 | for col in result.columns
218 | if (
219 | isinstance(dfs[0][col].dtype, pd.CategoricalDtype)
220 | and not isinstance(result[col].dtype, pd.CategoricalDtype)
221 | )
222 | }
223 | if dtypes:
224 | result = result.astype(dtypes)
225 | return result
226 |
227 | PandasBlocks = partial(Encode, serialize, deserialize, join)
228 |
--------------------------------------------------------------------------------
/partd/pickle.py:
--------------------------------------------------------------------------------
1 | """
2 | get/put functions that consume/produce Python lists using Pickle to serialize
3 | """
4 | import pickle
5 |
6 | from .encode import Encode
7 | from functools import partial
8 |
9 | def concat(lists):
10 | return sum(lists, [])
11 |
12 | Pickle = partial(Encode,
13 | partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL),
14 | pickle.loads,
15 | concat)
16 |
--------------------------------------------------------------------------------
/partd/python.py:
--------------------------------------------------------------------------------
1 | """
2 | get/put functions that consume/produce Python lists using msgpack or pickle
3 | to serialize.
4 |
5 | First we try msgpack (it's faster). If that fails then we default to pickle.
6 | """
7 | import pickle
8 |
9 | try:
10 | from pandas import msgpack
11 | except ImportError:
12 | try:
13 | import msgpack
14 | except ImportError:
15 | msgpack = False
16 |
17 |
18 | from .encode import Encode
19 | from functools import partial
20 |
21 |
22 | def dumps(x):
23 | try:
24 | return msgpack.packb(x, use_bin_type=True)
25 | except:
26 | return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
27 |
28 | def loads(x):
29 | try:
30 | if msgpack.version >= (0, 5, 2):
31 | unpack_kwargs = {'raw': False}
32 | else:
33 | unpack_kwargs = {'encoding': 'utf-8'}
34 | return msgpack.unpackb(x, **unpack_kwargs)
35 | except:
36 | return pickle.loads(x)
37 |
38 |
39 | def concat(lists):
40 | return sum(lists, [])
41 |
42 |
43 | Python = partial(Encode, dumps, loads, concat)
44 |
--------------------------------------------------------------------------------
/partd/tests/test_buffer.py:
--------------------------------------------------------------------------------
1 | from partd.dict import Dict
2 | from partd.file import File
3 | from partd.buffer import Buffer, keys_to_flush
4 | import pickle
5 |
6 | import shutil
7 | import os
8 |
9 |
10 | def test_partd():
11 | a = Dict()
12 | b = Dict()
13 | with Buffer(a, b, available_memory=10) as p:
14 | p.append({'x': b'Hello', 'y': b'abc'})
15 | assert a.get(['x', 'y']) == [b'Hello', b'abc']
16 |
17 | p.append({'x': b'World!', 'y': b'def'})
18 | assert a.get(['x', 'y']) == [b'', b'abcdef']
19 | assert b.get(['x', 'y']) == [b'HelloWorld!', b'']
20 |
21 | result = p.get(['y', 'x'])
22 | assert result == [b'abcdef', b'HelloWorld!']
23 |
24 | assert p.get('z') == b''
25 |
26 | with p.lock: # uh oh, possible deadlock
27 | result = p.get(['x'], lock=False)
28 |
29 |
30 | def test_keys_to_flush():
31 | lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, 'e': 10, 'f': 25, 'g': 5}
32 | assert keys_to_flush(lengths, 0.5) == ['f', 'a']
33 |
34 |
35 | def test_pickle():
36 | with Dict() as a:
37 | with File() as b:
38 | c = Buffer(a, b)
39 |
40 | c.append({'x': b'123'})
41 |
42 | d = pickle.loads(pickle.dumps(c))
43 |
44 | assert d.get('x') == c.get('x')
45 |
46 | pickled_attrs = ('memory_usage', 'lengths', 'available_memory')
47 | for attr in pickled_attrs:
48 | assert hasattr(d, attr)
49 | assert getattr(d, attr) == getattr(c, attr)
50 | # special case Dict and File -- some attrs do not pickle
51 | assert hasattr(d, 'fast')
52 | assert d.fast.data == c.fast.data
53 | assert hasattr(d, 'slow')
54 | assert d.slow.path == c.slow.path
55 |
--------------------------------------------------------------------------------
/partd/tests/test_compressed.py:
--------------------------------------------------------------------------------
1 | from partd.compressed import ZLib
2 |
3 |
4 | import shutil
5 | import os
6 | import pickle
7 |
8 |
9 | def test_partd():
10 | with ZLib() as p:
11 | p.append({'x': b'Hello', 'y': b'abc'})
12 | p.append({'x': b'World!', 'y': b'def'})
13 | assert os.path.exists(p.partd.filename('x'))
14 | assert os.path.exists(p.partd.filename('y'))
15 |
16 | result = p.get(['y', 'x'])
17 | assert result == [b'abcdef', b'HelloWorld!']
18 |
19 | assert p.get('z') == b''
20 |
21 | with p.lock: # uh oh, possible deadlock
22 | result = p.get(['x'], lock=False)
23 |
24 | assert not os.path.exists(p.partd.path)
25 |
26 |
27 | def test_pickle():
28 | with ZLib() as p:
29 | p.append({'x': b'123'})
30 | q = pickle.loads(pickle.dumps(p))
31 | assert q.get('x') == b'123'
32 |
--------------------------------------------------------------------------------
/partd/tests/test_dict.py:
--------------------------------------------------------------------------------
1 | from partd.dict import Dict
2 |
3 | import shutil
4 | import os
5 |
6 |
7 | def test_partd():
8 | with Dict() as p:
9 | p.append({'x': b'Hello', 'y': b'abc'})
10 | p.append({'x': b'World!', 'y': b'def'})
11 |
12 | result = p.get(['y', 'x'])
13 | assert result == [b'abcdef', b'HelloWorld!']
14 |
15 | assert p.get('z') == b''
16 |
17 | with p.lock: # uh oh, possible deadlock
18 | result = p.get(['x'], lock=False)
19 |
20 |
21 | def test_key_tuple():
22 | with Dict() as p:
23 | p.append({('a', 'b'): b'123'})
24 | assert p.get(('a', 'b')) == b'123'
25 |
26 |
27 | def test_iset():
28 | with Dict() as p:
29 | p.iset('x', b'123')
30 | assert 'x' in p._iset_seen
31 | assert 'y' not in p._iset_seen
32 | p.iset('x', b'123')
33 | p.iset('x', b'123')
34 | assert p.get('x') == b'123'
35 |
36 |
37 | def test_delete_non_existent_key():
38 | with Dict() as p:
39 | p.append({'x': b'123'})
40 | p.delete(['x', 'y'])
41 | assert p.get(['x', 'y']) == [b'', b'']
42 |
--------------------------------------------------------------------------------
/partd/tests/test_encode.py:
--------------------------------------------------------------------------------
1 | from partd.file import File
2 | from partd.encode import Encode
3 |
4 | import zlib
5 | import shutil
6 | import os
7 |
8 |
9 | def test_partd():
10 | with Encode(zlib.compress, zlib.decompress, b''.join) as p:
11 | p.append({'x': b'Hello', 'y': b'abc'})
12 | p.append({'x': b'World!', 'y': b'def'})
13 |
14 | result = p.get(['y', 'x'])
15 | assert result == [b'abcdef', b'HelloWorld!']
16 |
17 | assert p.get('z') == b''
18 |
19 | with p.lock: # uh oh, possible deadlock
20 | result = p.get(['x'], lock=False)
21 |
22 |
23 | def test_ensure():
24 | with Encode(zlib.compress, zlib.decompress, b''.join) as p:
25 | p.iset('x', b'123')
26 | p.iset('x', b'123')
27 | p.iset('x', b'123')
28 | assert p.get('x') == b'123'
29 |
--------------------------------------------------------------------------------
/partd/tests/test_file.py:
--------------------------------------------------------------------------------
1 | from partd.file import File
2 |
3 | import shutil
4 | import os
5 |
6 |
7 | def test_partd():
8 | with File() as p:
9 | p.append({'x': b'Hello', 'y': b'abc'})
10 | p.append({'x': b'World!', 'y': b'def'})
11 | assert os.path.exists(p.filename('x'))
12 | assert os.path.exists(p.filename('y'))
13 |
14 | result = p.get(['y', 'x'])
15 | assert result == [b'abcdef', b'HelloWorld!']
16 |
17 | assert p.get('z') == b''
18 |
19 | with p.lock: # uh oh, possible deadlock
20 | result = p.get(['x'], lock=False)
21 |
22 | assert not os.path.exists(p.path)
23 |
24 |
25 | def test_key_tuple():
26 | with File() as p:
27 | p.append({('a', 'b'): b'123'})
28 | assert os.path.exists(p.filename(('a', 'b')))
29 |
30 |
31 | def test_iset():
32 | with File() as p:
33 | p.iset('x', b'123')
34 | assert 'x' in p._iset_seen
35 | assert 'y' not in p._iset_seen
36 | p.iset('x', b'123')
37 | p.iset('x', b'123')
38 | assert p.get('x') == b'123'
39 |
40 |
41 | def test_nested_get():
42 | with File() as p:
43 | p.append({'x': b'1', 'y': b'2', 'z': b'3'})
44 | assert p.get(['x', ['y', 'z']]) == [b'1', [b'2', b'3']]
45 |
46 |
47 | def test_drop():
48 | with File() as p:
49 | p.append({'x': b'123'})
50 | p.iset('y', b'abc')
51 | assert p.get('x') == b'123'
52 | assert p.get('y') == b'abc'
53 |
54 | p.drop()
55 | assert p.get('x') == b''
56 | assert p.get('y') == b''
57 |
58 | p.append({'x': b'123'})
59 | p.iset('y', b'def')
60 | assert p.get('x') == b'123'
61 | assert p.get('y') == b'def'
62 |
63 |
64 | def test_del():
65 | f = File()
66 |
67 | assert f.path
68 | assert os.path.exists(f.path)
69 |
70 | f.__del__()
71 | assert not os.path.exists(f.path)
72 |
73 | with File('Foo') as p:
74 | p.__del__()
75 | assert os.path.exists(p.path)
76 |
77 |
78 | def test_specify_dirname():
79 | with File(dir=os.getcwd()) as f:
80 | assert os.getcwd() in f.path
81 |
--------------------------------------------------------------------------------
/partd/tests/test_numpy.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | np = pytest.importorskip('numpy') # noqa
3 |
4 | import pickle
5 |
6 | import partd
7 | from partd.numpy import Numpy
8 |
9 |
10 | def test_numpy():
11 | dt = np.dtype([('a', 'i4'), ('b', 'i2'), ('c', 'f8')])
12 | with Numpy() as p:
13 | p.append({'a': np.array([10, 20, 30], dtype=dt['a']),
14 | 'b': np.array([ 1, 2, 3], dtype=dt['b']),
15 | 'c': np.array([.1, .2, .3], dtype=dt['c'])})
16 | p.append({'a': np.array([70, 80, 90], dtype=dt['a']),
17 | 'b': np.array([ 7, 8, 9], dtype=dt['b']),
18 | 'c': np.array([.7, .8, .9], dtype=dt['c'])})
19 |
20 | result = p.get(['a', 'c'])
21 | assert (result[0] == np.array([10, 20, 30, 70, 80, 90],dtype=dt['a'])).all()
22 | assert (result[1] == np.array([.1, .2, .3, .7, .8, .9],dtype=dt['c'])).all()
23 |
24 | with p.lock: # uh oh, possible deadlock
25 | result = p.get(['a'], lock=False)
26 |
27 |
28 | def test_nested():
29 | with Numpy() as p:
30 | p.append({'x': np.array([1, 2, 3]),
31 | ('y', 1): np.array([4, 5, 6]),
32 | ('z', 'a', 3): np.array([.1, .2, .3])})
33 | assert (p.get(('z', 'a', 3)) == np.array([.1, .2, .3])).all()
34 |
35 |
36 | def test_serialization():
37 | with Numpy() as p:
38 | p.append({'x': np.array([1, 2, 3])})
39 | q = pickle.loads(pickle.dumps(p))
40 | assert (q.get('x') == [1, 2, 3]).all()
41 |
42 |
43 | array_of_lists = np.empty(3, dtype='O')
44 | array_of_lists[:] = [[1, 2], [3, 4], [5, 6]]
45 |
46 |
47 | @pytest.mark.parametrize('x', [np.array(['Alice', 'Bob', 'Charlie'], dtype='O'),
48 | array_of_lists])
49 | def test_object_dtype(x):
50 | with Numpy() as p:
51 | p.append({'x': x})
52 | p.append({'x': x})
53 | assert isinstance(p.get('x'), np.ndarray)
54 | assert (p.get('x') == np.concatenate([x, x])).all()
55 |
56 |
57 | def test_datetime_types():
58 | x = np.array(['2014-01-01T12:00:00'], dtype='M8[us]')
59 | y = np.array(['2014-01-01T12:00:00'], dtype='M8[s]')
60 | with Numpy() as p:
61 | p.append({'x': x, 'y': y})
62 | assert p.get('x').dtype == x.dtype
63 | assert p.get('y').dtype == y.dtype
64 |
65 |
66 | def test_non_utf8_bytes():
67 | a = np.array([b'\xc3\x28', b'\xa0\xa1', b'\xe2\x28\xa1', b'\xe2\x82\x28',
68 | b'\xf0\x28\x8c\xbc'], dtype='O')
69 | s = partd.numpy.serialize(a)
70 | assert (partd.numpy.deserialize(s, 'O') == a).all()
71 |
--------------------------------------------------------------------------------
/partd/tests/test_pandas.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | pytest.importorskip('pandas') # noqa
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import pandas.testing as tm
7 | import os
8 |
9 | try:
10 | import pyarrow as pa
11 | except ImportError:
12 | pa = None
13 |
14 | from partd.pandas import PandasColumns, PandasBlocks, serialize, deserialize
15 |
16 |
17 | df1 = pd.DataFrame({'a': [1, 2, 3],
18 | 'b': [1., 2., 3.],
19 | 'c': ['x', 'y', 'x']}, columns=['a', 'b', 'c'],
20 | index=pd.Index([1, 2, 3], name='myindex'))
21 |
22 | df2 = pd.DataFrame({'a': [10, 20, 30],
23 | 'b': [10., 20., 30.],
24 | 'c': ['X', 'Y', 'X']}, columns=['a', 'b', 'c'],
25 | index=pd.Index([10, 20, 30], name='myindex'))
26 |
27 |
28 | def test_PandasColumns():
29 | with PandasColumns() as p:
30 | assert os.path.exists(p.partd.partd.path)
31 |
32 | p.append({'x': df1, 'y': df2})
33 | p.append({'x': df2, 'y': df1})
34 | assert os.path.exists(p.partd.partd.filename('x'))
35 | assert os.path.exists(p.partd.partd.filename(('x', 'a')))
36 | assert os.path.exists(p.partd.partd.filename(('x', '.index')))
37 | assert os.path.exists(p.partd.partd.filename('y'))
38 |
39 | result = p.get(['y', 'x'])
40 | tm.assert_frame_equal(result[0], pd.concat([df2, df1]))
41 | tm.assert_frame_equal(result[1], pd.concat([df1, df2]))
42 |
43 | with p.lock: # uh oh, possible deadlock
44 | result = p.get(['x'], lock=False)
45 |
46 | assert not os.path.exists(p.partd.partd.path)
47 |
48 |
49 | def test_column_selection():
50 | with PandasColumns('foo') as p:
51 | p.append({'x': df1, 'y': df2})
52 | p.append({'x': df2, 'y': df1})
53 | result = p.get('x', columns=['c', 'b'])
54 | tm.assert_frame_equal(result, pd.concat([df1, df2])[['c', 'b']])
55 |
56 |
57 | def test_PandasBlocks():
58 | with PandasBlocks() as p:
59 | assert os.path.exists(p.partd.path)
60 |
61 | p.append({'x': df1, 'y': df2})
62 | p.append({'x': df2, 'y': df1})
63 | assert os.path.exists(p.partd.filename('x'))
64 | assert os.path.exists(p.partd.filename('y'))
65 |
66 | result = p.get(['y', 'x'])
67 | tm.assert_frame_equal(result[0], pd.concat([df2, df1]))
68 | tm.assert_frame_equal(result[1], pd.concat([df1, df2]))
69 |
70 | with p.lock: # uh oh, possible deadlock
71 | result = p.get(['x'], lock=False)
72 |
73 | assert not os.path.exists(p.partd.path)
74 |
75 |
76 | @pytest.mark.parametrize('ordered', [False, True])
77 | def test_serialize_categoricals(ordered):
78 | frame = pd.DataFrame({'x': [1, 2, 3, 4],
79 | 'y': pd.Categorical(['c', 'a', 'b', 'a'],
80 | ordered=ordered)},
81 | index=pd.Categorical(['x', 'y', 'z', 'x'],
82 | ordered=ordered))
83 | frame.index.name = 'foo'
84 | frame.columns.name = 'bar'
85 |
86 | for ind, df in [(0, frame), (1, frame.T)]:
87 | df2 = deserialize(serialize(df))
88 | tm.assert_frame_equal(df, df2)
89 |
90 |
91 | def test_serialize_multi_index():
92 | df = pd.DataFrame({'x': ['a', 'b', 'c', 'a', 'b', 'c'],
93 | 'y': [1, 2, 3, 4, 5, 6],
94 | 'z': [7., 8, 9, 10, 11, 12]})
95 | df = df.groupby([df.x, df.y]).sum()
96 | df.index.name = 'foo'
97 | df.columns.name = 'bar'
98 |
99 | df2 = deserialize(serialize(df))
100 | tm.assert_frame_equal(df, df2)
101 |
102 |
103 | @pytest.mark.parametrize('base', [
104 | pd.Timestamp('1987-03-3T01:01:01+0001'),
105 | pd.Timestamp('1987-03-03 01:01:01-0600', tz='US/Central'),
106 | ])
107 | def test_serialize(base):
108 | df = pd.DataFrame({'x': [
109 | base + pd.Timedelta(seconds=i)
110 | for i in np.random.randint(0, 1000, size=10)],
111 | 'y': list(range(10)),
112 | 'z': pd.date_range('2017', periods=10)})
113 | df2 = deserialize(serialize(df))
114 | tm.assert_frame_equal(df, df2)
115 |
116 |
117 | def test_other_extension_types():
118 | pytest.importorskip("pandas", minversion="0.25.0")
119 | a = pd.array([pd.Period("2000"), pd.Period("2001")])
120 | df = pd.DataFrame({"A": a})
121 | df2 = deserialize(serialize(df))
122 | tm.assert_frame_equal(df, df2)
123 |
124 | @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32"])
125 | def test_index_numeric_extension_types(dtype):
126 | pytest.importorskip("pandas", minversion="1.4.0")
127 |
128 | df = pd.DataFrame({"x": [1, 2, 3]}, index=[4, 5, 6])
129 | df.index = df.index.astype(dtype)
130 | df2 = deserialize(serialize(df))
131 | tm.assert_frame_equal(df, df2)
132 |
133 | @pytest.mark.parametrize(
134 | "dtype",
135 | [
136 | "string[python]",
137 | pytest.param(
138 | "string[pyarrow]",
139 | marks=pytest.mark.skipif(pa is None, reason="Requires pyarrow"),
140 | ),
141 | ],
142 | )
143 | def test_index_non_numeric_extension_types(dtype):
144 | pytest.importorskip("pandas", minversion="1.4.0")
145 | df = pd.DataFrame({"x": [1, 2, 3]}, index=["a", "b", "c"])
146 | df.index = df.index.astype(dtype)
147 | df2 = deserialize(serialize(df))
148 | tm.assert_frame_equal(df, df2)
149 |
150 |
151 | def test_categorical_concat():
152 | pytest.importorskip("pandas", minversion="2")
153 |
154 | df1 = pd.DataFrame({"a": ["x", "y"]}, dtype="category")
155 | df2 = pd.DataFrame({"a": ["y", "z"]}, dtype="category")
156 |
157 | with PandasBlocks() as p:
158 | p.append({'x': df1})
159 | p.append({'x': df2})
160 |
161 | result = p.get(["x"])
162 | pd.testing.assert_frame_equal(result[0], pd.concat([df1, df2]).astype("category"))
163 |
--------------------------------------------------------------------------------
/partd/tests/test_partd.py:
--------------------------------------------------------------------------------
1 | from partd import File
2 | from partd.core import token, escape_filename, filename
3 | from partd import core
4 | import os
5 | import shutil
6 | from contextlib import contextmanager
7 |
8 |
9 | def test_partd():
10 | path = 'tmp.partd'
11 |
12 | with File(path) as p:
13 | p.append({'x': b'Hello', 'y': b'abc'})
14 | p.append({'x': b'World!', 'y': b'def'})
15 | assert os.path.exists(p.filename('x'))
16 | assert os.path.exists(p.filename('y'))
17 |
18 | result = p.get(['y', 'x'])
19 | assert result == [b'abcdef', b'HelloWorld!']
20 |
21 | assert p.get('z') == b''
22 |
23 | with p.lock: # uh oh, possible deadlock
24 | result = p.get(['x'], lock=False)
25 |
26 | assert not os.path.exists(path)
27 |
28 |
29 | def test_key_tuple():
30 | with File('foo') as p:
31 | p.append({('a', 'b'): b'123'})
32 | assert os.path.exists(os.path.join(p.path, 'a', 'b'))
33 |
34 |
35 | def test_ensure():
36 | with File('foo') as p:
37 | p.iset('x', b'123')
38 | p.iset('x', b'123')
39 | p.iset('x', b'123')
40 |
41 | assert p.get('x') == b'123'
42 |
43 |
44 | def test_filenames():
45 | assert token('hello') == 'hello'
46 | assert token(('hello', 'world')) == os.path.join('hello', 'world')
47 | assert escape_filename(os.path.join('a', 'b')) == os.path.join('a', 'b')
48 | assert filename('dir', ('a', 'b')) == os.path.join('dir', 'a', 'b')
49 |
--------------------------------------------------------------------------------
/partd/tests/test_pickle.py:
--------------------------------------------------------------------------------
1 | from partd.pickle import Pickle
2 |
3 |
4 | import os
5 | import shutil
6 |
7 | def test_pickle():
8 | with Pickle() as p:
9 | p.append({'x': ['Hello', 'World!'], 'y': [1, 2, 3]})
10 | p.append({'x': ['Alice', 'Bob!'], 'y': [4, 5, 6]})
11 | assert os.path.exists(p.partd.filename('x'))
12 | assert os.path.exists(p.partd.filename('y'))
13 |
14 | result = p.get(['y', 'x'])
15 | assert result == [[1, 2, 3, 4, 5, 6],
16 | ['Hello', 'World!', 'Alice', 'Bob!']]
17 |
18 | with p.lock: # uh oh, possible deadlock
19 | result = p.get(['x'], lock=False)
20 |
21 | assert not os.path.exists(p.partd.path)
22 |
23 |
24 | def test_ensure():
25 | with Pickle() as p:
26 | p.iset('x', [1, 2, 3])
27 | p.iset('x', [1, 2, 3])
28 |
29 | assert p.get('x') == [1, 2, 3]
30 |
--------------------------------------------------------------------------------
/partd/tests/test_python.py:
--------------------------------------------------------------------------------
1 | from partd.python import dumps, loads
2 |
3 |
4 | import os
5 | import shutil
6 | from math import sin
7 |
8 |
9 | def test_pack_unpack():
10 | data = [1, 2, b'Hello', 'Hello']
11 | assert loads(dumps(data)) == data
12 |
13 | data = [1, 2, sin]
14 | assert loads(dumps(data)) == data
15 |
--------------------------------------------------------------------------------
/partd/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from partd.utils import frame, framesplit
2 | import struct
3 |
4 |
5 | def test_frame():
6 | assert frame(b'Hello') == struct.pack('Q', 5) + b'Hello'
7 |
8 |
9 | def test_framesplit():
10 | L = [b'Hello', b'World!', b'123']
11 | assert list(framesplit(b''.join(map(frame, L)))) == L
12 |
--------------------------------------------------------------------------------
/partd/tests/test_zmq.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | pytest.importorskip('zmq')
3 |
4 | from partd.zmq import Server, keys_to_flush, File, Client
5 | from partd import core, Dict
6 | from threading import Thread
7 | from time import sleep
8 | from contextlib import contextmanager
9 | import pickle
10 |
11 | import os
12 | import shutil
13 |
14 |
15 | def test_server():
16 | s = Server()
17 | try:
18 | s.start()
19 | s.append({'x': b'abc', 'y': b'1234'})
20 | s.append({'x': b'def', 'y': b'5678'})
21 |
22 | assert s.get(['x']) == [b'abcdef']
23 | assert s.get(['x', 'y']) == [b'abcdef', b'12345678']
24 |
25 | assert s.get(['x']) == [b'abcdef']
26 | finally:
27 | s.close()
28 |
29 |
30 | def dont_test_flow_control():
31 | path = 'bar'
32 | if os.path.exists('bar'):
33 | shutil.rmtree('bar')
34 | s = Server('bar', available_memory=1, n_outstanding_writes=3, start=False)
35 | p = Client(s.address)
36 | try:
37 | listen_thread = Thread(target=s.listen)
38 | listen_thread.start()
39 | """ Don't start these threads
40 | self._write_to_disk_thread = Thread(target=self._write_to_disk)
41 | self._write_to_disk_thread.start()
42 | self._free_frozen_sockets_thread = Thread(target=self._free_frozen_sockets)
43 | self._free_frozen_sockets_thread.start()
44 | """
45 | p.append({'x': b'12345'})
46 | sleep(0.1)
47 | assert s._out_disk_buffer.qsize() == 1
48 | p.append({'x': b'12345'})
49 | p.append({'x': b'12345'})
50 | sleep(0.1)
51 | assert s._out_disk_buffer.qsize() == 3
52 |
53 | held_append = Thread(target=p.append, args=({'x': b'123'},))
54 | held_append.start()
55 |
56 | sleep(0.1)
57 | assert held_append.is_alive() # held!
58 |
59 | assert not s._frozen_sockets.empty()
60 |
61 | write_to_disk_thread = Thread(target=s._write_to_disk)
62 | write_to_disk_thread.start()
63 | free_frozen_sockets_thread = Thread(target=s._free_frozen_sockets)
64 | free_frozen_sockets_thread.start()
65 |
66 | sleep(0.2)
67 | assert not held_append.is_alive()
68 | assert s._frozen_sockets.empty()
69 | finally:
70 | s.close()
71 |
72 |
73 | @contextmanager
74 | def partd_server(**kwargs):
75 | with Server(**kwargs) as server:
76 | with Client(server.address) as p:
77 | yield (p, server)
78 |
79 |
80 | def test_partd_object():
81 | with partd_server() as (p, server):
82 | p.append({'x': b'Hello', 'y': b'abc'})
83 | p.append({'x': b'World!', 'y': b'def'})
84 |
85 | result = p.get(['y', 'x'])
86 | assert result == [b'abcdef', b'HelloWorld!']
87 |
88 |
89 | def test_delete():
90 | with partd_server() as (p, server):
91 | p.append({'x': b'Hello'})
92 | assert p.get('x') == b'Hello'
93 | p.delete(['x'])
94 | assert p.get('x') == b''
95 |
96 |
97 | def test_iset():
98 | with partd_server() as (p, server):
99 | p.iset('x', b'111')
100 | p.iset('x', b'111')
101 | assert p.get('x') == b'111'
102 |
103 |
104 | def test_tuple_keys():
105 | with partd_server() as (p, server):
106 | p.append({('x', 'y'): b'123'})
107 | assert p.get(('x', 'y')) == b'123'
108 |
109 |
110 | def test_serialization():
111 | with partd_server() as (p, server):
112 | p.append({'x': b'123'})
113 | q = pickle.loads(pickle.dumps(p))
114 | assert q.get('x') == b'123'
115 |
116 |
117 | def test_drop():
118 | with partd_server() as (p, server):
119 | p.append({'x': b'123'})
120 | p.drop()
121 | assert p.get('x') == b''
122 |
123 |
124 | def dont_test_server_autocreation():
125 | with Client() as p:
126 | p.append({'x': b'123'})
127 | assert p.get('x') == b'123'
128 |
--------------------------------------------------------------------------------
/partd/utils.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | import os
3 | import shutil
4 | import tempfile
5 | import struct
6 |
7 |
8 | def raises(exc, lamda):
9 | try:
10 | lamda()
11 | return False
12 | except exc:
13 | return True
14 |
15 |
16 | @contextmanager
17 | def tmpfile(extension=''):
18 | extension = '.' + extension.lstrip('.')
19 | handle, filename = tempfile.mkstemp(extension)
20 | os.close(handle)
21 | os.remove(filename)
22 |
23 | try:
24 | yield filename
25 | finally:
26 | if os.path.exists(filename):
27 | if os.path.isdir(filename):
28 | shutil.rmtree(filename)
29 | else:
30 | os.remove(filename)
31 |
32 |
33 | def frame(bytes):
34 | """ Pack the length of the bytes in front of the bytes
35 |
36 | TODO: This does a full copy. This should maybe be inlined somehow
37 | wherever this gets used instead. My laptop shows a data bandwidth of
38 | 2GB/s
39 | """
40 | return struct.pack('Q', len(bytes)) + bytes
41 |
42 |
43 | def framesplit(bytes):
44 | """ Split buffer into frames of concatenated chunks
45 |
46 | >>> data = frame(b'Hello') + frame(b'World')
47 | >>> list(framesplit(data)) # doctest: +SKIP
48 | [b'Hello', b'World']
49 | """
50 | i = 0; n = len(bytes)
51 | chunks = list()
52 | while i < n:
53 | nbytes = struct.unpack('Q', bytes[i:i+8])[0]
54 | i += 8
55 | yield bytes[i: i + nbytes]
56 | i += nbytes
57 |
58 |
59 | def partition_all(n, bytes):
60 | """ Partition bytes into evenly sized blocks
61 |
62 | The final block holds the remainder and so may not be of equal size
63 |
64 | >>> list(partition_all(2, b'Hello'))
65 | [b'He', b'll', b'o']
66 |
67 | See Also:
68 | toolz.partition_all
69 | """
70 | if len(bytes) < n: # zero copy fast common case
71 | yield bytes
72 | else:
73 | for i in range(0, len(bytes), n):
74 | yield bytes[i: i+n]
75 |
76 |
77 | def nested_get(ind, coll, lazy=False):
78 | """ Get nested index from collection
79 |
80 | Examples
81 | --------
82 |
83 | >>> nested_get(1, 'abc')
84 | 'b'
85 | >>> nested_get([1, 0], 'abc')
86 | ['b', 'a']
87 | >>> nested_get([[1, 0], [0, 1]], 'abc')
88 | [['b', 'a'], ['a', 'b']]
89 | """
90 | if isinstance(ind, list):
91 | if lazy:
92 | return (nested_get(i, coll, lazy=lazy) for i in ind)
93 | else:
94 | return [nested_get(i, coll, lazy=lazy) for i in ind]
95 | else:
96 | return coll[ind]
97 |
98 |
99 | def flatten(seq):
100 | """
101 |
102 | >>> list(flatten([1]))
103 | [1]
104 |
105 | >>> list(flatten([[1, 2], [1, 2]]))
106 | [1, 2, 1, 2]
107 |
108 | >>> list(flatten([[[1], [2]], [[1], [2]]]))
109 | [1, 2, 1, 2]
110 |
111 | >>> list(flatten(((1, 2), (1, 2)))) # Don't flatten tuples
112 | [(1, 2), (1, 2)]
113 |
114 | >>> list(flatten((1, 2, [3, 4]))) # support heterogeneous
115 | [1, 2, 3, 4]
116 | """
117 | for item in seq:
118 | if isinstance(item, list):
119 | yield from flatten(item)
120 | else:
121 | yield item
122 |
123 |
124 | def suffix(key, term):
125 | """ suffix a key with a suffix
126 |
127 | Works if they key is a string or a tuple
128 |
129 | >>> suffix('x', '.dtype')
130 | 'x.dtype'
131 | >>> suffix(('a', 'b', 'c'), '.dtype')
132 | ('a', 'b', 'c.dtype')
133 | """
134 | if isinstance(key, str):
135 | return key + term
136 | elif isinstance(key, tuple):
137 | return key[:-1] + (suffix(key[-1], term),)
138 | else:
139 | return suffix(str(key), term)
140 |
141 |
142 | def extend(key, term):
143 | """ extend a key with a another element in a tuple
144 |
145 | Works if they key is a string or a tuple
146 |
147 | >>> extend('x', '.dtype')
148 | ('x', '.dtype')
149 | >>> extend(('a', 'b', 'c'), '.dtype')
150 | ('a', 'b', 'c', '.dtype')
151 | """
152 | if isinstance(term, tuple):
153 | pass
154 | elif isinstance(term, str):
155 | term = (term,)
156 | else:
157 | term = (str(term),)
158 |
159 | if not isinstance(key, tuple):
160 | key = (key,)
161 |
162 | return key + term
163 |
--------------------------------------------------------------------------------
/partd/zmq.py:
--------------------------------------------------------------------------------
1 | import zmq
2 | import logging
3 | from itertools import chain
4 | from bisect import bisect
5 | import socket
6 | from operator import add
7 | from time import sleep, time
8 | from toolz import accumulate, topk, pluck, merge, keymap
9 | import uuid
10 | from collections import defaultdict
11 | from contextlib import contextmanager, suppress
12 | from threading import Thread, Lock
13 | from datetime import datetime
14 | from multiprocessing import Process
15 | import traceback
16 | import sys
17 | from .dict import Dict
18 | from .file import File
19 | from .buffer import Buffer
20 | from . import core
21 |
22 |
23 | tuple_sep = b'-|-'
24 |
25 | logger = logging.getLogger(__name__)
26 |
27 |
28 | @contextmanager
29 | def logerrors():
30 | try:
31 | yield
32 | except Exception as e:
33 | logger.exception(e)
34 | raise
35 |
36 |
37 | class Server:
38 | def __init__(self, partd=None, bind=None, start=True, block=False,
39 | hostname=None):
40 | self.context = zmq.Context()
41 | if partd is None:
42 | partd = Buffer(Dict(), File())
43 | self.partd = partd
44 |
45 | self.socket = self.context.socket(zmq.ROUTER)
46 |
47 | if hostname is None:
48 | hostname = socket.gethostname()
49 | if isinstance(bind, str):
50 | bind = bind.encode()
51 | if bind is None:
52 | port = self.socket.bind_to_random_port('tcp://*')
53 | else:
54 | self.socket.bind(bind)
55 | port = int(bind.split(':')[-1].rstrip('/'))
56 | self.address = ('tcp://%s:%d' % (hostname, port)).encode()
57 |
58 | self.status = 'created'
59 |
60 | self.partd.lock.acquire()
61 | self._lock = Lock()
62 | self._socket_lock = Lock()
63 |
64 | if start:
65 | self.start()
66 |
67 | if block:
68 | self.block()
69 |
70 | def start(self):
71 | if self.status != 'run':
72 | self.status = 'run'
73 | self._listen_thread = Thread(target=self.listen)
74 | self._listen_thread.start()
75 | logger.debug('Start server at %s', self.address)
76 |
77 | def block(self):
78 | """ Block until all threads close """
79 | try:
80 | self._listen_thread.join()
81 | except AttributeError:
82 | pass
83 |
84 | def listen(self):
85 | with logerrors():
86 | logger.debug('Start listening %s', self.address)
87 | while self.status != 'closed':
88 | if not self.socket.poll(100):
89 | continue
90 |
91 | with self._socket_lock:
92 | payload = self.socket.recv_multipart()
93 |
94 | address, command, payload = payload[0], payload[1], payload[2:]
95 | logger.debug('Server receives %s %s', address, command)
96 | if command == b'close':
97 | logger.debug('Server closes')
98 | self.ack(address)
99 | self.status = 'closed'
100 | break
101 | # self.close()
102 |
103 | elif command == b'append':
104 | keys, values = payload[::2], payload[1::2]
105 | keys = list(map(deserialize_key, keys))
106 | data = dict(zip(keys, values))
107 | self.partd.append(data, lock=False)
108 | logger.debug('Server appends %d keys', len(data))
109 | self.ack(address)
110 |
111 | elif command == b'iset':
112 | key, value = payload
113 | key = deserialize_key(key)
114 | self.partd.iset(key, value, lock=False)
115 | self.ack(address)
116 |
117 | elif command == b'get':
118 | keys = list(map(deserialize_key, payload))
119 | logger.debug('get %s', keys)
120 | result = self.get(keys)
121 | self.send_to_client(address, result)
122 | self.ack(address, flow_control=False)
123 |
124 | elif command == b'delete':
125 | keys = list(map(deserialize_key, payload))
126 | logger.debug('delete %s', keys)
127 | self.partd.delete(keys, lock=False)
128 | self.ack(address, flow_control=False)
129 |
130 | elif command == b'syn':
131 | self.ack(address)
132 |
133 | elif command == b'drop':
134 | self.drop()
135 | self.ack(address)
136 |
137 | else:
138 | logger.debug("Unknown command: %s", command)
139 | raise ValueError("Unknown command: " + command)
140 |
141 | def send_to_client(self, address, result):
142 | with logerrors():
143 | if not isinstance(result, list):
144 | result = [result]
145 | with self._socket_lock:
146 | self.socket.send_multipart([address] + result)
147 |
148 | def ack(self, address, flow_control=True):
149 | with logerrors():
150 | logger.debug('Server sends ack')
151 | self.send_to_client(address, b'ack')
152 |
153 | def append(self, data):
154 | self.partd.append(data, lock=False)
155 | logger.debug('Server appends %d keys', len(data))
156 |
157 | def drop(self):
158 | with logerrors():
159 | self.partd.drop()
160 |
161 | def get(self, keys):
162 | with logerrors():
163 | logger.debug('Server gets keys: %s', keys)
164 | with self._lock:
165 | result = self.partd.get(keys, lock=False)
166 | return result
167 |
168 | def close(self):
169 | logger.debug('Server closes')
170 | self.status = 'closed'
171 | self.block()
172 | with suppress(zmq.error.ZMQError):
173 | self.socket.close(1)
174 | with suppress(zmq.error.ZMQError):
175 | self.context.destroy(3)
176 | self.partd.lock.release()
177 |
178 | def __enter__(self):
179 | self.start()
180 | return self
181 |
182 | def __exit__(self, *args):
183 | self.close()
184 | self.partd.__exit__(*args)
185 |
186 |
187 | def keys_to_flush(lengths, fraction=0.1, maxcount=100000):
188 | """ Which keys to remove
189 |
190 | >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15,
191 | ... 'e': 10, 'f': 25, 'g': 5}
192 | >>> keys_to_flush(lengths, 0.5)
193 | ['f', 'a']
194 | """
195 | top = topk(max(len(lengths) // 2, 1),
196 | lengths.items(),
197 | key=1)
198 | total = sum(lengths.values())
199 | cutoff = min(maxcount, max(1,
200 | bisect(list(accumulate(add, pluck(1, top))),
201 | total * fraction)))
202 | result = [k for k, v in top[:cutoff]]
203 | assert result
204 | return result
205 |
206 |
207 | def serialize_key(key):
208 | """
209 |
210 | >>> serialize_key('x')
211 | b'x'
212 | >>> serialize_key(('a', 'b', 1))
213 | b'a-|-b-|-1'
214 | """
215 | if isinstance(key, tuple):
216 | return tuple_sep.join(map(serialize_key, key))
217 | if isinstance(key, bytes):
218 | return key
219 | if isinstance(key, str):
220 | return key.encode()
221 | return str(key).encode()
222 |
223 |
224 | def deserialize_key(text):
225 | """
226 |
227 | >>> deserialize_key(b'x')
228 | b'x'
229 | >>> deserialize_key(b'a-|-b-|-1')
230 | (b'a', b'b', b'1')
231 | """
232 | if tuple_sep in text:
233 | return tuple(text.split(tuple_sep))
234 | else:
235 | return text
236 |
237 |
238 | from .core import Interface
239 | from .file import File
240 |
241 |
242 | class Client(Interface):
243 | def __init__(self, address=None, create_server=False, **kwargs):
244 | self.address = address
245 | self.context = zmq.Context()
246 | self.socket = self.context.socket(zmq.DEALER)
247 | logger.debug('Client connects to %s', address)
248 | self.socket.connect(address)
249 | self.send(b'syn', [], ack_required=False)
250 | self.lock = NotALock() # Server sequentializes everything
251 | Interface.__init__(self)
252 |
253 | def __getstate__(self):
254 | return {'address': self.address}
255 |
256 | def __setstate__(self, state):
257 | self.__init__(state['address'])
258 | logger.debug('Reconstruct client from pickled state')
259 |
260 | def send(self, command, payload, recv=False, ack_required=True):
261 | if ack_required:
262 | ack = self.socket.recv_multipart()
263 | assert ack == [b'ack']
264 | logger.debug('Client sends command: %s', command)
265 | self.socket.send_multipart([command] + payload)
266 | if recv:
267 | result = self.socket.recv_multipart()
268 | else:
269 | result = None
270 | return result
271 |
272 | def _get(self, keys, lock=None):
273 | """
274 |
275 | Lock argument is ignored. Everything is sequential (I think)
276 | """
277 | logger.debug('Client gets %s %s', self.address, keys)
278 | keys = list(map(serialize_key, keys))
279 | return self.send(b'get', keys, recv=True)
280 |
281 | def append(self, data, lock=None):
282 | logger.debug('Client appends %s %s', self.address, str(len(data)) + ' keys')
283 | data = keymap(serialize_key, data)
284 | payload = list(chain.from_iterable(data.items()))
285 | self.send(b'append', payload)
286 |
287 | def _delete(self, keys, lock=None):
288 | logger.debug('Client deletes %s %s', self.address, str(len(keys)) + ' keys')
289 | keys = list(map(serialize_key, keys))
290 | self.send(b'delete', keys)
291 |
292 | def _iset(self, key, value):
293 | self.send(b'iset', [serialize_key(key), value])
294 |
295 | def drop(self):
296 | self.send(b'drop', [])
297 | sleep(0.05)
298 |
299 | def close_server(self):
300 | self.send(b'close', [])
301 |
302 | def close(self):
303 | if hasattr(self, 'server_process'):
304 | with suppress(zmq.error.ZMQError):
305 | self.close_server()
306 | self.server_process.join()
307 | with suppress(zmq.error.ZMQError):
308 | self.socket.close(1)
309 | with suppress(zmq.error.ZMQError):
310 | self.context.destroy(1)
311 |
312 | def __exit__(self, type, value, traceback):
313 | self.drop()
314 | self.close()
315 |
316 | def __del__(self):
317 | self.close()
318 |
319 |
320 | class NotALock:
321 | def acquire(self): pass
322 | def release(self): pass
323 |
324 | def __enter__(self):
325 | return self
326 |
327 | def __exit__(self, *args):
328 | pass
329 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.2", "versioneer[toml]==0.29"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "partd"
7 | description = "Appendable key-value storage"
8 | maintainers = [{name = "Matthew Rocklin", email = "mrocklin@gmail.com"}]
9 | license = {text = "BSD"}
10 | keywords = []
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "Programming Language :: Python :: 3.9",
14 | "Programming Language :: Python :: 3.10",
15 | "Programming Language :: Python :: 3.11",
16 | "Programming Language :: Python :: 3.12",
17 | ]
18 | readme = "README.rst"
19 | urls = {Homepage = "http://github.com/dask/partd/"}
20 | requires-python = ">=3.9"
21 | dynamic = ["version"]
22 | dependencies = [
23 | "locket",
24 | "toolz",
25 | ]
26 |
27 | [project.optional-dependencies]
28 | complete = [
29 | "numpy >= 1.20.0",
30 | "pandas >=1.3",
31 | "pyzmq",
32 | "blosc",
33 | ]
34 |
35 | [tool.setuptools]
36 | packages = ["partd"]
37 | zip-safe = false
38 | include-package-data = false
39 |
40 | [tool.versioneer]
41 | VCS = "git"
42 | style = "pep440"
43 | versionfile_source = "partd/_version.py"
44 | versionfile_build = "partd/_version.py"
45 | tag_prefix = ""
46 | parentdir_prefix = "partd-"
47 |
48 | [tool.pytest.ini_options]
49 | addopts = "--strict-markers --strict-config"
50 | filterwarnings = ["error"]
51 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | locket
2 | toolz
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import annotations
4 |
5 | import versioneer
6 | from setuptools import setup
7 |
8 | setup(
9 | version=versioneer.get_version(),
10 | cmdclass=versioneer.get_cmdclass(),
11 | )
12 |
--------------------------------------------------------------------------------