├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── castra
    ├── __init__.py
    ├── core.py
    └── tests
    │   └── test_core.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: False
 2 | 
 3 | language: python
 4 | 
 5 | matrix:
 6 |   include:
 7 |     - python: 2.7
 8 |     - python: 3.3
 9 |     - python: 3.4
10 |     - python: 3.5
11 | 
12 | install:
13 |   # Install conda
14 |   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
15 |   - bash miniconda.sh -b -p $HOME/miniconda
16 |   - export PATH="$HOME/miniconda/bin:$PATH"
17 |   - conda config --set always_yes yes --set changeps1 no
18 |   - conda update conda
19 | 
20 |   # Install dependencies
21 |   - conda create -n castra python=$TRAVIS_PYTHON_VERSION pytest numpy pandas dask
22 |   - source activate castra
23 |   - pip install blosc
24 |   - pip install bloscpack
25 |   - pip install dask --upgrade
26 | 
27 | script:
28 |   - py.test -x --doctest-modules --pyargs castra
29 | 
30 | notifications:
31 |   email: false
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ﻿Copyright (c) 2015, Continuum Analytics, Inc.
 2 | Copyright (c) 2015, Valentin Haenel <valentin@haenel.co>
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 | Redistributions of source code must retain the above copyright notice,
 9 | this list of conditions and the following disclaimer.
10 | 
11 | Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 | 
15 | Neither the name of Continuum Analytics nor the names of any contributors
16 | may be used to endorse or promote products derived from this software
17 | without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
29 | THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include castra *.py
 2 | recursive-include docs *.rst
 3 | 
 4 | include setup.py
 5 | include README.rst
 6 | include LICENSE.txt
 7 | include requirements.txt
 8 | include MANIFEST.in
 9 | 
10 | prune docs/_build
11 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Castra
  2 | ======
  3 | 
  4 | |Build Status|
  5 | 
  6 | Castra is an on-disk, partitioned, compressed, column store.
  7 | Castra provides efficient columnar range queries.
  8 | 
  9 | *  **Efficient on-disk:**  Castra stores data on your hard drive in a way that you can load it quickly, increasing the comfort of inconveniently large data.
 10 | *  **Partitioned:**  Castra partitions your data along an index, allowing rapid loads of ranges of data like "All records between January and March"
 11 | *  **Compressed:**  Castra uses Blosc_ to compress data, increasing effective disk bandwidth and decreasing storage costs
 12 | *  **Column-store:**  Castra stores columns separately, drastically reducing I/O costs for analytic queries
 13 | *  **Tabular data:**  Castra plays well with Pandas and is an ideal fit for append-only applications like time-series
 14 | 
 15 | Maintenance
 16 | -----------
 17 | 
 18 | This project is no longer actively maintained.  Use at your own risk.
 19 | 
 20 | Example
 21 | -------
 22 | 
 23 | Consider some Pandas DataFrames
 24 | 
 25 | .. code-block:: python
 26 | 
 27 |    In [1]: import pandas as pd
 28 |    In [2]: A = pd.DataFrame({'price': [10.0, 11.0], 'volume': [100, 200]},
 29 |       ...:                  index=pd.DatetimeIndex(['2010', '2011']))
 30 | 
 31 |    In [3]: B = pd.DataFrame({'price': [12.0, 13.0], 'volume': [300, 400]},
 32 |       ...:                  index=pd.DatetimeIndex(['2012', '2013']))
 33 | 
 34 | We create a Castra with a filename and a template dataframe from which to get
 35 | column name, index, and dtype information
 36 | 
 37 | .. code-block:: python
 38 | 
 39 |    In [4]: from castra import Castra
 40 |    In [5]: c = Castra('data.castra', template=A)
 41 | 
 42 | The castra starts empty but we can extend it with new dataframes:
 43 | 
 44 | .. code-block:: python
 45 | 
 46 |    In [6]: c.extend(A)
 47 | 
 48 |    In [7]: c[:]
 49 |    Out[7]:
 50 |                price  volume
 51 |    2010-01-01     10     100
 52 |    2011-01-01     11     200
 53 | 
 54 |    In [8]: c.extend(B)
 55 | 
 56 |    In [9]: c[:]
 57 |    Out[9]:
 58 |                price  volume
 59 |    2010-01-01     10     100
 60 |    2011-01-01     11     200
 61 |    2012-01-01     12     300
 62 |    2013-01-01     13     400
 63 | 
 64 | We can select particular columns
 65 | 
 66 | .. code-block:: python
 67 | 
 68 |    In [10]: c[:, 'price']
 69 |    Out[10]:
 70 |    2010-01-01    10
 71 |    2011-01-01    11
 72 |    2012-01-01    12
 73 |    2013-01-01    13
 74 |    Name: price, dtype: float64
 75 | 
 76 | Particular ranges
 77 | 
 78 | .. code-block:: python
 79 | 
 80 |    In [12]: c['2011':'2013']
 81 |    Out[12]:
 82 |                price  volume
 83 |    2011-01-01     11     200
 84 |    2012-01-01     12     300
 85 |    2013-01-01     13     400
 86 | 
 87 | Or both
 88 | 
 89 | .. code-block:: python
 90 | 
 91 |    In [13]: c['2011':'2013', 'volume']
 92 |    Out[13]:
 93 |    2011-01-01    200
 94 |    2012-01-01    300
 95 |    2013-01-01    400
 96 |    Name: volume, dtype: int64
 97 | 
 98 | Storage
 99 | -------
100 | 
101 | Castra stores your dataframes as they arrived, you can see the divisions along
102 | which you data is divided.
103 | 
104 | .. code-block:: python
105 | 
106 |    In [14]: c.partitions
107 |    Out[14]:
108 |    2011-01-01    2009-12-31T16:00:00.000000000-0800--2010-12-31...
109 |    2013-01-01    2011-12-31T16:00:00.000000000-0800--2012-12-31...
110 |    dtype: object
111 | 
112 | Each column in each partition lives in a separate compressed file::
113 | 
114 |    $ ls -a data.castra/2011-12-31T16:00:00.000000000-0800--2012-12-31T16:00:00.000000000-0800
115 |    .  ..  .index  price  volume
116 | 
117 | Restrictions
118 | ------------
119 | 
120 | Castra is both fast and restrictive.
121 | 
122 | *  You must always give it dataframes that match its template (same column
123 |    names, index type, dtypes).
124 | *  You can only give castra dataframes with **increasing index values**.  For
125 |    example you can give it one dataframe a day for values on that day.  You can
126 |    not go back and update previous days.
127 | 
128 | Text and Categoricals
129 | ---------------------
130 | 
131 | Castra tries to encode text and object dtype columns with
132 | msgpack_, using the implementation found in
133 | the Pandas library.  It falls back to `pickle` with a high protocol if that
134 | fails.
135 | 
136 | Alternatively, Castra can categorize your data as it receives it
137 | 
138 | .. code-block:: python
139 | 
140 |    >>> c = Castra('data.castra', template=df, categories=['list', 'of', 'columns'])
141 | 
142 |    or
143 | 
144 |    >>> c = Castra('data.castra', template=df, categories=True) # all object dtype columns
145 | 
146 | Categorizing columns that have repetitive text, like ``'sex'`` or
147 | ``'ticker-symbol'`` can greatly improve both read times and computational
148 | performance with Pandas.  See this blogpost_ for more information.
149 | 
150 | .. _msgpack: http://msgpack.org/index.html
151 | 
152 | 
153 | Dask dataframe
154 | --------------
155 | 
156 | Castra interoperates smoothly with dask.dataframe_
157 | 
158 | .. code-block:: python
159 | 
160 |    >>> import dask.dataframe as dd
161 |    >>> df = dd.read_csv('myfiles.*.csv')
162 |    >>> df.set_index('timestamp', compute=False).to_castra('myfile.castra', categories=True)
163 | 
164 |    >>> df = dd.from_castra('myfile.castra')
165 | 
166 | Work in Progress
167 | ----------------
168 | 
169 | Castra is immature and largely for experimental use.
170 | 
171 | The developers do not promise backwards compatibility with future versions.
172 | You should treat castra as a very efficient temporary format and archive your
173 | data with some other system.
174 | 
175 | 
176 | 
177 | .. _Blosc: https://github.com/Blosc
178 | 
179 | .. _dask.dataframe: https://dask.pydata.org/en/latest/dataframe.html
180 | 
181 | .. _blogpost: http://matthewrocklin.com/blog/work/2015/06/18/Categoricals
182 | 
183 | .. |Build Status| image:: https://travis-ci.org/blaze/castra.svg
184 |    :target: https://travis-ci.org/blaze/castra
185 | 


--------------------------------------------------------------------------------
/castra/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import Castra
2 | 
3 | __version__ = '0.1.8'
4 | 


--------------------------------------------------------------------------------
/castra/core.py:
--------------------------------------------------------------------------------
  1 | from collections import Iterator
  2 | 
  3 | import os
  4 | 
  5 | from os.path import exists, isdir
  6 | 
  7 | try:
  8 |     import cPickle as pickle
  9 | except ImportError:
 10 |     import pickle
 11 | 
 12 | import shutil
 13 | import tempfile
 14 | from hashlib import md5
 15 | 
 16 | from functools import partial
 17 | 
 18 | import blosc
 19 | import bloscpack
 20 | 
 21 | import numpy as np
 22 | import pandas as pd
 23 | 
 24 | from pandas import msgpack
 25 | 
 26 | 
 27 | bp_args = bloscpack.BloscpackArgs(offsets=False, checksum='None')
 28 | 
 29 | def blosc_args(dt):
 30 |     if np.issubdtype(dt, int):
 31 |         return bloscpack.BloscArgs(dt.itemsize, clevel=3, shuffle=True)
 32 |     if np.issubdtype(dt, np.datetime64):
 33 |         return bloscpack.BloscArgs(dt.itemsize, clevel=3, shuffle=True)
 34 |     if np.issubdtype(dt, float):
 35 |         return bloscpack.BloscArgs(dt.itemsize, clevel=1, shuffle=False)
 36 |     return None
 37 | 
 38 | 
 39 | # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python
 40 | import string
 41 | valid_chars = "-_%s%s" % (string.ascii_letters, string.digits)
 42 | 
 43 | def escape(text):
 44 |     """
 45 | 
 46 |     >>> escape("Hello!")  # Remove punctuation from names
 47 |     'Hello'
 48 | 
 49 |     >>> escape("/!.")  # completely invalid names produce hash string
 50 |     'cb6698330c63e87fc35933a0474238b0'
 51 |     """
 52 |     result = ''.join(c for c in str(text) if c in valid_chars)
 53 |     if not result:
 54 |         result = md5(str(text).encode()).hexdigest()
 55 |     return result
 56 | 
 57 | 
 58 | def mkdir(path):
 59 |     if not exists(path):
 60 |         os.makedirs(path)
 61 | 
 62 | 
 63 | class Castra(object):
 64 |     meta_fields = ['columns', 'dtypes', 'index_dtype', 'axis_names']
 65 | 
 66 |     def __init__(self, path=None, template=None, categories=None, readonly=False):
 67 |         self._readonly = readonly
 68 |         # check if we should create a random path
 69 |         self._explicitly_given_path = path is not None
 70 | 
 71 |         if not self._explicitly_given_path:
 72 |             self.path = tempfile.mkdtemp(prefix='castra-')
 73 |         else:
 74 |             self.path = path
 75 | 
 76 |         # either we have a meta directory
 77 |         if isdir(self.dirname('meta')):
 78 |             if template is not None:
 79 |                 raise ValueError(
 80 |                     "Opening a castra with a template, yet this castra\n"
 81 |                     "already exists.  Filename: %s" % self.path)
 82 |             self.load_meta()
 83 |             self.load_partitions()
 84 |             self.load_categories()
 85 | 
 86 |         # or we don't, in which case we need a template
 87 |         elif template is not None:
 88 |             if self._readonly:
 89 |                 ValueError("Can't create new castra in readonly mode")
 90 | 
 91 |             if isinstance(categories, (list, tuple)):
 92 |                 if template.index.name in categories:
 93 |                     categories.remove(template.index.name)
 94 |                     categories.append('.index')
 95 |                 self.categories = dict((col, []) for col in categories)
 96 |             elif categories is True:
 97 |                 self.categories = dict((col, [])
 98 |                                        for col in template.columns
 99 |                                        if template.dtypes[col] == 'object')
100 |                 if isinstance(template.index, pd.CategoricalIndex):
101 |                     self.categories['.index'] = []
102 |             else:
103 |                 self.categories = dict()
104 | 
105 |             if self.categories:
106 |                 categories = set(self.categories)
107 |                 template_categories = set(template.dtypes.index.values)
108 |                 if categories.difference(template_categories) - set(['.index']):
109 |                     raise ValueError('passed in categories %s are not all '
110 |                                      'contained in template dataframe columns '
111 |                                      '%s' % (categories, template_categories))
112 | 
113 |             template2 = _decategorize(self.categories, template)[2]
114 | 
115 |             self.columns, self.dtypes, self.index_dtype = \
116 |                 list(template2.columns), template2.dtypes, template2.index.dtype
117 |             self.axis_names = [template2.index.name, template2.columns.name]
118 | 
119 |             # If index is a RangeIndex, use Int64Index instead
120 |             ind_type = type(template2.index)
121 |             try:
122 |                 if isinstance(template2.index, pd.RangeIndex):
123 |                     ind_type = pd.Int64Index
124 |             except AttributeError:
125 |                 pass
126 |             self.partitions = pd.Series([], dtype='O', index=ind_type([]))
127 |             self.minimum = None
128 | 
129 |             # check if the given path exists already and create it if it doesn't
130 |             mkdir(self.path)
131 | 
132 |             # raise an Exception if it isn't a directory
133 |             if not isdir(self.path):
134 |                 raise ValueError("'path': %s must be a directory")
135 | 
136 |             mkdir(self.dirname('meta', 'categories'))
137 |             self.flush_meta()
138 |             self.save_partitions()
139 |         else:
140 |             raise ValueError(
141 |                 "must specify a 'template' when creating a new Castra")
142 | 
143 |     def _empty_dataframe(self):
144 |         data = dict((n, pd.Series([], dtype=d, name=n))
145 |                     for (n, d) in self.dtypes.iteritems())
146 |         index = pd.Index([], name=self.axis_names[0])
147 |         columns = pd.Index(self.columns, name=self.axis_names[1])
148 |         df = pd.DataFrame(data, columns=columns, index=index)
149 |         return _categorize(self.categories, df)
150 | 
151 |     def load_meta(self, loads=pickle.loads):
152 |         for name in self.meta_fields:
153 |             with open(self.dirname('meta', name), 'rb') as f:
154 |                 setattr(self, name, loads(f.read()))
155 | 
156 |     def flush_meta(self, dumps=partial(pickle.dumps, protocol=2)):
157 |         if self._readonly:
158 |             raise IOError('File not open for writing')
159 |         for name in self.meta_fields:
160 |             with open(self.dirname('meta', name), 'wb') as f:
161 |                 f.write(dumps(getattr(self, name)))
162 | 
163 |     def load_partitions(self, loads=pickle.loads):
164 |         with open(self.dirname('meta', 'plist'), 'rb') as f:
165 |             self.partitions = loads(f.read())
166 |         with open(self.dirname('meta', 'minimum'), 'rb') as f:
167 |             self.minimum = loads(f.read())
168 | 
169 |     def save_partitions(self, dumps=partial(pickle.dumps, protocol=2)):
170 |         if self._readonly:
171 |             raise IOError('File not open for writing')
172 |         with open(self.dirname('meta', 'minimum'), 'wb') as f:
173 |             f.write(dumps(self.minimum))
174 |         with open(self.dirname('meta', 'plist'), 'wb') as f:
175 |             f.write(dumps(self.partitions))
176 | 
177 |     def append_categories(self, new, dumps=partial(pickle.dumps, protocol=2)):
178 |         if self._readonly:
179 |             raise IOError('File not open for writing')
180 |         separator = b'-sep-'
181 |         for col, cat in new.items():
182 |             if cat:
183 |                 with open(self.dirname('meta', 'categories', col), 'ab') as f:
184 |                     f.write(separator.join(map(dumps, cat)))
185 |                     f.write(separator)
186 | 
187 |     def load_categories(self, loads=pickle.loads):
188 |         separator = b'-sep-'
189 |         self.categories = dict()
190 |         for col in list(self.columns) + ['.index']:
191 |             fn = self.dirname('meta', 'categories', col)
192 |             if os.path.exists(fn):
193 |                 with open(fn, 'rb') as f:
194 |                     text = f.read()
195 |                 self.categories[col] = [loads(x)
196 |                                         for x in text.split(separator)[:-1]]
197 | 
198 |     def extend(self, df):
199 |         if self._readonly:
200 |             raise IOError('File not open for writing')
201 |         if len(df) == 0:
202 |             return
203 |         # TODO: Ensure that df is consistent with existing data
204 |         if not df.index.is_monotonic_increasing:
205 |             df = df.sort_index(inplace=False)
206 | 
207 |         new_categories, self.categories, df = _decategorize(self.categories,
208 |                                                             df)
209 |         self.append_categories(new_categories)
210 | 
211 |         if len(self.partitions) and df.index[0] <= self.partitions.index[-1]:
212 |             if is_trivial_index(df.index):
213 |                 df = df.copy()
214 |                 start = self.partitions.index[-1] + 1
215 |                 new_index = pd.Index(np.arange(start, start + len(df)),
216 |                                      name = df.index.name)
217 |                 df.index = new_index
218 |             else:
219 |                 raise ValueError("Index of new dataframe less than known data")
220 | 
221 |         index = df.index.values
222 |         partition_name = '--'.join([escape(index.min()), escape(index.max())])
223 | 
224 |         mkdir(self.dirname(partition_name))
225 | 
226 |         # Store columns
227 |         for col in df.columns:
228 |             pack_file(df[col].values, self.dirname(partition_name, col))
229 | 
230 |         # Store index
231 |         fn = self.dirname(partition_name, '.index')
232 |         bloscpack.pack_ndarray_file(index, fn, bloscpack_args=bp_args,
233 |                                     blosc_args=blosc_args(index.dtype))
234 | 
235 |         if not len(self.partitions):
236 |             self.minimum = coerce_index(index.dtype, index.min())
237 |         self.partitions.loc[index.max()] = partition_name
238 |         self.flush()
239 | 
240 |     def extend_sequence(self, seq, freq=None):
241 |         """Add dataframes from an iterable, optionally repartitioning by freq.
242 | 
243 |         Parameters
244 |         ----------
245 |         seq : iterable
246 |             An iterable of dataframes
247 |         freq : frequency, optional
248 |             A pandas datetime offset. If provided, the dataframes will be
249 |             partitioned by this frequency.
250 |         """
251 |         if self._readonly:
252 |             raise IOError('File not open for writing')
253 |         if isinstance(freq, str):
254 |             freq = pd.datetools.to_offset(freq)
255 |             partitioner = lambda buf, df: partitionby_freq(freq, buf, df)
256 |         elif freq is None:
257 |             partitioner = partitionby_none
258 |         else:
259 |             raise ValueError("Invalid 'freq': {0}".format(repr(freq)))
260 |         buf = self._empty_dataframe()
261 |         for df in seq:
262 |             write, buf = partitioner(buf, df)
263 |             for frame in write:
264 |                 self.extend(frame)
265 |         if buf is not None and not buf.empty:
266 |             self.extend(buf)
267 | 
268 |     def dirname(self, *args):
269 |         return os.path.join(self.path, *list(map(escape, args)))
270 | 
271 |     def load_partition(self, name, columns, categorize=True):
272 |         if isinstance(columns, Iterator):
273 |             columns = list(columns)
274 |         if '.index' in self.categories and name in self.partitions.index:
275 |             name = self.categories['.index'].index(name) - 1
276 |         if not isinstance(columns, list):
277 |             df = self.load_partition(name, [columns], categorize=categorize)
278 |             return df.iloc[:, 0]
279 |         arrays = [unpack_file(self.dirname(name, col)) for col in columns]
280 | 
281 |         df = pd.DataFrame(dict(zip(columns, arrays)),
282 |                           columns=pd.Index(columns, name=self.axis_names[1],
283 |                                            tupleize_cols=False),
284 |                           index=self.load_index(name))
285 |         if categorize:
286 |             df = _categorize(self.categories, df)
287 |         return df
288 | 
289 |     def load_index(self, name):
290 |         return pd.Index(unpack_file(self.dirname(name, '.index')),
291 |                         dtype=self.index_dtype,
292 |                         name=self.axis_names[0],
293 |                         tupleize_cols=False)
294 | 
295 |     def __getitem__(self, key):
296 |         if isinstance(key, tuple):
297 |             key, columns = key
298 |         else:
299 |             columns = self.columns
300 |         if isinstance(columns, slice):
301 |             columns = self.columns[columns]
302 | 
303 |         if isinstance(key, slice):
304 |             start, stop = key.start, key.stop
305 |         else:
306 |             start, stop = key, key
307 | 
308 |         if '.index' in self.categories:
309 |             if start is not None:
310 |                 start = self.categories['.index'].index(start)
311 |             if stop is not None:
312 |                 stop = self.categories['.index'].index(stop)
313 |         key = slice(start, stop)
314 | 
315 |         names = select_partitions(self.partitions, key)
316 | 
317 |         if not names:
318 |             return self._empty_dataframe()[columns]
319 | 
320 |         data_frames = [self.load_partition(name, columns, categorize=False)
321 |                        for name in names]
322 | 
323 |         data_frames[0] = data_frames[0].loc[start:]
324 |         data_frames[-1] = data_frames[-1].loc[:stop]
325 |         df = pd.concat(data_frames)
326 |         df = _categorize(self.categories, df)
327 |         return df
328 | 
329 |     def drop(self):
330 |         if self._readonly:
331 |             raise IOError('File not open for writing')
332 |         if os.path.exists(self.path):
333 |             shutil.rmtree(self.path)
334 | 
335 |     def flush(self):
336 |         if self._readonly:
337 |             raise IOError('File not open for writing')
338 |         self.save_partitions()
339 | 
340 |     def __enter__(self):
341 |         return self
342 | 
343 |     def __exit__(self, *args):
344 |         if not self._explicitly_given_path:
345 |             self.drop()
346 |         elif not self._readonly:
347 |             self.flush()
348 | 
349 |     __del__ = __exit__
350 | 
351 |     def __getstate__(self):
352 |         if not self._readonly:
353 |             self.flush()
354 |         return (self.path, self._explicitly_given_path, self._readonly)
355 | 
356 |     def __setstate__(self, state):
357 |         self.path = state[0]
358 |         self._explicitly_given_path = state[1]
359 |         self._readonly = state[2]
360 |         self.load_meta()
361 |         self.load_partitions()
362 |         self.load_categories()
363 | 
364 |     def to_dask(self, columns=None):
365 |         import dask.dataframe as dd
366 | 
367 |         meta = self._empty_dataframe()
368 |         if columns is None:
369 |             columns = self.columns
370 |         else:
371 |             meta = meta[columns]
372 | 
373 |         token = md5(str((self.path, os.path.getmtime(self.path))).encode()).hexdigest()
374 |         name = 'from-castra-' + token
375 | 
376 |         divisions = [self.minimum] + self.partitions.index.tolist()
377 |         if '.index' in self.categories:
378 |             divisions = ([self.categories['.index'][0]]
379 |                        + [self.categories['.index'][d + 1] for d in divisions[1:-1]]
380 |                        + [self.categories['.index'][-1]])
381 | 
382 |         key_parts = list(enumerate(self.partitions.values))
383 | 
384 |         dsk = dict(((name, i), (Castra.load_partition, self, part, columns))
385 |                    for i, part in key_parts)
386 |         if isinstance(columns, list):
387 |             return dd.DataFrame(dsk, name, meta, divisions)
388 |         else:
389 |             return dd.Series(dsk, name, meta, divisions)
390 | 
391 | 
392 | def pack_file(x, fn, encoding='utf8'):
393 |     """ Pack numpy array into filename
394 | 
395 |     Supports binary data with bloscpack and text data with msgpack+blosc
396 | 
397 |     >>> pack_file(np.array([1, 2, 3]), 'foo.blp')  # doctest: +SKIP
398 | 
399 |     See also:
400 |         unpack_file
401 |     """
402 |     if x.dtype != 'O':
403 |         bloscpack.pack_ndarray_file(x, fn, bloscpack_args=bp_args,
404 |                 blosc_args=blosc_args(x.dtype))
405 |     else:
406 |         bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1)
407 |         with open(fn, 'wb') as f:
408 |             f.write(bytes)
409 | 
410 | 
411 | def unpack_file(fn, encoding='utf8'):
412 |     """ Unpack numpy array from filename
413 | 
414 |     Supports binary data with bloscpack and text data with msgpack+blosc
415 | 
416 |     >>> unpack_file('foo.blp')  # doctest: +SKIP
417 |     array([1, 2, 3])
418 | 
419 |     See also:
420 |         pack_file
421 |     """
422 |     try:
423 |         return bloscpack.unpack_ndarray_file(fn)
424 |     except ValueError:
425 |         with open(fn, 'rb') as f:
426 |             data = msgpack.unpackb(blosc.decompress(f.read()),
427 |                                    encoding=encoding)
428 |             return np.array(data, object, copy=False)
429 | 
430 | 
431 | def coerce_index(dt, o):
432 |     if np.issubdtype(dt, np.datetime64):
433 |         return pd.Timestamp(o)
434 |     return o
435 | 
436 | 
437 | def select_partitions(partitions, key):
438 |     """ Select partitions from partition list given slice
439 | 
440 |     >>> p = pd.Series(['a', 'b', 'c', 'd', 'e'], index=[0, 10, 20, 30, 40])
441 |     >>> select_partitions(p, slice(3, 25))
442 |     ['b', 'c', 'd']
443 |     """
444 |     assert key.step is None, 'step must be None but was %s' % key.step
445 |     start, stop = key.start, key.stop
446 |     if start is not None:
447 |         start = coerce_index(partitions.index.dtype, start)
448 |         istart = partitions.index.searchsorted(start)
449 |     else:
450 |         istart = 0
451 |     if stop is not None:
452 |         stop = coerce_index(partitions.index.dtype, stop)
453 |         istop = partitions.index.searchsorted(stop)
454 |     else:
455 |         istop = len(partitions) - 1
456 | 
457 |     names = partitions.iloc[istart: istop + 1].values.tolist()
458 |     return names
459 | 
460 | 
461 | def _decategorize(categories, df):
462 |     """ Strip object dtypes from dataframe, update categories
463 | 
464 |     Given a DataFrame
465 | 
466 |     >>> df = pd.DataFrame({'x': [1, 2, 3], 'y': ['C', 'B', 'B']})
467 | 
468 |     And a dict of known categories
469 | 
470 |     >>> _ = categories = {'y': ['A', 'B']}
471 | 
472 |     Update dict and dataframe in place
473 | 
474 |     >>> extra, categories, df = _decategorize(categories, df)
475 |     >>> extra
476 |     {'y': ['C']}
477 |     >>> categories
478 |     {'y': ['A', 'B', 'C']}
479 |     >>> df
480 |        x  y
481 |     0  1  2
482 |     1  2  1
483 |     2  3  1
484 |     """
485 |     extra = dict()
486 |     new_categories = dict()
487 |     new_columns = dict((col, df[col].values) for col in df.columns)
488 |     for col, cat in categories.items():
489 |         if col == '.index' or col not in df.columns:
490 |             continue
491 |         idx = pd.Index(df[col])
492 |         idx = getattr(idx, 'categories', idx)
493 |         ex = idx[~idx.isin(cat)].unique()
494 |         if any(pd.isnull(c) for c in cat):
495 |             ex = ex[~pd.isnull(ex)]
496 |         extra[col] = ex.tolist()
497 |         new_categories[col] = cat + extra[col]
498 |         new_columns[col] = pd.Categorical(df[col].values, new_categories[col]).codes
499 | 
500 |     if '.index' in categories:
501 |         idx = df.index
502 |         idx = getattr(idx, 'categories', idx)
503 |         ex = idx[~idx.isin(cat)].unique()
504 |         if any(pd.isnull(c) for c in cat):
505 |             ex = ex[~pd.isnull(ex)]
506 |         extra['.index'] = ex.tolist()
507 |         new_categories['.index'] = cat + extra['.index']
508 | 
509 |         new_index = pd.Categorical(df.index, new_categories['.index']).codes
510 |         new_index = pd.Index(new_index, name=df.index.name)
511 |     else:
512 |         new_index = df.index
513 | 
514 |     new_df = pd.DataFrame(new_columns, columns=df.columns, index=new_index)
515 |     return extra, new_categories, new_df
516 | 
517 | 
518 | def make_categorical(s, categories):
519 |     name = '.index' if isinstance(s, pd.Index) else s.name
520 |     if name in categories:
521 |         idx = pd.Index(categories[name], tupleize_cols=False, dtype='object')
522 |         idx.is_unique = True
523 |         cat = pd.Categorical(s.values, categories=idx, fastpath=True, ordered=False)
524 |         return pd.CategoricalIndex(cat, name=s.name, ordered=True) if name == '.index' else cat
525 |     return s if name == '.index' else s.values
526 | 
527 | 
528 | 
529 | def _categorize(categories, df):
530 |     """ Categorize columns in dataframe
531 | 
532 |     >>> df = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 2, 0]})
533 |     >>> categories = {'y': ['A', 'B', 'c']}
534 |     >>> _categorize(categories, df)
535 |        x  y
536 |     0  1  A
537 |     1  2  c
538 |     2  3  A
539 |     """
540 |     if isinstance(df, pd.Series):
541 |         return pd.Series(make_categorical(df, categories),
542 |                          index=make_categorical(df.index, categories),
543 |                          name=df.name)
544 |     else:
545 |         return pd.DataFrame(dict((col, make_categorical(df[col], categories))
546 |                                  for col in df.columns),
547 |                             columns=df.columns,
548 |                             index=make_categorical(df.index, categories))
549 | 
550 | 
551 | def partitionby_none(buf, new):
552 |     """Repartition to ensure partitions don't split duplicate indices"""
553 |     if new.empty:
554 |         return [], buf
555 |     elif buf.empty:
556 |         return [], new
557 |     if not new.index.is_monotonic_increasing:
558 |         new = new.sort_index(inplace=False)
559 |     end = buf.index[-1]
560 |     if end >= new.index[0] and not is_trivial_index(new.index):
561 |         i = new.index.searchsorted(end, side='right')
562 |         # Only need to concat, `castra.extend` will resort if needed
563 |         buf = pd.concat([buf, new.iloc[:i]])
564 |         new = new.iloc[i:]
565 |     return [buf], new
566 | 
567 | 
568 | def partitionby_freq(freq, buf, new):
569 |     """Partition frames into blocks by a freq"""
570 |     df = pd.concat([buf, new])
571 |     if not df.index.is_monotonic_increasing:
572 |         df = df.sort_index(inplace=False)
573 |     start, end = pd.tseries.resample._get_range_edges(df.index[0],
574 |                                                       df.index[-1], freq)
575 |     inds = [df.index.searchsorted(i) for i in
576 |             pd.date_range(start, end, freq=freq)[1:]]
577 |     slices = [(inds[i-1], inds[i]) if i else (0, inds[i]) for i in
578 |               range(len(inds))]
579 |     frames = [df.iloc[i:j] for (i, j) in slices]
580 |     return frames[:-1], frames[-1]
581 | 
582 | 
583 | def is_trivial_index(ind):
584 |     """ Is this index just 0..n ?
585 | 
586 |     If so then we can probably ignore or change it around as necessary
587 | 
588 |     >>> is_trivial_index(pd.Index([0, 1, 2]))
589 |     True
590 | 
591 |     >>> is_trivial_index(pd.Index([0, 3, 5]))
592 |     False
593 |     """
594 |     return ind[0] == 0 and (ind == np.arange(len(ind))).all()
595 | 


--------------------------------------------------------------------------------
/castra/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import pickle
  4 | import shutil
  5 | 
  6 | import pandas as pd
  7 | import pandas.util.testing as tm
  8 | 
  9 | import pytest
 10 | 
 11 | import numpy as np
 12 | 
 13 | from castra import Castra
 14 | from castra.core import mkdir, select_partitions, _decategorize, _categorize
 15 | 
 16 | 
 17 | A = pd.DataFrame({'x': [1, 2],
 18 |                   'y': [1., 2.]},
 19 |                  columns=['x', 'y'],
 20 |                  index=[1, 2])
 21 | 
 22 | B = pd.DataFrame({'x': [10, 20],
 23 |                   'y': [10., 20.]},
 24 |                  columns=['x', 'y'],
 25 |                  index=[10, 20])
 26 | 
 27 | 
 28 | C = pd.DataFrame({'x': [10, 20],
 29 |                   'y': [10., 20.],
 30 |                   'z': [0, 1]},
 31 |                  columns=['x', 'y', 'z']).set_index('z')
 32 | C.columns.name = 'cols'
 33 | 
 34 | 
 35 | @pytest.yield_fixture
 36 | def base():
 37 |     d = tempfile.mkdtemp(prefix='castra-')
 38 |     try:
 39 |         yield d
 40 |     finally:
 41 |         shutil.rmtree(d)
 42 | 
 43 | 
 44 | def test_safe_mkdir_with_new(base):
 45 |     path = os.path.join(base, 'db')
 46 |     mkdir(path)
 47 |     assert os.path.exists(path)
 48 |     assert os.path.isdir(path)
 49 | 
 50 | 
 51 | def test_safe_mkdir_with_existing(base):
 52 |     # an existing path should not raise an exception
 53 |     mkdir(base)
 54 | 
 55 | 
 56 | def test_create_with_random_directory():
 57 |     Castra(template=A)
 58 | 
 59 | 
 60 | def test_create_with_non_existing_path(base):
 61 |     path = os.path.join(base, 'db')
 62 |     Castra(path=path, template=A)
 63 | 
 64 | 
 65 | def test_create_with_existing_path(base):
 66 |     Castra(path=base, template=A)
 67 | 
 68 | 
 69 | def test_get_empty(base):
 70 |     df = Castra(path=base, template=A)[:]
 71 |     assert (df.columns == A.columns).all()
 72 | 
 73 | 
 74 | def test_get_empty_result(base):
 75 |     c = Castra(path=base, template=A)
 76 |     c.extend(A)
 77 | 
 78 |     df = c[100:200]
 79 | 
 80 |     assert (df.columns == A.columns).all()
 81 | 
 82 | 
 83 | def test_get_slice(base):
 84 |     c = Castra(path=base, template=A)
 85 |     c.extend(A)
 86 | 
 87 |     tm.assert_frame_equal(c[:], c[:, :])
 88 |     tm.assert_frame_equal(c[:, 1:], c[:][['y']])
 89 | 
 90 | 
 91 | def test_exception_with_non_dir(base):
 92 |     file_ = os.path.join(base, 'file')
 93 |     with open(file_, 'w') as f:
 94 |         f.write('file')
 95 |     with pytest.raises(ValueError):
 96 |         Castra(file_)
 97 | 
 98 | 
 99 | def test_exception_with_existing_castra_and_template(base):
100 |     with Castra(path=base, template=A) as c:
101 |         c.extend(A)
102 |     with pytest.raises(ValueError):
103 |         Castra(path=base, template=A)
104 | 
105 | 
106 | def test_exception_with_empty_dir_and_no_template(base):
107 |     with pytest.raises(ValueError):
108 |         Castra(path=base)
109 | 
110 | 
111 | def test_load(base):
112 |     with Castra(path=base, template=A) as c:
113 |         c.extend(A)
114 |         c.extend(B)
115 | 
116 |     loaded = Castra(path=base)
117 |     tm.assert_frame_equal(pd.concat([A, B]), loaded[:])
118 | 
119 | 
120 | def test_del_with_random_dir():
121 |     c = Castra(template=A)
122 |     assert os.path.exists(c.path)
123 |     c.__del__()
124 |     assert not os.path.exists(c.path)
125 | 
126 | 
127 | def test_context_manager_with_random_dir():
128 |     with Castra(template=A) as c:
129 |         assert os.path.exists(c.path)
130 |     assert not os.path.exists(c.path)
131 | 
132 | 
133 | def test_context_manager_with_specific_dir(base):
134 |     with Castra(path=base, template=A) as c:
135 |         assert os.path.exists(c.path)
136 |     assert os.path.exists(c.path)
137 | 
138 | 
139 | def test_timeseries():
140 |     indices = [pd.DatetimeIndex(start=str(i), end=str(i+1), freq='w')
141 |                for i in range(2000, 2015)]
142 |     dfs = [pd.DataFrame({'x': list(range(len(ind)))}, ind).iloc[:-1]
143 |            for ind in indices]
144 | 
145 |     with Castra(template=dfs[0]) as c:
146 |         for df in dfs:
147 |             c.extend(df)
148 |         df = c['2010-05': '2013-02']
149 |         assert len(df) > 100
150 | 
151 | 
152 | def test_Castra():
153 |     c = Castra(template=A)
154 |     c.extend(A)
155 |     c.extend(B)
156 | 
157 |     assert c.columns == ['x', 'y']
158 | 
159 |     tm.assert_frame_equal(c[0:100], pd.concat([A, B]))
160 |     tm.assert_frame_equal(c[:5], A)
161 |     tm.assert_frame_equal(c[5:], B)
162 | 
163 |     tm.assert_frame_equal(c[2:5], A[1:])
164 |     tm.assert_frame_equal(c[2:15], pd.concat([A[1:], B[:1]]))
165 | 
166 | 
167 | def test_pickle_Castra():
168 |     path = tempfile.mkdtemp(prefix='castra-')
169 |     c = Castra(path=path, template=A)
170 |     c.extend(A)
171 |     c.extend(B)
172 | 
173 |     dumped = pickle.dumps(c)
174 |     undumped = pickle.loads(dumped)
175 | 
176 |     tm.assert_frame_equal(pd.concat([A, B]), undumped[:])
177 | 
178 | 
179 | def test_text():
180 |     df = pd.DataFrame({'name': ['Alice', 'Bob'],
181 |                        'balance': [100, 200]}, columns=['name', 'balance'])
182 |     with Castra(template=df) as c:
183 |         c.extend(df)
184 | 
185 |         tm.assert_frame_equal(c[:], df)
186 | 
187 | 
188 | def test_column_access():
189 |     with Castra(template=A) as c:
190 |         c.extend(A)
191 |         c.extend(B)
192 |         df = c[:, ['x']]
193 | 
194 |         tm.assert_frame_equal(df, pd.concat([A[['x']], B[['x']]]))
195 | 
196 |         df = c[:, 'x']
197 |         tm.assert_series_equal(df, pd.concat([A.x, B.x]))
198 | 
199 | 
200 | def test_reload():
201 |     path = tempfile.mkdtemp(prefix='castra-')
202 |     try:
203 |         c = Castra(template=A, path=path)
204 |         c.extend(A)
205 | 
206 |         d = Castra(path=path)
207 | 
208 |         assert c.columns == d.columns
209 |         assert (c.partitions == d.partitions).all()
210 |         assert c.minimum == d.minimum
211 |     finally:
212 |         shutil.rmtree(path)
213 | 
214 | 
215 | def test_readonly():
216 |     path = tempfile.mkdtemp(prefix='castra-')
217 |     try:
218 |         c = Castra(path=path, template=A)
219 |         c.extend(A)
220 |         d = Castra(path=path, readonly=True)
221 |         with pytest.raises(IOError):
222 |             d.extend(B)
223 |         with pytest.raises(IOError):
224 |             d.extend_sequence([B])
225 |         with pytest.raises(IOError):
226 |             d.flush()
227 |         with pytest.raises(IOError):
228 |             d.drop()
229 |         with pytest.raises(IOError):
230 |             d.save_partitions()
231 |         with pytest.raises(IOError):
232 |             d.flush_meta()
233 |         assert c.columns == d.columns
234 |         assert (c.partitions == d.partitions).all()
235 |         assert c.minimum == d.minimum
236 |     finally:
237 |         shutil.rmtree(path)
238 | 
239 | 
240 | def test_index_dtype_matches_template():
241 |     with Castra(template=A) as c:
242 |         assert c.partitions.index.dtype == A.index.dtype
243 | 
244 | 
245 | def test_to_dask_dataframe():
246 |     pytest.importorskip('dask.dataframe')
247 | 
248 |     try:
249 |         import dask.dataframe as dd
250 |     except ImportError:
251 |         return
252 | 
253 |     with Castra(template=A) as c:
254 |         c.extend(A)
255 |         c.extend(B)
256 | 
257 |         df = c.to_dask()
258 |         assert isinstance(df, dd.DataFrame)
259 |         assert list(df.divisions) == [1, 2, 20]
260 |         tm.assert_frame_equal(df.compute(), c[:])
261 | 
262 |         df = c.to_dask('x')
263 |         assert isinstance(df, dd.Series)
264 |         assert list(df.divisions) == [1, 2, 20]
265 |         tm.assert_series_equal(df.compute(), c[:, 'x'])
266 | 
267 | 
268 | def test_categorize():
269 |     A = pd.DataFrame({'x': [1, 2, 3], 'y': ['A', None, 'A']},
270 |                      columns=['x', 'y'], index=[0, 10, 20])
271 |     B = pd.DataFrame({'x': [4, 5, 6], 'y': ['C', None, 'A']},
272 |                      columns=['x', 'y'], index=[30, 40, 50])
273 | 
274 |     with Castra(template=A, categories=['y']) as c:
275 |         c.extend(A)
276 |         assert c[:].dtypes['y'] == 'category'
277 |         assert c[:]['y'].cat.codes.dtype == np.dtype('i1')
278 |         assert list(c[:, 'y'].cat.categories) == ['A', None]
279 | 
280 |         c.extend(B)
281 |         assert list(c[:, 'y'].cat.categories) == ['A', None, 'C']
282 | 
283 |         assert c.load_partition(c.partitions.iloc[0], 'y').dtype == 'category'
284 | 
285 |         c.flush()
286 | 
287 |         d = Castra(path=c.path)
288 |         tm.assert_frame_equal(c[:], d[:])
289 | 
290 | 
291 | def test_save_axis_names():
292 |     with Castra(template=C) as c:
293 |         c.extend(C)
294 |         assert c[:].index.name == 'z'
295 |         assert c[:].columns.name == 'cols'
296 |         tm.assert_frame_equal(c[:], C)
297 | 
298 | 
299 | def test_same_categories_when_already_categorized():
300 |     A = pd.DataFrame({'x': [1, 2] * 1000,
301 |                       'y': [1., 2.] * 1000,
302 |                       'z': np.random.choice(list('abc'), size=2000)},
303 |                      columns=list('xyz'))
304 |     A['z'] = A.z.astype('category')
305 |     with Castra(template=A, categories=['z']) as c:
306 |         c.extend(A)
307 |         assert c.categories['z'] == A.z.cat.categories.tolist()
308 | 
309 | 
310 | def test_category_dtype():
311 |     A = pd.DataFrame({'x': [1, 2] * 3,
312 |                       'y': [1., 2.] * 3,
313 |                       'z': list('abcabc')},
314 |                      columns=list('xyz'))
315 |     with Castra(template=A, categories=['z']) as c:
316 |         c.extend(A)
317 |         assert A.dtypes['z'] == 'object'
318 | 
319 | 
320 | def test_do_not_create_dirs_if_template_fails():
321 |     A = pd.DataFrame({'x': [1, 2] * 3,
322 |                       'y': [1., 2.] * 3,
323 |                       'z': list('abcabc')},
324 |                      columns=list('xyz'))
325 |     with pytest.raises(ValueError):
326 |         Castra(template=A, path='foo', categories=['w'])
327 |     assert not os.path.exists('foo')
328 | 
329 | 
330 | def test_sort_on_extend():
331 |     df = pd.DataFrame({'x': [1, 2, 3]}, index=[3, 2, 1])
332 |     expected = pd.DataFrame({'x': [3, 2, 1]}, index=[1, 2, 3])
333 |     with Castra(template=df) as c:
334 |         c.extend(df)
335 |         tm.assert_frame_equal(c[:], expected)
336 | 
337 | 
338 | def test_select_partitions():
339 |     p = pd.Series(['a', 'b', 'c', 'd', 'e'], index=[0, 10, 20, 30, 40])
340 |     assert select_partitions(p, slice(3, 25)) == ['b', 'c', 'd']
341 |     assert select_partitions(p, slice(None, 25)) == ['a', 'b', 'c', 'd']
342 |     assert select_partitions(p, slice(3, None)) == ['b', 'c', 'd', 'e']
343 |     assert select_partitions(p, slice(None, None)) == ['a', 'b', 'c', 'd', 'e']
344 |     assert select_partitions(p, slice(10, 30)) == ['b', 'c', 'd']
345 | 
346 | 
347 | def test_first_index_is_timestamp():
348 |     pytest.importorskip('dask.dataframe')
349 | 
350 |     df = pd.DataFrame({'x': [1, 2] * 3,
351 |                        'y': [1., 2.] * 3,
352 |                        'z': list('abcabc')},
353 |                       columns=list('xyz'),
354 |                       index=pd.date_range(start='20120101', periods=6))
355 |     with Castra(template=df) as c:
356 |         c.extend(df)
357 | 
358 |         assert isinstance(c.minimum, pd.Timestamp)
359 |         assert isinstance(c.to_dask().divisions[0], pd.Timestamp)
360 | 
361 | 
362 | def test_minimum_dtype():
363 |     df = tm.makeTimeDataFrame()
364 | 
365 |     with Castra(template=df) as c:
366 |         c.extend(df)
367 |         assert type(c.minimum) == type(c.partitions.index[0])
368 | 
369 | 
370 | def test_many_default_indexes():
371 |     a = pd.DataFrame({'x': [1, 2, 3]})
372 |     b = pd.DataFrame({'x': [4, 5, 6]})
373 |     c = pd.DataFrame({'x': [7, 8, 9]})
374 | 
375 |     e = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
376 | 
377 |     with Castra(template=a) as C:
378 |         C.extend(a)
379 |         C.extend(b)
380 |         C.extend(c)
381 | 
382 |         tm.assert_frame_equal(C[:], e)
383 | 
384 | 
385 | def test_raise_error_on_mismatched_index():
386 |     x = pd.DataFrame({'x': [1, 2, 3]}, index=[1, 2, 3])
387 |     y = pd.DataFrame({'x': [1, 2, 3]}, index=[4, 5, 6])
388 |     z = pd.DataFrame({'x': [4, 5, 6]}, index=[5, 6, 7])
389 | 
390 |     with Castra(template=x) as c:
391 |         c.extend(x)
392 |         c.extend(y)
393 | 
394 |         with pytest.raises(ValueError):
395 |             c.extend(z)
396 | 
397 | 
398 | def test_raise_error_on_equal_index():
399 |     a = pd.DataFrame({'x': [1, 2, 3]}, index=[1, 2, 3])
400 |     b = pd.DataFrame({'x': [4, 5, 6]}, index=[3, 4, 5])
401 | 
402 |     with Castra(template=a) as c:
403 |         c.extend(a)
404 | 
405 |         with pytest.raises(ValueError):
406 |             c.extend(b)
407 | 
408 | 
409 | def test_categories_nan():
410 |     a = pd.DataFrame({'x': ['A', np.nan]})
411 |     b = pd.DataFrame({'x': ['B', np.nan]})
412 | 
413 |     with Castra(template=a, categories=['x']) as c:
414 |         c.extend(a)
415 |         c.extend(b)
416 |         assert len(c.categories['x']) == 3
417 | 
418 | 
419 | def test_extend_sequence_freq():
420 |     df = pd.util.testing.makeTimeDataFrame(1000, 'min')
421 |     seq = [df.iloc[i:i+100] for i in range(0,1000,100)]
422 |     with Castra(template=df) as c:
423 |         c.extend_sequence(seq, freq='h')
424 |         tm.assert_frame_equal(c[:], df)
425 |         parts = pd.date_range(start=df.index[59], freq='h',
426 |                               periods=16).insert(17, df.index[-1])
427 |         tm.assert_index_equal(c.partitions.index, parts)
428 | 
429 |     with Castra(template=df) as c:
430 |         c.extend_sequence(seq, freq='d')
431 |         tm.assert_frame_equal(c[:], df)
432 |         assert len(c.partitions) == 1
433 | 
434 | 
435 | def test_extend_sequence_none():
436 |     data = {'a': range(5), 'b': range(5)}
437 |     p1 = pd.DataFrame(data, index=[1, 2, 3, 4, 5])
438 |     p2 = pd.DataFrame(data, index=[5, 5, 5, 6, 7])
439 |     p3 = pd.DataFrame(data, index=[7, 9, 10, 11, 12])
440 |     seq = [p1, p2, p3]
441 |     df = pd.concat(seq)
442 |     with Castra(template=df) as c:
443 |         c.extend_sequence(seq)
444 |         tm.assert_frame_equal(c[:], df)
445 |         assert len(c.partitions) == 3
446 |         assert len(c.load_partition('1--5', ['a', 'b']).index) == 8
447 |         assert len(c.load_partition('6--7', ['a', 'b']).index) == 3
448 |         assert len(c.load_partition('9--12', ['a', 'b']).index) == 4
449 | 
450 | 
451 | def test_extend_sequence_overlap():
452 |     df = pd.util.testing.makeTimeDataFrame(20, 'min')
453 |     p1 = df.iloc[:15]
454 |     p2 = df.iloc[10:20]
455 |     seq = [p1,p2]
456 |     df = pd.concat(seq)
457 |     with Castra(template=df) as c:
458 |         c.extend_sequence(seq)
459 |         tm.assert_frame_equal(c[:], df.sort_index())
460 |         assert (c.partitions.index == [p.index[-1] for p in seq]).all()
461 |     # Check with trivial index
462 |     p1 = pd.DataFrame({'a': range(10), 'b': range(10)})
463 |     p2 = pd.DataFrame({'a': range(10, 17), 'b': range(10, 17)})
464 |     seq = [p1,p2]
465 |     df = pd.DataFrame({'a': range(17), 'b': range(17)})
466 |     with Castra(template=df) as c:
467 |         c.extend_sequence(seq)
468 |         tm.assert_frame_equal(c[:], df)
469 |         assert (c.partitions.index == [9, 16]).all()
470 | 
471 | 
472 | def test_extend_sequence_single_frame():
473 |     df = pd.util.testing.makeTimeDataFrame(100, 'h')
474 |     seq = [df]
475 |     with Castra(template=df) as c:
476 |         c.extend_sequence(seq, freq='d')
477 |         assert (c.partitions.index == ['2000-01-01 23:00:00', '2000-01-02 23:00:00',
478 |                  '2000-01-03 23:00:00', '2000-01-04 23:00:00', '2000-01-05 03:00:00']).all()
479 |     df = pd.DataFrame({'a': range(10), 'b': range(10)})
480 |     seq = [df]
481 |     with Castra(template=df) as c:
482 |         c.extend_sequence(seq)
483 |         tm.assert_frame_equal(c[:], df)
484 | 
485 | 
486 | def test_column_with_period():
487 |     df = pd.DataFrame({'x': [10, 20],
488 |                        '.': [10., 20.]},
489 |                        columns=['x', '.'],
490 |                        index=[10, 20])
491 | 
492 |     with Castra(template=df) as c:
493 |         c.extend(df)
494 | 
495 | 
496 | def test_empty():
497 |     with Castra(template=A) as c:
498 |         c.extend(pd.DataFrame(columns=A.columns))
499 |         assert len(c[:]) == 0
500 | 
501 | 
502 | def test_index_with_single_value():
503 |     df = pd.DataFrame({'x': [1, 2, 3]}, index=[1, 1, 2])
504 |     with Castra(template=df) as c:
505 |         c.extend(df)
506 | 
507 |         tm.assert_frame_equal(c[1], df.loc[1])
508 | 
509 | 
510 | def test_categorical_index():
511 |     df = pd.DataFrame({'x': [1, 2, 3]},
512 |             index=pd.CategoricalIndex(['a', 'a', 'b'], ordered=True, name='foo'))
513 | 
514 |     with Castra(template=df, categories=True) as c:
515 |         c.extend(df)
516 |         result = c[:]
517 |         tm.assert_frame_equal(c[:], df)
518 | 
519 |     A = pd.DataFrame({'x': [1, 2, 3]},
520 |                     index=pd.Index(['a', 'a', 'b'], name='foo'))
521 |     B = pd.DataFrame({'x': [4, 5, 6]},
522 |                     index=pd.Index(['c', 'd', 'd'], name='foo'))
523 | 
524 |     path = tempfile.mkdtemp(prefix='castra-')
525 |     try:
526 |         with Castra(path=path, template=A, categories=['foo']) as c:
527 |             c.extend(A)
528 |             c.extend(B)
529 | 
530 |             c2 = Castra(path=path)
531 |             result = c2[:]
532 | 
533 |             expected = pd.concat([A, B])
534 |             expected.index = pd.CategoricalIndex(expected.index,
535 |                     name=expected.index.name, ordered=True)
536 |             tm.assert_frame_equal(result, expected)
537 | 
538 |             tm.assert_frame_equal(c['a'], expected.loc['a'])
539 |     finally:
540 |         shutil.rmtree(path)
541 | 
542 | 
543 | def test_categorical_index_with_dask_dataframe():
544 |     pytest.importorskip('dask.dataframe')
545 |     import dask.dataframe as dd
546 |     import dask
547 | 
548 |     A = pd.DataFrame({'x': [1, 2, 3, 4]},
549 |                     index=pd.Index(['a', 'a', 'b', 'b'], name='foo'))
550 |     B = pd.DataFrame({'x': [4, 5, 6]},
551 |                     index=pd.Index(['c', 'd', 'd'], name='foo'))
552 | 
553 | 
554 |     path = tempfile.mkdtemp(prefix='castra-')
555 |     try:
556 |         with Castra(path=path, template=A, categories=['foo']) as c:
557 |             c.extend(A)
558 |             c.extend(B)
559 | 
560 |             df = dd.from_castra(path)
561 |             assert df.divisions == ('a', 'c', 'd')
562 | 
563 |             result = df.compute(get=dask.async.get_sync)
564 | 
565 |             expected = pd.concat([A, B])
566 |             expected.index = pd.CategoricalIndex(expected.index,
567 |                     name=expected.index.name, ordered=True)
568 | 
569 |             tm.assert_frame_equal(result, expected)
570 | 
571 |             tm.assert_frame_equal(df.loc['a'].compute(), expected.loc['a'])
572 |             tm.assert_frame_equal(df.loc['b'].compute(get=dask.async.get_sync),
573 |                                   expected.loc['b'])
574 |     finally:
575 |         shutil.rmtree(path)
576 | 
577 | 
578 | def test__decategorize():
579 |     df = pd.DataFrame({'x': [1, 2, 3]},
580 |                       index=pd.CategoricalIndex(['a', 'a', 'b'], ordered=True,
581 |                           name='foo'))
582 | 
583 |     extra, categories, df2 = _decategorize({'.index': []}, df)
584 | 
585 |     assert (df2.index == [0, 0, 1]).all()
586 | 
587 |     df3 = _categorize(categories, df2)
588 | 
589 |     tm.assert_frame_equal(df, df3)
590 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | bloscpack>=0.8.0
4 | blosc
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from os.path import exists
 4 | from setuptools import setup
 5 | 
 6 | setup(name='castra',
 7 |       version='0.1.8',
 8 |       description='On-disk partitioned store',
 9 |       url='http://github.com/blaze/Castra/',
10 |       maintainer='Matthew Rocklin',
11 |       maintainer_email='mrocklin@gmail.com',
12 |       license='BSD',
13 |       keywords='',
14 |       packages=['castra'],
15 |       package_data={'castra': ['tests/*.py']},
16 |       install_requires=list(open('requirements.txt').read().strip().split('\n')),
17 |       long_description=(open('README.rst').read() if exists('README.rst')
18 |                         else ''),
19 |       zip_safe=False)
20 | 


--------------------------------------------------------------------------------