├── benchmark.py
├── src
    └── tstables
    │   ├── _version.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── test_tstable_static.py
    │       └── test_tstable_file.py
    │   ├── __init__.py
    │   ├── file.py
    │   ├── group.py
    │   ├── benchmark.py
    │   └── tstable.py
├── release.py
├── setup.cfg
├── .gitignore
├── EXAMPLES.md
├── setup.py
└── README.md


/benchmark.py:
--------------------------------------------------------------------------------
1 | import tstables
2 | tstables.Benchmark.main()


--------------------------------------------------------------------------------
/src/tstables/_version.py:
--------------------------------------------------------------------------------
1 | # Store the version here so:
2 | # 1) we don't load dependencies by storing it in __init__.py
3 | # 2) we can import it in setup.py for the same reason
4 | # 3) we can import it into your module module
5 | __version__ = '0.0.15'


--------------------------------------------------------------------------------
/release.py:
--------------------------------------------------------------------------------
 1 | # Converts README.md to README.txt (in restructured text format), builds package, and uploads to PyPI
 2 | 
 3 | import pypandoc
 4 | import os
 5 | 
 6 | rst = pypandoc.convert('README.md', 'rst')
 7 | f = open('README.txt','w+')
 8 | f.write(rst)
 9 | f.close()
10 | os.system("python3 setup.py register sdist upload")
11 | os.remove('README.txt')
12 | 
13 | 


--------------------------------------------------------------------------------
/src/tstables/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | from tstables.tests import test_tstable_static
 2 | from tstables.tests import test_tstable_file
 3 | #from tstables import tstable
 4 | 
 5 | def suite():
 6 |     import unittest
 7 |     import doctest
 8 |     suite = unittest.TestSuite()
 9 |     #suite.addTests(doctest.DocTestSuite(tstable))
10 |     suite.addTests(test_tstable_static.suite())
11 |     suite.addTests(test_tstable_file.suite())
12 |     return suite
13 | 
14 | if __name__ == '__main__':
15 |     unittest.TextTestRunner(verbosity=2).run(suite())


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [global]
 2 | # Just silently do your job
 3 | quiet = 1
 4 | 
 5 | [easy_install]
 6 | # Where we are going to look for thrirdparty dependencies
 7 | find_links = thirdparty
 8 | 
 9 | [build_py]
10 | # No optimization for now
11 | optimize = 0
12 | # Force build everything?
13 | force = True
14 | 
15 | [egg_info]
16 | # We are doing development build
17 | # tag_build = dev
18 | # Do we want to have date in file name?
19 | tag_date = 0
20 | 
21 | [bdist_egg]
22 | # We do not want to distribute binary with source code
23 | exclude-source-files = True
24 | 
25 | [rotate]
26 | # Keep only last 10 eggs, clean up older
27 | match = .egg
28 | keep = 10


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | pip-delete-this-directory.txt
27 | 
28 | # Unit test / coverage reports
29 | htmlcov/
30 | .tox/
31 | .coverage
32 | .cache
33 | nosetests.xml
34 | coverage.xml
35 | 
36 | # Translations
37 | *.mo
38 | *.pot
39 | 
40 | # Django stuff:
41 | *.log
42 | 
43 | # Sphinx documentation
44 | docs/_build/


--------------------------------------------------------------------------------
/src/tstables/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | ########################################################################
 4 | #
 5 | # License: MIT
 6 | #
 7 | # $Id$
 8 | #
 9 | ########################################################################
10 | 
11 | """TsTables is a Python package to store time series data in HDF5 files using PyTables and Pandas
12 | 
13 | :URL: http://afiedler.github.io/tstables
14 | 
15 | TsTables is a Python package to store time series data in HDF5 files using PyTables. It stores time series data into
16 | daily partitions and provides functions to query for subsets of data across partitions.
17 | 
18 | Its goals are to support a workflow where tons (gigabytes) of time series data are appended periodically to a HDF5 file,
19 | and need to be read many times (quickly) for analytical models and research.
20 | 
21 | """
22 | from ._version import __version__
23 | from tstables.tstable import TsTable
24 | from tstables.file import create_ts
25 | from tstables.group import timeseries_repr
26 | from tstables.group import timeseries_str
27 | from tstables.group import get_timeseries
28 | from tstables.benchmark import Benchmark
29 | import tables
30 | 
31 | # Augment the PyTables File class
32 | tables.File.create_ts = create_ts
33 | 
34 | # Patch the group class to return time series __str__ and __repr__
35 | old_repr = tables.Group.__repr__
36 | old_str = tables.Group.__str__
37 | 
38 | tables.Group.__repr__ = timeseries_repr
39 | tables.Group.__str__ = timeseries_str
40 | 
41 | # Add _v_timeseries to Group
42 | tables.Group._f_get_timeseries = get_timeseries
43 | 
44 | 


--------------------------------------------------------------------------------
/src/tstables/file.py:
--------------------------------------------------------------------------------
 1 | import tables
 2 | import tstables
 3 | import datetime
 4 | import numpy
 5 | 
 6 | def create_ts(self,where,name,description=None,title="",filters=None,
 7 |     expectedrows_per_partition=10000,chunkshape=None,
 8 |     byteorder=None,createparents=False):
 9 | 
10 |     # Check the Description to make sure the first col is "timestamp" with type Int64
11 |     for k in description.columns.keys():
12 |         if description.columns[k]._v_pos == 0:
13 |             first_col_name = k
14 | 
15 |     if first_col_name != 'timestamp':
16 |         raise AttributeError("first column must be called 'timestamp' and have type Int64")
17 | 
18 |     if description.columns[first_col_name].dtype != numpy.dtype('int64'):
19 |         raise AttributeError("first column must be called 'timestamp' and have type Int64")
20 | 
21 |     # The parent node of the time series
22 |     tsnode = self.create_group(where,name,title,filters,createparents)
23 | 
24 |     try:
25 |         # Decorate with TsTables attributes
26 |         tsnode._v_attrs._TS_TABLES_CLASS='TIMESERIES'
27 |         tsnode._v_attrs._TS_TABLES_VERSION='0.0.1'
28 | 
29 |         ts = tstables.TsTable(self,tsnode,description,title,filters,expectedrows_per_partition,
30 |             chunkshape,byteorder)
31 | 
32 |         # Need to create one partition to "save" the time series. This creates a new table to persist
33 |         # the table description
34 |         ts._TsTable__create_partition(datetime.datetime.utcnow().date())
35 |     except:
36 |         # Make sure that the group is deleted if an exception is raised
37 |         self.remove_node(tsnode,recursive=True)
38 |         raise
39 | 
40 |     return ts
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/EXAMPLES.md:
--------------------------------------------------------------------------------
 1 | # TsTables Examples
 2 | 
 3 | This document shows you a few examples of how to use TsTables to store and access data.
 4 | 
 5 | ## Basic Examples
 6 | 
 7 | ### Fetch the daily EURUSD exchange rate from FRED
 8 | 
 9 | This example fetches the daily EURUSD exchange rate from FRED, the St. Louis Fed's online database
10 | of economic data. TsTables isn't really designed for storing daily data, but this simple example 
11 | illustrates how you can get a Pandas DataFrame and append it to a time series.
12 | 
13 | ```python
14 | import tables
15 | import tstables
16 | import pandas.io.data as web
17 | from datetime import *
18 | 
19 | # Create a class to describe the table structure. The column "timestamp" is required, and must be
20 | # in the first position (pos=0) and have the type Int64.
21 | class prices(tables.IsDescription):
22 |     timestamp = tables.Int64Col(pos=0)
23 |     price = tables.Float64Col(pos=1)
24 | 
25 | f = tables.open_file('eurusd.h5','a')
26 | 
27 | # This creates the time series, which is just a group called 'EURUSD' in the root of the HDF5 file.
28 | ts = f.create_ts('/','EURUSD',prices)
29 | 
30 | start = datetime(2010,1,1)
31 | end = datetime(2014,5,2)
32 | 
33 | euro = web.DataReader("DEXUSEU", "fred", start, end)
34 | ts.append(euro)
35 | f.flush() 
36 | 
37 | # Now, read in a month of data
38 | read_start_dt = datetime(2014,1,1)
39 | read_end_dt = datetime(2014,1,31)
40 | 
41 | jan = ts.read_range(read_start_dt,read_end_dt)
42 | ```
43 | 
44 | ### Load one month of minutely bitcoin price data
45 | 
46 | This example loads one month of minutely [Bitcoin Price Index](http://coindesk.com/price) from 
47 | CoinDesk. First, you'll need to download 
48 | [this CSV file](http://afiedler.github.io/tstables/bpi_2014_01.csv). This example assumes that you've
49 | stored the CSV file in the current directory.
50 | 
51 | ```python
52 | import tables
53 | import tstables
54 | import pandas
55 | from datetime import *
56 | 
57 | # Class to use as the table description
58 | class BpiValues(tables.IsDescription):
59 |     timestamp = tables.Int64Col(pos=0)
60 |     bpi = tables.Float64Col(pos=1)
61 | 
62 | # Use pandas to read in the CSV data
63 | bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True)
64 | 
65 | f = tables.open_file('bpi.h5','a')
66 | 
67 | # Create a new time series
68 | ts = f.create_ts('/','BPI',BpiValues)
69 | 
70 | # Append the BPI data
71 | ts.append(bpi)
72 | 
73 | # Read in some data
74 | read_start_dt = datetime(2014,1,4,12,00)
75 | read_end_dt = datetime(2014,1,4,14,30)
76 | 
77 | rows = ts.read_range(read_start_dt,read_end_dt)
78 | 
79 | # `rows` will be a pandas DataFrame with a DatetimeIndex.
80 | ```
81 | 


--------------------------------------------------------------------------------
/src/tstables/group.py:
--------------------------------------------------------------------------------
 1 | import tables
 2 | import tstables
 3 | import datetime
 4 | 
 5 | def timeseries_repr(self):
 6 | 	"""Return a detailed string representation of the group or time series.
 7 | 
 8 | 	Examples
 9 | 	--------
10 | 
11 | 	::
12 | 
13 | 		>>> f = tables.open_file('data/test.h5')
14 | 		>>> f.root.group0
15 | 		/group0 (Group) 'First Group'
16 | 		  children := ['tuple1' (Table), 'group1' (Group)]
17 | 
18 | 	::
19 | 
20 | 		>>> f = tables.open_file('data/test_timeseries.h5')
21 | 		>>> f.root.timeseries0
22 | 		/timeseries0 (Group/Timeseries) 'A group that is also a time series'
23 | 
24 | 	"""
25 | 	try:
26 | 		tstables_class = self._v_attrs._TS_TABLES_CLASS
27 | 
28 | 		# Additional representation (maybe min timestamp, max timestamp) goes here
29 | 		# Don't include all of the children here!
30 | 
31 | 		return "%s" % str(self)
32 | 
33 | 	except AttributeError:
34 | 		rep = [
35 | 			'%r (%s)' % (childname, child.__class__.__name__)
36 | 			for (childname, child) in self._v_children.iteritems()
37 | 		]
38 | 		childlist = '[%s]' % (', '.join(rep))
39 | 
40 | 		return "%s\n  children := %s" % (str(self), childlist)
41 | 
42 | 
43 | 
44 | def timeseries_str(self):
45 | 	"""Return a short string representation of the group or time series.
46 | 
47 | 	Examples
48 | 	--------
49 | 
50 | 	::
51 | 
52 | 		>>> f=tables.open_file('data/test.h5')
53 | 		>>> print(f.root.group0)
54 | 		/group0 (Group) 'First Group'
55 | 
56 | 	::
57 | 		>>> f = tables.open_file('data/test_timeseries.h5')
58 | 		>>> f.root.timeseries0
59 | 		/timeseries0 (Group/Timeseries) 'A group that is also a time series'
60 | 
61 | 	"""
62 | 
63 | 	try:
64 | 		tstables_class = self._v_attrs._TS_TABLES_CLASS
65 | 		classname = "%s/Timeseries" % self.__class__.__name__
66 | 	except AttributeError:
67 | 		classname = self.__class__.__name__
68 | 
69 | 	pathname = self._v_pathname
70 | 	title = self._v_title
71 | 	return "%s (%s) %r" % (pathname, classname, title)
72 | 
73 | def get_timeseries(self):
74 | 	try:
75 | 		tstables_class = self._v_attrs._TS_TABLES_CLASS
76 | 	except AttributeError:
77 | 		return None
78 | 
79 | 	ts_table = tstables.TsTable(self._v_file,self,None)
80 | 
81 | 	# Need to determine the description, title, filters, expectedrows_per_partition,
82 | 	# chunkshape, byteorder
83 | 	ts_data = ts_table._TsTable__fetch_first_table()
84 | 	ts_table.table_description = ts_data.description
85 | 	ts_table.table_title = ts_data.title
86 | 	ts_table.table_filters = ts_data.filters
87 | 	ts_table.table_chunkshape = ts_data.chunkshape
88 | 	ts_table.table_byteorder = ts_data.byteorder
89 | 	ts_table.table_expectedrows_per_partition = ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION
90 | 
91 | 	return ts_table


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | 
 4 | # Handle the long description (read from README.txt, which is created by converting README.md)
 5 | long_description = 'TsTables is a Python package to store time series data in HDF5 files using '
 6 | 'PyTables. It stores time series data into daily partitions and provides functions to query for '
 7 | 'subsets of data across partitions.\n'
 8 | 'Its goals are to support a workflow where tons (gigabytes) of time series data are '
 9 | 'appended periodically to a HDF5 file, and need to be read many times (quickly) for analytical '
10 | 'models and research.'
11 | 
12 | if os.path.exists('README.txt'):
13 |     long_description = open('README.txt').read()
14 | 
15 | exec(open('src/tstables/_version.py').read())
16 | 
17 | 
18 | setup(
19 | 
20 |     # Package structure
21 |     #
22 |     # find_packages searches through a set of directories 
23 |     # looking for packages
24 |     packages = find_packages('src', exclude = ['*.tests', '*.tests.*', 'tests.*', 'tests']),
25 |     
26 |     # package_dir directive maps package names to directories.
27 |     # package_name:package_directory
28 |     package_dir = {'': 'src'},
29 | 
30 |     # Not all packages are capable of running in compressed form, 
31 |     # because they may expect to be able to access either source 
32 |     # code or data files as normal operating system files.
33 |     zip_safe = True,
34 | 
35 |     # Entry points
36 |     #
37 |     # install the executable
38 |     entry_points = {
39 |         'console_scripts': ['tstables_benchmark = tstables.Benchmark:main']
40 |     },
41 | 
42 |     # Dependencies
43 |     #
44 |     # Dependency expressions have a package name on the left-hand 
45 |     # side, a version on the right-hand side, and a comparison 
46 |     # operator between them, e.g. == exact version, >= this version
47 |     # or higher
48 |     install_requires = ['tables>=3.1.1', 'pandas>=0.13.1'],
49 | 
50 |     # Tests
51 |     #
52 |     # Tests must be wrapped in a unittest test suite by either a
53 |     # function, a TestCase class or method, or a module or package
54 |     # containing TestCase classes. If the named suite is a package,
55 |     # any submodules and subpackages are recursively added to the
56 |     # overall test suite.
57 |     test_suite = 'tstables.tests.suite',
58 |     # Download dependencies in the current directory
59 |     tests_require = 'docutils >= 0.6',
60 | 
61 |     name = "tstables",
62 |     version = __version__,
63 | 
64 |     # metadata for upload to PyPI
65 |     author = "Andy Fiedler",
66 |     author_email = "andy@andyfiedler.com",
67 |     description = "Handles large time series using PyTables and Pandas",
68 |     license = "MIT",
69 |     keywords = "time series high frequency HDF5",
70 |     url = "http://github.com/afiedler/tstables",   # project home page, if any
71 |     long_description = long_description
72 |     # could also include download_url, classifiers, etc.
73 | )


--------------------------------------------------------------------------------
/src/tstables/benchmark.py:
--------------------------------------------------------------------------------
 1 | import tables
 2 | import tstables
 3 | import tempfile
 4 | import datetime
 5 | import pytz
 6 | import pandas
 7 | import numpy
 8 | import timeit
 9 | import os
10 | 
11 | # Class to define record structure
12 | class Price(tables.IsDescription):
13 |     timestamp = tables.Int64Col(pos=0)
14 |     price = tables.Int32Col(pos=1)
15 | 
16 | class Benchmark:
17 |     @classmethod
18 |     def log_me(cls, s):
19 |         cls.log.write(s)
20 |         print(s) 
21 |     
22 |     @classmethod
23 |     def write_data(cls):
24 |         # This simple benchmark creates a HDF5 file with a timeseries. It then loads about one year of random secondly
25 |         #  data into it, closes it, and reads it back.
26 |    
27 |         cls.log_me("Started benchmark at %s\n\n" % datetime.datetime.now())
28 |     
29 |         cls.temp_file = tempfile.mkstemp('h5')[1]
30 |         cls.h5_file = tables.open_file(cls.temp_file,'r+')
31 |         ts = cls.h5_file.create_ts('/','EURUSD',description=Price)
32 |     
33 |         start_dt = datetime.datetime(2014,1,1,tzinfo=pytz.utc)
34 |     
35 |         # period is number of seconds in 31 days.
36 |         # will result in slightly more than a year of data.
37 |         index = pandas.date_range(start_dt, periods=2678400, freq='S')
38 |         values = numpy.int32(numpy.random.rand(2678400, 1)*numpy.iinfo(numpy.int32).max)
39 |         df = pandas.DataFrame(values,index=index,columns=['price'],dtype=numpy.dtype('i4'))
40 |     
41 |         append_times = []
42 |         for month in range(0,12):
43 |             t = timeit.timeit(lambda: ts.append(df), number=1)
44 |             df.index = df.index+pandas.offsets.Day(31) # Shift index to next month
45 |             cls.log_me(" * finished appending month {0}\n".format(month))
46 |             append_times.append(t)
47 |     
48 |         cls.log_me("Appended 12 months of data:\n")
49 |         for a in append_times:
50 |             cls.log_me(" * {0} seconds\n".format(a))
51 |     
52 |         cls.log_me("average {0} seconds, total {1} seconds\n\n".format(sum(append_times)/len(append_times),
53 |                                                                    sum(append_times)))
54 |     
55 |     
56 |         # Now, close the file and re-open it
57 |         cls.h5_file.close()
58 |         
59 |     @classmethod
60 |     def read_data(cls):
61 |         # report the file size
62 |         h5_size = os.stat(cls.temp_file).st_size
63 |         cls.log_me("file size (bytes): {0}\n".format(h5_size))
64 | 
65 |         cls.h5_file = tables.open_file(cls.temp_file,'r')
66 |         ts = cls.h5_file.root.EURUSD._f_get_timeseries()
67 | 
68 |         # Now, read random one hour increments
69 | 
70 |         def read_random_hour(ts,min_dt,max_dt):
71 |             rnd = numpy.random.rand(1)[0]
72 |             start_offset = datetime.timedelta(seconds=(max_dt - min_dt - datetime.timedelta(hours=1)).total_seconds() * rnd)
73 |             start_dt = min_dt + start_offset
74 |             end_dt = start_dt + datetime.timedelta(hours=1)
75 | 
76 |             ts.read_range(start_dt,end_dt)
77 | 
78 |         min_dt = ts.min_dt()
79 |         max_dt = ts.max_dt()
80 | 
81 |         read_time = timeit.timeit(lambda: read_random_hour(ts, min_dt, max_dt), number=100)
82 | 
83 |         cls.log_me("average time to read one hour of data (100 repetitions): {0} seconds\n".format(read_time))
84 | 
85 | 
86 |     @classmethod
87 |     def main(cls):
88 |         cls.log = open('benchmark.txt', 'w')
89 |         
90 |         cls.write_data()
91 |         
92 |         cls.read_data()
93 | 
94 |         # Finished!
95 |         cls.log.close()
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TsTables
 2 | 
 3 | TsTables is a Python package to store time series data in HDF5 files using PyTables. It stores time
 4 | series data into daily partitions and provides functions to query for subsets of data across
 5 | partitions.
 6 | 
 7 | Its goals are to support a workflow where tons (gigabytes) of time series data are 
 8 | appended periodically to a HDF5 file, and need to be read many times (quickly) for analytical models
 9 | and research.
10 | 
11 | ## Example
12 | 
13 | This example reads in minutely bitcoin price data and then fetches a range of data. For the full example here, and other
14 | examples, see [EXAMPLES.md](EXAMPLES.md).
15 | 
16 | ```python
17 | # Class to use as the table description
18 | class BpiValues(tables.IsDescription):
19 |     timestamp = tables.Int64Col(pos=0)
20 |     bpi = tables.Float64Col(pos=1)
21 | 
22 | # Use pandas to read in the CSV data
23 | bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True)
24 | 
25 | f = tables.open_file('bpi.h5','a')
26 | 
27 | # Create a new time series
28 | ts = f.create_ts('/','BPI',BpiValues)
29 | 
30 | # Append the BPI data
31 | ts.append(bpi)
32 | 
33 | # Read in some data
34 | read_start_dt = datetime(2014,1,4,12,00)
35 | read_end_dt = datetime(2014,1,4,14,30)
36 | 
37 | rows = ts.read_range(read_start_dt,read_end_dt)
38 | 
39 | # `rows` will be a pandas DataFrame with a DatetimeIndex.
40 | ```
41 | 
42 | Here is how to open a pre-existing `bpi.h5` HDF5 file and get that timeseries from it.
43 | 
44 | ```python
45 | f = tables.open_file('bpi.h5','r')
46 | ts = f.root.BPI._f_get_timeseries()
47 | 
48 | # Read in some data
49 | read_start_dt = datetime(2014,1,4,12,00)
50 | read_end_dt = datetime(2014,1,4,14,30)
51 | 
52 | rows = ts.read_range(read_start_dt,read_end_dt)
53 | ```
54 | 
55 | ## Running unit tests
56 | 
57 | You can run the unit test suite from the command line at the root of the repository:
58 | 
59 | `python setup.py test`
60 | 
61 | 
62 | ## Preliminary benchmarks
63 | 
64 | The main goal of TsTables is to make it very fast to read subsets of data, given a date range. TsTables currently
65 | includes a simple benchmark to track progress towards that goal. To run it, after installing the package, you can run 
66 | `tstables_benchmark` from the command line or you can import the package in a Python console and run it directly.
67 | 
68 | ```python
69 | import tstables
70 | tstables.Benchmark.main()
71 | ```
72 |     
73 | Running the benchmark both prints results out to the screen and saves them in `benchmark.txt`.
74 | 
75 | The benchmark loads one year of random secondly data (just the timestamp column and a 32-bit integer "price" column) 
76 | into a file, and then it reads random one hour chunks of data.
77 | 
78 | Currently, here's some benchmarks of TsTables (from a MacBook Pro with a SSD):
79 | 
80 | Metric                                                      | Results
81 | ------------------------------------------------------------|-----------------
82 | Append one month of data (2.67 million rows)                | 0.711 seconds
83 | Fetch one hour of data into memory                          | 0.305 seconds
84 | File size (one year of data, 32 million rows, uncompressed) | 391.6 MB
85 | 
86 | HDF5 supports zlib and other compression algorithms, which can be enabled through PyTables to reduce the file 
87 | size. Without compression, the HDF5 file size is approximately 1.8% larger than the raw data in binary form, a 
88 | drastically lower overhead than CSV files.
89 | 
90 | ## Contributing
91 | 
92 | If you are interested in the project (to contribute
93 | or to hear about updates), email Andy Fiedler at <andy@andyfiedler.com> or submit a pull request.
94 | 


--------------------------------------------------------------------------------
/src/tstables/tests/test_tstable_static.py:
--------------------------------------------------------------------------------
  1 | import tstables
  2 | import unittest
  3 | import datetime
  4 | import pytz
  5 | 
  6 | class TsTableStaticTestCase(unittest.TestCase):
  7 | 
  8 |     TIME_EPS = datetime.timedelta(microseconds=1*1000)
  9 | 
 10 |     def test_partition_range_same_time(self):
 11 |         # 2014-04-01 01:00:00 UTC
 12 |         start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc)
 13 | 
 14 |         # End at the exact same time
 15 |         end_dt = start_dt
 16 |         parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt)
 17 | 
 18 |         # There should be only one partition, and the range should equal start_dt,end_dt
 19 |         self.assertEqual(parts[start_dt.date()], (start_dt,end_dt))
 20 |         self.assertEqual(len(parts.keys()), 1)
 21 | 
 22 |     def test_partition_range_same_day(self):
 23 |         # 2014-04-01 01:00:00 UTC
 24 |         start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc)
 25 | 
 26 |         # 2014-04-01 04:00:00 UTC
 27 |         end_dt = datetime.datetime(2014,4,1,4,0,tzinfo=pytz.utc)
 28 |         parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt)
 29 | 
 30 |         # There should be only one partition, and the range should equal start_dt,end_dt + 1ms
 31 |         self.assertEqual(parts[start_dt.date()], (start_dt,end_dt))
 32 |         self.assertEqual(len(parts.keys()),1)
 33 | 
 34 |     def test_partition_range_two_day(self):
 35 |         # 2014-04-01 01:00:00 UTC
 36 |         start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc)
 37 | 
 38 |         # 2014-04-02 04:00:00 UTC
 39 |         end_dt = datetime.datetime(2014,4,2,4,0,tzinfo=pytz.utc)
 40 |         parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt)
 41 | 
 42 |         # Should be two parts: [2014-04-01 01:00:00 UTC, 2014-04-01 23:59:59.999 UTC] and
 43 |         # [2014-04-02 00:00:00 UTC, 2014-04-02 04:00:00 UTC]
 44 |         self.assertEqual(parts[start_dt.date()][0], start_dt)
 45 |         self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc) - self.TIME_EPS)
 46 |         self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc))
 47 |         self.assertEqual(parts[end_dt.date()][1],end_dt)
 48 |         self.assertEqual(len(parts.keys()),2)
 49 | 
 50 |     def test_partition_range_three_day(self):
 51 |         # 2014-04-01 01:00:00 UTC
 52 |         start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc)
 53 | 
 54 |         # 2014-04-03 04:00:00 UTC
 55 |         end_dt = datetime.datetime(2014,4,3,4,0,tzinfo=pytz.utc)
 56 |         parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt)
 57 | 
 58 |         # Should be three parts: [2014-04-01 01:00:00 UTC, 2014-04-01 23:59:59.999 UTC],
 59 |         # [2014-04-02 00:00:00 UTC, 2014-04-02 23:59:59.999 UTC]
 60 |         # [2014-04-03 00:00:00 UTC, 2014-04-03 04:00:00 UTC]
 61 |         self.assertEqual(parts[start_dt.date()][0], start_dt)
 62 |         self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc)-self.TIME_EPS)
 63 | 
 64 |         mid_date = start_dt.date() + datetime.timedelta(days=1)
 65 |         self.assertEqual(parts[mid_date][0], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc))
 66 |         self.assertEqual(parts[mid_date][1], datetime.datetime(2014,4,3,0,0,tzinfo=pytz.utc)-self.TIME_EPS)
 67 | 
 68 |         self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,3,0,0,tzinfo=pytz.utc))
 69 |         self.assertEqual(parts[end_dt.date()][1], end_dt)
 70 |         self.assertEqual(len(parts.keys()),3)
 71 | 
 72 |     def test_partition_range_just_cross_boundary(self):
 73 |         # 2014-03-31 23:59:59.999 UTC
 74 |         start_dt = datetime.datetime(2014,3,31,23,59,59,999*1000,tzinfo=pytz.utc)
 75 | 
 76 |         # 2014-04-01 00:00:00.001 UTC
 77 |         end_dt = datetime.datetime(2014,4,1,0,0,0,1*1000,tzinfo=pytz.utc)
 78 |         parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt)
 79 | 
 80 |         # Should be two parts: [2014-03-31 23:59:59.999 UTC, 2014-03-31 23:59:59.999 UTC] and
 81 |         # [2014-04-01 00:00:00 UTC, 2014-04-01 00:00:00.001 UTC]
 82 |         self.assertEqual(parts[start_dt.date()][0], start_dt)
 83 |         self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,1,0,0,tzinfo=pytz.utc) - self.TIME_EPS)
 84 |         self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,1,0,0,tzinfo=pytz.utc))
 85 |         self.assertEqual(parts[end_dt.date()][1], end_dt)
 86 |         self.assertEqual(len(parts.keys()),2)
 87 | 
 88 |     def test_dt_to_ts(self):
 89 |         # Test 1 - Epoch
 90 |         dt = datetime.datetime(1970,1,1,tzinfo=pytz.utc)
 91 |         ts = tstables.TsTable._TsTable__dt_to_ts(dt)
 92 | 
 93 |         self.assertEqual(ts, 0)
 94 | 
 95 |         # Test 2 - 1970-01-01T00:00:00.000
 96 |         dt = datetime.datetime(1971,1,1,tzinfo=pytz.utc)
 97 |         ts = tstables.TsTable._TsTable__dt_to_ts(dt)
 98 | 
 99 |         assert ts == 31536000000
100 | 
101 |         # Test 3 - 2014-05-05T01:01:01.100
102 |         dt = datetime.datetime(year=2014,month=5,day=5,hour=1,minute=1,second=1,microsecond=100*1000,tzinfo=pytz.utc)
103 |         ts = tstables.TsTable._TsTable__dt_to_ts(dt)
104 | 
105 |         assert ts == 1399251661100
106 | 
107 |     def test_ts_to_dt(self):
108 |         # Test 1 - Epoch
109 |         ts = 0
110 |         dt = tstables.TsTable._TsTable__ts_to_dt(ts)
111 | 
112 |         assert dt == datetime.datetime(1970,1,1,tzinfo=pytz.utc)
113 | 
114 |         # Test 2 - 1970-01-01T00:00:00.000
115 |         ts = 31536000000
116 |         dt = tstables.TsTable._TsTable__ts_to_dt(ts)
117 | 
118 |         assert dt == datetime.datetime(1971,1,1,tzinfo=pytz.utc)
119 | 
120 |         # Test 3 - 2014-05-05T01:01:01.100
121 |         ts = 1399251661100
122 |         dt = tstables.TsTable._TsTable__ts_to_dt(ts)
123 | 
124 |         assert dt == datetime.datetime(year=2014,month=5,day=5,hour=1,minute=1,second=1,microsecond=100*1000,tzinfo=pytz.utc)
125 | 
126 |     def test_partition_date_to_path_array(self):
127 |         dt = datetime.datetime(2014,5,5,1,1,1,tzinfo=pytz.utc)
128 |         pa = tstables.TsTable._TsTable__partition_date_to_path_array(dt)
129 |         expected = ['y2014','m05','d05']
130 |         for idx,p in enumerate(pa):
131 |             assert p == expected[idx]
132 | 
133 | def suite():
134 |     loader = unittest.TestLoader()
135 |     suite = unittest.TestSuite()
136 |     suite.addTest(loader.loadTestsFromTestCase(TsTableStaticTestCase))
137 |     return suite
138 | 
139 | if __name__ == '__main__':
140 |     unittest.TextTestRunner(verbosity=2).run(suite())


--------------------------------------------------------------------------------
/src/tstables/tests/test_tstable_file.py:
--------------------------------------------------------------------------------
  1 | import tables
  2 | import tstables
  3 | import unittest
  4 | import datetime
  5 | import pytz
  6 | import tempfile
  7 | try:
  8 |     from io import StringIO
  9 | except ImportError:
 10 |     from cStringIO import StringIO
 11 | import os
 12 | import pandas
 13 | import mock
 14 | import numpy
 15 | 
 16 | # Class to define record structure
 17 | class Price(tables.IsDescription):
 18 |     timestamp = tables.Int64Col(pos=0)
 19 |     price = tables.Int32Col(pos=1)
 20 | 
 21 | 
 22 | class TsTableFileTestCase(unittest.TestCase):
 23 | 
 24 |     def setUp(self):
 25 |         self.temp_file = tempfile.mkstemp('h5')[1]
 26 |         self.h5_file = tables.open_file(self.temp_file,'r+')
 27 | 
 28 |     def tearDown(self):
 29 |         self.h5_file.close()
 30 |         os.remove(self.temp_file)
 31 | 
 32 | 
 33 |     def test_create_ts(self):
 34 |         # Technically, there is a race condition here if you happen to run this at exactly midnight UTC!
 35 |         now = datetime.datetime.utcnow()
 36 |         self.h5_file.create_ts('/','EURUSD',description=Price)
 37 | 
 38 |         # Want to check that:
 39 |         # - the group exists
 40 |         # - it has a _TS_TABLES_CLASS attribute equal to "TIMESERIES"
 41 |         # - it has a table at yYYYY/mMM/dDD/ts_data, where YYY-MM-DD is today (in UTC)
 42 |         # - the dtype is correct
 43 |         self.assertEqual(self.h5_file.root.EURUSD.__class__, tables.Group)
 44 |         self.assertEqual(self.h5_file.root.EURUSD._v_attrs._TS_TABLES_CLASS,'TIMESERIES')
 45 | 
 46 |         path = tstables.TsTable._TsTable__partition_date_to_path_array(now.date())
 47 | 
 48 |         ts_data = self.h5_file.root.EURUSD._f_get_child(path[0])._f_get_child(path[1])._f_get_child(
 49 |             path[2])._f_get_child('ts_data')
 50 | 
 51 |         self.assertEqual(ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION,10000)
 52 | 
 53 |         self.assertEqual(ts_data._v_dtype[0],tables.dtype_from_descr(Price)[0])
 54 |         self.assertEqual(ts_data._v_dtype[1],tables.dtype_from_descr(Price)[1])
 55 | 
 56 | 
 57 |     def test_create_ts_with_invalid_description_incorrect_order(self):
 58 |         class InvalidDesc(tables.IsDescription):
 59 |             # Positions are out of order here!
 60 |             timestamp = tables.Int64Col(pos=1)
 61 |             price = tables.Int32Col(pos=0)
 62 | 
 63 |         self.assertRaises(AttributeError, self.h5_file.create_ts, '/', 'EURUSD', description=InvalidDesc)
 64 | 
 65 |     def test_create_ts_with_invalid_description_incorrect_order(self):
 66 |         class InvalidDesc(tables.IsDescription):
 67 |             # Type is incorrect here!
 68 |             timestamp = tables.Int32Col(pos=0)
 69 |             price = tables.Int32Col(pos=1)
 70 | 
 71 |         self.assertRaises(AttributeError, self.h5_file.create_ts, '/', 'EURUSD', description=InvalidDesc)
 72 | 
 73 |     def test_load_same_timestamp(self):
 74 | 
 75 |         # Test data that is multiple rows with the same timestamp
 76 |         csv = u"""2014-05-05T01:01:01.100Z,1
 77 |                  2014-05-05T01:01:01.100Z,2
 78 |                  2014-05-05T01:01:01.100Z,3
 79 |                  2014-05-05T01:01:01.100Z,4
 80 |                  2014-05-05T01:01:01.100Z,5"""
 81 | 
 82 |         sfile = StringIO(csv)
 83 | 
 84 |         # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
 85 |         rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})
 86 | 
 87 |         ts = self.h5_file.create_ts('/','EURUSD',description=Price)
 88 |         ts.append(rows)
 89 | 
 90 |         # Inspect to ensure that data has been stored correctly
 91 |         tbl = ts.root_group.y2014.m05.d05.ts_data
 92 | 
 93 |         self.assertEqual(tbl.nrows,5)
 94 | 
 95 |         # Fetch rows over a larger range
 96 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc))
 97 | 
 98 |         # Confirm equality
 99 |         for idx,p in enumerate(rows_read['price']):
100 |             self.assertEqual(p,rows['price'][idx])
101 | 
102 |         # Fetch rows over the smallest possible range
103 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc),
104 |                                   datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc))
105 | 
106 |         # Confirm equality
107 |         for idx,p in enumerate(rows_read['price']):
108 |             self.assertEqual(p,rows['price'][idx])
109 | 
110 | 
111 |     @mock.patch.object(tstables.TsTable, 'MAX_FULL_PARTITION_READ_SIZE', 1)
112 |     def test_load_same_timestamp(self):
113 | 
114 |         # Test data that is multiple rows with the same timestamp
115 |         csv = u"""2014-05-05T01:01:01.100Z,1
116 |                  2014-05-05T01:01:01.100Z,2
117 |                  2014-05-05T01:01:01.100Z,3
118 |                  2014-05-05T01:01:01.100Z,4
119 |                  2014-05-05T01:01:01.100Z,5"""
120 | 
121 |         sfile = StringIO(csv)
122 | 
123 |         # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
124 |         rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})
125 | 
126 |         ts = self.h5_file.create_ts('/','EURUSD',description=Price)
127 |         ts.append(rows)
128 | 
129 |         # Inspect to ensure that data has been stored correctly
130 |         tbl = ts.root_group.y2014.m05.d05.ts_data
131 | 
132 |         self.assertEqual(tbl.nrows,5)
133 | 
134 |         # Fetch rows over a larger range
135 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc))
136 | 
137 |         # Confirm equality
138 |         for idx,p in enumerate(rows_read['price']):
139 |             self.assertEqual(p,rows['price'][idx])
140 | 
141 |         # Fetch rows over the smallest possible range
142 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc),
143 |                                   datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc))
144 | 
145 |         # Confirm equality
146 |         for idx,p in enumerate(rows_read['price']):
147 |             self.assertEqual(p,rows['price'][idx])
148 | 
149 | 
150 |     @mock.patch.object(tstables.TsTable, 'MAX_FULL_PARTITION_READ_SIZE', 1)
151 |     @mock.patch.object(tables.Table, 'read_where')
152 |     @mock.patch.object(tables.Table, 'read')
153 |     def test_read_using_read_where(self, mock_read, mock_read_where):
154 | 
155 |         csv = u"""2014-05-05T01:01:01.100Z,1
156 |                  2014-05-05T01:01:01.100Z,2
157 |                  2014-05-05T01:01:01.100Z,3
158 |                  2014-05-05T01:01:01.100Z,4
159 |                  2014-05-05T01:01:01.100Z,5"""
160 | 
161 |         sfile = StringIO(csv)
162 | 
163 |         # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
164 |         rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})
165 | 
166 |         ts = self.h5_file.create_ts('/','EURUSD',description=Price)
167 |         ts.append(rows)
168 | 
169 |         # Inspect to ensure that data has been stored correctly
170 |         tbl = ts.root_group.y2014.m05.d05.ts_data
171 | 
172 |         self.assertEqual(tbl.nrows,5)
173 | 
174 |         # Table.read_where is a mock, so we need to give it a return value
175 |         mock_read_where.return_value = numpy.ndarray(shape=0,dtype=[('timestamp', '<i8'), ('price', '<i4')])
176 | 
177 |         # Fetch rows over a larger range
178 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc))
179 | 
180 |         self.assertEquals(mock_read_where.called, True)
181 |         self.assertEquals(mock_read.called, False)
182 | 
183 |     @mock.patch.object(tables.Table, 'read_where')
184 |     @mock.patch.object(tables.Table, 'read')
185 |     def test_read_using_read_where(self, mock_read, mock_read_where):
186 | 
187 |         csv = u"""2014-05-05T01:01:01.100Z,1
188 |                  2014-05-05T01:01:01.100Z,2
189 |                  2014-05-05T01:01:01.100Z,3
190 |                  2014-05-05T01:01:01.100Z,4
191 |                  2014-05-05T01:01:01.100Z,5"""
192 | 
193 |         sfile = StringIO(csv)
194 | 
195 |         # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
196 |         rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})
197 | 
198 |         ts = self.h5_file.create_ts('/','EURUSD',description=Price)
199 |         ts.append(rows)
200 | 
201 |         # Inspect to ensure that data has been stored correctly
202 |         tbl = ts.root_group.y2014.m05.d05.ts_data
203 | 
204 |         self.assertEqual(tbl.nrows,5)
205 | 
206 |         # Table.read_where is a mock, so we need to give it a return value
207 |         mock_read.return_value = numpy.ndarray(shape=0,dtype=[('timestamp', '<i8'), ('price', '<i4')])
208 | 
209 |         # Fetch rows over a larger range
210 |         rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc))
211 | 
212 |         self.assertEquals(mock_read_where.called, False)
213 |         self.assertEquals(mock_read.called, True)
214 | 
215 | 
216 |     def __load_csv_data(self,csv):
217 |         sfile = StringIO(csv)
218 | 
219 |         # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
220 |         rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})
221 | 
222 |         ts = self.h5_file.create_ts('/','EURUSD',description=Price)
223 |         ts.append(rows)
224 | 
225 |         return ts,rows
226 | 
227 |     def test_load_cross_partition_boundary_timestamps(self):
228 | 
229 |         # This data should just cross the partition boundary between 5/4 and 5/5
230 |         csv = u"""2014-05-04T23:59:59.998Z,1
231 |                  2014-05-04T23:59:59.999Z,2
232 |                  2014-05-04T23:59:59.999Z,3
233 |                  2014-05-05T00:00:00.000Z,4
234 |                  2014-05-05T00:00:00.001Z,5"""
235 | 
236 |         ts,rows = self.__load_csv_data(csv)
237 | 
238 |         # Inspect to ensure that data has been stored correctly
239 |         tbl = ts.root_group.y2014.m05.d04.ts_data
240 | 
241 |         # Three rows on the 4th
242 |         self.assertEqual(tbl.nrows,3)
243 | 
244 |         tbl = ts.root_group.y2014.m05.d05.ts_data
245 | 
246 |         # Two rows on the 5th
247 |         self.assertEqual(tbl.nrows,2)
248 | 
249 |         # Fetch rows over a larger range
250 |         rows_read = ts.read_range(datetime.datetime(2014,5,4,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc))
251 | 
252 |         # Confirm equality
253 |         for idx,p in enumerate(rows_read['price']):
254 |             self.assertEqual(p,rows['price'][idx])
255 | 
256 |         # Fetch rows over the smallest possible range
257 |         rows_read = ts.read_range(datetime.datetime(2014,5,4,23,59,59,998*1000,tzinfo=pytz.utc),
258 |                                   datetime.datetime(2014,5,5,0,0,0,1*1000,tzinfo=pytz.utc))
259 | 
260 |         # Confirm equality
261 |         for idx,p in enumerate(rows_read['price']):
262 |             self.assertEqual(p,rows['price'][idx])
263 | 
264 |     def test_read_data_end_date_before_start_date(self):
265 |         csv = u"""2014-05-04T23:59:59.998Z,1
266 |                  2014-05-04T23:59:59.999Z,2
267 |                  2014-05-04T23:59:59.999Z,3
268 |                  2014-05-05T00:00:00.000Z,4
269 |                  2014-05-05T00:00:00.001Z,5"""
270 | 
271 |         ts,rows = self.__load_csv_data(csv)
272 | 
273 |         # Try to fetch with end_dt before start_dt
274 |         end_dt = datetime.datetime(2014,5,1)
275 |         start_dt = datetime.datetime(2014,5,5)
276 |         self.assertRaises(AttributeError, ts.read_range, start_dt, end_dt)
277 | 
278 |         # This should work, and return just this row: '2014-05-05T00:00:00.000Z,4'
279 |         end_dt = start_dt
280 |         rng = ts.read_range(start_dt,end_dt)
281 | 
282 |         self.assertEqual(rng['price'].size, 1)
283 |         self.assertEqual(rng['price'][0],4)
284 | 
285 |     def test_no_data_stored_in_missing_day(self):
286 |         # Note that May 5 is missing
287 |         csv = u"""2014-05-04T23:59:59.998Z,1
288 |                  2014-05-04T23:59:59.999Z,2
289 |                  2014-05-04T23:59:59.999Z,3
290 |                  2014-05-06T00:00:00.000Z,4
291 |                  2014-05-06T00:00:00.001Z,5"""
292 | 
293 |         ts,rows = self.__load_csv_data(csv)
294 | 
295 |         tbl = ts.root_group.y2014.m05.d05.ts_data
296 | 
297 |         # No rows on the 5th
298 |         self.assertEqual(tbl.nrows,0)
299 | 
300 |         tbl = ts.root_group.y2014.m05.d06.ts_data
301 | 
302 |         # Two rows on the 6th
303 |         self.assertEqual(tbl.nrows,2)
304 | 
305 |         tbl = ts.root_group.y2014.m05.d04.ts_data
306 | 
307 |         # Three rows on the 4th
308 |         self.assertEqual(tbl.nrows,3)
309 | 
310 |     def test_exception_on_unsorted_data(self):
311 |         # Note that this is unsorted
312 |         csv = u"""2014-05-05T23:59:59.998Z,1
313 |                  2014-05-04T23:59:59.999Z,2
314 |                  2014-05-04T23:59:59.999Z,3
315 |                  2014-05-06T00:00:00.000Z,4
316 |                  2014-05-06T00:00:00.001Z,5"""
317 | 
318 |         self.assertRaises(ValueError, self.__load_csv_data, csv)
319 | 
320 |     def test_append_no_data(self):
321 |         # No data, just making sure this doesn't throw an exception or anything
322 |         csv = u""""""
323 | 
324 |         ts,rows = self.__load_csv_data(csv)
325 | 
326 |         self.assertEqual(rows['price'].size, 0)
327 | 
328 |     def test_no_group_created_on_create_ts_exception(self):
329 |         self.assertRaises(ValueError,self.h5_file.create_ts,'/','EURUSD',description=Price,
330 |                           chunkshape='an invalid chunkshape')
331 | 
332 |         # Should not have created the group
333 |         self.assertRaises(tables.NoSuchNodeError,self.h5_file.root._f_get_child,'EURUSD')
334 | 
335 | 
336 | 
337 | def suite():
338 |     loader = unittest.TestLoader()
339 |     suite = unittest.TestSuite()
340 |     suite.addTest(loader.loadTestsFromTestCase(TsTableFileTestCase))
341 |     return suite
342 | 
343 | if __name__ == '__main__':
344 |     unittest.TextTestRunner(verbosity=2).run(suite())


--------------------------------------------------------------------------------
/src/tstables/tstable.py:
--------------------------------------------------------------------------------
  1 | import pytz
  2 | import datetime
  3 | import tables
  4 | import numpy
  5 | import numpy.lib.recfunctions
  6 | import pandas
  7 | import re
  8 | 
  9 | class TsTable:
 10 |     EPOCH = datetime.datetime(1970,1,1,tzinfo=pytz.utc)
 11 | 
 12 |     # Partition size is one day (in milliseconds)
 13 |     PARTITION_SIZE = numpy.int64(86400000)
 14 | 
 15 |     # The maximum partition size to read completely into memory before using Table.read_where.
 16 |     MAX_FULL_PARTITION_READ_SIZE = 25*1e6
 17 | 
 18 |     def __init__(self,pt_file,root_group,description,title="",filters=None,
 19 |         expectedrows_per_partition=10000,chunkshape=None,byteorder=None):
 20 |         self.file = pt_file
 21 |         self.root_group = root_group
 22 |         self.table_description = description
 23 |         self.table_title = title
 24 |         self.table_filters = filters
 25 |         self.table_expectedrows = expectedrows_per_partition
 26 |         self.table_chunkshape = chunkshape
 27 |         self.table_byteorder = byteorder
 28 | 
 29 |     @classmethod
 30 |     def __tsrange_to_partition_ranges(self,start_ts,end_ts):
 31 |         start_partition = start_ts // self.PARTITION_SIZE
 32 |         end_partition = end_ts // self.PARTITION_SIZE
 33 | 
 34 |         # Handle the special case when there is only one partition
 35 |         if start_partition == end_partition:
 36 |             return {start_partition: (start_ts, end_ts)}
 37 | 
 38 |         partition_ranges = {}
 39 |         for p in range(start_partition,end_partition+1):
 40 |             if p == start_partition:
 41 |                 # append truncated range from start_ts to the end of the partition
 42 |                 partition_ranges[p] = tuple((start_ts, (start_partition+1)*self.PARTITION_SIZE-1))
 43 |             elif p == end_partition:
 44 |                 # append truncated range from start of the partition to end_ts
 45 |                 partition_ranges[p] = tuple(((end_partition*self.PARTITION_SIZE), end_ts))
 46 |             else:
 47 |                 partition_ranges[p] = tuple((p*self.PARTITION_SIZE, (p+1)*self.PARTITION_SIZE - 1))
 48 | 
 49 |         return partition_ranges
 50 | 
 51 | 
 52 | 
 53 |     @classmethod
 54 |     def __dtrange_to_partition_ranges(self,start_dt,end_dt):
 55 | 
 56 |         # We assume that start_dt and end_dt are in UTC at this point
 57 |         start_ts = self.__dt_to_ts(start_dt)
 58 |         end_ts = self.__dt_to_ts(end_dt)
 59 | 
 60 |         ts_partitions = self.__tsrange_to_partition_ranges(start_ts,end_ts)
 61 | 
 62 |         dt_partitions = {}
 63 | 
 64 |         for k in ts_partitions.keys():
 65 |             day = self.__ts_to_dt(k*self.PARTITION_SIZE).date()
 66 |             s_ts = ts_partitions[k][0]
 67 |             e_ts = ts_partitions[k][1]
 68 |             dt_partitions[day] = tuple((self.__ts_to_dt(s_ts),self.__ts_to_dt(e_ts)))
 69 | 
 70 |         return dt_partitions
 71 |     
 72 |     @classmethod
 73 |     def __dt_to_ts(self,dt):
 74 |         delta = dt - self.EPOCH
 75 |         ts = numpy.int64(delta.total_seconds()) # This will strip off fractional seconds
 76 |         ts = ts * 1000 # shift to milliseconds
 77 |         ts = ts + numpy.int64(delta.microseconds/1000)
 78 |         return ts
 79 | 
 80 |     @classmethod
 81 |     def __ts_to_dt(self,ts):
 82 |         # Trying to avoid a lossy conversion here. If we were to cast the ts as a timedelta using
 83 |         # just milliseconds, we might overflow the buffer
 84 |         ts_milliseconds = ts % 1000
 85 |         ts_seconds = numpy.int64(numpy.divide((ts - (ts % 1000)) % 86400000,1000))
 86 |         ts_days = numpy.int64(numpy.divide(ts - ts_milliseconds - ts_seconds*1000,86400000))
 87 |         # The reason we do it this way it to attempt to avoid overflows for any component
 88 |         dt = self.EPOCH + datetime.timedelta(days=int(ts_days),
 89 |             seconds=int(ts_seconds),microseconds=int(ts_milliseconds)*1000)
 90 |         return dt
 91 | 
 92 |     def __v_dtype(self):
 93 |         return tables.description.dtype_from_descr(self.table_description)
 94 | 
 95 |     def __fetch_rows_from_partition(self,partition_date,start_dt,end_dt):
 96 |         try:
 97 |             y_group = self.root_group._v_groups[partition_date.strftime('y%Y')]
 98 |             m_group = y_group._v_groups[partition_date.strftime('m%m')]
 99 |             d_group = m_group._v_groups[partition_date.strftime('d%d')]
100 |         except KeyError:
101 |             # If the partition group is missing, then return an empty array
102 |             return numpy.ndarray(shape=0,dtype=self.__v_dtype())
103 | 
104 |         # It is faster to fetch the entire partition into memory and process it with NumPy than to
105 |         # use Table.read_where. However, Table.read_where might be needed for very large partitions
106 |         # where memory usage is a concern.
107 |         if d_group.ts_data.rowsize * d_group.ts_data.nrows < TsTable.MAX_FULL_PARTITION_READ_SIZE:
108 |             p_data = d_group.ts_data.read()
109 |             start_ts = self.__dt_to_ts(start_dt)
110 |             end_ts = self.__dt_to_ts(end_dt)
111 |             start_idx = numpy.searchsorted(p_data['timestamp'], start_ts, side='left')
112 |             end_idx = numpy.searchsorted(p_data['timestamp'], end_ts, side='right')
113 |             return p_data[start_idx:end_idx]
114 |         else:
115 |             return d_group.ts_data.read_where('(timestamp >= {0}) & (timestamp <= {1})'.format(
116 |                 self.__dt_to_ts(start_dt),self.__dt_to_ts(end_dt)))
117 | 
118 |     def __fetch_first_table(self):
119 |         y_group = self.root_group._f_list_nodes()[0]
120 |         m_group = y_group._f_list_nodes()[0]
121 |         d_group = m_group._f_list_nodes()[0]
122 |         return d_group.ts_data
123 | 
124 |     def __fetch_last_table(self):
125 |         y_group = sorted(self.root_group._f_list_nodes(), key=lambda x: x._v_name)[-1]
126 |         m_group = sorted(y_group._f_list_nodes(), key=lambda x: x._v_name)[-1]
127 |         d_group = sorted(m_group._f_list_nodes(), key=lambda x: x._v_name)[-1]
128 |         return d_group.ts_data
129 | 
130 |     def __get_max_ts(self):
131 |         max_group_dt = None
132 |         max_ts = None
133 |         for group in self.root_group._f_walk_groups():
134 |             m = re.search('y([0-9]{4})/m([0-9]{2})/d([0-9]{2})',group._v_pathname)
135 |             if m is not None:
136 |                 group_dt = datetime.date(int(m.group(1)),int(m.group(2)),int(m.group(3)))
137 |             else:
138 |                 continue
139 | 
140 |             if (max_group_dt is not None) and (max_group_dt < group_dt):
141 |                 
142 |                 if group.ts_data.nrows == 0:
143 |                     group_max_ts = None
144 |                 else:
145 |                     group_max_ts = group.ts_data.cols.timestamp[-1]
146 | 
147 |                 if (group_max_ts is not None) and (max_ts is None or max_ts < group_max_ts):
148 |                     max_ts = group_max_ts
149 |                     max_group_dt = group_dt
150 |             elif (max_group_dt is None):
151 |             
152 |                 if group.ts_data.nrows == 0:
153 |                     group_max_ts = None
154 |                 else:
155 |                     group_max_ts = group.ts_data.cols.timestamp[-1]
156 | 
157 |                 if (group_max_ts is not None):
158 |                     max_ts = group_max_ts
159 |                     max_group_dt = group_dt
160 | 
161 |         
162 |         return max_ts
163 | 
164 |     def __get_min_ts(self):
165 |         min_group_dt = None
166 |         min_ts = None
167 |         for group in self.root_group._f_walk_groups():
168 |             m = re.search('y([0-9]{4})/m([0-9]{2})/d([0-9]{2})',group._v_pathname)
169 |             if m is not None:
170 |                 group_dt = datetime.date(int(m.group(1)),int(m.group(2)),int(m.group(3)))
171 |             else:
172 |                 continue
173 | 
174 |             if (min_group_dt is not None) and (min_group_dt > group_dt):
175 |                 if group.ts_data.nrows == 0:
176 |                     group_min_ts = None
177 |                 else:
178 |                     group_min_ts = group.ts_data.cols.timestamp[0]
179 | 
180 |                 if (group_min_ts is not None) and (min_ts is None or min_ts > group_min_ts):
181 |                     min_ts = group_min_ts
182 |                     min_group_dt = group_dt
183 |             elif (min_group_dt is None):
184 |                 
185 |                 if group.ts_data.nrows == 0:
186 |                     group_min_ts = None
187 |                 else:
188 |                     group_min_ts = group.ts_data.cols.timestamp[0]
189 | 
190 |                 if (group_min_ts is not None):
191 |                     min_ts = group_min_ts
192 |                     min_group_dt = group_dt
193 | 
194 |         
195 |         return min_ts
196 | 
197 |     def min_dt(self):
198 |         return self.__ts_to_dt(self.__get_min_ts())
199 | 
200 |     def max_dt(self):
201 |         return self.__ts_to_dt(self.__get_max_ts())
202 | 
203 |     def read_range(self,start_dt,end_dt,as_pandas_dataframe=True):
204 |         # Convert start_dt and end_dt to UTC if they are naive
205 |         if start_dt.tzinfo is None:
206 |             start_dt = pytz.utc.localize(start_dt)
207 |         if end_dt.tzinfo is None:
208 |             end_dt = pytz.utc.localize(end_dt)
209 | 
210 | 
211 |         if start_dt > end_dt:
212 |             raise AttributeError('start_dt must be <= end_dt')
213 |         
214 | 
215 |         partitions = self.__dtrange_to_partition_ranges(start_dt,end_dt)
216 |         sorted_pkeys = sorted(partitions.keys())
217 | 
218 |         # Start with an empty array
219 |         result = numpy.ndarray(shape=0,dtype=self.__v_dtype())
220 | 
221 |         for p in sorted_pkeys:
222 |             result = numpy.concatenate(
223 |                 (result,self.__fetch_rows_from_partition(p,start_dt,end_dt)))
224 | 
225 |         # Turn into a pandas DataFrame with a timeseries index
226 |         if as_pandas_dataframe:
227 |             result = pandas.DataFrame.from_records(result,
228 |                 index=result['timestamp'].astype('datetime64[ms]'),
229 |                 exclude=['timestamp'])
230 | 
231 |         return result
232 | 
233 |     def append(self,rows,convert_strings=False):
234 |         # This part is specific to pandas support. If rows is a pandas DataFrame, convert it to a
235 |         # format suitable to PyTables
236 |         if rows.__class__ == pandas.core.frame.DataFrame:
237 |             if rows.empty:
238 |                 return # Do nothing if we are appending nothing
239 | 
240 |             # Fixed Bug with pandas version 0.20+
241 |             if rows.index.__class__ != pandas.core.indexes.datetimes.DatetimeIndex:
242 |                 raise ValueError('when rows is a DataFrame, the index must be a DatetimeIndex.')
243 | 
244 |             # Convert to records
245 |             records = rows.to_records(index=True)
246 | 
247 |             # Need to make two type conversions:
248 |             # 1. Pandas stores strings internally as variable-length strings, which are converted to objects in NumPy
249 |             #    PyTables can't store those in a StringCol, so this converts to fixed-length strings if convert_strings
250 |             #    set to True.
251 |             # 2. Need to convert the timestamp to datetime64[ms] (milliseconds)
252 | 
253 |             dest_dtype = self.__fetch_first_table().description._v_dtype
254 | 
255 |             new_descr = []
256 |             existing_descr = records.dtype.descr
257 | 
258 |             for idx,d in enumerate(existing_descr):
259 |                 if existing_descr[idx][1] == '|O8' and dest_dtype[idx].char == 'S' and convert_strings:
260 |                     # records dtype is something like |O8 and dest dt is a string
261 |                     new_descr.append((existing_descr[idx][0], dest_dtype[idx]))
262 |                 elif idx == 0:
263 |                     # Make sure timestamp is in milliseconds
264 |                     new_descr.append((existing_descr[idx][0], '<M8[ms]'))
265 |                 else:
266 |                     new_descr.append(existing_descr[idx])
267 | 
268 |             # recast to the new type
269 |             rows = records.astype(numpy.dtype(new_descr))
270 | 
271 |         # Try to convert the object into a recarray compliant with table. This code is stolen from
272 |         # PyTable's append method.
273 |         try:
274 |             iflavor = tables.flavor.flavor_of(rows)
275 |             if iflavor != 'python':
276 |                 rows = tables.flavor.array_as_internal(rows,iflavor)
277 | 
278 |             wbufRA = numpy.rec.array(rows, dtype=self.__fetch_first_table().description._v_dtype)
279 |         except Exception as exc:
280 |             raise ValueError("rows parameter cannot be converted into a recarray object compliant "
281 |                              "with table '%s'.  The error was: <%s>" % (str(self), exc))
282 | 
283 |         # Confirm that first column is Int64. This is an additional constraint of TsTables.     
284 |         if not wbufRA.dtype[0] == numpy.dtype('int64'):
285 |             raise ValueError("first column must be of type numpy.int64.")
286 | 
287 |         # We also need to confirm that the rows are sorted by timestamp. This is an additional
288 |         # constraint of TsTables.
289 |         if not (numpy.diff(wbufRA['timestamp']) >= 0).all():
290 |             raise ValueError("timestamp column must be sorted in ascending order.")
291 | 
292 |         # Array is confirmed sorted at this point, so min and max are easy to get
293 |         min_ts = wbufRA[0][0]
294 |         max_ts = wbufRA[-1][0]
295 | 
296 |         # Confirm that min is >= to the TsTable's max_ts
297 |         if min_ts < (self.__get_max_ts() or numpy.iinfo('int64').min):
298 |             raise ValueError("rows start prior to the end of existing rows, so they cannot be "
299 |                              "appended.")
300 | 
301 |         # wbufRA is ready to be inserted at this point. Chop it up into partitions.
302 |         min_dt = self.__ts_to_dt(min_ts)
303 |         max_dt = self.__ts_to_dt(max_ts)
304 |         possible_partitions = self.__dtrange_to_partition_ranges(min_dt,max_dt)
305 | 
306 |         sorted_pkeys = sorted(possible_partitions.keys())
307 | 
308 |         # For each partition, we are splitting on the end date
309 |         split_on_idx = []
310 |         for p in sorted_pkeys:
311 |             # p_max_ts is the maximum value of the timestamp column that SHOULD be included in this
312 |             # partition.
313 |             # We need to determine the row index of the row AFTER the last row where p_max_ts is <= to
314 |             # the timestamp.
315 |             p_max_ts = self.__dt_to_ts(possible_partitions[p][1])
316 |             split_on = numpy.searchsorted(wbufRA['timestamp'], p_max_ts, side='right')
317 |             split_on_idx.append(split_on)
318 | 
319 |         # Now, split the array
320 |         split_wbufRA = numpy.split(wbufRA,split_on_idx)
321 | 
322 |         # Save each partition
323 |         for idx,p in enumerate(sorted_pkeys):
324 |             self.__append_rows_to_partition(p,split_wbufRA[idx])
325 | 
326 |     @staticmethod
327 |     def __partition_date_to_path_array(partition_dt):
328 |         """Converts a partition date to an array of partition names
329 |         """
330 | 
331 |         return [partition_dt.strftime('y%Y'),partition_dt.strftime('m%m'),partition_dt.strftime('d%d')]
332 | 
333 |     def __append_rows_to_partition(self,partition_dt,rows):
334 |         """Appends rows to a partition (which might not exist yet, and will then be created)
335 | 
336 |         The rows argument is assumed to be sorted and *only* contain rows that have timestamps that
337 |         are valid for this partition.
338 |         """
339 | 
340 |         ts_data = self.__fetch_or_create_partition_table(partition_dt)
341 |         ts_data.append(rows)
342 |     
343 |     def __fetch_partition_group(self,partition_dt):
344 |         """Fetches a partition group, or returns `False` if the partition group does not exist
345 |         """
346 | 
347 |         try:
348 |             p_array = self.__partition_date_to_path_array(partition_dt)
349 |             return self.root_group._f_get_child(p_array[0])._f_get_child(p_array[1])._f_get_child(p_array[2])
350 |         except (KeyError,tables.NoSuchNodeError):
351 |             return False
352 | 
353 |     def __create_partition(self,partition_dt):
354 |         """Creates partition, including parent groups (if they don't exist) and the data table
355 |         """
356 | 
357 |         p_array = self.__partition_date_to_path_array(partition_dt)
358 |         
359 |         # For each component, fetch the group or create it
360 |         # Year
361 |         try:
362 |             y_group = self.root_group._f_get_child(p_array[0])
363 |         except tables.NoSuchNodeError:
364 |             y_group = self.file.create_group(self.root_group,p_array[0])
365 | 
366 |         # Month
367 |         try:
368 |             m_group = y_group._f_get_child(p_array[1])
369 |         except tables.NoSuchNodeError:
370 |             m_group = self.file.create_group(y_group,p_array[1])
371 | 
372 |         # Day
373 |         try:
374 |             d_group = m_group._f_get_child(p_array[2])
375 |         except tables.NoSuchNodeError:
376 |             d_group = self.file.create_group(m_group,p_array[2])
377 | 
378 |         # We need to create the table in the day group
379 |         ts_data = self.file.create_table(d_group,'ts_data',self.table_description,self.table_title,
380 |             self.table_filters, self.table_expectedrows, self.table_chunkshape, self.table_byteorder)
381 | 
382 |         # Need to save this as an attribute because it doesn't seem to be saved anywhere
383 |         ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION = self.table_expectedrows
384 | 
385 |         return ts_data
386 | 
387 |     def __fetch_or_create_partition_table(self,partition_dt):
388 |         group = self.__fetch_partition_group(partition_dt)
389 |         if group:
390 |             return group._f_get_child('ts_data')
391 |         else:
392 |             return self.__create_partition(partition_dt)
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 |         
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 |     
408 | 


--------------------------------------------------------------------------------