├── benchmark.py ├── src └── tstables │ ├── _version.py │ ├── tests │ ├── __init__.py │ ├── test_tstable_static.py │ └── test_tstable_file.py │ ├── __init__.py │ ├── file.py │ ├── group.py │ ├── benchmark.py │ └── tstable.py ├── release.py ├── setup.cfg ├── .gitignore ├── EXAMPLES.md ├── setup.py └── README.md /benchmark.py: -------------------------------------------------------------------------------- 1 | import tstables 2 | tstables.Benchmark.main() -------------------------------------------------------------------------------- /src/tstables/_version.py: -------------------------------------------------------------------------------- 1 | # Store the version here so: 2 | # 1) we don't load dependencies by storing it in __init__.py 3 | # 2) we can import it in setup.py for the same reason 4 | # 3) we can import it into your module module 5 | __version__ = '0.0.15' -------------------------------------------------------------------------------- /release.py: -------------------------------------------------------------------------------- 1 | # Converts README.md to README.txt (in restructured text format), builds package, and uploads to PyPI 2 | 3 | import pypandoc 4 | import os 5 | 6 | rst = pypandoc.convert('README.md', 'rst') 7 | f = open('README.txt','w+') 8 | f.write(rst) 9 | f.close() 10 | os.system("python3 setup.py register sdist upload") 11 | os.remove('README.txt') 12 | 13 | -------------------------------------------------------------------------------- /src/tstables/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from tstables.tests import test_tstable_static 2 | from tstables.tests import test_tstable_file 3 | #from tstables import tstable 4 | 5 | def suite(): 6 | import unittest 7 | import doctest 8 | suite = unittest.TestSuite() 9 | #suite.addTests(doctest.DocTestSuite(tstable)) 10 | suite.addTests(test_tstable_static.suite()) 11 | suite.addTests(test_tstable_file.suite()) 12 | return suite 13 | 14 | if __name__ == '__main__': 15 | unittest.TextTestRunner(verbosity=2).run(suite()) -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [global] 2 | # Just silently do your job 3 | quiet = 1 4 | 5 | [easy_install] 6 | # Where we are going to look for thrirdparty dependencies 7 | find_links = thirdparty 8 | 9 | [build_py] 10 | # No optimization for now 11 | optimize = 0 12 | # Force build everything? 13 | force = True 14 | 15 | [egg_info] 16 | # We are doing development build 17 | # tag_build = dev 18 | # Do we want to have date in file name? 19 | tag_date = 0 20 | 21 | [bdist_egg] 22 | # We do not want to distribute binary with source code 23 | exclude-source-files = True 24 | 25 | [rotate] 26 | # Keep only last 10 eggs, clean up older 27 | match = .egg 28 | keep = 10 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # Installer logs 25 | pip-log.txt 26 | pip-delete-this-directory.txt 27 | 28 | # Unit test / coverage reports 29 | htmlcov/ 30 | .tox/ 31 | .coverage 32 | .cache 33 | nosetests.xml 34 | coverage.xml 35 | 36 | # Translations 37 | *.mo 38 | *.pot 39 | 40 | # Django stuff: 41 | *.log 42 | 43 | # Sphinx documentation 44 | docs/_build/ -------------------------------------------------------------------------------- /src/tstables/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ######################################################################## 4 | # 5 | # License: MIT 6 | # 7 | # $Id$ 8 | # 9 | ######################################################################## 10 | 11 | """TsTables is a Python package to store time series data in HDF5 files using PyTables and Pandas 12 | 13 | :URL: http://afiedler.github.io/tstables 14 | 15 | TsTables is a Python package to store time series data in HDF5 files using PyTables. It stores time series data into 16 | daily partitions and provides functions to query for subsets of data across partitions. 17 | 18 | Its goals are to support a workflow where tons (gigabytes) of time series data are appended periodically to a HDF5 file, 19 | and need to be read many times (quickly) for analytical models and research. 20 | 21 | """ 22 | from ._version import __version__ 23 | from tstables.tstable import TsTable 24 | from tstables.file import create_ts 25 | from tstables.group import timeseries_repr 26 | from tstables.group import timeseries_str 27 | from tstables.group import get_timeseries 28 | from tstables.benchmark import Benchmark 29 | import tables 30 | 31 | # Augment the PyTables File class 32 | tables.File.create_ts = create_ts 33 | 34 | # Patch the group class to return time series __str__ and __repr__ 35 | old_repr = tables.Group.__repr__ 36 | old_str = tables.Group.__str__ 37 | 38 | tables.Group.__repr__ = timeseries_repr 39 | tables.Group.__str__ = timeseries_str 40 | 41 | # Add _v_timeseries to Group 42 | tables.Group._f_get_timeseries = get_timeseries 43 | 44 | -------------------------------------------------------------------------------- /src/tstables/file.py: -------------------------------------------------------------------------------- 1 | import tables 2 | import tstables 3 | import datetime 4 | import numpy 5 | 6 | def create_ts(self,where,name,description=None,title="",filters=None, 7 | expectedrows_per_partition=10000,chunkshape=None, 8 | byteorder=None,createparents=False): 9 | 10 | # Check the Description to make sure the first col is "timestamp" with type Int64 11 | for k in description.columns.keys(): 12 | if description.columns[k]._v_pos == 0: 13 | first_col_name = k 14 | 15 | if first_col_name != 'timestamp': 16 | raise AttributeError("first column must be called 'timestamp' and have type Int64") 17 | 18 | if description.columns[first_col_name].dtype != numpy.dtype('int64'): 19 | raise AttributeError("first column must be called 'timestamp' and have type Int64") 20 | 21 | # The parent node of the time series 22 | tsnode = self.create_group(where,name,title,filters,createparents) 23 | 24 | try: 25 | # Decorate with TsTables attributes 26 | tsnode._v_attrs._TS_TABLES_CLASS='TIMESERIES' 27 | tsnode._v_attrs._TS_TABLES_VERSION='0.0.1' 28 | 29 | ts = tstables.TsTable(self,tsnode,description,title,filters,expectedrows_per_partition, 30 | chunkshape,byteorder) 31 | 32 | # Need to create one partition to "save" the time series. This creates a new table to persist 33 | # the table description 34 | ts._TsTable__create_partition(datetime.datetime.utcnow().date()) 35 | except: 36 | # Make sure that the group is deleted if an exception is raised 37 | self.remove_node(tsnode,recursive=True) 38 | raise 39 | 40 | return ts 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # TsTables Examples 2 | 3 | This document shows you a few examples of how to use TsTables to store and access data. 4 | 5 | ## Basic Examples 6 | 7 | ### Fetch the daily EURUSD exchange rate from FRED 8 | 9 | This example fetches the daily EURUSD exchange rate from FRED, the St. Louis Fed's online database 10 | of economic data. TsTables isn't really designed for storing daily data, but this simple example 11 | illustrates how you can get a Pandas DataFrame and append it to a time series. 12 | 13 | ```python 14 | import tables 15 | import tstables 16 | import pandas.io.data as web 17 | from datetime import * 18 | 19 | # Create a class to describe the table structure. The column "timestamp" is required, and must be 20 | # in the first position (pos=0) and have the type Int64. 21 | class prices(tables.IsDescription): 22 | timestamp = tables.Int64Col(pos=0) 23 | price = tables.Float64Col(pos=1) 24 | 25 | f = tables.open_file('eurusd.h5','a') 26 | 27 | # This creates the time series, which is just a group called 'EURUSD' in the root of the HDF5 file. 28 | ts = f.create_ts('/','EURUSD',prices) 29 | 30 | start = datetime(2010,1,1) 31 | end = datetime(2014,5,2) 32 | 33 | euro = web.DataReader("DEXUSEU", "fred", start, end) 34 | ts.append(euro) 35 | f.flush() 36 | 37 | # Now, read in a month of data 38 | read_start_dt = datetime(2014,1,1) 39 | read_end_dt = datetime(2014,1,31) 40 | 41 | jan = ts.read_range(read_start_dt,read_end_dt) 42 | ``` 43 | 44 | ### Load one month of minutely bitcoin price data 45 | 46 | This example loads one month of minutely [Bitcoin Price Index](http://coindesk.com/price) from 47 | CoinDesk. First, you'll need to download 48 | [this CSV file](http://afiedler.github.io/tstables/bpi_2014_01.csv). This example assumes that you've 49 | stored the CSV file in the current directory. 50 | 51 | ```python 52 | import tables 53 | import tstables 54 | import pandas 55 | from datetime import * 56 | 57 | # Class to use as the table description 58 | class BpiValues(tables.IsDescription): 59 | timestamp = tables.Int64Col(pos=0) 60 | bpi = tables.Float64Col(pos=1) 61 | 62 | # Use pandas to read in the CSV data 63 | bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True) 64 | 65 | f = tables.open_file('bpi.h5','a') 66 | 67 | # Create a new time series 68 | ts = f.create_ts('/','BPI',BpiValues) 69 | 70 | # Append the BPI data 71 | ts.append(bpi) 72 | 73 | # Read in some data 74 | read_start_dt = datetime(2014,1,4,12,00) 75 | read_end_dt = datetime(2014,1,4,14,30) 76 | 77 | rows = ts.read_range(read_start_dt,read_end_dt) 78 | 79 | # `rows` will be a pandas DataFrame with a DatetimeIndex. 80 | ``` 81 | -------------------------------------------------------------------------------- /src/tstables/group.py: -------------------------------------------------------------------------------- 1 | import tables 2 | import tstables 3 | import datetime 4 | 5 | def timeseries_repr(self): 6 | """Return a detailed string representation of the group or time series. 7 | 8 | Examples 9 | -------- 10 | 11 | :: 12 | 13 | >>> f = tables.open_file('data/test.h5') 14 | >>> f.root.group0 15 | /group0 (Group) 'First Group' 16 | children := ['tuple1' (Table), 'group1' (Group)] 17 | 18 | :: 19 | 20 | >>> f = tables.open_file('data/test_timeseries.h5') 21 | >>> f.root.timeseries0 22 | /timeseries0 (Group/Timeseries) 'A group that is also a time series' 23 | 24 | """ 25 | try: 26 | tstables_class = self._v_attrs._TS_TABLES_CLASS 27 | 28 | # Additional representation (maybe min timestamp, max timestamp) goes here 29 | # Don't include all of the children here! 30 | 31 | return "%s" % str(self) 32 | 33 | except AttributeError: 34 | rep = [ 35 | '%r (%s)' % (childname, child.__class__.__name__) 36 | for (childname, child) in self._v_children.iteritems() 37 | ] 38 | childlist = '[%s]' % (', '.join(rep)) 39 | 40 | return "%s\n children := %s" % (str(self), childlist) 41 | 42 | 43 | 44 | def timeseries_str(self): 45 | """Return a short string representation of the group or time series. 46 | 47 | Examples 48 | -------- 49 | 50 | :: 51 | 52 | >>> f=tables.open_file('data/test.h5') 53 | >>> print(f.root.group0) 54 | /group0 (Group) 'First Group' 55 | 56 | :: 57 | >>> f = tables.open_file('data/test_timeseries.h5') 58 | >>> f.root.timeseries0 59 | /timeseries0 (Group/Timeseries) 'A group that is also a time series' 60 | 61 | """ 62 | 63 | try: 64 | tstables_class = self._v_attrs._TS_TABLES_CLASS 65 | classname = "%s/Timeseries" % self.__class__.__name__ 66 | except AttributeError: 67 | classname = self.__class__.__name__ 68 | 69 | pathname = self._v_pathname 70 | title = self._v_title 71 | return "%s (%s) %r" % (pathname, classname, title) 72 | 73 | def get_timeseries(self): 74 | try: 75 | tstables_class = self._v_attrs._TS_TABLES_CLASS 76 | except AttributeError: 77 | return None 78 | 79 | ts_table = tstables.TsTable(self._v_file,self,None) 80 | 81 | # Need to determine the description, title, filters, expectedrows_per_partition, 82 | # chunkshape, byteorder 83 | ts_data = ts_table._TsTable__fetch_first_table() 84 | ts_table.table_description = ts_data.description 85 | ts_table.table_title = ts_data.title 86 | ts_table.table_filters = ts_data.filters 87 | ts_table.table_chunkshape = ts_data.chunkshape 88 | ts_table.table_byteorder = ts_data.byteorder 89 | ts_table.table_expectedrows_per_partition = ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION 90 | 91 | return ts_table -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | # Handle the long description (read from README.txt, which is created by converting README.md) 5 | long_description = 'TsTables is a Python package to store time series data in HDF5 files using ' 6 | 'PyTables. It stores time series data into daily partitions and provides functions to query for ' 7 | 'subsets of data across partitions.\n' 8 | 'Its goals are to support a workflow where tons (gigabytes) of time series data are ' 9 | 'appended periodically to a HDF5 file, and need to be read many times (quickly) for analytical ' 10 | 'models and research.' 11 | 12 | if os.path.exists('README.txt'): 13 | long_description = open('README.txt').read() 14 | 15 | exec(open('src/tstables/_version.py').read()) 16 | 17 | 18 | setup( 19 | 20 | # Package structure 21 | # 22 | # find_packages searches through a set of directories 23 | # looking for packages 24 | packages = find_packages('src', exclude = ['*.tests', '*.tests.*', 'tests.*', 'tests']), 25 | 26 | # package_dir directive maps package names to directories. 27 | # package_name:package_directory 28 | package_dir = {'': 'src'}, 29 | 30 | # Not all packages are capable of running in compressed form, 31 | # because they may expect to be able to access either source 32 | # code or data files as normal operating system files. 33 | zip_safe = True, 34 | 35 | # Entry points 36 | # 37 | # install the executable 38 | entry_points = { 39 | 'console_scripts': ['tstables_benchmark = tstables.Benchmark:main'] 40 | }, 41 | 42 | # Dependencies 43 | # 44 | # Dependency expressions have a package name on the left-hand 45 | # side, a version on the right-hand side, and a comparison 46 | # operator between them, e.g. == exact version, >= this version 47 | # or higher 48 | install_requires = ['tables>=3.1.1', 'pandas>=0.13.1'], 49 | 50 | # Tests 51 | # 52 | # Tests must be wrapped in a unittest test suite by either a 53 | # function, a TestCase class or method, or a module or package 54 | # containing TestCase classes. If the named suite is a package, 55 | # any submodules and subpackages are recursively added to the 56 | # overall test suite. 57 | test_suite = 'tstables.tests.suite', 58 | # Download dependencies in the current directory 59 | tests_require = 'docutils >= 0.6', 60 | 61 | name = "tstables", 62 | version = __version__, 63 | 64 | # metadata for upload to PyPI 65 | author = "Andy Fiedler", 66 | author_email = "andy@andyfiedler.com", 67 | description = "Handles large time series using PyTables and Pandas", 68 | license = "MIT", 69 | keywords = "time series high frequency HDF5", 70 | url = "http://github.com/afiedler/tstables", # project home page, if any 71 | long_description = long_description 72 | # could also include download_url, classifiers, etc. 73 | ) -------------------------------------------------------------------------------- /src/tstables/benchmark.py: -------------------------------------------------------------------------------- 1 | import tables 2 | import tstables 3 | import tempfile 4 | import datetime 5 | import pytz 6 | import pandas 7 | import numpy 8 | import timeit 9 | import os 10 | 11 | # Class to define record structure 12 | class Price(tables.IsDescription): 13 | timestamp = tables.Int64Col(pos=0) 14 | price = tables.Int32Col(pos=1) 15 | 16 | class Benchmark: 17 | @classmethod 18 | def log_me(cls, s): 19 | cls.log.write(s) 20 | print(s) 21 | 22 | @classmethod 23 | def write_data(cls): 24 | # This simple benchmark creates a HDF5 file with a timeseries. It then loads about one year of random secondly 25 | # data into it, closes it, and reads it back. 26 | 27 | cls.log_me("Started benchmark at %s\n\n" % datetime.datetime.now()) 28 | 29 | cls.temp_file = tempfile.mkstemp('h5')[1] 30 | cls.h5_file = tables.open_file(cls.temp_file,'r+') 31 | ts = cls.h5_file.create_ts('/','EURUSD',description=Price) 32 | 33 | start_dt = datetime.datetime(2014,1,1,tzinfo=pytz.utc) 34 | 35 | # period is number of seconds in 31 days. 36 | # will result in slightly more than a year of data. 37 | index = pandas.date_range(start_dt, periods=2678400, freq='S') 38 | values = numpy.int32(numpy.random.rand(2678400, 1)*numpy.iinfo(numpy.int32).max) 39 | df = pandas.DataFrame(values,index=index,columns=['price'],dtype=numpy.dtype('i4')) 40 | 41 | append_times = [] 42 | for month in range(0,12): 43 | t = timeit.timeit(lambda: ts.append(df), number=1) 44 | df.index = df.index+pandas.offsets.Day(31) # Shift index to next month 45 | cls.log_me(" * finished appending month {0}\n".format(month)) 46 | append_times.append(t) 47 | 48 | cls.log_me("Appended 12 months of data:\n") 49 | for a in append_times: 50 | cls.log_me(" * {0} seconds\n".format(a)) 51 | 52 | cls.log_me("average {0} seconds, total {1} seconds\n\n".format(sum(append_times)/len(append_times), 53 | sum(append_times))) 54 | 55 | 56 | # Now, close the file and re-open it 57 | cls.h5_file.close() 58 | 59 | @classmethod 60 | def read_data(cls): 61 | # report the file size 62 | h5_size = os.stat(cls.temp_file).st_size 63 | cls.log_me("file size (bytes): {0}\n".format(h5_size)) 64 | 65 | cls.h5_file = tables.open_file(cls.temp_file,'r') 66 | ts = cls.h5_file.root.EURUSD._f_get_timeseries() 67 | 68 | # Now, read random one hour increments 69 | 70 | def read_random_hour(ts,min_dt,max_dt): 71 | rnd = numpy.random.rand(1)[0] 72 | start_offset = datetime.timedelta(seconds=(max_dt - min_dt - datetime.timedelta(hours=1)).total_seconds() * rnd) 73 | start_dt = min_dt + start_offset 74 | end_dt = start_dt + datetime.timedelta(hours=1) 75 | 76 | ts.read_range(start_dt,end_dt) 77 | 78 | min_dt = ts.min_dt() 79 | max_dt = ts.max_dt() 80 | 81 | read_time = timeit.timeit(lambda: read_random_hour(ts, min_dt, max_dt), number=100) 82 | 83 | cls.log_me("average time to read one hour of data (100 repetitions): {0} seconds\n".format(read_time)) 84 | 85 | 86 | @classmethod 87 | def main(cls): 88 | cls.log = open('benchmark.txt', 'w') 89 | 90 | cls.write_data() 91 | 92 | cls.read_data() 93 | 94 | # Finished! 95 | cls.log.close() 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TsTables 2 | 3 | TsTables is a Python package to store time series data in HDF5 files using PyTables. It stores time 4 | series data into daily partitions and provides functions to query for subsets of data across 5 | partitions. 6 | 7 | Its goals are to support a workflow where tons (gigabytes) of time series data are 8 | appended periodically to a HDF5 file, and need to be read many times (quickly) for analytical models 9 | and research. 10 | 11 | ## Example 12 | 13 | This example reads in minutely bitcoin price data and then fetches a range of data. For the full example here, and other 14 | examples, see [EXAMPLES.md](EXAMPLES.md). 15 | 16 | ```python 17 | # Class to use as the table description 18 | class BpiValues(tables.IsDescription): 19 | timestamp = tables.Int64Col(pos=0) 20 | bpi = tables.Float64Col(pos=1) 21 | 22 | # Use pandas to read in the CSV data 23 | bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True) 24 | 25 | f = tables.open_file('bpi.h5','a') 26 | 27 | # Create a new time series 28 | ts = f.create_ts('/','BPI',BpiValues) 29 | 30 | # Append the BPI data 31 | ts.append(bpi) 32 | 33 | # Read in some data 34 | read_start_dt = datetime(2014,1,4,12,00) 35 | read_end_dt = datetime(2014,1,4,14,30) 36 | 37 | rows = ts.read_range(read_start_dt,read_end_dt) 38 | 39 | # `rows` will be a pandas DataFrame with a DatetimeIndex. 40 | ``` 41 | 42 | Here is how to open a pre-existing `bpi.h5` HDF5 file and get that timeseries from it. 43 | 44 | ```python 45 | f = tables.open_file('bpi.h5','r') 46 | ts = f.root.BPI._f_get_timeseries() 47 | 48 | # Read in some data 49 | read_start_dt = datetime(2014,1,4,12,00) 50 | read_end_dt = datetime(2014,1,4,14,30) 51 | 52 | rows = ts.read_range(read_start_dt,read_end_dt) 53 | ``` 54 | 55 | ## Running unit tests 56 | 57 | You can run the unit test suite from the command line at the root of the repository: 58 | 59 | `python setup.py test` 60 | 61 | 62 | ## Preliminary benchmarks 63 | 64 | The main goal of TsTables is to make it very fast to read subsets of data, given a date range. TsTables currently 65 | includes a simple benchmark to track progress towards that goal. To run it, after installing the package, you can run 66 | `tstables_benchmark` from the command line or you can import the package in a Python console and run it directly. 67 | 68 | ```python 69 | import tstables 70 | tstables.Benchmark.main() 71 | ``` 72 | 73 | Running the benchmark both prints results out to the screen and saves them in `benchmark.txt`. 74 | 75 | The benchmark loads one year of random secondly data (just the timestamp column and a 32-bit integer "price" column) 76 | into a file, and then it reads random one hour chunks of data. 77 | 78 | Currently, here's some benchmarks of TsTables (from a MacBook Pro with a SSD): 79 | 80 | Metric | Results 81 | ------------------------------------------------------------|----------------- 82 | Append one month of data (2.67 million rows) | 0.711 seconds 83 | Fetch one hour of data into memory | 0.305 seconds 84 | File size (one year of data, 32 million rows, uncompressed) | 391.6 MB 85 | 86 | HDF5 supports zlib and other compression algorithms, which can be enabled through PyTables to reduce the file 87 | size. Without compression, the HDF5 file size is approximately 1.8% larger than the raw data in binary form, a 88 | drastically lower overhead than CSV files. 89 | 90 | ## Contributing 91 | 92 | If you are interested in the project (to contribute 93 | or to hear about updates), email Andy Fiedler at or submit a pull request. 94 | -------------------------------------------------------------------------------- /src/tstables/tests/test_tstable_static.py: -------------------------------------------------------------------------------- 1 | import tstables 2 | import unittest 3 | import datetime 4 | import pytz 5 | 6 | class TsTableStaticTestCase(unittest.TestCase): 7 | 8 | TIME_EPS = datetime.timedelta(microseconds=1*1000) 9 | 10 | def test_partition_range_same_time(self): 11 | # 2014-04-01 01:00:00 UTC 12 | start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc) 13 | 14 | # End at the exact same time 15 | end_dt = start_dt 16 | parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt) 17 | 18 | # There should be only one partition, and the range should equal start_dt,end_dt 19 | self.assertEqual(parts[start_dt.date()], (start_dt,end_dt)) 20 | self.assertEqual(len(parts.keys()), 1) 21 | 22 | def test_partition_range_same_day(self): 23 | # 2014-04-01 01:00:00 UTC 24 | start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc) 25 | 26 | # 2014-04-01 04:00:00 UTC 27 | end_dt = datetime.datetime(2014,4,1,4,0,tzinfo=pytz.utc) 28 | parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt) 29 | 30 | # There should be only one partition, and the range should equal start_dt,end_dt + 1ms 31 | self.assertEqual(parts[start_dt.date()], (start_dt,end_dt)) 32 | self.assertEqual(len(parts.keys()),1) 33 | 34 | def test_partition_range_two_day(self): 35 | # 2014-04-01 01:00:00 UTC 36 | start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc) 37 | 38 | # 2014-04-02 04:00:00 UTC 39 | end_dt = datetime.datetime(2014,4,2,4,0,tzinfo=pytz.utc) 40 | parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt) 41 | 42 | # Should be two parts: [2014-04-01 01:00:00 UTC, 2014-04-01 23:59:59.999 UTC] and 43 | # [2014-04-02 00:00:00 UTC, 2014-04-02 04:00:00 UTC] 44 | self.assertEqual(parts[start_dt.date()][0], start_dt) 45 | self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc) - self.TIME_EPS) 46 | self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc)) 47 | self.assertEqual(parts[end_dt.date()][1],end_dt) 48 | self.assertEqual(len(parts.keys()),2) 49 | 50 | def test_partition_range_three_day(self): 51 | # 2014-04-01 01:00:00 UTC 52 | start_dt = datetime.datetime(2014,4,1,1,0,tzinfo=pytz.utc) 53 | 54 | # 2014-04-03 04:00:00 UTC 55 | end_dt = datetime.datetime(2014,4,3,4,0,tzinfo=pytz.utc) 56 | parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt) 57 | 58 | # Should be three parts: [2014-04-01 01:00:00 UTC, 2014-04-01 23:59:59.999 UTC], 59 | # [2014-04-02 00:00:00 UTC, 2014-04-02 23:59:59.999 UTC] 60 | # [2014-04-03 00:00:00 UTC, 2014-04-03 04:00:00 UTC] 61 | self.assertEqual(parts[start_dt.date()][0], start_dt) 62 | self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc)-self.TIME_EPS) 63 | 64 | mid_date = start_dt.date() + datetime.timedelta(days=1) 65 | self.assertEqual(parts[mid_date][0], datetime.datetime(2014,4,2,0,0,tzinfo=pytz.utc)) 66 | self.assertEqual(parts[mid_date][1], datetime.datetime(2014,4,3,0,0,tzinfo=pytz.utc)-self.TIME_EPS) 67 | 68 | self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,3,0,0,tzinfo=pytz.utc)) 69 | self.assertEqual(parts[end_dt.date()][1], end_dt) 70 | self.assertEqual(len(parts.keys()),3) 71 | 72 | def test_partition_range_just_cross_boundary(self): 73 | # 2014-03-31 23:59:59.999 UTC 74 | start_dt = datetime.datetime(2014,3,31,23,59,59,999*1000,tzinfo=pytz.utc) 75 | 76 | # 2014-04-01 00:00:00.001 UTC 77 | end_dt = datetime.datetime(2014,4,1,0,0,0,1*1000,tzinfo=pytz.utc) 78 | parts = tstables.TsTable._TsTable__dtrange_to_partition_ranges(start_dt,end_dt) 79 | 80 | # Should be two parts: [2014-03-31 23:59:59.999 UTC, 2014-03-31 23:59:59.999 UTC] and 81 | # [2014-04-01 00:00:00 UTC, 2014-04-01 00:00:00.001 UTC] 82 | self.assertEqual(parts[start_dt.date()][0], start_dt) 83 | self.assertEqual(parts[start_dt.date()][1], datetime.datetime(2014,4,1,0,0,tzinfo=pytz.utc) - self.TIME_EPS) 84 | self.assertEqual(parts[end_dt.date()][0], datetime.datetime(2014,4,1,0,0,tzinfo=pytz.utc)) 85 | self.assertEqual(parts[end_dt.date()][1], end_dt) 86 | self.assertEqual(len(parts.keys()),2) 87 | 88 | def test_dt_to_ts(self): 89 | # Test 1 - Epoch 90 | dt = datetime.datetime(1970,1,1,tzinfo=pytz.utc) 91 | ts = tstables.TsTable._TsTable__dt_to_ts(dt) 92 | 93 | self.assertEqual(ts, 0) 94 | 95 | # Test 2 - 1970-01-01T00:00:00.000 96 | dt = datetime.datetime(1971,1,1,tzinfo=pytz.utc) 97 | ts = tstables.TsTable._TsTable__dt_to_ts(dt) 98 | 99 | assert ts == 31536000000 100 | 101 | # Test 3 - 2014-05-05T01:01:01.100 102 | dt = datetime.datetime(year=2014,month=5,day=5,hour=1,minute=1,second=1,microsecond=100*1000,tzinfo=pytz.utc) 103 | ts = tstables.TsTable._TsTable__dt_to_ts(dt) 104 | 105 | assert ts == 1399251661100 106 | 107 | def test_ts_to_dt(self): 108 | # Test 1 - Epoch 109 | ts = 0 110 | dt = tstables.TsTable._TsTable__ts_to_dt(ts) 111 | 112 | assert dt == datetime.datetime(1970,1,1,tzinfo=pytz.utc) 113 | 114 | # Test 2 - 1970-01-01T00:00:00.000 115 | ts = 31536000000 116 | dt = tstables.TsTable._TsTable__ts_to_dt(ts) 117 | 118 | assert dt == datetime.datetime(1971,1,1,tzinfo=pytz.utc) 119 | 120 | # Test 3 - 2014-05-05T01:01:01.100 121 | ts = 1399251661100 122 | dt = tstables.TsTable._TsTable__ts_to_dt(ts) 123 | 124 | assert dt == datetime.datetime(year=2014,month=5,day=5,hour=1,minute=1,second=1,microsecond=100*1000,tzinfo=pytz.utc) 125 | 126 | def test_partition_date_to_path_array(self): 127 | dt = datetime.datetime(2014,5,5,1,1,1,tzinfo=pytz.utc) 128 | pa = tstables.TsTable._TsTable__partition_date_to_path_array(dt) 129 | expected = ['y2014','m05','d05'] 130 | for idx,p in enumerate(pa): 131 | assert p == expected[idx] 132 | 133 | def suite(): 134 | loader = unittest.TestLoader() 135 | suite = unittest.TestSuite() 136 | suite.addTest(loader.loadTestsFromTestCase(TsTableStaticTestCase)) 137 | return suite 138 | 139 | if __name__ == '__main__': 140 | unittest.TextTestRunner(verbosity=2).run(suite()) -------------------------------------------------------------------------------- /src/tstables/tests/test_tstable_file.py: -------------------------------------------------------------------------------- 1 | import tables 2 | import tstables 3 | import unittest 4 | import datetime 5 | import pytz 6 | import tempfile 7 | try: 8 | from io import StringIO 9 | except ImportError: 10 | from cStringIO import StringIO 11 | import os 12 | import pandas 13 | import mock 14 | import numpy 15 | 16 | # Class to define record structure 17 | class Price(tables.IsDescription): 18 | timestamp = tables.Int64Col(pos=0) 19 | price = tables.Int32Col(pos=1) 20 | 21 | 22 | class TsTableFileTestCase(unittest.TestCase): 23 | 24 | def setUp(self): 25 | self.temp_file = tempfile.mkstemp('h5')[1] 26 | self.h5_file = tables.open_file(self.temp_file,'r+') 27 | 28 | def tearDown(self): 29 | self.h5_file.close() 30 | os.remove(self.temp_file) 31 | 32 | 33 | def test_create_ts(self): 34 | # Technically, there is a race condition here if you happen to run this at exactly midnight UTC! 35 | now = datetime.datetime.utcnow() 36 | self.h5_file.create_ts('/','EURUSD',description=Price) 37 | 38 | # Want to check that: 39 | # - the group exists 40 | # - it has a _TS_TABLES_CLASS attribute equal to "TIMESERIES" 41 | # - it has a table at yYYYY/mMM/dDD/ts_data, where YYY-MM-DD is today (in UTC) 42 | # - the dtype is correct 43 | self.assertEqual(self.h5_file.root.EURUSD.__class__, tables.Group) 44 | self.assertEqual(self.h5_file.root.EURUSD._v_attrs._TS_TABLES_CLASS,'TIMESERIES') 45 | 46 | path = tstables.TsTable._TsTable__partition_date_to_path_array(now.date()) 47 | 48 | ts_data = self.h5_file.root.EURUSD._f_get_child(path[0])._f_get_child(path[1])._f_get_child( 49 | path[2])._f_get_child('ts_data') 50 | 51 | self.assertEqual(ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION,10000) 52 | 53 | self.assertEqual(ts_data._v_dtype[0],tables.dtype_from_descr(Price)[0]) 54 | self.assertEqual(ts_data._v_dtype[1],tables.dtype_from_descr(Price)[1]) 55 | 56 | 57 | def test_create_ts_with_invalid_description_incorrect_order(self): 58 | class InvalidDesc(tables.IsDescription): 59 | # Positions are out of order here! 60 | timestamp = tables.Int64Col(pos=1) 61 | price = tables.Int32Col(pos=0) 62 | 63 | self.assertRaises(AttributeError, self.h5_file.create_ts, '/', 'EURUSD', description=InvalidDesc) 64 | 65 | def test_create_ts_with_invalid_description_incorrect_order(self): 66 | class InvalidDesc(tables.IsDescription): 67 | # Type is incorrect here! 68 | timestamp = tables.Int32Col(pos=0) 69 | price = tables.Int32Col(pos=1) 70 | 71 | self.assertRaises(AttributeError, self.h5_file.create_ts, '/', 'EURUSD', description=InvalidDesc) 72 | 73 | def test_load_same_timestamp(self): 74 | 75 | # Test data that is multiple rows with the same timestamp 76 | csv = u"""2014-05-05T01:01:01.100Z,1 77 | 2014-05-05T01:01:01.100Z,2 78 | 2014-05-05T01:01:01.100Z,3 79 | 2014-05-05T01:01:01.100Z,4 80 | 2014-05-05T01:01:01.100Z,5""" 81 | 82 | sfile = StringIO(csv) 83 | 84 | # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex. 85 | rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'}) 86 | 87 | ts = self.h5_file.create_ts('/','EURUSD',description=Price) 88 | ts.append(rows) 89 | 90 | # Inspect to ensure that data has been stored correctly 91 | tbl = ts.root_group.y2014.m05.d05.ts_data 92 | 93 | self.assertEqual(tbl.nrows,5) 94 | 95 | # Fetch rows over a larger range 96 | rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc)) 97 | 98 | # Confirm equality 99 | for idx,p in enumerate(rows_read['price']): 100 | self.assertEqual(p,rows['price'][idx]) 101 | 102 | # Fetch rows over the smallest possible range 103 | rows_read = ts.read_range(datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc), 104 | datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc)) 105 | 106 | # Confirm equality 107 | for idx,p in enumerate(rows_read['price']): 108 | self.assertEqual(p,rows['price'][idx]) 109 | 110 | 111 | @mock.patch.object(tstables.TsTable, 'MAX_FULL_PARTITION_READ_SIZE', 1) 112 | def test_load_same_timestamp(self): 113 | 114 | # Test data that is multiple rows with the same timestamp 115 | csv = u"""2014-05-05T01:01:01.100Z,1 116 | 2014-05-05T01:01:01.100Z,2 117 | 2014-05-05T01:01:01.100Z,3 118 | 2014-05-05T01:01:01.100Z,4 119 | 2014-05-05T01:01:01.100Z,5""" 120 | 121 | sfile = StringIO(csv) 122 | 123 | # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex. 124 | rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'}) 125 | 126 | ts = self.h5_file.create_ts('/','EURUSD',description=Price) 127 | ts.append(rows) 128 | 129 | # Inspect to ensure that data has been stored correctly 130 | tbl = ts.root_group.y2014.m05.d05.ts_data 131 | 132 | self.assertEqual(tbl.nrows,5) 133 | 134 | # Fetch rows over a larger range 135 | rows_read = ts.read_range(datetime.datetime(2014,5,5,tzinfo=pytz.utc),datetime.datetime(2014,5,6,tzinfo=pytz.utc)) 136 | 137 | # Confirm equality 138 | for idx,p in enumerate(rows_read['price']): 139 | self.assertEqual(p,rows['price'][idx]) 140 | 141 | # Fetch rows over the smallest possible range 142 | rows_read = ts.read_range(datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc), 143 | datetime.datetime(2014,5,5,1,1,1,100*1000,tzinfo=pytz.utc)) 144 | 145 | # Confirm equality 146 | for idx,p in enumerate(rows_read['price']): 147 | self.assertEqual(p,rows['price'][idx]) 148 | 149 | 150 | @mock.patch.object(tstables.TsTable, 'MAX_FULL_PARTITION_READ_SIZE', 1) 151 | @mock.patch.object(tables.Table, 'read_where') 152 | @mock.patch.object(tables.Table, 'read') 153 | def test_read_using_read_where(self, mock_read, mock_read_where): 154 | 155 | csv = u"""2014-05-05T01:01:01.100Z,1 156 | 2014-05-05T01:01:01.100Z,2 157 | 2014-05-05T01:01:01.100Z,3 158 | 2014-05-05T01:01:01.100Z,4 159 | 2014-05-05T01:01:01.100Z,5""" 160 | 161 | sfile = StringIO(csv) 162 | 163 | # Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex. 164 | rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'}) 165 | 166 | ts = self.h5_file.create_ts('/','EURUSD',description=Price) 167 | ts.append(rows) 168 | 169 | # Inspect to ensure that data has been stored correctly 170 | tbl = ts.root_group.y2014.m05.d05.ts_data 171 | 172 | self.assertEqual(tbl.nrows,5) 173 | 174 | # Table.read_where is a mock, so we need to give it a return value 175 | mock_read_where.return_value = numpy.ndarray(shape=0,dtype=[('timestamp', '= {0}) & (timestamp <= {1})'.format( 116 | self.__dt_to_ts(start_dt),self.__dt_to_ts(end_dt))) 117 | 118 | def __fetch_first_table(self): 119 | y_group = self.root_group._f_list_nodes()[0] 120 | m_group = y_group._f_list_nodes()[0] 121 | d_group = m_group._f_list_nodes()[0] 122 | return d_group.ts_data 123 | 124 | def __fetch_last_table(self): 125 | y_group = sorted(self.root_group._f_list_nodes(), key=lambda x: x._v_name)[-1] 126 | m_group = sorted(y_group._f_list_nodes(), key=lambda x: x._v_name)[-1] 127 | d_group = sorted(m_group._f_list_nodes(), key=lambda x: x._v_name)[-1] 128 | return d_group.ts_data 129 | 130 | def __get_max_ts(self): 131 | max_group_dt = None 132 | max_ts = None 133 | for group in self.root_group._f_walk_groups(): 134 | m = re.search('y([0-9]{4})/m([0-9]{2})/d([0-9]{2})',group._v_pathname) 135 | if m is not None: 136 | group_dt = datetime.date(int(m.group(1)),int(m.group(2)),int(m.group(3))) 137 | else: 138 | continue 139 | 140 | if (max_group_dt is not None) and (max_group_dt < group_dt): 141 | 142 | if group.ts_data.nrows == 0: 143 | group_max_ts = None 144 | else: 145 | group_max_ts = group.ts_data.cols.timestamp[-1] 146 | 147 | if (group_max_ts is not None) and (max_ts is None or max_ts < group_max_ts): 148 | max_ts = group_max_ts 149 | max_group_dt = group_dt 150 | elif (max_group_dt is None): 151 | 152 | if group.ts_data.nrows == 0: 153 | group_max_ts = None 154 | else: 155 | group_max_ts = group.ts_data.cols.timestamp[-1] 156 | 157 | if (group_max_ts is not None): 158 | max_ts = group_max_ts 159 | max_group_dt = group_dt 160 | 161 | 162 | return max_ts 163 | 164 | def __get_min_ts(self): 165 | min_group_dt = None 166 | min_ts = None 167 | for group in self.root_group._f_walk_groups(): 168 | m = re.search('y([0-9]{4})/m([0-9]{2})/d([0-9]{2})',group._v_pathname) 169 | if m is not None: 170 | group_dt = datetime.date(int(m.group(1)),int(m.group(2)),int(m.group(3))) 171 | else: 172 | continue 173 | 174 | if (min_group_dt is not None) and (min_group_dt > group_dt): 175 | if group.ts_data.nrows == 0: 176 | group_min_ts = None 177 | else: 178 | group_min_ts = group.ts_data.cols.timestamp[0] 179 | 180 | if (group_min_ts is not None) and (min_ts is None or min_ts > group_min_ts): 181 | min_ts = group_min_ts 182 | min_group_dt = group_dt 183 | elif (min_group_dt is None): 184 | 185 | if group.ts_data.nrows == 0: 186 | group_min_ts = None 187 | else: 188 | group_min_ts = group.ts_data.cols.timestamp[0] 189 | 190 | if (group_min_ts is not None): 191 | min_ts = group_min_ts 192 | min_group_dt = group_dt 193 | 194 | 195 | return min_ts 196 | 197 | def min_dt(self): 198 | return self.__ts_to_dt(self.__get_min_ts()) 199 | 200 | def max_dt(self): 201 | return self.__ts_to_dt(self.__get_max_ts()) 202 | 203 | def read_range(self,start_dt,end_dt,as_pandas_dataframe=True): 204 | # Convert start_dt and end_dt to UTC if they are naive 205 | if start_dt.tzinfo is None: 206 | start_dt = pytz.utc.localize(start_dt) 207 | if end_dt.tzinfo is None: 208 | end_dt = pytz.utc.localize(end_dt) 209 | 210 | 211 | if start_dt > end_dt: 212 | raise AttributeError('start_dt must be <= end_dt') 213 | 214 | 215 | partitions = self.__dtrange_to_partition_ranges(start_dt,end_dt) 216 | sorted_pkeys = sorted(partitions.keys()) 217 | 218 | # Start with an empty array 219 | result = numpy.ndarray(shape=0,dtype=self.__v_dtype()) 220 | 221 | for p in sorted_pkeys: 222 | result = numpy.concatenate( 223 | (result,self.__fetch_rows_from_partition(p,start_dt,end_dt))) 224 | 225 | # Turn into a pandas DataFrame with a timeseries index 226 | if as_pandas_dataframe: 227 | result = pandas.DataFrame.from_records(result, 228 | index=result['timestamp'].astype('datetime64[ms]'), 229 | exclude=['timestamp']) 230 | 231 | return result 232 | 233 | def append(self,rows,convert_strings=False): 234 | # This part is specific to pandas support. If rows is a pandas DataFrame, convert it to a 235 | # format suitable to PyTables 236 | if rows.__class__ == pandas.core.frame.DataFrame: 237 | if rows.empty: 238 | return # Do nothing if we are appending nothing 239 | 240 | # Fixed Bug with pandas version 0.20+ 241 | if rows.index.__class__ != pandas.core.indexes.datetimes.DatetimeIndex: 242 | raise ValueError('when rows is a DataFrame, the index must be a DatetimeIndex.') 243 | 244 | # Convert to records 245 | records = rows.to_records(index=True) 246 | 247 | # Need to make two type conversions: 248 | # 1. Pandas stores strings internally as variable-length strings, which are converted to objects in NumPy 249 | # PyTables can't store those in a StringCol, so this converts to fixed-length strings if convert_strings 250 | # set to True. 251 | # 2. Need to convert the timestamp to datetime64[ms] (milliseconds) 252 | 253 | dest_dtype = self.__fetch_first_table().description._v_dtype 254 | 255 | new_descr = [] 256 | existing_descr = records.dtype.descr 257 | 258 | for idx,d in enumerate(existing_descr): 259 | if existing_descr[idx][1] == '|O8' and dest_dtype[idx].char == 'S' and convert_strings: 260 | # records dtype is something like |O8 and dest dt is a string 261 | new_descr.append((existing_descr[idx][0], dest_dtype[idx])) 262 | elif idx == 0: 263 | # Make sure timestamp is in milliseconds 264 | new_descr.append((existing_descr[idx][0], '" % (str(self), exc)) 282 | 283 | # Confirm that first column is Int64. This is an additional constraint of TsTables. 284 | if not wbufRA.dtype[0] == numpy.dtype('int64'): 285 | raise ValueError("first column must be of type numpy.int64.") 286 | 287 | # We also need to confirm that the rows are sorted by timestamp. This is an additional 288 | # constraint of TsTables. 289 | if not (numpy.diff(wbufRA['timestamp']) >= 0).all(): 290 | raise ValueError("timestamp column must be sorted in ascending order.") 291 | 292 | # Array is confirmed sorted at this point, so min and max are easy to get 293 | min_ts = wbufRA[0][0] 294 | max_ts = wbufRA[-1][0] 295 | 296 | # Confirm that min is >= to the TsTable's max_ts 297 | if min_ts < (self.__get_max_ts() or numpy.iinfo('int64').min): 298 | raise ValueError("rows start prior to the end of existing rows, so they cannot be " 299 | "appended.") 300 | 301 | # wbufRA is ready to be inserted at this point. Chop it up into partitions. 302 | min_dt = self.__ts_to_dt(min_ts) 303 | max_dt = self.__ts_to_dt(max_ts) 304 | possible_partitions = self.__dtrange_to_partition_ranges(min_dt,max_dt) 305 | 306 | sorted_pkeys = sorted(possible_partitions.keys()) 307 | 308 | # For each partition, we are splitting on the end date 309 | split_on_idx = [] 310 | for p in sorted_pkeys: 311 | # p_max_ts is the maximum value of the timestamp column that SHOULD be included in this 312 | # partition. 313 | # We need to determine the row index of the row AFTER the last row where p_max_ts is <= to 314 | # the timestamp. 315 | p_max_ts = self.__dt_to_ts(possible_partitions[p][1]) 316 | split_on = numpy.searchsorted(wbufRA['timestamp'], p_max_ts, side='right') 317 | split_on_idx.append(split_on) 318 | 319 | # Now, split the array 320 | split_wbufRA = numpy.split(wbufRA,split_on_idx) 321 | 322 | # Save each partition 323 | for idx,p in enumerate(sorted_pkeys): 324 | self.__append_rows_to_partition(p,split_wbufRA[idx]) 325 | 326 | @staticmethod 327 | def __partition_date_to_path_array(partition_dt): 328 | """Converts a partition date to an array of partition names 329 | """ 330 | 331 | return [partition_dt.strftime('y%Y'),partition_dt.strftime('m%m'),partition_dt.strftime('d%d')] 332 | 333 | def __append_rows_to_partition(self,partition_dt,rows): 334 | """Appends rows to a partition (which might not exist yet, and will then be created) 335 | 336 | The rows argument is assumed to be sorted and *only* contain rows that have timestamps that 337 | are valid for this partition. 338 | """ 339 | 340 | ts_data = self.__fetch_or_create_partition_table(partition_dt) 341 | ts_data.append(rows) 342 | 343 | def __fetch_partition_group(self,partition_dt): 344 | """Fetches a partition group, or returns `False` if the partition group does not exist 345 | """ 346 | 347 | try: 348 | p_array = self.__partition_date_to_path_array(partition_dt) 349 | return self.root_group._f_get_child(p_array[0])._f_get_child(p_array[1])._f_get_child(p_array[2]) 350 | except (KeyError,tables.NoSuchNodeError): 351 | return False 352 | 353 | def __create_partition(self,partition_dt): 354 | """Creates partition, including parent groups (if they don't exist) and the data table 355 | """ 356 | 357 | p_array = self.__partition_date_to_path_array(partition_dt) 358 | 359 | # For each component, fetch the group or create it 360 | # Year 361 | try: 362 | y_group = self.root_group._f_get_child(p_array[0]) 363 | except tables.NoSuchNodeError: 364 | y_group = self.file.create_group(self.root_group,p_array[0]) 365 | 366 | # Month 367 | try: 368 | m_group = y_group._f_get_child(p_array[1]) 369 | except tables.NoSuchNodeError: 370 | m_group = self.file.create_group(y_group,p_array[1]) 371 | 372 | # Day 373 | try: 374 | d_group = m_group._f_get_child(p_array[2]) 375 | except tables.NoSuchNodeError: 376 | d_group = self.file.create_group(m_group,p_array[2]) 377 | 378 | # We need to create the table in the day group 379 | ts_data = self.file.create_table(d_group,'ts_data',self.table_description,self.table_title, 380 | self.table_filters, self.table_expectedrows, self.table_chunkshape, self.table_byteorder) 381 | 382 | # Need to save this as an attribute because it doesn't seem to be saved anywhere 383 | ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION = self.table_expectedrows 384 | 385 | return ts_data 386 | 387 | def __fetch_or_create_partition_table(self,partition_dt): 388 | group = self.__fetch_partition_group(partition_dt) 389 | if group: 390 | return group._f_get_child('ts_data') 391 | else: 392 | return self.__create_partition(partition_dt) 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | --------------------------------------------------------------------------------