├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── binary ├── adult │ ├── README.md │ ├── adult.trn.bz2 │ └── adult.tst.bz2 ├── covtype │ ├── README.md │ └── covtype.trn.bz2 ├── ijcnn1 │ ├── README.md │ ├── ijcnn1.trn.bz2 │ └── ijcnn1.tst.bz2 ├── kdd2010 │ ├── README.md │ ├── kdd2010.trn.bz2 │ └── kdd2010.tst.bz2 └── reuters │ ├── README.md │ ├── reuters.trn.bz2 │ └── reuters.tst.bz2 ├── hawkes └── bund │ ├── README.md │ └── bund.npz ├── lib ├── __init__.py ├── binary.py ├── compression.py ├── dataset_analysis.py ├── hawkes.py ├── preprocessing │ ├── __init__.py │ ├── clean_covtype.py │ ├── clean_kdd2010.py │ └── clean_reuters.py ├── regression.py └── tests │ ├── __init__.py │ ├── binary_test.py │ ├── hawkes_test.py │ └── regression_test.py ├── regression └── abalone │ ├── README.md │ └── abalone.trn.bz2 └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bz2 filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .hypothesis/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | local_settings.py 54 | 55 | # Flask stuff: 56 | instance/ 57 | .webassets-cache 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | venv/ 82 | ENV/ 83 | 84 | # Spyder project settings 85 | .spyderproject 86 | 87 | # Rope project settings 88 | .ropeproject 89 | 90 | 91 | #uncompressed data files 92 | *.trn 93 | *.tst 94 | 95 | # origin files before cleaning 96 | *_orig.bz2 97 | 98 | # IDE files 99 | .DS_Store 100 | .idea/* -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.4" 4 | # command to install dependencies 5 | before_install: 6 | - sudo apt-get -qq update 7 | - sudo apt-get -qq install python-numpy python-scipy 8 | install: 9 | - pip install -r requirements.txt 10 | # command to run tests 11 | script: python -m unittest discover -v . "*_test.py" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2016, X-DataInitiative 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/X-DataInitiative/tick-datasets.svg?branch=master)](https://travis-ci.org/X-DataInitiative/tick-datasets) 2 | 3 | # tick-datasets 4 | Hosting of ready-to-use machine learning datasets 5 | -------------------------------------------------------------------------------- /binary/adult/README.md: -------------------------------------------------------------------------------- 1 | ## Adult 2 | 3 | ### Characteristics 4 | 5 | Train dataset 6 | 7 | 8 | 9 | 10 | 11 |
Number of observations 32561
Number of features 123
Sparsity 11.3%
Class balancing 24.1% positive samples
12 | 13 | Test dataset 14 | 15 | 16 | 17 | 18 | 19 |
Number of observations 16281
Number of features 123
Sparsity 11.4%
Class balancing 23.6% positive samples
20 | 21 | ### Description 22 | 23 | Predict whether income exceeds $50K/yr based on census data. Also known 24 | as "Census Income" dataset. 25 | [https://archive.ics.uci.edu/ml/datasets/Adult](https://archive.ics.uci.edu/ml/datasets/Adult) 26 | 27 | ### Preprocessing 28 | The original Adult data set has 14 features, among which six are continuous and 29 | eight are categorical. In this data set, continuous features are discretized 30 | into quintiles, and each quantile is represented by a binary feature. Also, 31 | a categorical feature with m categories is converted to m binary features. It 32 | leads to a total of 123 binary features. 33 | 34 | John C. Platt. 35 | Fast training of support vector machines using sequential minimal optimization. 36 | In Bernhard Schölkopf, Christopher J. C. Burges, and 37 | Alexander J. Smola, editors, Advances in Kernel Methods - 38 | Support Vector Learning, Cambridge, MA, 1998. MIT Press. 39 | 40 | Note that as feature 122 was not occurring in the test set it has been added 41 | (with a zero value) to the first observation. Hence train and test data have 42 | the same number of features. The last line of the original test dataset 43 | (which only contained a label) has also been removed. 44 | 45 | ### Original download link 46 | [http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2) -------------------------------------------------------------------------------- /binary/adult/adult.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.trn.bz2 -------------------------------------------------------------------------------- /binary/adult/adult.tst.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.tst.bz2 -------------------------------------------------------------------------------- /binary/covtype/README.md: -------------------------------------------------------------------------------- 1 | ## Covtype 2 | 3 | ### Characteristics 4 | 5 | 6 | 7 | 8 | 9 | 10 |
Number of observations 581012
Number of features 54
Sparsity 22.1%
Class balancing 51.2% positive samples
11 | 12 | ### Description 13 | Forest CoverType dataset 14 | [https://archive.ics.uci.edu/ml/datasets/Covertype](https://archive.ics.uci.edu/ml/datasets/Covertype) 15 | 16 | ### Preprocessing 17 | Labels that were originally 1 and 2 have been changed to -1, 1. 18 | 19 | ### Original download link 20 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2) -------------------------------------------------------------------------------- /binary/covtype/covtype.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/covtype/covtype.trn.bz2 -------------------------------------------------------------------------------- /binary/ijcnn1/README.md: -------------------------------------------------------------------------------- 1 | ## ijcnn1 2 | 3 | ### Characteristics 4 | Train dataset 5 | 6 | 7 | 8 | 9 | 10 |
Number of observations 35000
Number of features 22
Sparsity 59.1%
Class balancing 9.76% positive samples
11 | 12 | 13 | Test dataset 14 | 15 | 16 | 17 | 18 | 19 |
Number of observations 91701
Number of features 22
Sparsity 59.1%
Class balancing 9.5% positive samples
20 | 21 | ### Description 22 | 23 | ### Preprocessing 24 | We use winner's transformation presented in 25 | 26 | Chih-Chung Chang and Chih-Jen Lin. 27 | IJCNN 2001 challenge: Generalization ability and text decoding. 28 | In Proceedings of IJCNN. IEEE, 2001. 29 | 30 | ### Original download links 31 | Train is concatenation of 32 | 33 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2) 34 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2) 35 | 36 | Test is 37 | 38 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2) -------------------------------------------------------------------------------- /binary/ijcnn1/ijcnn1.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.trn.bz2 -------------------------------------------------------------------------------- /binary/ijcnn1/ijcnn1.tst.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.tst.bz2 -------------------------------------------------------------------------------- /binary/kdd2010/README.md: -------------------------------------------------------------------------------- 1 | ## KDD 2010 2 | 3 | ### Characteristics 4 | 5 | Train dataset 6 | 7 | 8 | 9 | 10 | 11 |
Number of observations 19,264,097
Number of features 1,163,024
Sparsity 0.000797%
Class balancing 86.1% positive samples
12 | 13 | Maximum feature index in train dataset is 1,129,522 but the total number 14 | of features is 1,163,024 in order to be compatible with test dataset. 15 | 16 | 17 | Test dataset 18 | 19 | 20 | 21 | 22 | 23 |
Number of observations 748,401
Number of features 1,163,024
Sparsity 0.000774%
Class balancing 88.8% positive samples
24 | 25 | ### Description 26 | Juan, Yuchin, et al. "Field-aware factorization machines for CTR prediction." 27 | Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016. 28 | 29 | ### Preprocessing 30 | Labels that were originally 0 and 1 have been changed to -1, 1. 31 | 32 | 33 | ### Original download links 34 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2) 35 | 36 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2) -------------------------------------------------------------------------------- /binary/kdd2010/kdd2010.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.trn.bz2 -------------------------------------------------------------------------------- /binary/kdd2010/kdd2010.tst.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.tst.bz2 -------------------------------------------------------------------------------- /binary/reuters/README.md: -------------------------------------------------------------------------------- 1 | ## Reuters 2 | 3 | ### Characteristics 4 | 5 | Train dataset 6 | 7 | 8 | 9 | 10 | 11 |
Number of observations 7770
Number of features 8315
Sparsity 0.526%
Class balancing 6.92% positive samples
12 | 13 | Test dataset 14 | 15 | 16 | 17 | 18 | 19 |
Number of observations 3299
Number of features 8315
Sparsity 0.499%
Class balancing 5.43% positive samples
20 | 21 | ### Description 22 | 23 | ### Preprocessing 24 | The train set was originally containing several time the same value for one 25 | feature, it has been cleaned. 26 | 27 | ### Original download link 28 | [http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2) -------------------------------------------------------------------------------- /binary/reuters/reuters.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.trn.bz2 -------------------------------------------------------------------------------- /binary/reuters/reuters.tst.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.tst.bz2 -------------------------------------------------------------------------------- /hawkes/bund/README.md: -------------------------------------------------------------------------------- 1 | ## Bund Future trade data 2 | 3 | ### Description 4 | 5 | One month of data in April 2014 on Bund Future traded at eurex with 6 | microsecond timestamp resolution. 7 | 8 | This data is meant to be fitted with Hawkes processes. It contains for each 9 | day 4 time series representing: 10 | 11 | 1. Mid-price movement up 12 | 2. Mid-price movement down 13 | 3. Buyer initiated trades that do not move the mid-price 14 | 4. Seller initiated trades that do not move the mid-price 15 | 16 | ### Characteristics 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
Number of realizations 20
Average number of ticks node 0 7009.15
Average number of ticks node 1 6998.15
Average number of ticks node 2 257677.55
Average number of ticks node 3 261423.6
25 | 26 | ### Preprocessing 27 | 28 | Market opens at 8AM which corresponds to a timestamp of 28800. This timestamp 29 | has been substracted to all timestamps to have a realizations that starts at 30 | time 0. 31 | 32 | Please note that as markets closes at 10PM, the end time of our 33 | substracted realizations is 50400. 34 | 35 | -------------------------------------------------------------------------------- /hawkes/bund/bund.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/hawkes/bund/bund.npz -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/__init__.py -------------------------------------------------------------------------------- /lib/binary.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_svmlight_file 5 | 6 | from lib.dataset_analysis import features_characteristics, print_characteristics 7 | 8 | # name, path, n_observations, n_features 9 | binary_datasets = [ 10 | ('Adult Train', '../binary/adult/adult.trn.bz2', 32561, 123), 11 | ('Adult Test', '../binary/adult/adult.tst.bz2', 16281, 123), 12 | ('Covtype Train', '../binary/covtype/covtype.trn.bz2', 581012, 54), 13 | ('ijcnn1 Train', '../binary/ijcnn1/ijcnn1.trn.bz2', 49990, 22), 14 | ('ijcnn1 Test', '../binary/ijcnn1/ijcnn1.tst.bz2', 91701, 22), 15 | ('Reuters Train', '../binary/reuters/reuters.trn.bz2', 7770, 8315), 16 | ('Reuters Test', '../binary/reuters/reuters.tst.bz2', 3299, 8315), 17 | ('KDD 2010 Train', '../binary/kdd2010/kdd2010.trn.bz2', 18 | 19264097, 1129522), 19 | ('KDD 2010 Test', '../binary/kdd2010/kdd2010.tst.bz2', 20 | 748401, 1163024), 21 | ] 22 | 23 | 24 | def iterate_binary_dataset_path(): 25 | for name, path, *args in binary_datasets: 26 | path = os.path.join(os.path.dirname(__file__), path) 27 | yield name, path 28 | 29 | 30 | def iterate_binary_dataset(): 31 | for name, path, n_observations, n_features in binary_datasets: 32 | path = os.path.join(os.path.dirname(__file__), path) 33 | x, y = load_svmlight_file(path) 34 | yield name, x, y, n_observations, n_features 35 | 36 | 37 | def describe_binary_datasets(): 38 | for name, x, y, *args in iterate_binary_dataset(): 39 | n_rows = x.shape[0] 40 | positive_ratio = np.sum(y == 1) / n_rows 41 | 42 | characteristics = features_characteristics(x) 43 | characteristics['Class balancing'] = '{:.3g}% positive samples'.format( 44 | positive_ratio * 100) 45 | 46 | print('\n{:}'.format(name)) 47 | print_characteristics(characteristics, html=False) 48 | -------------------------------------------------------------------------------- /lib/compression.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import os 3 | from shutil import copyfileobj 4 | from itertools import chain 5 | 6 | from lib.binary import iterate_binary_dataset_path 7 | from lib.regression import iterate_regression_dataset_path 8 | 9 | 10 | def get_compressed_file_path(decompressed_file_path): 11 | if decompressed_file_path[-4:] == '.bz2': 12 | return decompressed_file_path 13 | return '{:}.bz2'.format(decompressed_file_path) 14 | 15 | 16 | def get_decompressed_file_path(compressed_file_path): 17 | return compressed_file_path.replace('.bz2', '') 18 | 19 | 20 | def compress_file(decompressed_file_path): 21 | compressed_file_path = get_compressed_file_path(decompressed_file_path) 22 | with open(decompressed_file_path, 'rb') as input: 23 | with bz2.BZ2File(compressed_file_path, 'wb') as output: 24 | copyfileobj(input, output) 25 | 26 | 27 | def decompress_file(compressed_file_path): 28 | decompressed_file_path = get_decompressed_file_path(compressed_file_path) 29 | with bz2.BZ2File(compressed_file_path, 'rb') as input: 30 | with open(decompressed_file_path, 'wb') as output: 31 | copyfileobj(input, output) 32 | 33 | 34 | all_datasets_path = chain( 35 | iterate_binary_dataset_path(), 36 | iterate_regression_dataset_path() 37 | ) 38 | 39 | 40 | def compress_all_files(replace=False): 41 | for name, path in all_datasets_path: 42 | path = get_decompressed_file_path(path) 43 | compressed_path = get_compressed_file_path(path) 44 | 45 | if os.path.exists(compressed_path) and replace is False: 46 | print('%s dataset already exists at %s' 47 | % (name, compressed_path)) 48 | else: 49 | print('compressing', name) 50 | compress_file(path) 51 | 52 | 53 | def decompress_all_files(replace=False): 54 | for name, path in all_datasets_path: 55 | path = get_compressed_file_path(path) 56 | decompressed_path = get_decompressed_file_path(path) 57 | 58 | if os.path.exists(decompressed_path) and replace is False: 59 | print('%s dataset already exists at %s' 60 | % (name, decompressed_path)) 61 | else: 62 | print('decompressing', name) 63 | decompress_file(path) 64 | -------------------------------------------------------------------------------- /lib/dataset_analysis.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | def features_characteristics(features_matrix): 5 | n_rows = features_matrix.shape[0] 6 | n_columns = features_matrix.shape[1] 7 | 8 | sparsity = features_matrix.getnnz() / (n_rows * n_columns) 9 | 10 | characteristics = OrderedDict() 11 | characteristics['Number of observations'] = n_rows 12 | characteristics['Number of features'] = n_columns 13 | characteristics['Sparsity'] = '{:.3g}%'.format(sparsity * 100) 14 | return characteristics 15 | 16 | 17 | def print_characteristics(characteristics, html=False): 18 | if html is False: 19 | for k, v in characteristics.items(): 20 | print("{:<25} {:}".format(k, v)) 21 | else: 22 | html_data = '' 23 | html_data += '\n' 24 | for k, v in characteristics.items(): 25 | html_data += ' ' 26 | html_data += ' '.format(k, v) 27 | html_data += '\n' 28 | html_data += '
{:} {:}
\n' 29 | 30 | print(html_data) -------------------------------------------------------------------------------- /lib/hawkes.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import OrderedDict 3 | import numpy as np 4 | 5 | from lib.dataset_analysis import print_characteristics 6 | 7 | # name, path, number of realization, number of nodes, end_time 8 | hawkes_datasets = [ 9 | ('Bund', '../hawkes/bund/bund.npz', 20, 4, 50400) 10 | ] 11 | 12 | 13 | def iterate_hawkes_dataset_path(): 14 | for name, path, *args in hawkes_datasets: 15 | path = os.path.join(os.path.dirname(__file__), path) 16 | yield name, path 17 | 18 | 19 | def iterate_hawkes_dataset(): 20 | for name, path, n_realizations, n_nodes, end_time in hawkes_datasets: 21 | path = os.path.join(os.path.dirname(__file__), path) 22 | timestamps_dict = np.load(path) 23 | yield name, timestamps_dict, n_realizations, n_nodes, end_time 24 | 25 | 26 | def hawkes_characteristics(timestamps_list): 27 | characteristics = OrderedDict() 28 | n_realizations = len(timestamps_list) 29 | characteristics['Number of realizations'] = n_realizations 30 | 31 | n_nodes = len(timestamps_list[0]) 32 | n_jumps_per_node = np.zeros(n_nodes) 33 | for timestamps in timestamps_list: 34 | for i in range(n_nodes): 35 | n_jumps_per_node[i] += len(timestamps[i]) 36 | n_jumps_per_node /= n_realizations 37 | for i in range(n_nodes): 38 | characteristics['Average number of ticks node %i' % i] = \ 39 | n_jumps_per_node[i] 40 | 41 | return characteristics 42 | 43 | 44 | def describe_hawkes_datasets(): 45 | for name, timestamps_dict, *args in iterate_hawkes_dataset(): 46 | timestamps_list = [timestamps_dict[key] 47 | for key in timestamps_dict.keys()] 48 | characteristics = hawkes_characteristics(timestamps_list) 49 | print('\n{:}'.format(name)) 50 | print_characteristics(characteristics, html=False) 51 | -------------------------------------------------------------------------------- /lib/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/preprocessing/__init__.py -------------------------------------------------------------------------------- /lib/preprocessing/clean_covtype.py: -------------------------------------------------------------------------------- 1 | # Covtype dataset have labels in {1, 2} instead of {-1, 1} 2 | import bz2 3 | from urllib.request import urlretrieve 4 | 5 | 6 | def clean_line_labels(line): 7 | line = line.strip() 8 | split_l = line.split(' ') 9 | label = split_l[0] 10 | features = ' '.join(split_l[1:]) 11 | 12 | if int(label) == 2: 13 | label = 1 14 | elif int(label) == 1: 15 | label = -1 16 | else: 17 | raise ValueError('Unknown label %s' % label) 18 | 19 | clean_line = '{:} {:}\n'.format(label, features) 20 | return clean_line 21 | 22 | 23 | covtype_data_url = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/' \ 24 | 'binary/covtype.libsvm.binary.scale.bz2' 25 | tmp_path, _ = urlretrieve(covtype_data_url) 26 | 27 | covtype_train_file_path = '../../binary/covtype/covtype.trn' 28 | with bz2.BZ2File(tmp_path) as train_file: 29 | with open(covtype_train_file_path, 'w') as covtype_train_file: 30 | for data in train_file.readlines(): 31 | cleaned_line = clean_line_labels(data.decode('utf-8')) 32 | covtype_train_file.write(cleaned_line) 33 | -------------------------------------------------------------------------------- /lib/preprocessing/clean_kdd2010.py: -------------------------------------------------------------------------------- 1 | # KDD 2010 datasets have labels in {0, 1} instead of {-1, 1} 2 | import bz2 3 | 4 | from lib.compression import compress_file 5 | 6 | 7 | def clean_line_labels(line): 8 | line = line.strip() 9 | split_l = line.split(' ') 10 | label = split_l[0] 11 | features = ' '.join(split_l[1:]) 12 | 13 | if int(label) == 1: 14 | label = 1 15 | elif int(label) == 0: 16 | label = -1 17 | else: 18 | raise ValueError('Unknown label %s' % label) 19 | 20 | clean_line = '{:} {:}\n'.format(label, features) 21 | return clean_line 22 | 23 | 24 | original_files_path = [ 25 | '../../binary/kdd2010/kdd2010.trn_orig.bz2', 26 | '../../binary/kdd2010/kdd2010.tst_orig.bz2' 27 | ] 28 | 29 | save_file_path = [ 30 | '../../binary/kdd2010/kdd2010.trn', 31 | '../../binary/kdd2010/kdd2010.tst' 32 | ] 33 | 34 | for original_path, save_path in zip(original_files_path, save_file_path): 35 | with bz2.BZ2File(original_path) as train_file: 36 | with open(save_path, 'w') as kdd2010_train_file: 37 | for i, data in enumerate(train_file.readlines()): 38 | if i % 100000 == 0: 39 | print(i) 40 | cleaned_line = clean_line_labels(data.decode('utf-8')) 41 | kdd2010_train_file.write(cleaned_line) 42 | 43 | compress_file(save_path) 44 | -------------------------------------------------------------------------------- /lib/preprocessing/clean_reuters.py: -------------------------------------------------------------------------------- 1 | # Reuters dataset has line with the same feature defined twice 2 | # For example line 3 starts with 3 | # -1 171:8.09199011447501E-02 171:8.09199011447501E-02 4 | # Hence we need to clean that 5 | import tarfile 6 | from urllib.request import urlretrieve 7 | from collections import OrderedDict 8 | 9 | 10 | def clean_line_duplicates(line): 11 | line = line.strip() 12 | split_l = line.split(' ') 13 | label = split_l[0] 14 | features = split_l[1:] 15 | features_dict = OrderedDict() 16 | for feature in features: 17 | index = feature.split(':')[0].strip() 18 | value = feature.split(':')[1].strip() 19 | if index not in features_dict: 20 | features_dict[index] = value 21 | else: 22 | if features_dict[index] != value: 23 | raise (ValueError('index', index, features_dict[index], 24 | value)) 25 | joined_features = [':'.join([index, value]) 26 | for index, value in features_dict.items()] 27 | cleaned_line = ' '.join([label] + joined_features) + '\n' 28 | return cleaned_line 29 | 30 | 31 | reuters_data_url = 'http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2' 32 | tmp_path, _ = urlretrieve(reuters_data_url) 33 | uncompressed_data = tarfile.open(name=tmp_path, mode="r:bz2") 34 | 35 | reuters_train_file_path = '../../binary/reuters/reuters.trn' 36 | 37 | with uncompressed_data.extractfile('reuters/money-fx.trn') as train_file: 38 | with open(reuters_train_file_path, 'w') as reuters_train_file: 39 | for data in train_file.readlines(): 40 | cleaned_line = clean_line_duplicates(data.decode('utf-8')) 41 | reuters_train_file.write(cleaned_line) 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /lib/regression.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_svmlight_file 5 | 6 | from lib.dataset_analysis import features_characteristics, print_characteristics 7 | 8 | # name, path, n_observations, n_features 9 | regression_datasets = [ 10 | ('Abalone Train', '../regression/abalone/abalone.trn.bz2', 4177, 8), 11 | ] 12 | 13 | def iterate_regression_dataset_path(): 14 | for name, path, *args in regression_datasets: 15 | path = os.path.join(os.path.dirname(__file__), path) 16 | yield name, path 17 | 18 | 19 | def iterate_regression_dataset(): 20 | for name, path, n_observations, n_features in regression_datasets: 21 | path = os.path.join(os.path.dirname(__file__), path) 22 | x, y = load_svmlight_file(path) 23 | yield name, x, y, n_observations, n_features 24 | 25 | 26 | def describe_regression_datasets(): 27 | for name, x, y, *args in iterate_regression_dataset(): 28 | label_mean = np.mean(y) 29 | label_std = np.std(y) 30 | label_min = np.min(y) 31 | label_max = np.max(y) 32 | 33 | characteristics = features_characteristics(x) 34 | characteristics['label mean'] = '{:.3g}'.format(label_mean) 35 | characteristics['label std'] = '{:.3g}'.format(label_std) 36 | characteristics['label min'] = '{:.3g}'.format(label_min) 37 | characteristics['label max'] = '{:.3g}'.format(label_max) 38 | 39 | print('\n{:}'.format(name)) 40 | print_characteristics(characteristics, html=False) 41 | 42 | # describe_regression_datasets() -------------------------------------------------------------------------------- /lib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/tests/__init__.py -------------------------------------------------------------------------------- /lib/tests/binary_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from lib.binary import iterate_binary_dataset 6 | 7 | 8 | class BinaryTests(unittest.TestCase): 9 | def test_data_set_consistency(self): 10 | """...Test binary datasets have the expected shape 11 | """ 12 | for name, x, y, n_observations, n_features in \ 13 | iterate_binary_dataset(): 14 | self.assertEqual(x.shape[0], n_observations, 15 | "Incorrect number of observations in %s" % name) 16 | self.assertEqual(y.shape[0], n_observations, 17 | "Incorrect number of labels in %s" % name) 18 | self.assertEqual(x.shape[1], n_features, 19 | "Incorrect number of features in %s" % name) 20 | 21 | self.assertEqual(set(np.unique(y)), {-1, 1}, 22 | "Incorrect labels encoding in %s" % name) 23 | 24 | 25 | if __name__ == '__main__': 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /lib/tests/hawkes_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from lib.hawkes import iterate_hawkes_dataset 5 | 6 | 7 | def detect_duplicates(timestamps): 8 | """Detect if one timestamp appears twice in the realisation 9 | 10 | Parameters 11 | ---------- 12 | timestamps : `list` of `np.ndarray` 13 | Hawkes realization 14 | 15 | Returns 16 | ------- 17 | output : `bool` 18 | True if a timestamp appears twice 19 | """ 20 | detected = False 21 | n_nodes = len(timestamps) 22 | 23 | # Search for timestamps appearing twice in the same node 24 | for i in range(n_nodes): 25 | diff = timestamps[i][1:] - timestamps[i][:-1] 26 | mask = np.hstack((True, diff != 0)) 27 | detected |= not np.alltrue(mask) 28 | 29 | # Search for timestamps appearing twice in two different nodes 30 | for i in range(n_nodes): 31 | for j in range(i + 1, n_nodes): 32 | search_left_i = np.searchsorted(timestamps[j], timestamps[i], 33 | side='left') 34 | search_right_i = np.searchsorted(timestamps[j], timestamps[i], 35 | side='right') 36 | 37 | search_left_j = np.searchsorted(timestamps[i], timestamps[j], 38 | side='left') 39 | search_right_j = np.searchsorted(timestamps[i], timestamps[j], 40 | side='right') 41 | 42 | detected |= not np.alltrue(search_left_i == search_right_i) 43 | detected |= not np.alltrue(search_left_j == search_right_j) 44 | 45 | return detected 46 | 47 | 48 | class HawkesTests(unittest.TestCase): 49 | def test_detect_duplicates(self): 50 | """...Test that test function works as expected 51 | """ 52 | timestamps = [np.array([1., 2., 3.])] 53 | self.assertFalse(detect_duplicates(timestamps)) 54 | 55 | timestamps = [np.array([1., 2., 2., 3.])] 56 | self.assertTrue(detect_duplicates(timestamps)) 57 | 58 | timestamps = [np.array([1., 2., 3.]), 59 | np.array([4., 5., 6., 7.])] 60 | self.assertFalse(detect_duplicates(timestamps)) 61 | 62 | timestamps = [np.array([1., 2., 3.]), 63 | np.array([4., 5., 6., 6., 7.])] 64 | self.assertTrue(detect_duplicates(timestamps)) 65 | 66 | timestamps = [np.array([1., 2., 4.]), 67 | np.array([4., 5., 6., 7.])] 68 | self.assertTrue(detect_duplicates(timestamps)) 69 | 70 | def test_data_set_consistency(self): 71 | """...Test hawkes datasets have the expected shape 72 | """ 73 | for name, timestamps_list, n_realizations, n_nodes, end_time in \ 74 | iterate_hawkes_dataset(): 75 | self.assertEqual(len(timestamps_list.keys()), n_realizations, 76 | "Incorrect number of realizations in %s" % name) 77 | 78 | for name, timestamps in timestamps_list.items(): 79 | self.assertEqual(timestamps.shape, (n_nodes,), 80 | "Incorrect number of nodes in %s" % name) 81 | fist_time = min(map(min, timestamps)) 82 | self.assertGreaterEqual(fist_time, 0, 83 | "Incorrect first time in %s" % name) 84 | last_time = max(map(max, timestamps)) 85 | self.assertLessEqual(last_time, end_time, 86 | "Incorrect last time in %s" % name) 87 | 88 | self.assertFalse(detect_duplicates(timestamps)) 89 | 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /lib/tests/regression_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lib.regression import iterate_regression_dataset 4 | 5 | 6 | class RegressionTests(unittest.TestCase): 7 | def test_data_set_consistency(self): 8 | """...Test regression datasets have the expected shape 9 | """ 10 | for name, x, y, n_observations, n_features in \ 11 | iterate_regression_dataset(): 12 | self.assertEqual(x.shape[0], n_observations, 13 | "Incorrect number of observations in %s" % name) 14 | self.assertEqual(y.shape[0], n_observations, 15 | "Incorrect number of labels in %s" % name) 16 | self.assertEqual(x.shape[1], n_features, 17 | "Incorrect number of features in %s" % name) 18 | 19 | 20 | if __name__ == '__main__': 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /regression/abalone/README.md: -------------------------------------------------------------------------------- 1 | ## Abalone 2 | 3 | ### Characteristics 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
Number of observations 4177
Number of features 8
Sparsity 96%
label mean 9.93
label std 3.22
label min 1
label max 29
14 | 15 | ### Description 16 | 17 | Predict the age of abalone from physical measurements 18 | 19 | [https://archive.ics.uci.edu/ml/datasets/Abalone](https://archive.ics.uci.edu/ml/datasets/Abalone) 20 | 21 | ### Preprocessing 22 | None 23 | 24 | ### Original download link 25 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale) -------------------------------------------------------------------------------- /regression/abalone/abalone.trn.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/regression/abalone/abalone.trn.bz2 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | --------------------------------------------------------------------------------