├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── binary
├── adult
│ ├── README.md
│ ├── adult.trn.bz2
│ └── adult.tst.bz2
├── covtype
│ ├── README.md
│ └── covtype.trn.bz2
├── ijcnn1
│ ├── README.md
│ ├── ijcnn1.trn.bz2
│ └── ijcnn1.tst.bz2
├── kdd2010
│ ├── README.md
│ ├── kdd2010.trn.bz2
│ └── kdd2010.tst.bz2
└── reuters
│ ├── README.md
│ ├── reuters.trn.bz2
│ └── reuters.tst.bz2
├── hawkes
└── bund
│ ├── README.md
│ └── bund.npz
├── lib
├── __init__.py
├── binary.py
├── compression.py
├── dataset_analysis.py
├── hawkes.py
├── preprocessing
│ ├── __init__.py
│ ├── clean_covtype.py
│ ├── clean_kdd2010.py
│ └── clean_reuters.py
├── regression.py
└── tests
│ ├── __init__.py
│ ├── binary_test.py
│ ├── hawkes_test.py
│ └── regression_test.py
├── regression
└── abalone
│ ├── README.md
│ └── abalone.trn.bz2
└── requirements.txt
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bz2 filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | .hypothesis/
46 |
47 | # Translations
48 | *.mo
49 | *.pot
50 |
51 | # Django stuff:
52 | *.log
53 | local_settings.py
54 |
55 | # Flask stuff:
56 | instance/
57 | .webassets-cache
58 |
59 | # Scrapy stuff:
60 | .scrapy
61 |
62 | # Sphinx documentation
63 | docs/_build/
64 |
65 | # PyBuilder
66 | target/
67 |
68 | # IPython Notebook
69 | .ipynb_checkpoints
70 |
71 | # pyenv
72 | .python-version
73 |
74 | # celery beat schedule file
75 | celerybeat-schedule
76 |
77 | # dotenv
78 | .env
79 |
80 | # virtualenv
81 | venv/
82 | ENV/
83 |
84 | # Spyder project settings
85 | .spyderproject
86 |
87 | # Rope project settings
88 | .ropeproject
89 |
90 |
91 | #uncompressed data files
92 | *.trn
93 | *.tst
94 |
95 | # origin files before cleaning
96 | *_orig.bz2
97 |
98 | # IDE files
99 | .DS_Store
100 | .idea/*
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.4"
4 | # command to install dependencies
5 | before_install:
6 | - sudo apt-get -qq update
7 | - sudo apt-get -qq install python-numpy python-scipy
8 | install:
9 | - pip install -r requirements.txt
10 | # command to run tests
11 | script: python -m unittest discover -v . "*_test.py"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2016, X-DataInitiative
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/X-DataInitiative/tick-datasets)
2 |
3 | # tick-datasets
4 | Hosting of ready-to-use machine learning datasets
5 |
--------------------------------------------------------------------------------
/binary/adult/README.md:
--------------------------------------------------------------------------------
1 | ## Adult
2 |
3 | ### Characteristics
4 |
5 | Train dataset
6 |
7 | Number of observations | 32561 |
8 | Number of features | 123 |
9 | Sparsity | 11.3% |
10 | Class balancing | 24.1% positive samples |
11 |
12 |
13 | Test dataset
14 |
15 | Number of observations | 16281 |
16 | Number of features | 123 |
17 | Sparsity | 11.4% |
18 | Class balancing | 23.6% positive samples |
19 |
20 |
21 | ### Description
22 |
23 | Predict whether income exceeds $50K/yr based on census data. Also known
24 | as "Census Income" dataset.
25 | [https://archive.ics.uci.edu/ml/datasets/Adult](https://archive.ics.uci.edu/ml/datasets/Adult)
26 |
27 | ### Preprocessing
28 | The original Adult data set has 14 features, among which six are continuous and
29 | eight are categorical. In this data set, continuous features are discretized
30 | into quintiles, and each quantile is represented by a binary feature. Also,
31 | a categorical feature with m categories is converted to m binary features. It
32 | leads to a total of 123 binary features.
33 |
34 | John C. Platt.
35 | Fast training of support vector machines using sequential minimal optimization.
36 | In Bernhard Schölkopf, Christopher J. C. Burges, and
37 | Alexander J. Smola, editors, Advances in Kernel Methods -
38 | Support Vector Learning, Cambridge, MA, 1998. MIT Press.
39 |
40 | Note that as feature 122 was not occurring in the test set it has been added
41 | (with a zero value) to the first observation. Hence train and test data have
42 | the same number of features. The last line of the original test dataset
43 | (which only contained a label) has also been removed.
44 |
45 | ### Original download link
46 | [http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2)
--------------------------------------------------------------------------------
/binary/adult/adult.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.trn.bz2
--------------------------------------------------------------------------------
/binary/adult/adult.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.tst.bz2
--------------------------------------------------------------------------------
/binary/covtype/README.md:
--------------------------------------------------------------------------------
1 | ## Covtype
2 |
3 | ### Characteristics
4 |
5 |
6 | Number of observations | 581012 |
7 | Number of features | 54 |
8 | Sparsity | 22.1% |
9 | Class balancing | 51.2% positive samples |
10 |
11 |
12 | ### Description
13 | Forest CoverType dataset
14 | [https://archive.ics.uci.edu/ml/datasets/Covertype](https://archive.ics.uci.edu/ml/datasets/Covertype)
15 |
16 | ### Preprocessing
17 | Labels that were originally 1 and 2 have been changed to -1, 1.
18 |
19 | ### Original download link
20 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2)
--------------------------------------------------------------------------------
/binary/covtype/covtype.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/covtype/covtype.trn.bz2
--------------------------------------------------------------------------------
/binary/ijcnn1/README.md:
--------------------------------------------------------------------------------
1 | ## ijcnn1
2 |
3 | ### Characteristics
4 | Train dataset
5 |
6 | Number of observations | 35000 |
7 | Number of features | 22 |
8 | Sparsity | 59.1% |
9 | Class balancing | 9.76% positive samples |
10 |
11 |
12 |
13 | Test dataset
14 |
15 | Number of observations | 91701 |
16 | Number of features | 22 |
17 | Sparsity | 59.1% |
18 | Class balancing | 9.5% positive samples |
19 |
20 |
21 | ### Description
22 |
23 | ### Preprocessing
24 | We use winner's transformation presented in
25 |
26 | Chih-Chung Chang and Chih-Jen Lin.
27 | IJCNN 2001 challenge: Generalization ability and text decoding.
28 | In Proceedings of IJCNN. IEEE, 2001.
29 |
30 | ### Original download links
31 | Train is concatenation of
32 |
33 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2)
34 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2)
35 |
36 | Test is
37 |
38 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2)
--------------------------------------------------------------------------------
/binary/ijcnn1/ijcnn1.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.trn.bz2
--------------------------------------------------------------------------------
/binary/ijcnn1/ijcnn1.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.tst.bz2
--------------------------------------------------------------------------------
/binary/kdd2010/README.md:
--------------------------------------------------------------------------------
1 | ## KDD 2010
2 |
3 | ### Characteristics
4 |
5 | Train dataset
6 |
7 | Number of observations | 19,264,097 |
8 | Number of features | 1,163,024 |
9 | Sparsity | 0.000797% |
10 | Class balancing | 86.1% positive samples |
11 |
12 |
13 | Maximum feature index in train dataset is 1,129,522 but the total number
14 | of features is 1,163,024 in order to be compatible with test dataset.
15 |
16 |
17 | Test dataset
18 |
19 | Number of observations | 748,401 |
20 | Number of features | 1,163,024 |
21 | Sparsity | 0.000774% |
22 | Class balancing | 88.8% positive samples |
23 |
24 |
25 | ### Description
26 | Juan, Yuchin, et al. "Field-aware factorization machines for CTR prediction."
27 | Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.
28 |
29 | ### Preprocessing
30 | Labels that were originally 0 and 1 have been changed to -1, 1.
31 |
32 |
33 | ### Original download links
34 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2)
35 |
36 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2)
--------------------------------------------------------------------------------
/binary/kdd2010/kdd2010.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.trn.bz2
--------------------------------------------------------------------------------
/binary/kdd2010/kdd2010.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.tst.bz2
--------------------------------------------------------------------------------
/binary/reuters/README.md:
--------------------------------------------------------------------------------
1 | ## Reuters
2 |
3 | ### Characteristics
4 |
5 | Train dataset
6 |
7 | Number of observations | 7770 |
8 | Number of features | 8315 |
9 | Sparsity | 0.526% |
10 | Class balancing | 6.92% positive samples |
11 |
12 |
13 | Test dataset
14 |
15 | Number of observations | 3299 |
16 | Number of features | 8315 |
17 | Sparsity | 0.499% |
18 | Class balancing | 5.43% positive samples |
19 |
20 |
21 | ### Description
22 |
23 | ### Preprocessing
24 | The train set was originally containing several time the same value for one
25 | feature, it has been cleaned.
26 |
27 | ### Original download link
28 | [http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2)
--------------------------------------------------------------------------------
/binary/reuters/reuters.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.trn.bz2
--------------------------------------------------------------------------------
/binary/reuters/reuters.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.tst.bz2
--------------------------------------------------------------------------------
/hawkes/bund/README.md:
--------------------------------------------------------------------------------
1 | ## Bund Future trade data
2 |
3 | ### Description
4 |
5 | One month of data in April 2014 on Bund Future traded at eurex with
6 | microsecond timestamp resolution.
7 |
8 | This data is meant to be fitted with Hawkes processes. It contains for each
9 | day 4 time series representing:
10 |
11 | 1. Mid-price movement up
12 | 2. Mid-price movement down
13 | 3. Buyer initiated trades that do not move the mid-price
14 | 4. Seller initiated trades that do not move the mid-price
15 |
16 | ### Characteristics
17 |
18 |
19 | Number of realizations | 20 |
20 | Average number of ticks node 0 | 7009.15 |
21 | Average number of ticks node 1 | 6998.15 |
22 | Average number of ticks node 2 | 257677.55 |
23 | Average number of ticks node 3 | 261423.6 |
24 |
25 |
26 | ### Preprocessing
27 |
28 | Market opens at 8AM which corresponds to a timestamp of 28800. This timestamp
29 | has been substracted to all timestamps to have a realizations that starts at
30 | time 0.
31 |
32 | Please note that as markets closes at 10PM, the end time of our
33 | substracted realizations is 50400.
34 |
35 |
--------------------------------------------------------------------------------
/hawkes/bund/bund.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/hawkes/bund/bund.npz
--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/__init__.py
--------------------------------------------------------------------------------
/lib/binary.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from sklearn.datasets import load_svmlight_file
5 |
6 | from lib.dataset_analysis import features_characteristics, print_characteristics
7 |
8 | # name, path, n_observations, n_features
9 | binary_datasets = [
10 | ('Adult Train', '../binary/adult/adult.trn.bz2', 32561, 123),
11 | ('Adult Test', '../binary/adult/adult.tst.bz2', 16281, 123),
12 | ('Covtype Train', '../binary/covtype/covtype.trn.bz2', 581012, 54),
13 | ('ijcnn1 Train', '../binary/ijcnn1/ijcnn1.trn.bz2', 49990, 22),
14 | ('ijcnn1 Test', '../binary/ijcnn1/ijcnn1.tst.bz2', 91701, 22),
15 | ('Reuters Train', '../binary/reuters/reuters.trn.bz2', 7770, 8315),
16 | ('Reuters Test', '../binary/reuters/reuters.tst.bz2', 3299, 8315),
17 | ('KDD 2010 Train', '../binary/kdd2010/kdd2010.trn.bz2',
18 | 19264097, 1129522),
19 | ('KDD 2010 Test', '../binary/kdd2010/kdd2010.tst.bz2',
20 | 748401, 1163024),
21 | ]
22 |
23 |
24 | def iterate_binary_dataset_path():
25 | for name, path, *args in binary_datasets:
26 | path = os.path.join(os.path.dirname(__file__), path)
27 | yield name, path
28 |
29 |
30 | def iterate_binary_dataset():
31 | for name, path, n_observations, n_features in binary_datasets:
32 | path = os.path.join(os.path.dirname(__file__), path)
33 | x, y = load_svmlight_file(path)
34 | yield name, x, y, n_observations, n_features
35 |
36 |
37 | def describe_binary_datasets():
38 | for name, x, y, *args in iterate_binary_dataset():
39 | n_rows = x.shape[0]
40 | positive_ratio = np.sum(y == 1) / n_rows
41 |
42 | characteristics = features_characteristics(x)
43 | characteristics['Class balancing'] = '{:.3g}% positive samples'.format(
44 | positive_ratio * 100)
45 |
46 | print('\n{:}'.format(name))
47 | print_characteristics(characteristics, html=False)
48 |
--------------------------------------------------------------------------------
/lib/compression.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import os
3 | from shutil import copyfileobj
4 | from itertools import chain
5 |
6 | from lib.binary import iterate_binary_dataset_path
7 | from lib.regression import iterate_regression_dataset_path
8 |
9 |
10 | def get_compressed_file_path(decompressed_file_path):
11 | if decompressed_file_path[-4:] == '.bz2':
12 | return decompressed_file_path
13 | return '{:}.bz2'.format(decompressed_file_path)
14 |
15 |
16 | def get_decompressed_file_path(compressed_file_path):
17 | return compressed_file_path.replace('.bz2', '')
18 |
19 |
20 | def compress_file(decompressed_file_path):
21 | compressed_file_path = get_compressed_file_path(decompressed_file_path)
22 | with open(decompressed_file_path, 'rb') as input:
23 | with bz2.BZ2File(compressed_file_path, 'wb') as output:
24 | copyfileobj(input, output)
25 |
26 |
27 | def decompress_file(compressed_file_path):
28 | decompressed_file_path = get_decompressed_file_path(compressed_file_path)
29 | with bz2.BZ2File(compressed_file_path, 'rb') as input:
30 | with open(decompressed_file_path, 'wb') as output:
31 | copyfileobj(input, output)
32 |
33 |
34 | all_datasets_path = chain(
35 | iterate_binary_dataset_path(),
36 | iterate_regression_dataset_path()
37 | )
38 |
39 |
40 | def compress_all_files(replace=False):
41 | for name, path in all_datasets_path:
42 | path = get_decompressed_file_path(path)
43 | compressed_path = get_compressed_file_path(path)
44 |
45 | if os.path.exists(compressed_path) and replace is False:
46 | print('%s dataset already exists at %s'
47 | % (name, compressed_path))
48 | else:
49 | print('compressing', name)
50 | compress_file(path)
51 |
52 |
53 | def decompress_all_files(replace=False):
54 | for name, path in all_datasets_path:
55 | path = get_compressed_file_path(path)
56 | decompressed_path = get_decompressed_file_path(path)
57 |
58 | if os.path.exists(decompressed_path) and replace is False:
59 | print('%s dataset already exists at %s'
60 | % (name, decompressed_path))
61 | else:
62 | print('decompressing', name)
63 | decompress_file(path)
64 |
--------------------------------------------------------------------------------
/lib/dataset_analysis.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 |
4 | def features_characteristics(features_matrix):
5 | n_rows = features_matrix.shape[0]
6 | n_columns = features_matrix.shape[1]
7 |
8 | sparsity = features_matrix.getnnz() / (n_rows * n_columns)
9 |
10 | characteristics = OrderedDict()
11 | characteristics['Number of observations'] = n_rows
12 | characteristics['Number of features'] = n_columns
13 | characteristics['Sparsity'] = '{:.3g}%'.format(sparsity * 100)
14 | return characteristics
15 |
16 |
17 | def print_characteristics(characteristics, html=False):
18 | if html is False:
19 | for k, v in characteristics.items():
20 | print("{:<25} {:}".format(k, v))
21 | else:
22 | html_data = ''
23 | html_data += '\n'
24 | for k, v in characteristics.items():
25 | html_data += ' '
26 | html_data += ' {:} | {:} | '.format(k, v)
27 | html_data += '
\n'
28 | html_data += '
\n'
29 |
30 | print(html_data)
--------------------------------------------------------------------------------
/lib/hawkes.py:
--------------------------------------------------------------------------------
1 | import os
2 | from collections import OrderedDict
3 | import numpy as np
4 |
5 | from lib.dataset_analysis import print_characteristics
6 |
7 | # name, path, number of realization, number of nodes, end_time
8 | hawkes_datasets = [
9 | ('Bund', '../hawkes/bund/bund.npz', 20, 4, 50400)
10 | ]
11 |
12 |
13 | def iterate_hawkes_dataset_path():
14 | for name, path, *args in hawkes_datasets:
15 | path = os.path.join(os.path.dirname(__file__), path)
16 | yield name, path
17 |
18 |
19 | def iterate_hawkes_dataset():
20 | for name, path, n_realizations, n_nodes, end_time in hawkes_datasets:
21 | path = os.path.join(os.path.dirname(__file__), path)
22 | timestamps_dict = np.load(path)
23 | yield name, timestamps_dict, n_realizations, n_nodes, end_time
24 |
25 |
26 | def hawkes_characteristics(timestamps_list):
27 | characteristics = OrderedDict()
28 | n_realizations = len(timestamps_list)
29 | characteristics['Number of realizations'] = n_realizations
30 |
31 | n_nodes = len(timestamps_list[0])
32 | n_jumps_per_node = np.zeros(n_nodes)
33 | for timestamps in timestamps_list:
34 | for i in range(n_nodes):
35 | n_jumps_per_node[i] += len(timestamps[i])
36 | n_jumps_per_node /= n_realizations
37 | for i in range(n_nodes):
38 | characteristics['Average number of ticks node %i' % i] = \
39 | n_jumps_per_node[i]
40 |
41 | return characteristics
42 |
43 |
44 | def describe_hawkes_datasets():
45 | for name, timestamps_dict, *args in iterate_hawkes_dataset():
46 | timestamps_list = [timestamps_dict[key]
47 | for key in timestamps_dict.keys()]
48 | characteristics = hawkes_characteristics(timestamps_list)
49 | print('\n{:}'.format(name))
50 | print_characteristics(characteristics, html=False)
51 |
--------------------------------------------------------------------------------
/lib/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/preprocessing/__init__.py
--------------------------------------------------------------------------------
/lib/preprocessing/clean_covtype.py:
--------------------------------------------------------------------------------
1 | # Covtype dataset have labels in {1, 2} instead of {-1, 1}
2 | import bz2
3 | from urllib.request import urlretrieve
4 |
5 |
6 | def clean_line_labels(line):
7 | line = line.strip()
8 | split_l = line.split(' ')
9 | label = split_l[0]
10 | features = ' '.join(split_l[1:])
11 |
12 | if int(label) == 2:
13 | label = 1
14 | elif int(label) == 1:
15 | label = -1
16 | else:
17 | raise ValueError('Unknown label %s' % label)
18 |
19 | clean_line = '{:} {:}\n'.format(label, features)
20 | return clean_line
21 |
22 |
23 | covtype_data_url = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/' \
24 | 'binary/covtype.libsvm.binary.scale.bz2'
25 | tmp_path, _ = urlretrieve(covtype_data_url)
26 |
27 | covtype_train_file_path = '../../binary/covtype/covtype.trn'
28 | with bz2.BZ2File(tmp_path) as train_file:
29 | with open(covtype_train_file_path, 'w') as covtype_train_file:
30 | for data in train_file.readlines():
31 | cleaned_line = clean_line_labels(data.decode('utf-8'))
32 | covtype_train_file.write(cleaned_line)
33 |
--------------------------------------------------------------------------------
/lib/preprocessing/clean_kdd2010.py:
--------------------------------------------------------------------------------
1 | # KDD 2010 datasets have labels in {0, 1} instead of {-1, 1}
2 | import bz2
3 |
4 | from lib.compression import compress_file
5 |
6 |
7 | def clean_line_labels(line):
8 | line = line.strip()
9 | split_l = line.split(' ')
10 | label = split_l[0]
11 | features = ' '.join(split_l[1:])
12 |
13 | if int(label) == 1:
14 | label = 1
15 | elif int(label) == 0:
16 | label = -1
17 | else:
18 | raise ValueError('Unknown label %s' % label)
19 |
20 | clean_line = '{:} {:}\n'.format(label, features)
21 | return clean_line
22 |
23 |
24 | original_files_path = [
25 | '../../binary/kdd2010/kdd2010.trn_orig.bz2',
26 | '../../binary/kdd2010/kdd2010.tst_orig.bz2'
27 | ]
28 |
29 | save_file_path = [
30 | '../../binary/kdd2010/kdd2010.trn',
31 | '../../binary/kdd2010/kdd2010.tst'
32 | ]
33 |
34 | for original_path, save_path in zip(original_files_path, save_file_path):
35 | with bz2.BZ2File(original_path) as train_file:
36 | with open(save_path, 'w') as kdd2010_train_file:
37 | for i, data in enumerate(train_file.readlines()):
38 | if i % 100000 == 0:
39 | print(i)
40 | cleaned_line = clean_line_labels(data.decode('utf-8'))
41 | kdd2010_train_file.write(cleaned_line)
42 |
43 | compress_file(save_path)
44 |
--------------------------------------------------------------------------------
/lib/preprocessing/clean_reuters.py:
--------------------------------------------------------------------------------
1 | # Reuters dataset has line with the same feature defined twice
2 | # For example line 3 starts with
3 | # -1 171:8.09199011447501E-02 171:8.09199011447501E-02
4 | # Hence we need to clean that
5 | import tarfile
6 | from urllib.request import urlretrieve
7 | from collections import OrderedDict
8 |
9 |
10 | def clean_line_duplicates(line):
11 | line = line.strip()
12 | split_l = line.split(' ')
13 | label = split_l[0]
14 | features = split_l[1:]
15 | features_dict = OrderedDict()
16 | for feature in features:
17 | index = feature.split(':')[0].strip()
18 | value = feature.split(':')[1].strip()
19 | if index not in features_dict:
20 | features_dict[index] = value
21 | else:
22 | if features_dict[index] != value:
23 | raise (ValueError('index', index, features_dict[index],
24 | value))
25 | joined_features = [':'.join([index, value])
26 | for index, value in features_dict.items()]
27 | cleaned_line = ' '.join([label] + joined_features) + '\n'
28 | return cleaned_line
29 |
30 |
31 | reuters_data_url = 'http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2'
32 | tmp_path, _ = urlretrieve(reuters_data_url)
33 | uncompressed_data = tarfile.open(name=tmp_path, mode="r:bz2")
34 |
35 | reuters_train_file_path = '../../binary/reuters/reuters.trn'
36 |
37 | with uncompressed_data.extractfile('reuters/money-fx.trn') as train_file:
38 | with open(reuters_train_file_path, 'w') as reuters_train_file:
39 | for data in train_file.readlines():
40 | cleaned_line = clean_line_duplicates(data.decode('utf-8'))
41 | reuters_train_file.write(cleaned_line)
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/lib/regression.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from sklearn.datasets import load_svmlight_file
5 |
6 | from lib.dataset_analysis import features_characteristics, print_characteristics
7 |
8 | # name, path, n_observations, n_features
9 | regression_datasets = [
10 | ('Abalone Train', '../regression/abalone/abalone.trn.bz2', 4177, 8),
11 | ]
12 |
13 | def iterate_regression_dataset_path():
14 | for name, path, *args in regression_datasets:
15 | path = os.path.join(os.path.dirname(__file__), path)
16 | yield name, path
17 |
18 |
19 | def iterate_regression_dataset():
20 | for name, path, n_observations, n_features in regression_datasets:
21 | path = os.path.join(os.path.dirname(__file__), path)
22 | x, y = load_svmlight_file(path)
23 | yield name, x, y, n_observations, n_features
24 |
25 |
26 | def describe_regression_datasets():
27 | for name, x, y, *args in iterate_regression_dataset():
28 | label_mean = np.mean(y)
29 | label_std = np.std(y)
30 | label_min = np.min(y)
31 | label_max = np.max(y)
32 |
33 | characteristics = features_characteristics(x)
34 | characteristics['label mean'] = '{:.3g}'.format(label_mean)
35 | characteristics['label std'] = '{:.3g}'.format(label_std)
36 | characteristics['label min'] = '{:.3g}'.format(label_min)
37 | characteristics['label max'] = '{:.3g}'.format(label_max)
38 |
39 | print('\n{:}'.format(name))
40 | print_characteristics(characteristics, html=False)
41 |
42 | # describe_regression_datasets()
--------------------------------------------------------------------------------
/lib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/tests/__init__.py
--------------------------------------------------------------------------------
/lib/tests/binary_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import numpy as np
4 |
5 | from lib.binary import iterate_binary_dataset
6 |
7 |
8 | class BinaryTests(unittest.TestCase):
9 | def test_data_set_consistency(self):
10 | """...Test binary datasets have the expected shape
11 | """
12 | for name, x, y, n_observations, n_features in \
13 | iterate_binary_dataset():
14 | self.assertEqual(x.shape[0], n_observations,
15 | "Incorrect number of observations in %s" % name)
16 | self.assertEqual(y.shape[0], n_observations,
17 | "Incorrect number of labels in %s" % name)
18 | self.assertEqual(x.shape[1], n_features,
19 | "Incorrect number of features in %s" % name)
20 |
21 | self.assertEqual(set(np.unique(y)), {-1, 1},
22 | "Incorrect labels encoding in %s" % name)
23 |
24 |
25 | if __name__ == '__main__':
26 | unittest.main()
27 |
--------------------------------------------------------------------------------
/lib/tests/hawkes_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 |
4 | from lib.hawkes import iterate_hawkes_dataset
5 |
6 |
7 | def detect_duplicates(timestamps):
8 | """Detect if one timestamp appears twice in the realisation
9 |
10 | Parameters
11 | ----------
12 | timestamps : `list` of `np.ndarray`
13 | Hawkes realization
14 |
15 | Returns
16 | -------
17 | output : `bool`
18 | True if a timestamp appears twice
19 | """
20 | detected = False
21 | n_nodes = len(timestamps)
22 |
23 | # Search for timestamps appearing twice in the same node
24 | for i in range(n_nodes):
25 | diff = timestamps[i][1:] - timestamps[i][:-1]
26 | mask = np.hstack((True, diff != 0))
27 | detected |= not np.alltrue(mask)
28 |
29 | # Search for timestamps appearing twice in two different nodes
30 | for i in range(n_nodes):
31 | for j in range(i + 1, n_nodes):
32 | search_left_i = np.searchsorted(timestamps[j], timestamps[i],
33 | side='left')
34 | search_right_i = np.searchsorted(timestamps[j], timestamps[i],
35 | side='right')
36 |
37 | search_left_j = np.searchsorted(timestamps[i], timestamps[j],
38 | side='left')
39 | search_right_j = np.searchsorted(timestamps[i], timestamps[j],
40 | side='right')
41 |
42 | detected |= not np.alltrue(search_left_i == search_right_i)
43 | detected |= not np.alltrue(search_left_j == search_right_j)
44 |
45 | return detected
46 |
47 |
48 | class HawkesTests(unittest.TestCase):
49 | def test_detect_duplicates(self):
50 | """...Test that test function works as expected
51 | """
52 | timestamps = [np.array([1., 2., 3.])]
53 | self.assertFalse(detect_duplicates(timestamps))
54 |
55 | timestamps = [np.array([1., 2., 2., 3.])]
56 | self.assertTrue(detect_duplicates(timestamps))
57 |
58 | timestamps = [np.array([1., 2., 3.]),
59 | np.array([4., 5., 6., 7.])]
60 | self.assertFalse(detect_duplicates(timestamps))
61 |
62 | timestamps = [np.array([1., 2., 3.]),
63 | np.array([4., 5., 6., 6., 7.])]
64 | self.assertTrue(detect_duplicates(timestamps))
65 |
66 | timestamps = [np.array([1., 2., 4.]),
67 | np.array([4., 5., 6., 7.])]
68 | self.assertTrue(detect_duplicates(timestamps))
69 |
70 | def test_data_set_consistency(self):
71 | """...Test hawkes datasets have the expected shape
72 | """
73 | for name, timestamps_list, n_realizations, n_nodes, end_time in \
74 | iterate_hawkes_dataset():
75 | self.assertEqual(len(timestamps_list.keys()), n_realizations,
76 | "Incorrect number of realizations in %s" % name)
77 |
78 | for name, timestamps in timestamps_list.items():
79 | self.assertEqual(timestamps.shape, (n_nodes,),
80 | "Incorrect number of nodes in %s" % name)
81 | fist_time = min(map(min, timestamps))
82 | self.assertGreaterEqual(fist_time, 0,
83 | "Incorrect first time in %s" % name)
84 | last_time = max(map(max, timestamps))
85 | self.assertLessEqual(last_time, end_time,
86 | "Incorrect last time in %s" % name)
87 |
88 | self.assertFalse(detect_duplicates(timestamps))
89 |
90 |
91 | if __name__ == '__main__':
92 | unittest.main()
93 |
--------------------------------------------------------------------------------
/lib/tests/regression_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from lib.regression import iterate_regression_dataset
4 |
5 |
6 | class RegressionTests(unittest.TestCase):
7 | def test_data_set_consistency(self):
8 | """...Test regression datasets have the expected shape
9 | """
10 | for name, x, y, n_observations, n_features in \
11 | iterate_regression_dataset():
12 | self.assertEqual(x.shape[0], n_observations,
13 | "Incorrect number of observations in %s" % name)
14 | self.assertEqual(y.shape[0], n_observations,
15 | "Incorrect number of labels in %s" % name)
16 | self.assertEqual(x.shape[1], n_features,
17 | "Incorrect number of features in %s" % name)
18 |
19 |
20 | if __name__ == '__main__':
21 | unittest.main()
22 |
--------------------------------------------------------------------------------
/regression/abalone/README.md:
--------------------------------------------------------------------------------
1 | ## Abalone
2 |
3 | ### Characteristics
4 |
5 |
6 | Number of observations | 4177 |
7 | Number of features | 8 |
8 | Sparsity | 96% |
9 | label mean | 9.93 |
10 | label std | 3.22 |
11 | label min | 1 |
12 | label max | 29 |
13 |
14 |
15 | ### Description
16 |
17 | Predict the age of abalone from physical measurements
18 |
19 | [https://archive.ics.uci.edu/ml/datasets/Abalone](https://archive.ics.uci.edu/ml/datasets/Abalone)
20 |
21 | ### Preprocessing
22 | None
23 |
24 | ### Original download link
25 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale)
--------------------------------------------------------------------------------
/regression/abalone/abalone.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/regression/abalone/abalone.trn.bz2
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 |
--------------------------------------------------------------------------------