├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── binary
    ├── adult
    │   ├── README.md
    │   ├── adult.trn.bz2
    │   └── adult.tst.bz2
    ├── covtype
    │   ├── README.md
    │   └── covtype.trn.bz2
    ├── ijcnn1
    │   ├── README.md
    │   ├── ijcnn1.trn.bz2
    │   └── ijcnn1.tst.bz2
    ├── kdd2010
    │   ├── README.md
    │   ├── kdd2010.trn.bz2
    │   └── kdd2010.tst.bz2
    └── reuters
    │   ├── README.md
    │   ├── reuters.trn.bz2
    │   └── reuters.tst.bz2
├── hawkes
    └── bund
    │   ├── README.md
    │   └── bund.npz
├── lib
    ├── __init__.py
    ├── binary.py
    ├── compression.py
    ├── dataset_analysis.py
    ├── hawkes.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── clean_covtype.py
    │   ├── clean_kdd2010.py
    │   └── clean_reuters.py
    ├── regression.py
    └── tests
    │   ├── __init__.py
    │   ├── binary_test.py
    │   ├── hawkes_test.py
    │   └── regression_test.py
├── regression
    └── abalone
    │   ├── README.md
    │   └── abalone.trn.bz2
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bz2 filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | 
 26 | # PyInstaller
 27 | #  Usually these files are written by a python script from a template
 28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 29 | *.manifest
 30 | *.spec
 31 | 
 32 | # Installer logs
 33 | pip-log.txt
 34 | pip-delete-this-directory.txt
 35 | 
 36 | # Unit test / coverage reports
 37 | htmlcov/
 38 | .tox/
 39 | .coverage
 40 | .coverage.*
 41 | .cache
 42 | nosetests.xml
 43 | coverage.xml
 44 | *,cover
 45 | .hypothesis/
 46 | 
 47 | # Translations
 48 | *.mo
 49 | *.pot
 50 | 
 51 | # Django stuff:
 52 | *.log
 53 | local_settings.py
 54 | 
 55 | # Flask stuff:
 56 | instance/
 57 | .webassets-cache
 58 | 
 59 | # Scrapy stuff:
 60 | .scrapy
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | # IPython Notebook
 69 | .ipynb_checkpoints
 70 | 
 71 | # pyenv
 72 | .python-version
 73 | 
 74 | # celery beat schedule file
 75 | celerybeat-schedule
 76 | 
 77 | # dotenv
 78 | .env
 79 | 
 80 | # virtualenv
 81 | venv/
 82 | ENV/
 83 | 
 84 | # Spyder project settings
 85 | .spyderproject
 86 | 
 87 | # Rope project settings
 88 | .ropeproject
 89 | 
 90 | 
 91 | #uncompressed data files
 92 | *.trn
 93 | *.tst
 94 | 
 95 | # origin files before cleaning
 96 | *_orig.bz2
 97 | 
 98 | # IDE files
 99 | .DS_Store
100 | .idea/*


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.4"
 4 | # command to install dependencies
 5 | before_install:
 6 |   - sudo apt-get -qq update
 7 |   - sudo apt-get -qq install python-numpy python-scipy
 8 | install:
 9 |   - pip install -r requirements.txt
10 | # command to run tests
11 | script: python -m unittest discover -v . "*_test.py"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2016, X-DataInitiative
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [![Build Status](https://travis-ci.org/X-DataInitiative/tick-datasets.svg?branch=master)](https://travis-ci.org/X-DataInitiative/tick-datasets)
2 | 
3 | # tick-datasets
4 | Hosting of ready-to-use machine learning datasets
5 | 


--------------------------------------------------------------------------------
/binary/adult/README.md:
--------------------------------------------------------------------------------
 1 | ## Adult
 2 | 
 3 | ### Characteristics
 4 | 
 5 | Train dataset
 6 | <table>
 7 |     <tr> <td>Number of observations</td> <td>32561</td> </tr>
 8 |     <tr> <td>Number of features</td> <td>123</td> </tr>
 9 |     <tr> <td>Sparsity</td> <td>11.3%</td> </tr>
10 |     <tr> <td>Class balancing</td> <td>24.1% positive samples</td> </tr>
11 | </table>
12 | 
13 | Test dataset
14 | <table>
15 |     <tr> <td>Number of observations</td> <td>16281</td> </tr>
16 |     <tr> <td>Number of features</td> <td>123</td> </tr>
17 |     <tr> <td>Sparsity</td> <td>11.4%</td> </tr>
18 |     <tr> <td>Class balancing</td> <td>23.6% positive samples</td> </tr>
19 | </table>
20 | 
21 | ### Description
22 | 
23 | Predict whether income exceeds $50K/yr based on census data. Also known 
24 | as "Census Income" dataset.
25 | [https://archive.ics.uci.edu/ml/datasets/Adult](https://archive.ics.uci.edu/ml/datasets/Adult)
26 | 
27 | ### Preprocessing
28 | The original Adult data set has 14 features, among which six are continuous and 
29 | eight are categorical. In this data set, continuous features are discretized 
30 | into quintiles, and each quantile is represented by a binary feature. Also, 
31 | a categorical feature with m categories is converted to m binary features. It
32 | leads to a total of 123 binary features. 
33 | 
34 | John C. Platt. 
35 | Fast training of support vector machines using sequential minimal optimization. 
36 | In Bernhard Schölkopf, Christopher J. C. Burges, and 
37 | Alexander J. Smola, editors, Advances in Kernel Methods - 
38 | Support Vector Learning, Cambridge, MA, 1998. MIT Press.
39 | 
40 | Note that as feature 122 was not occurring in the test set it has been added 
41 | (with a zero value) to the first observation. Hence train and test data have 
42 | the same number of features. The last line of the original test dataset 
43 | (which only contained a label) has also been removed.
44 | 
45 | ### Original download link
46 | [http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-adult.tar.bz2)


--------------------------------------------------------------------------------
/binary/adult/adult.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.trn.bz2


--------------------------------------------------------------------------------
/binary/adult/adult.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/adult/adult.tst.bz2


--------------------------------------------------------------------------------
/binary/covtype/README.md:
--------------------------------------------------------------------------------
 1 | ## Covtype
 2 | 
 3 | ### Characteristics
 4 | 
 5 | <table>
 6 |     <tr> <td>Number of observations</td> <td>581012</td> </tr>
 7 |     <tr> <td>Number of features</td> <td>54</td> </tr>
 8 |     <tr> <td>Sparsity</td> <td>22.1%</td> </tr>
 9 |     <tr> <td>Class balancing</td> <td>51.2% positive samples</td> </tr>
10 | </table>
11 | 
12 | ### Description
13 | Forest CoverType dataset
14 | [https://archive.ics.uci.edu/ml/datasets/Covertype](https://archive.ics.uci.edu/ml/datasets/Covertype)
15 | 
16 | ### Preprocessing
17 | Labels that were originally 1 and 2 have been changed to -1, 1. 
18 | 
19 | ### Original download link
20 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.scale.bz2)


--------------------------------------------------------------------------------
/binary/covtype/covtype.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/covtype/covtype.trn.bz2


--------------------------------------------------------------------------------
/binary/ijcnn1/README.md:
--------------------------------------------------------------------------------
 1 | ## ijcnn1
 2 | 
 3 | ### Characteristics
 4 | Train dataset
 5 | <table>
 6 |     <tr> <td>Number of observations</td> <td>35000</td> </tr>
 7 |     <tr> <td>Number of features</td> <td>22</td> </tr>
 8 |     <tr> <td>Sparsity</td> <td>59.1%</td> </tr>
 9 |     <tr> <td>Class balancing</td> <td>9.76% positive samples</td> </tr>
10 | </table>
11 | 
12 | 
13 | Test dataset
14 | <table>
15 |     <tr> <td>Number of observations</td> <td>91701</td> </tr>
16 |     <tr> <td>Number of features</td> <td>22</td> </tr>
17 |     <tr> <td>Sparsity</td> <td>59.1%</td> </tr>
18 |     <tr> <td>Class balancing</td> <td>9.5% positive samples</td> </tr>
19 | </table>
20 | 
21 | ### Description
22 | 
23 | ### Preprocessing
24 | We use winner's transformation presented in
25 |  
26 | Chih-Chung Chang and Chih-Jen Lin. 
27 | IJCNN 2001 challenge: Generalization ability and text decoding. 
28 | In Proceedings of IJCNN. IEEE, 2001.
29 | 
30 | ### Original download links
31 | Train is concatenation of 
32 | 
33 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2)
34 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.val.bz2)
35 | 
36 | Test is
37 | 
38 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2)


--------------------------------------------------------------------------------
/binary/ijcnn1/ijcnn1.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.trn.bz2


--------------------------------------------------------------------------------
/binary/ijcnn1/ijcnn1.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/ijcnn1/ijcnn1.tst.bz2


--------------------------------------------------------------------------------
/binary/kdd2010/README.md:
--------------------------------------------------------------------------------
 1 | ## KDD 2010
 2 | 
 3 | ### Characteristics
 4 | 
 5 | Train dataset
 6 | <table>
 7 |     <tr> <td>Number of observations</td> <td>19,264,097</td> </tr>
 8 |     <tr> <td>Number of features</td> <td>1,163,024</td> </tr>
 9 |     <tr> <td>Sparsity</td> <td>0.000797%</td> </tr>
10 |     <tr> <td>Class balancing</td> <td>86.1% positive samples</td> </tr>
11 | </table>
12 | 
13 | Maximum feature index in train dataset is 1,129,522 but the total number 
14 | of features is 1,163,024 in order to be compatible with test dataset.
15 | 
16 | 
17 | Test dataset
18 | <table>
19 |     <tr> <td>Number of observations</td> <td>748,401</td> </tr>
20 |     <tr> <td>Number of features</td> <td>1,163,024</td> </tr>
21 |     <tr> <td>Sparsity</td> <td>0.000774%</td> </tr>
22 |     <tr> <td>Class balancing</td> <td>88.8% positive samples</td> </tr>
23 | </table>
24 | 
25 | ### Description
26 | Juan, Yuchin, et al. "Field-aware factorization machines for CTR prediction." 
27 | Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.
28 | 
29 | ### Preprocessing
30 | Labels that were originally 0 and 1 have been changed to -1, 1. 
31 | 
32 | 
33 | ### Original download links
34 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.bz2)
35 | 
36 | [https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb-raw-libsvm.t.bz2)


--------------------------------------------------------------------------------
/binary/kdd2010/kdd2010.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.trn.bz2


--------------------------------------------------------------------------------
/binary/kdd2010/kdd2010.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/kdd2010/kdd2010.tst.bz2


--------------------------------------------------------------------------------
/binary/reuters/README.md:
--------------------------------------------------------------------------------
 1 | ## Reuters
 2 | 
 3 | ### Characteristics
 4 | 
 5 | Train dataset
 6 | <table>
 7 |     <tr> <td>Number of observations</td> <td>7770</td> </tr>
 8 |     <tr> <td>Number of features</td> <td>8315</td> </tr>
 9 |     <tr> <td>Sparsity</td> <td>0.526%</td> </tr>
10 |     <tr> <td>Class balancing</td> <td>6.92% positive samples</td> </tr>
11 | </table>
12 | 
13 | Test dataset
14 | <table>
15 |     <tr> <td>Number of observations</td> <td>3299</td> </tr>
16 |     <tr> <td>Number of features</td> <td>8315</td> </tr>
17 |     <tr> <td>Sparsity</td> <td>0.499%</td> </tr>
18 |     <tr> <td>Class balancing</td> <td>5.43% positive samples</td> </tr>
19 | </table>
20 | 
21 | ### Description
22 | 
23 | ### Preprocessing
24 | The train set was originally containing several time the same value for one 
25 | feature, it has been cleaned. 
26 | 
27 | ### Original download link
28 | [http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2](http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2)


--------------------------------------------------------------------------------
/binary/reuters/reuters.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.trn.bz2


--------------------------------------------------------------------------------
/binary/reuters/reuters.tst.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/binary/reuters/reuters.tst.bz2


--------------------------------------------------------------------------------
/hawkes/bund/README.md:
--------------------------------------------------------------------------------
 1 | ## Bund Future trade data
 2 | 
 3 | ### Description
 4 | 
 5 | One month of data in April 2014 on Bund Future traded at eurex with 
 6 | microsecond timestamp resolution.
 7 | 
 8 | This data is meant to be fitted with Hawkes processes. It contains for each 
 9 | day 4 time series representing:
10 | 
11 | 1. Mid-price movement up
12 | 2. Mid-price movement down
13 | 3. Buyer initiated trades that do not move the mid-price
14 | 4. Seller initiated trades that do not move the mid-price
15 | 
16 | ### Characteristics
17 | 
18 | <table>
19 |     <tr> <td>Number of realizations</td> <td>20</td> </tr>
20 |     <tr> <td>Average number of ticks node 0</td> <td>7009.15</td> </tr>
21 |     <tr> <td>Average number of ticks node 1</td> <td>6998.15</td> </tr>
22 |     <tr> <td>Average number of ticks node 2</td> <td>257677.55</td> </tr>
23 |     <tr> <td>Average number of ticks node 3</td> <td>261423.6</td> </tr>
24 | </table>
25 | 
26 | ### Preprocessing
27 | 
28 | Market opens at 8AM which corresponds to a timestamp of 28800. This timestamp
29 | has been substracted to all timestamps to have a realizations that starts at 
30 | time 0.
31 | 
32 | Please note that as markets closes at 10PM, the end time of our 
33 | substracted realizations is 50400. 
34 | 
35 | 


--------------------------------------------------------------------------------
/hawkes/bund/bund.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/hawkes/bund/bund.npz


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/__init__.py


--------------------------------------------------------------------------------
/lib/binary.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import load_svmlight_file
 5 | 
 6 | from lib.dataset_analysis import features_characteristics, print_characteristics
 7 | 
 8 | # name, path, n_observations, n_features
 9 | binary_datasets = [
10 |     ('Adult Train', '../binary/adult/adult.trn.bz2', 32561, 123),
11 |     ('Adult Test', '../binary/adult/adult.tst.bz2', 16281, 123),
12 |     ('Covtype Train', '../binary/covtype/covtype.trn.bz2', 581012, 54),
13 |     ('ijcnn1 Train', '../binary/ijcnn1/ijcnn1.trn.bz2', 49990, 22),
14 |     ('ijcnn1 Test', '../binary/ijcnn1/ijcnn1.tst.bz2', 91701, 22),
15 |     ('Reuters Train', '../binary/reuters/reuters.trn.bz2', 7770, 8315),
16 |     ('Reuters Test', '../binary/reuters/reuters.tst.bz2', 3299, 8315),
17 |     ('KDD 2010 Train', '../binary/kdd2010/kdd2010.trn.bz2',
18 |      19264097, 1129522),
19 |     ('KDD 2010 Test', '../binary/kdd2010/kdd2010.tst.bz2',
20 |      748401, 1163024),
21 | ]
22 | 
23 | 
24 | def iterate_binary_dataset_path():
25 |     for name, path, *args in binary_datasets:
26 |         path = os.path.join(os.path.dirname(__file__), path)
27 |         yield name, path
28 | 
29 | 
30 | def iterate_binary_dataset():
31 |     for name, path, n_observations, n_features in binary_datasets:
32 |         path = os.path.join(os.path.dirname(__file__), path)
33 |         x, y = load_svmlight_file(path)
34 |         yield name, x, y, n_observations, n_features
35 | 
36 | 
37 | def describe_binary_datasets():
38 |     for name, x, y, *args in iterate_binary_dataset():
39 |         n_rows = x.shape[0]
40 |         positive_ratio = np.sum(y == 1) / n_rows
41 | 
42 |         characteristics = features_characteristics(x)
43 |         characteristics['Class balancing'] = '{:.3g}% positive samples'.format(
44 |             positive_ratio * 100)
45 | 
46 |         print('\n{:}'.format(name))
47 |         print_characteristics(characteristics, html=False)
48 | 


--------------------------------------------------------------------------------
/lib/compression.py:
--------------------------------------------------------------------------------
 1 | import bz2
 2 | import os
 3 | from shutil import copyfileobj
 4 | from itertools import chain
 5 | 
 6 | from lib.binary import iterate_binary_dataset_path
 7 | from lib.regression import iterate_regression_dataset_path
 8 | 
 9 | 
10 | def get_compressed_file_path(decompressed_file_path):
11 |     if decompressed_file_path[-4:] == '.bz2':
12 |         return decompressed_file_path
13 |     return '{:}.bz2'.format(decompressed_file_path)
14 | 
15 | 
16 | def get_decompressed_file_path(compressed_file_path):
17 |     return compressed_file_path.replace('.bz2', '')
18 | 
19 | 
20 | def compress_file(decompressed_file_path):
21 |     compressed_file_path = get_compressed_file_path(decompressed_file_path)
22 |     with open(decompressed_file_path, 'rb') as input:
23 |         with bz2.BZ2File(compressed_file_path, 'wb') as output:
24 |             copyfileobj(input, output)
25 | 
26 | 
27 | def decompress_file(compressed_file_path):
28 |     decompressed_file_path = get_decompressed_file_path(compressed_file_path)
29 |     with bz2.BZ2File(compressed_file_path, 'rb') as input:
30 |         with open(decompressed_file_path, 'wb') as output:
31 |             copyfileobj(input, output)
32 | 
33 | 
34 | all_datasets_path = chain(
35 |     iterate_binary_dataset_path(),
36 |     iterate_regression_dataset_path()
37 | )
38 | 
39 | 
40 | def compress_all_files(replace=False):
41 |     for name, path in all_datasets_path:
42 |         path = get_decompressed_file_path(path)
43 |         compressed_path = get_compressed_file_path(path)
44 | 
45 |         if os.path.exists(compressed_path) and replace is False:
46 |             print('%s dataset already exists at %s'
47 |                   % (name, compressed_path))
48 |         else:
49 |             print('compressing', name)
50 |             compress_file(path)
51 | 
52 | 
53 | def decompress_all_files(replace=False):
54 |     for name, path in all_datasets_path:
55 |         path = get_compressed_file_path(path)
56 |         decompressed_path = get_decompressed_file_path(path)
57 | 
58 |         if os.path.exists(decompressed_path) and replace is False:
59 |             print('%s dataset already exists at %s'
60 |                   % (name, decompressed_path))
61 |         else:
62 |             print('decompressing', name)
63 |             decompress_file(path)
64 | 


--------------------------------------------------------------------------------
/lib/dataset_analysis.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | 
 4 | def features_characteristics(features_matrix):
 5 |     n_rows = features_matrix.shape[0]
 6 |     n_columns = features_matrix.shape[1]
 7 | 
 8 |     sparsity = features_matrix.getnnz() / (n_rows * n_columns)
 9 | 
10 |     characteristics = OrderedDict()
11 |     characteristics['Number of observations'] = n_rows
12 |     characteristics['Number of features'] = n_columns
13 |     characteristics['Sparsity'] = '{:.3g}%'.format(sparsity * 100)
14 |     return characteristics
15 | 
16 | 
17 | def print_characteristics(characteristics, html=False):
18 |     if html is False:
19 |         for k, v in characteristics.items():
20 |             print("{:<25} {:}".format(k, v))
21 |     else:
22 |         html_data = ''
23 |         html_data += '<table>\n'
24 |         for k, v in characteristics.items():
25 |             html_data += '    <tr>'
26 |             html_data += ' <td>{:}</td> <td>{:}</td> '.format(k, v)
27 |             html_data += '</tr>\n'
28 |         html_data += '</table>\n'
29 | 
30 |         print(html_data)


--------------------------------------------------------------------------------
/lib/hawkes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import OrderedDict
 3 | import numpy as np
 4 | 
 5 | from lib.dataset_analysis import print_characteristics
 6 | 
 7 | # name, path, number of realization, number of nodes, end_time
 8 | hawkes_datasets = [
 9 |     ('Bund', '../hawkes/bund/bund.npz', 20, 4, 50400)
10 | ]
11 | 
12 | 
13 | def iterate_hawkes_dataset_path():
14 |     for name, path, *args in hawkes_datasets:
15 |         path = os.path.join(os.path.dirname(__file__), path)
16 |         yield name, path
17 | 
18 | 
19 | def iterate_hawkes_dataset():
20 |     for name, path, n_realizations, n_nodes, end_time in hawkes_datasets:
21 |         path = os.path.join(os.path.dirname(__file__), path)
22 |         timestamps_dict = np.load(path)
23 |         yield name, timestamps_dict, n_realizations, n_nodes, end_time
24 | 
25 | 
26 | def hawkes_characteristics(timestamps_list):
27 |     characteristics = OrderedDict()
28 |     n_realizations = len(timestamps_list)
29 |     characteristics['Number of realizations'] = n_realizations
30 | 
31 |     n_nodes = len(timestamps_list[0])
32 |     n_jumps_per_node = np.zeros(n_nodes)
33 |     for timestamps in timestamps_list:
34 |         for i in range(n_nodes):
35 |             n_jumps_per_node[i] += len(timestamps[i])
36 |     n_jumps_per_node /= n_realizations
37 |     for i in range(n_nodes):
38 |         characteristics['Average number of ticks node %i' % i] = \
39 |             n_jumps_per_node[i]
40 | 
41 |     return characteristics
42 | 
43 | 
44 | def describe_hawkes_datasets():
45 |     for name, timestamps_dict, *args in iterate_hawkes_dataset():
46 |         timestamps_list = [timestamps_dict[key]
47 |                            for key in timestamps_dict.keys()]
48 |         characteristics = hawkes_characteristics(timestamps_list)
49 |         print('\n{:}'.format(name))
50 |         print_characteristics(characteristics, html=False)
51 | 


--------------------------------------------------------------------------------
/lib/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/preprocessing/__init__.py


--------------------------------------------------------------------------------
/lib/preprocessing/clean_covtype.py:
--------------------------------------------------------------------------------
 1 | # Covtype dataset have labels in {1, 2} instead of {-1, 1}
 2 | import bz2
 3 | from urllib.request import urlretrieve
 4 | 
 5 | 
 6 | def clean_line_labels(line):
 7 |     line = line.strip()
 8 |     split_l = line.split(' ')
 9 |     label = split_l[0]
10 |     features = ' '.join(split_l[1:])
11 | 
12 |     if int(label) == 2:
13 |         label = 1
14 |     elif int(label) == 1:
15 |         label = -1
16 |     else:
17 |         raise ValueError('Unknown label %s' % label)
18 | 
19 |     clean_line = '{:} {:}\n'.format(label, features)
20 |     return clean_line
21 | 
22 | 
23 | covtype_data_url = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/' \
24 |                    'binary/covtype.libsvm.binary.scale.bz2'
25 | tmp_path, _ = urlretrieve(covtype_data_url)
26 | 
27 | covtype_train_file_path = '../../binary/covtype/covtype.trn'
28 | with bz2.BZ2File(tmp_path) as train_file:
29 |     with open(covtype_train_file_path, 'w') as covtype_train_file:
30 |         for data in train_file.readlines():
31 |             cleaned_line = clean_line_labels(data.decode('utf-8'))
32 |             covtype_train_file.write(cleaned_line)
33 | 


--------------------------------------------------------------------------------
/lib/preprocessing/clean_kdd2010.py:
--------------------------------------------------------------------------------
 1 | # KDD 2010 datasets have labels in {0, 1} instead of {-1, 1}
 2 | import bz2
 3 | 
 4 | from lib.compression import compress_file
 5 | 
 6 | 
 7 | def clean_line_labels(line):
 8 |     line = line.strip()
 9 |     split_l = line.split(' ')
10 |     label = split_l[0]
11 |     features = ' '.join(split_l[1:])
12 | 
13 |     if int(label) == 1:
14 |         label = 1
15 |     elif int(label) == 0:
16 |         label = -1
17 |     else:
18 |         raise ValueError('Unknown label %s' % label)
19 | 
20 |     clean_line = '{:} {:}\n'.format(label, features)
21 |     return clean_line
22 | 
23 | 
24 | original_files_path = [
25 |     '../../binary/kdd2010/kdd2010.trn_orig.bz2',
26 |     '../../binary/kdd2010/kdd2010.tst_orig.bz2'
27 | ]
28 | 
29 | save_file_path = [
30 |     '../../binary/kdd2010/kdd2010.trn',
31 |     '../../binary/kdd2010/kdd2010.tst'
32 | ]
33 | 
34 | for original_path, save_path in zip(original_files_path, save_file_path):
35 |     with bz2.BZ2File(original_path) as train_file:
36 |         with open(save_path, 'w') as kdd2010_train_file:
37 |             for i, data in enumerate(train_file.readlines()):
38 |                 if i % 100000 == 0:
39 |                     print(i)
40 |                 cleaned_line = clean_line_labels(data.decode('utf-8'))
41 |                 kdd2010_train_file.write(cleaned_line)
42 | 
43 |     compress_file(save_path)
44 | 


--------------------------------------------------------------------------------
/lib/preprocessing/clean_reuters.py:
--------------------------------------------------------------------------------
 1 | # Reuters dataset has line with the same feature defined twice
 2 | # For example line 3 starts with
 3 | # -1 171:8.09199011447501E-02 171:8.09199011447501E-02
 4 | # Hence we need to clean that
 5 | import tarfile
 6 | from urllib.request import urlretrieve
 7 | from collections import OrderedDict
 8 | 
 9 | 
10 | def clean_line_duplicates(line):
11 |     line = line.strip()
12 |     split_l = line.split(' ')
13 |     label = split_l[0]
14 |     features = split_l[1:]
15 |     features_dict = OrderedDict()
16 |     for feature in features:
17 |         index = feature.split(':')[0].strip()
18 |         value = feature.split(':')[1].strip()
19 |         if index not in features_dict:
20 |             features_dict[index] = value
21 |         else:
22 |             if features_dict[index] != value:
23 |                 raise (ValueError('index', index, features_dict[index],
24 |                                   value))
25 |     joined_features = [':'.join([index, value])
26 |                        for index, value in features_dict.items()]
27 |     cleaned_line = ' '.join([label] + joined_features) + '\n'
28 |     return cleaned_line
29 | 
30 | 
31 | reuters_data_url = 'http://leon.bottou.org/_media/papers/lasvm-reuters.tar.bz2'
32 | tmp_path, _ = urlretrieve(reuters_data_url)
33 | uncompressed_data = tarfile.open(name=tmp_path, mode="r:bz2")
34 | 
35 | reuters_train_file_path = '../../binary/reuters/reuters.trn'
36 | 
37 | with uncompressed_data.extractfile('reuters/money-fx.trn') as train_file:
38 |     with open(reuters_train_file_path, 'w') as reuters_train_file:
39 |         for data in train_file.readlines():
40 |             cleaned_line = clean_line_duplicates(data.decode('utf-8'))
41 |             reuters_train_file.write(cleaned_line)
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/lib/regression.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import load_svmlight_file
 5 | 
 6 | from lib.dataset_analysis import features_characteristics, print_characteristics
 7 | 
 8 | # name, path, n_observations, n_features
 9 | regression_datasets = [
10 |     ('Abalone Train', '../regression/abalone/abalone.trn.bz2', 4177, 8),
11 | ]
12 | 
13 | def iterate_regression_dataset_path():
14 |     for name, path, *args in regression_datasets:
15 |         path = os.path.join(os.path.dirname(__file__), path)
16 |         yield name, path
17 | 
18 | 
19 | def iterate_regression_dataset():
20 |     for name, path, n_observations, n_features in regression_datasets:
21 |         path = os.path.join(os.path.dirname(__file__), path)
22 |         x, y = load_svmlight_file(path)
23 |         yield name, x, y, n_observations, n_features
24 | 
25 | 
26 | def describe_regression_datasets():
27 |     for name, x, y, *args in iterate_regression_dataset():
28 |         label_mean = np.mean(y)
29 |         label_std = np.std(y)
30 |         label_min = np.min(y)
31 |         label_max = np.max(y)
32 | 
33 |         characteristics = features_characteristics(x)
34 |         characteristics['label mean'] = '{:.3g}'.format(label_mean)
35 |         characteristics['label std'] = '{:.3g}'.format(label_std)
36 |         characteristics['label min'] = '{:.3g}'.format(label_min)
37 |         characteristics['label max'] = '{:.3g}'.format(label_max)
38 | 
39 |         print('\n{:}'.format(name))
40 |         print_characteristics(characteristics, html=False)
41 | 
42 | # describe_regression_datasets()


--------------------------------------------------------------------------------
/lib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/lib/tests/__init__.py


--------------------------------------------------------------------------------
/lib/tests/binary_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from lib.binary import iterate_binary_dataset
 6 | 
 7 | 
 8 | class BinaryTests(unittest.TestCase):
 9 |     def test_data_set_consistency(self):
10 |         """...Test binary datasets have the expected shape
11 |         """
12 |         for name, x, y, n_observations, n_features in \
13 |                 iterate_binary_dataset():
14 |             self.assertEqual(x.shape[0], n_observations,
15 |                              "Incorrect number of observations in %s" % name)
16 |             self.assertEqual(y.shape[0], n_observations,
17 |                              "Incorrect number of labels in %s" % name)
18 |             self.assertEqual(x.shape[1], n_features,
19 |                              "Incorrect number of features in %s" % name)
20 | 
21 |             self.assertEqual(set(np.unique(y)), {-1, 1},
22 |                              "Incorrect labels encoding in %s" % name)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/lib/tests/hawkes_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from lib.hawkes import iterate_hawkes_dataset
 5 | 
 6 | 
 7 | def detect_duplicates(timestamps):
 8 |     """Detect if one timestamp appears twice in the realisation
 9 |     
10 |     Parameters
11 |     ----------
12 |     timestamps : `list` of `np.ndarray`
13 |         Hawkes realization
14 |         
15 |     Returns
16 |     -------
17 |     output : `bool`
18 |         True if a timestamp appears twice
19 |     """
20 |     detected = False
21 |     n_nodes = len(timestamps)
22 | 
23 |     # Search for timestamps appearing twice in the same node
24 |     for i in range(n_nodes):
25 |         diff = timestamps[i][1:] - timestamps[i][:-1]
26 |         mask = np.hstack((True, diff != 0))
27 |         detected |= not np.alltrue(mask)
28 | 
29 |     # Search for timestamps appearing twice in two different nodes
30 |     for i in range(n_nodes):
31 |         for j in range(i + 1, n_nodes):
32 |             search_left_i = np.searchsorted(timestamps[j], timestamps[i],
33 |                                             side='left')
34 |             search_right_i = np.searchsorted(timestamps[j], timestamps[i],
35 |                                              side='right')
36 | 
37 |             search_left_j = np.searchsorted(timestamps[i], timestamps[j],
38 |                                             side='left')
39 |             search_right_j = np.searchsorted(timestamps[i], timestamps[j],
40 |                                              side='right')
41 | 
42 |             detected |= not np.alltrue(search_left_i == search_right_i)
43 |             detected |= not np.alltrue(search_left_j == search_right_j)
44 | 
45 |     return detected
46 | 
47 | 
48 | class HawkesTests(unittest.TestCase):
49 |     def test_detect_duplicates(self):
50 |         """...Test that test function works as expected
51 |         """
52 |         timestamps = [np.array([1., 2., 3.])]
53 |         self.assertFalse(detect_duplicates(timestamps))
54 | 
55 |         timestamps = [np.array([1., 2., 2., 3.])]
56 |         self.assertTrue(detect_duplicates(timestamps))
57 | 
58 |         timestamps = [np.array([1., 2., 3.]),
59 |                       np.array([4., 5., 6., 7.])]
60 |         self.assertFalse(detect_duplicates(timestamps))
61 | 
62 |         timestamps = [np.array([1., 2., 3.]),
63 |                       np.array([4., 5., 6., 6., 7.])]
64 |         self.assertTrue(detect_duplicates(timestamps))
65 | 
66 |         timestamps = [np.array([1., 2., 4.]),
67 |                       np.array([4., 5., 6., 7.])]
68 |         self.assertTrue(detect_duplicates(timestamps))
69 | 
70 |     def test_data_set_consistency(self):
71 |         """...Test hawkes datasets have the expected shape
72 |         """
73 |         for name, timestamps_list, n_realizations, n_nodes, end_time in \
74 |                 iterate_hawkes_dataset():
75 |             self.assertEqual(len(timestamps_list.keys()), n_realizations,
76 |                              "Incorrect number of realizations in %s" % name)
77 | 
78 |             for name, timestamps in timestamps_list.items():
79 |                 self.assertEqual(timestamps.shape, (n_nodes,),
80 |                                  "Incorrect number of nodes in %s" % name)
81 |                 fist_time = min(map(min, timestamps))
82 |                 self.assertGreaterEqual(fist_time, 0,
83 |                                         "Incorrect first time in %s" % name)
84 |                 last_time = max(map(max, timestamps))
85 |                 self.assertLessEqual(last_time, end_time,
86 |                                      "Incorrect last time in %s" % name)
87 | 
88 |                 self.assertFalse(detect_duplicates(timestamps))
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/lib/tests/regression_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from lib.regression import iterate_regression_dataset
 4 | 
 5 | 
 6 | class RegressionTests(unittest.TestCase):
 7 |     def test_data_set_consistency(self):
 8 |         """...Test regression datasets have the expected shape
 9 |         """
10 |         for name, x, y, n_observations, n_features in \
11 |                 iterate_regression_dataset():
12 |             self.assertEqual(x.shape[0], n_observations,
13 |                              "Incorrect number of observations in %s" % name)
14 |             self.assertEqual(y.shape[0], n_observations,
15 |                              "Incorrect number of labels in %s" % name)
16 |             self.assertEqual(x.shape[1], n_features,
17 |                              "Incorrect number of features in %s" % name)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/regression/abalone/README.md:
--------------------------------------------------------------------------------
 1 | ## Abalone
 2 | 
 3 | ### Characteristics
 4 | 
 5 | <table>
 6 |     <tr> <td>Number of observations</td> <td>4177</td> </tr>
 7 |     <tr> <td>Number of features</td> <td>8</td> </tr>
 8 |     <tr> <td>Sparsity</td> <td>96%</td> </tr>
 9 |     <tr> <td>label mean</td> <td>9.93</td> </tr>
10 |     <tr> <td>label std</td> <td>3.22</td> </tr>
11 |     <tr> <td>label min</td> <td>1</td> </tr>
12 |     <tr> <td>label max</td> <td>29</td> </tr>
13 | </table>
14 | 
15 | ### Description
16 | 
17 | Predict the age of abalone from physical measurements
18 | 
19 | [https://archive.ics.uci.edu/ml/datasets/Abalone](https://archive.ics.uci.edu/ml/datasets/Abalone)
20 | 
21 | ### Preprocessing
22 | None
23 | 
24 | ### Original download link
25 | [http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone_scale)


--------------------------------------------------------------------------------
/regression/abalone/abalone.trn.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-DataInitiative/tick-datasets/9d959b6e53e17145e93e9849ff1f9f6d2de8ae51/regression/abalone/abalone.trn.bz2


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | 


--------------------------------------------------------------------------------