├── datasets
    ├── nile
    │   ├── .gitignore
    │   ├── nile.png
    │   ├── README.md
    │   └── nile.json
    ├── businv
    │   ├── .gitignore
    │   ├── businv.png
    │   ├── README.md
    │   └── convert.py
    ├── centralia
    │   ├── .gitignore
    │   ├── centralia.png
    │   ├── from_wikipedia.txt
    │   ├── README.md
    │   ├── centralia.json
    │   └── convert.py
    ├── homeruns
    │   ├── .gitignore
    │   ├── homeruns.png
    │   ├── README.md
    │   ├── homeruns.json
    │   └── get_homeruns.py
    ├── ozone
    │   ├── .gitignore
    │   ├── ozone.png
    │   ├── README.md
    │   ├── convert.py
    │   ├── ozone.json
    │   └── ozone-depleting-substance-emissions.csv
    ├── run_log
    │   ├── .gitignore
    │   ├── run_log.png
    │   ├── README.md
    │   ├── LICENSE
    │   └── convert.py
    ├── seatbelts
    │   ├── .gitignore
    │   ├── seatbelts.png
    │   └── README.md
    ├── well_log
    │   ├── .gitignore
    │   ├── well_log.png
    │   ├── README.md
    │   └── convert.py
    ├── brent_spot
    │   ├── .gitignore
    │   ├── brent_spot.png
    │   ├── README.md
    │   └── convert.py
    ├── construction
    │   ├── .gitignore
    │   ├── privtime.xls
    │   ├── construction.png
    │   ├── README.md
    │   └── convert.py
    ├── global_co2
    │   ├── .gitignore
    │   ├── global_co2.png
    │   ├── README.md
    │   └── get_global_co2.py
    ├── jfk_passengers
    │   ├── .gitignore
    │   ├── jfk_passengers.png
    │   ├── README.md
    │   └── convert.py
    ├── lga_passengers
    │   ├── .gitignore
    │   ├── lga_passengers.png
    │   ├── README.md
    │   └── convert.py
    ├── rail_lines
    │   ├── .gitignore
    │   ├── rail_lines.png
    │   ├── README.md
    │   └── rail_lines.json
    ├── us_population
    │   ├── .gitignore
    │   ├── us_population.png
    │   ├── README.md
    │   └── convert.py
    ├── quality_control_1
    │   ├── .gitignore
    │   ├── quality_control_1.png
    │   └── README.md
    ├── quality_control_2
    │   ├── .gitignore
    │   ├── quality_control_2.png
    │   └── README.md
    ├── quality_control_3
    │   ├── .gitignore
    │   ├── quality_control_3.png
    │   └── README.md
    ├── quality_control_4
    │   ├── .gitignore
    │   ├── quality_control_4.png
    │   └── README.md
    ├── quality_control_5
    │   ├── .gitignore
    │   ├── quality_control_5.png
    │   └── README.md
    ├── shanghai_license
    │   ├── .gitignore
    │   ├── shanghai_license.png
    │   ├── README.md
    │   └── convert.py
    ├── unemployment_nl
    │   ├── .gitignore
    │   ├── unemployment_nl.png
    │   ├── README.md
    │   └── convert.py
    ├── gdp_iran
    │   ├── .gitignore
    │   ├── gdp_iran.png
    │   ├── README.md
    │   ├── convert.py
    │   └── gdp_iran.json
    ├── usd_isk
    │   ├── .gitignore
    │   ├── usd_isk.png
    │   ├── README.md
    │   ├── convert.py
    │   └── ert_bil_eur_m_Label.csv
    ├── gdp_argentina
    │   ├── .gitignore
    │   ├── gdp_argentina.png
    │   ├── README.md
    │   ├── convert.py
    │   └── gdp_argentina.json
    ├── gdp_croatia
    │   ├── .gitignore
    │   ├── gdp_croatia.png
    │   ├── README.md
    │   ├── gdp_croatia.json
    │   └── convert.py
    ├── apple
    │   ├── .gitignore
    │   ├── apple.png
    │   └── README.md
    ├── measles
    │   ├── .gitignore
    │   ├── measles.png
    │   ├── README.md
    │   └── get_measles.py
    ├── bee_waggle_6
    │   ├── .gitignore
    │   ├── bee_waggle_6.png
    │   └── README.md
    ├── bitcoin
    │   ├── .gitignore
    │   ├── bitcoin.png
    │   ├── README.md
    │   └── get_bitcoin.py
    ├── occupancy
    │   ├── .gitignore
    │   ├── occupancy.png
    │   ├── README.md
    │   └── get_occupancy.py
    ├── ratner_stock
    │   ├── .gitignore
    │   ├── ratner_stock.png
    │   └── README.md
    ├── robocalls
    │   ├── .gitignore
    │   ├── robocalls.png
    │   └── README.md
    ├── scanline_42049
    │   ├── .gitignore
    │   ├── scanline_42049.png
    │   ├── README.md
    │   └── get_scanline_42049.py
    ├── scanline_126007
    │   ├── .gitignore
    │   ├── scanline_126007.png
    │   ├── README.md
    │   └── get_scanline_126007.py
    ├── bank
    │   ├── bank.png
    │   ├── README.md
    │   └── LICENSE
    ├── iceland_tourism
    │   ├── .gitignore
    │   ├── iceland_tourism.png
    │   ├── README.md
    │   └── get_iceland_tourism.py
    ├── gdp_japan
    │   ├── gdp_japan.png
    │   ├── README.md
    │   ├── gdp_japan.csv
    │   └── gdp_japan.json
    ├── co2_canada
    │   ├── co2_canada.png
    │   ├── README.md
    │   └── co2_canada.csv
    ├── debt_ireland
    │   ├── debt_ireland.png
    │   ├── debt_ireland.csv
    │   ├── README.md
    │   └── debt_ireland.json
    ├── uk_coal_employ
    │   ├── Coal_since_1853.xls
    │   ├── uk_coal_employ.png
    │   ├── README.md
    │   ├── employ_only.csv
    │   └── uk_coal_employ.json
    └── children_per_woman
    │   ├── children_per_woman.png
    │   ├── tfr-by-gapminder.xlsx
    │   └── README.md
├── .gitignore
├── CHANGELOG.md
├── .github
    └── workflows
    │   ├── action.yml
    │   └── validate.yml
├── requirements.txt
├── examples
    ├── R
    │   ├── README.md
    │   └── load_dataset.R
    └── python
    │   ├── README.md
    │   └── load_dataset.py
├── Dockerfile
├── LICENSE
├── Makefile
├── checksums.json
├── utils
    ├── check_checksums.py
    ├── plot_dataset.py
    └── validate_dataset.py
└── schema.json


/datasets/nile/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/businv/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/centralia/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/homeruns/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/ozone/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/run_log/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/seatbelts/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/well_log/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/brent_spot/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/construction/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/global_co2/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/jfk_passengers/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/lga_passengers/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/rail_lines/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/us_population/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/quality_control_1/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/quality_control_2/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/quality_control_3/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/quality_control_4/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/quality_control_5/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/shanghai_license/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/unemployment_nl/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | 


--------------------------------------------------------------------------------
/datasets/gdp_iran/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | original/
3 | 


--------------------------------------------------------------------------------
/datasets/usd_isk/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | original/
3 | 


--------------------------------------------------------------------------------
/datasets/gdp_argentina/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | original/
3 | 


--------------------------------------------------------------------------------
/datasets/gdp_croatia/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | original/
3 | 


--------------------------------------------------------------------------------
/datasets/apple/.gitignore:
--------------------------------------------------------------------------------
1 | AAPL.csv
2 | apple.json
3 | old/
4 | 


--------------------------------------------------------------------------------
/datasets/measles/.gitignore:
--------------------------------------------------------------------------------
1 | ewmeas.dat
2 | measles.json
3 | old/
4 | 


--------------------------------------------------------------------------------
/datasets/bee_waggle_6/.gitignore:
--------------------------------------------------------------------------------
1 | bee_waggle_6.json
2 | old/
3 | psslds.zip
4 | 


--------------------------------------------------------------------------------
/datasets/bitcoin/.gitignore:
--------------------------------------------------------------------------------
1 | bitcoin.json
2 | market-price.csv
3 | old/
4 | 


--------------------------------------------------------------------------------
/datasets/occupancy/.gitignore:
--------------------------------------------------------------------------------
1 | datatraining.txt
2 | occupancy.json
3 | old/
4 | 


--------------------------------------------------------------------------------
/datasets/ratner_stock/.gitignore:
--------------------------------------------------------------------------------
1 | SIG.csv
2 | old/
3 | ratner_stock.json
4 | 


--------------------------------------------------------------------------------
/datasets/robocalls/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | robocalls.html
3 | robocalls.json
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | */__pycache__
2 | */*/__pycache__
3 | *.pyc
4 | venv/
5 | export/
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | 
3 | ## Version 1.0.0
4 | 
5 | * Initial release
6 | 


--------------------------------------------------------------------------------
/datasets/scanline_42049/.gitignore:
--------------------------------------------------------------------------------
1 | 42049.jpg
2 | old/
3 | scanline_42049.json
4 | 


--------------------------------------------------------------------------------
/datasets/scanline_126007/.gitignore:
--------------------------------------------------------------------------------
1 | 126007.jpg
2 | old/
3 | scanline_126007.json
4 | 


--------------------------------------------------------------------------------
/datasets/bank/bank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bank/bank.png


--------------------------------------------------------------------------------
/datasets/nile/nile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/nile/nile.png


--------------------------------------------------------------------------------
/datasets/apple/apple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/apple/apple.png


--------------------------------------------------------------------------------
/datasets/iceland_tourism/.gitignore:
--------------------------------------------------------------------------------
1 | iceland_tourism.json
2 | old/
3 | visitors-to-iceland-2002-2019-oct.xlsx
4 | 


--------------------------------------------------------------------------------
/datasets/ozone/ozone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/ozone/ozone.png


--------------------------------------------------------------------------------
/datasets/bitcoin/bitcoin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bitcoin/bitcoin.png


--------------------------------------------------------------------------------
/datasets/businv/businv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/businv/businv.png


--------------------------------------------------------------------------------
/datasets/measles/measles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/measles/measles.png


--------------------------------------------------------------------------------
/datasets/run_log/run_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/run_log/run_log.png


--------------------------------------------------------------------------------
/datasets/usd_isk/usd_isk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/usd_isk/usd_isk.png


--------------------------------------------------------------------------------
/datasets/gdp_iran/gdp_iran.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_iran/gdp_iran.png


--------------------------------------------------------------------------------
/datasets/homeruns/homeruns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/homeruns/homeruns.png


--------------------------------------------------------------------------------
/datasets/well_log/well_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/well_log/well_log.png


--------------------------------------------------------------------------------
/datasets/centralia/centralia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/centralia/centralia.png


--------------------------------------------------------------------------------
/datasets/gdp_japan/gdp_japan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_japan/gdp_japan.png


--------------------------------------------------------------------------------
/datasets/occupancy/occupancy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/occupancy/occupancy.png


--------------------------------------------------------------------------------
/datasets/robocalls/robocalls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/robocalls/robocalls.png


--------------------------------------------------------------------------------
/datasets/seatbelts/seatbelts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/seatbelts/seatbelts.png


--------------------------------------------------------------------------------
/datasets/brent_spot/brent_spot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/brent_spot/brent_spot.png


--------------------------------------------------------------------------------
/datasets/co2_canada/co2_canada.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/co2_canada/co2_canada.png


--------------------------------------------------------------------------------
/datasets/construction/privtime.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/construction/privtime.xls


--------------------------------------------------------------------------------
/datasets/global_co2/global_co2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/global_co2/global_co2.png


--------------------------------------------------------------------------------
/datasets/rail_lines/rail_lines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/rail_lines/rail_lines.png


--------------------------------------------------------------------------------
/datasets/bee_waggle_6/bee_waggle_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bee_waggle_6/bee_waggle_6.png


--------------------------------------------------------------------------------
/datasets/construction/construction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/construction/construction.png


--------------------------------------------------------------------------------
/datasets/debt_ireland/debt_ireland.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/debt_ireland/debt_ireland.png


--------------------------------------------------------------------------------
/datasets/gdp_croatia/gdp_croatia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_croatia/gdp_croatia.png


--------------------------------------------------------------------------------
/datasets/ratner_stock/ratner_stock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/ratner_stock/ratner_stock.png


--------------------------------------------------------------------------------
/datasets/gdp_argentina/gdp_argentina.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_argentina/gdp_argentina.png


--------------------------------------------------------------------------------
/datasets/us_population/us_population.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/us_population/us_population.png


--------------------------------------------------------------------------------
/datasets/jfk_passengers/jfk_passengers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/jfk_passengers/jfk_passengers.png


--------------------------------------------------------------------------------
/datasets/lga_passengers/lga_passengers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/lga_passengers/lga_passengers.png


--------------------------------------------------------------------------------
/datasets/scanline_42049/scanline_42049.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/scanline_42049/scanline_42049.png


--------------------------------------------------------------------------------
/datasets/uk_coal_employ/Coal_since_1853.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/uk_coal_employ/Coal_since_1853.xls


--------------------------------------------------------------------------------
/datasets/uk_coal_employ/uk_coal_employ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/uk_coal_employ/uk_coal_employ.png


--------------------------------------------------------------------------------
/datasets/iceland_tourism/iceland_tourism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/iceland_tourism/iceland_tourism.png


--------------------------------------------------------------------------------
/datasets/scanline_126007/scanline_126007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/scanline_126007/scanline_126007.png


--------------------------------------------------------------------------------
/datasets/unemployment_nl/unemployment_nl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/unemployment_nl/unemployment_nl.png


--------------------------------------------------------------------------------
/datasets/quality_control_1/quality_control_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_1/quality_control_1.png


--------------------------------------------------------------------------------
/datasets/quality_control_2/quality_control_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_2/quality_control_2.png


--------------------------------------------------------------------------------
/datasets/quality_control_3/quality_control_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_3/quality_control_3.png


--------------------------------------------------------------------------------
/datasets/quality_control_4/quality_control_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_4/quality_control_4.png


--------------------------------------------------------------------------------
/datasets/quality_control_5/quality_control_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_5/quality_control_5.png


--------------------------------------------------------------------------------
/datasets/shanghai_license/shanghai_license.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/shanghai_license/shanghai_license.png


--------------------------------------------------------------------------------
/datasets/children_per_woman/children_per_woman.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/children_per_woman/children_per_woman.png


--------------------------------------------------------------------------------
/datasets/children_per_woman/tfr-by-gapminder.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/children_per_woman/tfr-by-gapminder.xlsx


--------------------------------------------------------------------------------
/.github/workflows/action.yml:
--------------------------------------------------------------------------------
1 | name: 'TCPD Docker'
2 | description: 'Runs the TCPD build script in a Docker container'
3 | runs:
4 |   using: 'docker'
5 |   image: '../../Dockerfile'
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Pillow>=6.2.1
 2 | beautifulsoup4>=4.8.1
 3 | clevercsv>=0.4.7
 4 | numpy>=1.17
 5 | requests>=2.22.0
 6 | yfinance>=0.1.79
 7 | jsonschema>=3.2.0
 8 | openpyxl
 9 | diff-match-patch
10 | 


--------------------------------------------------------------------------------
/datasets/quality_control_5/README.md:
--------------------------------------------------------------------------------
1 | # Quality Control no. 5
2 | 
3 | This is a simple quality control dataset with no change points and Gaussian 
4 | (0, 1) noise.
5 | 
6 | ![Plot of quality_control_5 dataset](./quality_control_5.png)
7 | 


--------------------------------------------------------------------------------
/datasets/centralia/from_wikipedia.txt:
--------------------------------------------------------------------------------
 1 | 1870	1342
 2 | 1880	1886
 3 | 1890	2761
 4 | 1900	2048
 5 | 1910	2429
 6 | 1920	2336
 7 | 1930	2446
 8 | 1940	2449
 9 | 1950	1986
10 | 1960	1435
11 | 1970	1165
12 | 1980	1017
13 | 1990	63
14 | 2000	21
15 | 2010	10
16 | 


--------------------------------------------------------------------------------
/datasets/quality_control_4/README.md:
--------------------------------------------------------------------------------
1 | # Quality Control no. 4
2 | 
3 | This dataset has multiple periodic components with different amplitude and an 
4 | offset change at time index 341.
5 | 
6 | ![Plot of quality_control_4 dataset](./quality_control_4.png)
7 | 


--------------------------------------------------------------------------------
/datasets/centralia/README.md:
--------------------------------------------------------------------------------
1 | # Population of Centralia, Pennsylvania
2 | 
3 | Abandoned mining town in the US.
4 | 
5 | Source: 
6 | [https://en.wikipedia.org/wiki/Centralia,_Pennsylvania#Demographics](https://en.wikipedia.org/wiki/Centralia,_Pennsylvania#Demographics)
7 | 
8 | ![Plot of centralia dataset](./centralia.png)
9 | 


--------------------------------------------------------------------------------
/datasets/bank/README.md:
--------------------------------------------------------------------------------
 1 | # Bank amounts
 2 | 
 3 | This dataset represents the amount of money in someone's current account. 
 4 | Significant changes occur on days of large transactions.
 5 | 
 6 | The ``bank.json`` file and this readme are licensed under the MIT license, see 
 7 | the LICENSE file.
 8 | 
 9 | ![Plot of bank dataset](./bank.png)
10 | 


--------------------------------------------------------------------------------
/datasets/quality_control_1/README.md:
--------------------------------------------------------------------------------
1 | # Quality Control no. 1
2 | 
3 | This is a quality control dataset with a known change point at time index 146. 
4 | The series has a small trend and has Gaussian noise before the change point 
5 | and an offset and uniform noise after the change point.
6 | 
7 | ![Plot of quality_control_1 dataset](./quality_control_1.png)
8 | 


--------------------------------------------------------------------------------
/datasets/quality_control_2/README.md:
--------------------------------------------------------------------------------
1 | # Quality Control no. 2
2 | 
3 | This is a quality control dataset with a known change point at time index 97. 
4 | The data has constant Gaussian (0, 1) noise throughout, with a step change of 
5 | size 1.5. It exemplifies the kind of datasets used in simulation studies of CP 
6 | algorithms.
7 | 
8 | ![Plot of quality_control_2 dataset](./quality_control_2.png)
9 | 


--------------------------------------------------------------------------------
/datasets/quality_control_3/README.md:
--------------------------------------------------------------------------------
1 | # Quality Control no. 3
2 | 
3 | This is a quality control dataset with a slight seasonal pattern and a known 
4 | change point at time index 179. The change is a change in the Gaussian noise 
5 | distribution from (0, 1) noise to (2, 2) noise. The data also contains an 
6 | outlier at index 42 (indexing from 0).
7 | 
8 | ![Plot of quality_control_3 dataset](./quality_control_3.png)
9 | 


--------------------------------------------------------------------------------
/datasets/bitcoin/README.md:
--------------------------------------------------------------------------------
 1 | # Bitcoin Market Price
 2 | 
 3 | This data is obtained from: 
 4 | [https://www.blockchain.com/charts/market-price?timespan=all](https://www.blockchain.com/charts/market-price?timespan=all). 
 5 | As this data can not be redistributed, the Makefile will download it from the 
 6 | Internet Archive.
 7 | 
 8 | The first 500 observations of the resulting time series are removed as they 
 9 | are quite uninteresting.
10 | 
11 | ![Plot of bitcoin dataset](./bitcoin.png)
12 | 


--------------------------------------------------------------------------------
/datasets/scanline_42049/README.md:
--------------------------------------------------------------------------------
 1 | # Scan line of image 42049
 2 | 
 3 | This is a "scan line", a horizontal slice, from a grayscale image from the 
 4 | BSD300 dataset. The image and the exact index of the scan line have been 
 5 | selected because of the abrupt changes between black and white that occur.
 6 | 
 7 | As it is not clear whether the BSD300 images can be redistributed freely, we 
 8 | download the image from the internet archive instead.
 9 | 
10 | ![Plot of scanline_42049 dataset](./scanline_42049.png)
11 | 


--------------------------------------------------------------------------------
/datasets/scanline_126007/README.md:
--------------------------------------------------------------------------------
 1 | # Scan line of image 126007
 2 | 
 3 | This is a "scan line", a horizontal slice, from a grayscale image from the 
 4 | BSD300 dataset. The image and the exact index of the scan line have been 
 5 | selected because of the abrupt changes between black and white that occur.
 6 | 
 7 | As it is not clear whether the BSD300 images can be redistributed freely, we 
 8 | download the image from the internet archive instead.
 9 | 
10 | ![Plot of scanline_126007 dataset](./scanline_126007.png)
11 | 


--------------------------------------------------------------------------------
/datasets/debt_ireland/debt_ireland.csv:
--------------------------------------------------------------------------------
 1 | Statistical Data Warehouse code,AME.A.IRL.1.0.319.0.UDGGL
 2 | Country,Ireland
 3 | 2000,36.0732199
 4 | 2001,33.2394627
 5 | 2002,30.5521068
 6 | 2003,29.9296861
 7 | 2004,28.2148891
 8 | 2005,26.0766114
 9 | 2006,23.618314
10 | 2007,23.9083721
11 | 2008,42.4036869
12 | 2009,61.5433048
13 | 2010,85.9938449
14 | 2011,110.861647
15 | 2012,119.8646655
16 | 2013,119.6837014
17 | 2014,104.1283774
18 | 2015,76.8191392
19 | 2016,73.4443135
20 | 2017,68.4403541
21 | 2018,63.8514846
22 | 2019,61.1353388
23 | 2020,56.0118623
24 | 


--------------------------------------------------------------------------------
/datasets/run_log/README.md:
--------------------------------------------------------------------------------
 1 | # Interval Training Running Pace
 2 | 
 3 | This dataset shows the pace of a runner during an interval training session, 
 4 | where a mobile application provides instructions on when to run and when to 
 5 | walk.
 6 | 
 7 | Data obtained from the authors' RunDouble account for a run on 2018-07-31.
 8 | 
 9 | See the LICENSE file for the license of the ``stats.csv`` file. To retrieve 
10 | ``run_log.json`` from ``stats.csv``, run:
11 | 
12 | ```
13 | $ python convert.py stats.csv run_log.json
14 | ```
15 | 
16 | ![Plot of run_log dataset](./run_log.png)
17 | 


--------------------------------------------------------------------------------
/datasets/iceland_tourism/README.md:
--------------------------------------------------------------------------------
 1 | # Iceland Tourism numbers by Month
 2 | 
 3 | Source [Icelandic Tourist 
 4 | Board](https://www.ferdamalastofa.is/en/recearch-and-statistics/numbers-of-foreign-visitors).
 5 | 
 6 | This dataset contains the monthly visitor numbers of tourists to Iceland, 
 7 | arriving through Keflavik airport. The data is obtained from the Icelandic 
 8 | Tourist Board. Since it is unclear if the data is in the public domain, we 
 9 | download it from an archive.org URL and do not redistribute it as part of this 
10 | repository.
11 | 
12 | ![Plot of iceland_tourism dataset](./iceland_tourism.png)
13 | 


--------------------------------------------------------------------------------
/datasets/robocalls/README.md:
--------------------------------------------------------------------------------
 1 | # Robocalls in the US per month
 2 | 
 3 | Data obtained from [RoboCallIndex](https://robocallindex.com/history/time). As 
 4 | it is not clear whether we can redistribute the data as part of this 
 5 | repository, we retrieve it locally instead.
 6 | 
 7 | There is a potential changepoint in March 2018 when the Federal Appeals Court 
 8 | struck down FCC rules on Robocalls. Full history of the relevant legislation 
 9 | can be found at 
10 | [https://epic.org/amicus/tcpa/aca-international/](https://epic.org/amicus/tcpa/aca-international/).
11 | 
12 | ![Plot of robocalls dataset](./robocalls.png)
13 | 


--------------------------------------------------------------------------------
/datasets/debt_ireland/README.md:
--------------------------------------------------------------------------------
 1 | # Debt of Ireland
 2 | 
 3 | Data obtained from 
 4 | [Eurostat](https://www.euro-area-statistics.org/macro-economic-indicators?cr=aut&lg=en&page=2&template=1).
 5 | 
 6 | Specifically, the timeseries concerns the government debt ratio of Ireland for 
 7 | the period 2000-2020. Effects of the financial crisis are visible.
 8 | 
 9 | Source: euro area statistics.
10 | Retrieved: 2019-03-27.
11 | 
12 | The information page of the Euro Area Statistics website states that data can 
13 | be redistributed under the condition that the source is quoted.
14 | 
15 | ![Plot of debt_ireland dataset](./debt_ireland.png)
16 | 


--------------------------------------------------------------------------------
/datasets/children_per_woman/README.md:
--------------------------------------------------------------------------------
 1 | # Children per Woman
 2 | 
 3 | This is a dataset from GapMinder showing the number of children per woman on 
 4 | average, globally.
 5 | 
 6 | The original data is obtained from GapMinder at: 
 7 | [https://www.gapminder.org/data/documentation/gd008/](https://www.gapminder.org/data/documentation/gd008/)
 8 | 
 9 | The timeseries that we use is from the ``world_total`` tab in the XLSX file.
10 | 
11 | GapMinder data is licensed under the CC BY 4.0 license, which allows us to 
12 | redistribute the original xlsx file here. Attribution: Free data from 
13 | www.gapminder.org.
14 | 
15 | ![Plot of children_per_woman dataset](./children_per_woman.png)
16 | 


--------------------------------------------------------------------------------
/datasets/homeruns/README.md:
--------------------------------------------------------------------------------
 1 | # Home Runs in the American League by Year
 2 | 
 3 | Data retrieved from the [Baseball 
 4 | Databank](https://github.com/chadwickbureau/baseballdatabank). The file 
 5 | ``Batting.csv`` is obtained from this repository and can be redistributed in 
 6 | this repository under the [CC BY-SA 3.0 
 7 | license](https://creativecommons.org/licenses/by-sa/3.0/). This implies that 
 8 | both ``Batting.csv`` and ``homeruns.json`` are licensed under this same 
 9 | license: http://creativecommons.org/licenses/by-sa/3.0/.
10 | 
11 | This dataset lists the number of home runs in the American League of baseball 
12 | by year.
13 | 
14 | ![Plot of homeruns dataset](./homeruns.png)
15 | 


--------------------------------------------------------------------------------
/datasets/gdp_iran/README.md:
--------------------------------------------------------------------------------
 1 | # GDP of Argentina in constant LCU
 2 | 
 3 | Potential change point around the Iranian Revolution. Obtained from the [World 
 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=IR&start=1960).
 5 | 
 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 
 7 | World Bank on 2019-08-28. No modifications to the original data file 
 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made.
 9 | 
10 | To retrieve the ``gdp_iran.json`` file from the csv file, simply run:
11 | 
12 | ```
13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_iran.json
14 | ```
15 | 
16 | ![Plot of gdp_iran dataset](./gdp_iran.png)
17 | 


--------------------------------------------------------------------------------
/datasets/gdp_croatia/README.md:
--------------------------------------------------------------------------------
 1 | # GDP Croatia in constant LCU
 2 | 
 3 | Apparent change point around the financial crisis. Obtained from the [World 
 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=HR&start=1995).
 5 | 
 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 
 7 | World Bank on 2019-08-28. No modifications to the original data file 
 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made.
 9 | 
10 | To retrieve the ``gdp_croatia.json`` file from the csv file, simply run:
11 | 
12 | ```
13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_croatia.json
14 | ```
15 | 
16 | ![Plot of gdp_croatia dataset](./gdp_croatia.png)
17 | 


--------------------------------------------------------------------------------
/datasets/gdp_argentina/README.md:
--------------------------------------------------------------------------------
 1 | # GDP of Argentina in constant LCU
 2 | 
 3 | Potential change point around the financial crisis. Obtained from the [World 
 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=AR&start=1960).
 5 | 
 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 
 7 | World Bank on 2019-08-28. No modifications to the original data file 
 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made.
 9 | 
10 | To retrieve the ``gdp_argentina.json`` file from the csv file, simply run:
11 | 
12 | ```
13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_argentina.json
14 | ```
15 | 
16 | ![Plot of gdp_argentina dataset](./gdp_argentina.png)
17 | 


--------------------------------------------------------------------------------
/datasets/seatbelts/README.md:
--------------------------------------------------------------------------------
 1 | # UK Driver Deaths
 2 | 
 3 | This dataset concerns the number of drivers killed or seriously injured in the 
 4 | UK around the period where seatbelts are introduced. Seatbelts were compulsory 
 5 | equipment in all new cars in 1972 and were mandatory to be worn from 1983 
 6 | onwards.
 7 | 
 8 | Data exported from R, where the it is a builtin dataset called 
 9 | ``UKDriverDeaths`` in the ``datasets`` package. Since the ``datasets`` package 
10 | is part of R it is licensed under version 2 of the [GNU Public 
11 | License](https://www.r-project.org/COPYING). The data file produced from this 
12 | data (``seatbelts.json``) is therefore licensed under GPLv2 as well.
13 | 
14 | ![Plot of seatbelts dataset](./seatbelts.png)
15 | 


--------------------------------------------------------------------------------
/datasets/rail_lines/README.md:
--------------------------------------------------------------------------------
 1 | # Rail Lines (total route-km)
 2 | 
 3 | Data on the total kilometers of rail lines in the world. Data obtained from 
 4 | the [World 
 5 | Bank](https://data.worldbank.org/indicator/IS.RRS.TOTL.KM?locations=1W).
 6 | 
 7 | The dataset is licensed under [CC BY 
 8 | 4.0](https://creativecommons.org/licenses/by/4.0/) and can therefore be 
 9 | redistributed as part of this repository. No modifications to the data have 
10 | been made during the conversion to the JSON format.
11 | 
12 | - ``./API_IS.RRS.TOTL.KM_DS2_en_csv_v2_10520532.csv`` contains the original 
13 |   dataset retrieved from the World Bank.
14 | - ``./rail_lines.json`` contains the data from the entire world in JSON 
15 |   format.
16 | 
17 | ![Plot of rail_lines dataset](./rail_lines.png)
18 | 


--------------------------------------------------------------------------------
/datasets/unemployment_nl/README.md:
--------------------------------------------------------------------------------
 1 | # Unemployment in the Netherlands
 2 | 
 3 | This data shows the percentage of unemployment people in the labor population. 
 4 | The original data is retrieved from [Statistics 
 5 | Netherlands](https://opendata.cbs.nl/statline/#/CBS/nl/dataset/71882ned/table?ts=1554392218500 
 6 | ) and can be redistributed as part of this repository.
 7 | 
 8 | In the time series we use the data from both genders and use the corrected 
 9 | value for the year 2001.
10 | 
11 | To retrieve the ``unemployment_nl.json`` from the original source file, 
12 | simply run:
13 | 
14 | ```
15 | $ python convert.py Beroepsbevolking__vanaf_1800__12_uursgrens___1800_2013_04042019_154346.csv unemployment_nl.json
16 | ```
17 | 
18 | ![Plot of unemployment_nl dataset](./unemployment_nl.png)
19 | 


--------------------------------------------------------------------------------
/datasets/co2_canada/README.md:
--------------------------------------------------------------------------------
 1 | # CO2 emissions (tonnes per person) for Canada
 2 | 
 3 | This series describes carbon dioxide emissions from the burning of fossil 
 4 | fuels (metric tonnes of CO2 per person) in Canada. The data is retrieved from 
 5 | [GapMinder](https://www.gapminder.org/tools/#$state$marker$axis_y$which=co2_emissions_tonnes_per_person&domainMin:null&domainMax:null&zoomedMin:null&zoomedMax:null&scaleType=genericLog&spaceRef:null;;;&chart-type=bubbles)
 6 | 
 7 | We isolate Canada because there is a long history and the behaviour looks 
 8 | interesting.
 9 | 
10 | GapMinder data is licensed under the CC BY 4.0 license, which allows us to 
11 | redistribute the original data here. Attribution: Free data from 
12 | www.gapminder.org.
13 | 
14 | ![Plot of co2_canada dataset](./co2_canada.png)
15 | 


--------------------------------------------------------------------------------
/datasets/gdp_japan/README.md:
--------------------------------------------------------------------------------
 1 | # Historic GDP of Japan in the Local Currency Unit (LCU)
 2 | 
 3 | Data obtained from the [World 
 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.CN?locations=JP).
 5 | 
 6 | There is a known structural break in the growth rate of Japan, known as the 
 7 | [lost decade](https://en.wikipedia.org/wiki/Lost_Decade_(Japan)).
 8 | 
 9 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 
10 | World Bank on 2019-03-27. No modifications to the original data file 
11 | (``gdp.csv``) have been made. The file ``gdp_japan.csv`` is a subset of the 
12 | ``gdp.csv`` file that contains only the data for Japan. The ``gdp_japan.json`` 
13 | file is manually constructed from the ``gdp_japan.csv`` file.
14 | 
15 | ![Plot of gdp_japan dataset](./gdp_japan.png)
16 | 


--------------------------------------------------------------------------------
/datasets/uk_coal_employ/README.md:
--------------------------------------------------------------------------------
 1 | # Historic Employment in UK Coal Mines
 2 | 
 3 | This is historic data obtained from [the UK 
 4 | government](https://www.gov.uk/government/statistical-data-sets/historical-coal-data-coal-production-availability-and-consumption). 
 5 | As the dataset is licensed under the [Open Government 
 6 | License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) 
 7 | we distribute it as part of this repository.
 8 | 
 9 | We use the employment column for the number of workers employed in the British 
10 | coal mines (extracted to ``employ_only.csv`` from ``Coal_since_1853.xls``) and 
11 | converted to the ``uk_coal_employ.json`` file. Missing values in the data are 
12 | indicated by a ``null`` value in the JSON file.
13 | 
14 | ![Plot of uk_coal_employ dataset](./uk_coal_employ.png)
15 | 


--------------------------------------------------------------------------------
/datasets/jfk_passengers/README.md:
--------------------------------------------------------------------------------
 1 | # JFK Airline Passengers
 2 | 
 3 | This dataset gives the number of passengers arriving and departing at JFK. 
 4 | 
 5 | The data is obtained from New York State's official Kaggle page for this 
 6 | dataset: https://www.kaggle.com/new-york-state/nys-air-passenger-traffic,-port-authority-of-ny-nj#air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv
 7 | 
 8 | This page shows that the data is under a "CC0: Public Domain" license, so we 
 9 | redistribute it here as part of our repository.
10 | 
11 | To create the ``jfk_passengers.json`` file from the raw csv file, simply run:
12 | 
13 | ```
14 | $ python convert.py ./air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv ./jfk_passengers.json
15 | ```
16 | 
17 | ![Plot of jfk_passengers dataset](./jfk_passengers.png)
18 | 


--------------------------------------------------------------------------------
/datasets/lga_passengers/README.md:
--------------------------------------------------------------------------------
 1 | # LGA Airline Passengers
 2 | 
 3 | This dataset gives the number of passengers arriving and departing at LGA. 
 4 | 
 5 | The data is obtained from New York State's official Kaggle page for this 
 6 | dataset: https://www.kaggle.com/new-york-state/nys-air-passenger-traffic,-port-authority-of-ny-nj#air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv
 7 | 
 8 | This page shows that the data is under a "CC0: Public Domain" license, so we 
 9 | redistribute it here as part of our repository.
10 | 
11 | To create the ``lga_passengers.json`` file from the raw csv file, simply run:
12 | 
13 | ```
14 | $ python convert.py ./air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv ./lga_passengers.json
15 | ```
16 | 
17 | ![Plot of lga_passengers dataset](./lga_passengers.png)
18 | 


--------------------------------------------------------------------------------
/datasets/measles/README.md:
--------------------------------------------------------------------------------
 1 | # Weekly Measles Case Reports England & Wales
 2 | 
 3 | This data is available from [prof. Ben Bolker's 
 4 | webpage](https://ms.mcmaster.ca/~bolker/measdata.html).
 5 | 
 6 | The original file is ``ewmeas.dat``.
 7 | 
 8 | For the data format, see: 
 9 | [https://ms.mcmaster.ca/~bolker/measdata/formats.html](https://ms.mcmaster.ca/~bolker/measdata/formats.html).
10 | 
11 | The time difference between observations is not exactly constant, but for the 
12 | annotation we will consider it as such. Detection algorithms that can take a 
13 | specific time axis should be provided with the true axis.
14 | 
15 | While this data is provided "as is", it is not clearly licensed for 
16 | redistribution. We therefore download it locally instead of distributing it 
17 | with this repository.
18 | 
19 | ![Plot of measles dataset](./measles.png)
20 | 


--------------------------------------------------------------------------------
/datasets/apple/README.md:
--------------------------------------------------------------------------------
 1 | # Apple Stock
 2 | 
 3 | This dataset concerns the daily close price and volume of Apple stock around 
 4 | the year 2000. The dataset is sampled every 3 observations to reduce the 
 5 | length of the time series.
 6 | 
 7 | Data retrieved from [Yahoo 
 8 | Finance](https://finance.yahoo.com/quote/AAPL/history?period1=850348800&period2=1084579200&interval=1d&filter=history&frequency=1d). 
 9 | We use the Python package ``yfinance`` to download the data as it can not be 
10 | redistributed as part of this repository.
11 | 
12 | Since the original data has observations only on trading days, there are 
13 | arguably gaps in this time series on non-trading days. However we consider 
14 | these to be consecutive, and thus also consider the sampled time series to 
15 | have consecutive observations.
16 | 
17 | ![Plot of apple dataset](./apple.png)
18 | 


--------------------------------------------------------------------------------
/datasets/businv/README.md:
--------------------------------------------------------------------------------
 1 | # Total Business Inventories
 2 | 
 3 | Monthly total business inventories from the US Census. Data retrieved from 
 4 | [this direct 
 5 | url](https://www.census.gov/mtis/www/data/text/mtis-inventory.txt) on 
 6 | 2019-09-11. We use the unadjusted time series to maintain the seasonal 
 7 | component.
 8 | 
 9 | According to [this 
10 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 
11 | on the US Census website, we are allowed to redistribute the data as part of 
12 | this repository.
13 | 
14 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 
15 | 2019-09-11.
16 | 
17 | To create the ``businv.json`` file from the raw ``mtis-inventory.txt`` file, 
18 | simply run:
19 | 
20 | ```
21 | $ python convert.py mtis-inventory.txt businv.json
22 | ```
23 | 
24 | ![Plot of businv dataset](./businv.png)
25 | 


--------------------------------------------------------------------------------
/datasets/us_population/README.md:
--------------------------------------------------------------------------------
 1 | # US Population
 2 | 
 3 | This time series are the population numbers in the US. A potential change 
 4 | point occurs around index 459 (1990s).
 5 | 
 6 | Data obtained from
 7 | [Kaggle](https://www.kaggle.com/census/population-time-series-data#POP.csv).
 8 | 
 9 | The original source of the data is the US Census Bureau. According to [this 
10 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 
11 | on the US Census website, we are allowed to redistribute the data as part of 
12 | this repository.
13 | 
14 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 
15 | 2019-08-28.
16 | 
17 | To obtain ``./us_population.json`` from ``POP.csv``, simply run:
18 | 
19 | ```
20 | $ python convert.py POP.csv us_population.json
21 | ```
22 | 
23 | ![Plot of us_population dataset](./us_population.png)
24 | 


--------------------------------------------------------------------------------
/datasets/bee_waggle_6/README.md:
--------------------------------------------------------------------------------
 1 | # Bee Waggle Dataset sequence 6
 2 | 
 3 | The movement of honey bees switches between a left turn, a right turn, and a 
 4 | waggle.  This dataset contains is a three-dimensional dataset of the position 
 5 | (x, y) and heading angle (theta) of a single bee.
 6 | 
 7 | Source: [Parametric Segmental Switching Linear Dynamic Systems 
 8 | (PS-SLDS)](https://www.cc.gatech.edu/~borg/ijcv_psslds/).
 9 | 
10 | When using this time series, please cite original authors:
11 | 
12 | ```bibtex
13 | @article{oh2008learning,
14 |     title={Learning and inferring motion patterns using parametric segmental switching linear dynamic systems},
15 |     author={Oh, S. M. and Rehg, J. M. and Balch, T. and Dellaert, F.},
16 |     journal={International Journal of Computer Vision},
17 |     volume={77},
18 |     number={1-3},
19 |     pages={103--124},
20 |     year={2008},
21 |     publisher={Springer}
22 | }
23 | ```
24 | 
25 | ![Plot of bee_waggle_6 dataset](./bee_waggle_6.png)
26 | 


--------------------------------------------------------------------------------
/datasets/centralia/centralia.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "centralia",
 3 | 	"longname": "Centralia Pennsylvania Population",
 4 | 	"n_obs": 15,
 5 | 	"n_dim": 1,
 6 | 	"time": {
 7 | 		"type": "string",
 8 | 		"format": "%Y",
 9 | 		"index": [
10 | 			0,
11 | 			1,
12 | 			2,
13 | 			3,
14 | 			4,
15 | 			5,
16 | 			6,
17 | 			7,
18 | 			8,
19 | 			9,
20 | 			10,
21 | 			11,
22 | 			12,
23 | 			13,
24 | 			14
25 | 		],
26 | 		"raw": [
27 | 			"1870",
28 | 			"1880",
29 | 			"1890",
30 | 			"1900",
31 | 			"1910",
32 | 			"1920",
33 | 			"1930",
34 | 			"1940",
35 | 			"1950",
36 | 			"1960",
37 | 			"1970",
38 | 			"1980",
39 | 			"1990",
40 | 			"2000",
41 | 			"2010"
42 | 		]
43 | 	},
44 | 	"series": [
45 | 		{
46 | 			"label": "Population",
47 | 			"type": "int",
48 | 			"raw": [
49 | 				1342,
50 | 				1886,
51 | 				2761,
52 | 				2048,
53 | 				2429,
54 | 				2336,
55 | 				2446,
56 | 				2449,
57 | 				1986,
58 | 				1435,
59 | 				1165,
60 | 				1017,
61 | 				63,
62 | 				21,
63 | 				10
64 | 			]
65 | 		}
66 | 	]
67 | }


--------------------------------------------------------------------------------
/datasets/ratner_stock/README.md:
--------------------------------------------------------------------------------
 1 | # Ratner Group Stock Price
 2 | 
 3 | The Ratner Group's stock price [is 
 4 | known](https://en.wikipedia.org/wiki/Gerald_Ratner#The_speech) for an event 
 5 | that can be considered a change point.
 6 | 
 7 | Historical stock market data for SIG retrieved from [Yahoo finance (daily 
 8 | frequency)](https://finance.yahoo.com/quote/SIG/history?period1=584841600&period2=1567036800&interval=1d&filter=history&frequency=1d). 
 9 | We use the Python package ``yfinance`` to download the data as it can not be 
10 | redistributed as part of this repository.
11 | 
12 | The data has been sampled every 3 observations to reduce the length of the 
13 | series.
14 | 
15 | Since the original data has observations only on trading days, there are 
16 | arguably gaps in this time series (on non-trading days). However we consider 
17 | these to be consecutive, and thus also consider the sampled time series to 
18 | have consecutive observations.
19 | 
20 | ![Plot of ratner_stock dataset](./ratner_stock.png)
21 | 


--------------------------------------------------------------------------------
/examples/R/README.md:
--------------------------------------------------------------------------------
 1 | # Loading a TCPD dataset into R
 2 | 
 3 | The file ``load_dataset.R`` contains the function ``load.dataset`` that reads 
 4 | the JSON file into an R dataframe. The 
 5 | [RJSONIO](https://cran.r-project.org/web/packages/RJSONIO/index.html) package 
 6 | is required:
 7 | 
 8 | ```R
 9 | > install.packages('RJSONIO')
10 | ```
11 | 
12 | Simply run:
13 | 
14 | ```R
15 | > source('./load_dataset.R')
16 | > df <- load.dataset('../../datasets/ozone/ozone.json')
17 | > df
18 |     t Total Emissions
19 | 1   0          380000
20 | 2   1          400000
21 | 3   2          440000
22 | 4   3          480000
23 | 5   4          510000
24 | 6   5          540000
25 | 7   6          580000
26 | 8   7          630000
27 | ```
28 | 
29 | Notice that the time axis in TCPD is always 0-based. This needs to be taken 
30 | into account when comparing detection results to the human annotations. (This 
31 | is an unfortunate consequence of the differences between indexing in R and 
32 | Python.)
33 | 
34 | Missing observations in time series are represented with a ``NA`` value.
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:24.04
 2 | 
 3 | # Install necessary packages and ensure python means python3
 4 | RUN apt-get update && \
 5 | 	    DEBIAN_FRONTEND=noninteractive apt-get remove -y python && \
 6 | 	    apt-get install -y --no-install-recommends \
 7 | 	    git \
 8 | 	    build-essential \
 9 | 	    libcurl4-openssl-dev \
10 | 	    libssl-dev \
11 | 	    python3 \
12 | 	    python3-dev \
13 | 	    python3-pip \
14 | 	    python3-venv \
15 | 	    python3-wheel && \
16 | 	    echo "alias python='python3'" >> /root/.bash_aliases && \
17 | 	    echo "alias pip='pip3'" >> /root/.bash_aliases && \
18 | 	    cd /usr/local/bin && ln -s /usr/bin/python3 python && \
19 | 	    cd /usr/local/bin && ln -s /usr/bin/pip3 pip
20 | 
21 | # Make bash the default shell
22 | RUN mv /bin/sh /bin/sh.old && cp /bin/bash /bin/sh
23 | 
24 | # Clone the dataset repo
25 | RUN git clone https://github.com/alan-turing-institute/TCPD
26 | 
27 | # Change working dir
28 | WORKDIR TCPD
29 | 
30 | # Create virtualenv
31 | RUN make venv
32 | 
33 | # Build the dataset when container is run.
34 | CMD ["make", "export"]
35 | 


--------------------------------------------------------------------------------
/examples/R/load_dataset.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: Example code to load a TCPD time series
 3 | #' author: G.J.J. van den Burg
 4 | #' date: 2020-01-06
 5 | #' license: See the LICENSE file.
 6 | #' copyright: 2019, The Alan Turing Institute
 7 | #' ---
 8 | 
 9 | library(RJSONIO)
10 | 
11 | load.dataset <- function(filename)
12 | {
13 |     data <- fromJSON(filename)
14 | 
15 |     # reformat the data into a data frame with a time index and the data values
16 |     tidx <- data$time$index
17 | 
18 |     cols <- c()
19 | 
20 |     mat <- NULL
21 |     for (j in 1:data$n_dim) {
22 |         s <- data$series[[j]]
23 |         v <- NULL
24 |         for (i in 1:data$n_obs) {
25 |             val <- s$raw[[i]]
26 |             if (is.null(val)) {
27 |                 v <- c(v, NA)
28 |             } else {
29 |                 v <- c(v, val)
30 |             }
31 |         }
32 |         cols <- c(cols, s$label)
33 |         mat <- cbind(mat, v)
34 |     }
35 | 
36 |     mat <- cbind(tidx, mat)
37 |     colnames(mat) <- c('t', cols)
38 | 
39 |     df <- as.data.frame(mat)
40 |     return(df)
41 | }
42 | 


--------------------------------------------------------------------------------
/datasets/occupancy/README.md:
--------------------------------------------------------------------------------
 1 | # Room occupancy data
 2 | 
 3 | Dataset on detecting room occupancy based on several variables. For our 
 4 | dataset we use the Temperature, Humidity, Light, and CO2 variables from the 
 5 | training dataset.
 6 | 
 7 | This dataset is obtained from the [UCI 
 8 | repository](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+)
 9 | on 2019-06-10. As it is unclear whether the data can be redistributed as part 
10 | of this repository, we download it locally instead.
11 | 
12 | The data is sampled at every 16 observations to reduce the length of the 
13 | series.
14 | 
15 | When using this particular time series, please cite:
16 | 
17 | ```bib
18 | @article{candanedo2016accurate,
19 | 	title={Accurate occupancy detection of an office room from light, temperature, humidity and $\text{CO}_2$ measurements using statistical learning models},
20 | 	author={Candanedo, L. M. and Feldheim, V.},
21 | 	journal={Energy and Buildings},
22 | 	volume={112},
23 | 	pages={28--39},
24 | 	year={2016},
25 | 	publisher={Elsevier}
26 | }
27 | ```
28 | 
29 | ![Plot of occupancy dataset](./occupancy.png)
30 | 


--------------------------------------------------------------------------------
/datasets/nile/README.md:
--------------------------------------------------------------------------------
 1 | # Volume of Nile River at Aswan
 2 | 
 3 | This is a dataset on the volume of the Nile river at Aswan for the period 
 4 | 1871-1970. The data is obtained from the website of the book [Time Series 
 5 | Analysis by State Space Methods](http://www.ssfpack.com/DKbook.html) by Durbin 
 6 | and Koopman. The data is also available in the R ``datasets`` package, which 
 7 | is part of R and is therefore licensed under version 2 of the [GNU Public 
 8 | License](https://www.r-project.org/COPYING). The data file produced from this 
 9 | data (``nile.json``) is therefore licensed under GPLv2 as well.
10 | 
11 | A potential change point occurs in 1898 with the introduction of a dam.
12 | 
13 | Note that this is not the Nile dataset used in other change point papers, 
14 | which covers a period from 622 to 1284 AD (see e.g. Witcher et al. (2002)).
15 | That dataset, [available 
16 | here](https://web.archive.org/web/20000815223740/http://lib.stat.cmu.edu/S/beran), 
17 | has been used in many papers to detect the introduction of a nilometer in the 
18 | year 715 AD.
19 | 
20 | ![Plot of nile dataset](./nile.png)
21 | 


--------------------------------------------------------------------------------
/datasets/brent_spot/README.md:
--------------------------------------------------------------------------------
 1 | # Brent Spot Price
 2 | 
 3 | This is the USD price for Brent Crude oil, measured daily. We include the time 
 4 | series from 2000 onwards. The data is sampled at every 10 original 
 5 | observations to reduce the length of the series.
 6 | 
 7 | The data is obtained from the [U.S. Energy Information 
 8 | Administration](https://www.eia.gov/opendata/qb.php?sdid=PET.RBRTE.D). Since 
 9 | the data is in the public domain, we distribute it as part of this repository. 
10 | Source: U.S. Energy Information Administration (Sep. 2019).
11 | 
12 | Since the original data has observations only on trading days, there are 
13 | arguably gaps in this time series (on non-trading days). However we consider 
14 | these to be consecutive, and thus also consider the sampled time series to 
15 | have consecutive observations.
16 | 
17 | To obtain the ``brent_spot.json`` file from the original 
18 | ``Europe_Brent_Spot_Price_FOB_Daily.csv`` file, simply run:
19 | 
20 | ```
21 | $ python convert.py Europe_Brent_Spot_Price_FOB_Daily.csv brent_spot.json
22 | ```
23 | 
24 | ![Plot of brent_spot dataset](./brent_spot.png)
25 | 


--------------------------------------------------------------------------------
/.github/workflows/validate.yml:
--------------------------------------------------------------------------------
 1 | name: Check TCPD
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 |   schedule:
11 |     - cron: 16 17 */10 * *
12 | 
13 | jobs:
14 |   tcpd-ubuntu:
15 |     name: check TCPD (direct)
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - name: Install dependencies
20 |         run: sudo apt-get update && sudo apt-get install build-essential
21 |         shell: bash
22 | 
23 |       - name: Install Python 3.12
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: '3.12'
27 | 
28 |       - name: Checkout code
29 |         uses: actions/checkout@v4
30 | 
31 |       - name: Ensure clean
32 |         run: make clean
33 |         shell: bash
34 | 
35 |       - name: Build and verify
36 |         run: make test
37 |         shell: bash
38 | 
39 |   tcpd-docker:
40 |     name: check TCPD (docker)
41 |     runs-on: ubuntu-latest
42 | 
43 |     steps:
44 |       - name: Checkout
45 |         uses: actions/checkout@v4
46 | 
47 |       - name: TCPD docker check
48 |         uses: ./.github/workflows/
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 The Alan Turing Institute
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 
 4 | of this software and associated documentation files (the "Software"), to deal 
 5 | in the Software without restriction, including without limitation the rights 
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 7 | copies of the Software, and to permit persons to whom the Software is 
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all 
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/datasets/bank/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 G.J.J. van den Burg
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 
 4 | of this software and associated documentation files (the "Software"), to deal 
 5 | in the Software without restriction, including without limitation the rights 
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 7 | copies of the Software, and to permit persons to whom the Software is 
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all 
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/datasets/well_log/README.md:
--------------------------------------------------------------------------------
 1 | # Well-log dataset
 2 | 
 3 | This is the well-known well-log dataset used in many changepoint papers. This 
 4 | version is obtained from [this Github 
 5 | repository](https://raw.githubusercontent.com/alan-turing-institute/bocpdms/a4042b45004592f5b9fa912b346dd22a212b7ea0/Data/well.txt), 
 6 | and licensed under the MIT license.
 7 | 
 8 | The dataset is sampled at every 6 observations to reduce the length of the 
 9 | series. To obtain the json dataset from the original file, simply run:
10 | 
11 | ```
12 | $ python convert.py well_log.txt well_log.json
13 | ```
14 | 
15 | [Here](https://web.archive.org/web/20191128143944/https://raw.githubusercontent.com/alan-turing-institute/bocpdms/a4042b45004592f5b9fa912b346dd22a212b7ea0/Data/well.txt) 
16 | is an archive.org url for the dataset.
17 | 
18 | When using this series, please cite the original source:
19 | 
20 | ```bib
21 | @book{oruanaidh1996numerical,
22 | 	title={Numerical {Bayesian} Methods Applied to Signal Processing},
23 | 	author={{\'O Ruanaidh}, J. J. K. and Fitzgerald, W. J.},
24 | 	year={1996},
25 | 	publisher={Springer}
26 | }
27 | ```
28 | 
29 | ![Plot of well_log dataset](./well_log.png)
30 | 


--------------------------------------------------------------------------------
/datasets/construction/README.md:
--------------------------------------------------------------------------------
 1 | # Total Private Construction Spending
 2 | 
 3 | This dataset is retrieved from the US Census and concerns the total private 
 4 | construction spending. Potential change points occur at recessions. The data 
 5 | is obtained [from this Census 
 6 | page](https://www.census.gov/construction/c30/historical_data.html) using the 
 7 | "Private" series from the "Not Seasonally Adjusted" column in the "Monthly" 
 8 | table. Alternatively, use [this direct 
 9 | URL](https://www.census.gov/construction/c30/xls/privtime.xls).
10 | 
11 | According to [this 
12 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 
13 | on the US Census website, we are allowed to redistribute the data as part of 
14 | this repository.
15 | 
16 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 
17 | 2019-09-11.
18 | 
19 | To create ``construction.json`` file from the raw ``privtime.xls`` file, 
20 | simply run:
21 | 
22 | ```
23 | $ python convert.py privtime.xls construction.json
24 | ```
25 | 
26 | ![Plot of construction dataset](./construction.png)
27 | 


--------------------------------------------------------------------------------
/datasets/run_log/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Gerrit J.J. van den Burg
 2 | 
 3 | This LICENSE file covers the stats.csv file only.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), to deal 
 7 | in the Software without restriction, including without limitation the rights 
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 9 | copies of the Software, and to permit persons to whom the Software is 
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all 
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/datasets/well_log/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | 
14 | SAMPLE = 6
15 | 
16 | 
17 | def parse_args():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("input_file", help="File to convert")
20 |     parser.add_argument("output_file", help="File to write to")
21 |     return parser.parse_args()
22 | 
23 | 
24 | def main():
25 |     args = parse_args()
26 | 
27 |     with open(args.input_file, "r") as fp:
28 |         rows = [l.strip() for l in fp]
29 | 
30 |     rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0]
31 | 
32 |     values = list(map(float, rows))
33 |     name = "well_log"
34 |     longname = "Well Log"
35 | 
36 |     series = [{"label": "V1", "type": "float", "raw": values}]
37 | 
38 |     data = {
39 |         "name": name,
40 |         "longname": longname,
41 |         "n_obs": len(values),
42 |         "n_dim": len(series),
43 |         "time": {"index": list(range(len(values)))},
44 |         "series": series,
45 |     }
46 | 
47 |     with open(args.output_file, "w") as fp:
48 |         json.dump(data, fp, indent="\t")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/datasets/debt_ireland/debt_ireland.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "debt_ireland",
 3 | 	"longname": "Debt Ireland",
 4 | 	"n_obs": 21,
 5 | 	"n_dim": 1,
 6 | 	"time": {
 7 | 		"type": "string",
 8 | 		"format": "%Y",
 9 | 		"index": [
10 | 			0,
11 | 			1,
12 | 			2,
13 | 			3,
14 | 			4,
15 | 			5,
16 | 			6,
17 | 			7,
18 | 			8,
19 | 			9,
20 | 			10,
21 | 			11,
22 | 			12,
23 | 			13,
24 | 			14,
25 | 			15,
26 | 			16,
27 | 			17,
28 | 			18,
29 | 			19,
30 | 			20
31 | 		],
32 | 		"raw": [
33 | 			"2000",
34 | 			"2001",
35 | 			"2002",
36 | 			"2003",
37 | 			"2004",
38 | 			"2005",
39 | 			"2006",
40 | 			"2007",
41 | 			"2008",
42 | 			"2009",
43 | 			"2010",
44 | 			"2011",
45 | 			"2012",
46 | 			"2013",
47 | 			"2014",
48 | 			"2015",
49 | 			"2016",
50 | 			"2017",
51 | 			"2018",
52 | 			"2019",
53 | 			"2020"
54 | 		]
55 | 	},
56 | 	"series": [
57 | 		{
58 | 			"label": "V1",
59 | 			"type": "float",
60 | 			"raw": [
61 | 				36.0732199,
62 | 				33.2394627,
63 | 				30.5521068,
64 | 				29.9296861,
65 | 				28.2148891,
66 | 				26.0766114,
67 | 				23.618314,
68 | 				23.9083721,
69 | 				42.4036869,
70 | 				61.5433048,
71 | 				85.9938449,
72 | 				110.861647,
73 | 				119.8646655,
74 | 				119.6837014,
75 | 				104.1283774,
76 | 				76.8191392,
77 | 				73.4443135,
78 | 				68.4403541,
79 | 				63.8514846,
80 | 				61.1353388,
81 | 				56.0118623
82 | 			]
83 | 		}
84 | 	]
85 | }
86 | 


--------------------------------------------------------------------------------
/datasets/ozone/README.md:
--------------------------------------------------------------------------------
 1 | # Ozone-depleting substance emissions
 2 | 
 3 | This dataset contains "Global emissions of ozone-depleting substances, 
 4 | measured in tonnes of chlorofluorocarbon-11 equivalents
 5 | (CFC11-equivalents) per year." It is obtained from [Our World in 
 6 | Data](https://ourworldindata.org/ozone-layer), who have scraped the data from:
 7 | 
 8 | Hegglin, M. I., Fahey, D. W., McFarland, M., Montzka, S. A., & Nash, E. R. 
 9 | (2014). [Twenty questions and answers about the ozone layer: 2014 
10 | update](https://www.wmo.int/pages/prog/arep/gaw/ozone_2014/documents/2014%20Twenty%20Questions_Final.pdf). 
11 | World Meteorological Organization, UNEP, NOAA, NASA, and European Commission.
12 | 
13 | A change is expected after the signing of the [Montreal 
14 | Protocol](https://en.wikipedia.org/wiki/Montreal_Protocol).
15 | 
16 | The chart in the article by [Our World in 
17 | Data](https://ourworldindata.org/ozone-layer) is licensed under [CC BY 
18 | 4.0](https://creativecommons.org/licenses/by/4.0/deed.en_US). No changes to 
19 | the data were made. The original data sourced from Hegglin et al., 2015 (cited 
20 | above) is in the public domain.
21 | 
22 | The ``ozone.json`` file can be obtained from the original 
23 | ``./ozone-depleting-substance-emissions.csv`` by running:
24 | 
25 | ```
26 | $ python convert.py ./ozone-depleting-substance-emissions.csv ./ozone.json
27 | ```
28 | 
29 | ![Plot of ozone dataset](./ozone.png)
30 | 


--------------------------------------------------------------------------------
/datasets/shanghai_license/README.md:
--------------------------------------------------------------------------------
 1 | # Shanghai License Plate Applicants
 2 | 
 3 | Source: 
 4 | [Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price). 
 5 | Data licensed under [CC0: Public 
 6 | Domain](https://creativecommons.org/publicdomain/zero/1.0/), so we can 
 7 | redistribute it as part of this repository.
 8 | 
 9 | There seems to be a clear sudden growth in the number of applicants.
10 | 
11 | Note: according to [this discussion on 
12 | Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price/discussion/73140), 
13 | the record for 2008-02 is missing because the license plates for January and 
14 | Feburary were auctioned off simultaneously in January. As this represents an 
15 | uneven measurement and a missing value, we choose to split the observation for 
16 | January and February 2008 in two, dividing the amount equally between the 
17 | months. An alternative would be to introduce a missing value in 2008-02, but 
18 | since many of the algorithms we wish to evaluate are not able to handle 
19 | missing values (and any imputation method would be incorrect), we believe this 
20 | is a reasonable way to deal with this issue.
21 | 
22 | To obtain the ``shanghai_license.json`` file from the 
23 | ``Shanghai_license_plate_price_-_Sheet3.csv`` file, simply run:
24 | 
25 | ```
26 | $ python convert.py Shanghai_license_plate_price_-_Sheet3.csv shanghai_license.json
27 | ```
28 | 
29 | ![Plot of shanghai_license dataset](./shanghai_license.png)
30 | 


--------------------------------------------------------------------------------
/datasets/gdp_japan/gdp_japan.csv:
--------------------------------------------------------------------------------
1 | "Country Name","Country Code","Indicator Name","Indicator Code","1960","1961","1962","1963","1964","1965","1966","1967","1968","1969","1970","1971","1972","1973","1974","1975","1976","1977","1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"
2 | "Japan","JPN","GDP (current LCU)","NY.GDP.MKTP.CN","15950643462144","19263102386176","21860286726144","25019327447040","29429642297344","32742100172800","38026105323520","44561476878336","52776386166784","61993511813120","76539307651500","84215883490900","96418343539100","117397596102100","140090360740400","154787118329600","173827764691400","193706278803100","213306268936200","231195355873400","250636100000000","268830700000000","282582000000000","295303900000000","313145300000000","333686000000000","350344800000000","366339100000000","393641400000000","421469400000000","453608500000000","482845400000000","495055800000000","495291000000000","501537700000000","512541700000000","525806900000000","534142500000000","527876900000000","519651800000000","526706000000000","523005000000000","515986200000000","515400700000000","520965400000000","524132800000000","526879700000000","531688200000000","520715700000000","489501000000000","500353900000000","491408500000000","494957200000000","503175600000000","513876000000000","531985800000000","538445800000000","546488800000000",""
3 | 


--------------------------------------------------------------------------------
/datasets/gdp_croatia/gdp_croatia.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "gdp_croatia",
 3 | 	"longname": "GDP Croatia",
 4 | 	"n_obs": 24,
 5 | 	"n_dim": 1,
 6 | 	"time": {
 7 | 		"type": "string",
 8 | 		"format": "%Y",
 9 | 		"index": [
10 | 			0,
11 | 			1,
12 | 			2,
13 | 			3,
14 | 			4,
15 | 			5,
16 | 			6,
17 | 			7,
18 | 			8,
19 | 			9,
20 | 			10,
21 | 			11,
22 | 			12,
23 | 			13,
24 | 			14,
25 | 			15,
26 | 			16,
27 | 			17,
28 | 			18,
29 | 			19,
30 | 			20,
31 | 			21,
32 | 			22,
33 | 			23
34 | 		],
35 | 		"raw": [
36 | 			"1995",
37 | 			"1996",
38 | 			"1997",
39 | 			"1998",
40 | 			"1999",
41 | 			"2000",
42 | 			"2001",
43 | 			"2002",
44 | 			"2003",
45 | 			"2004",
46 | 			"2005",
47 | 			"2006",
48 | 			"2007",
49 | 			"2008",
50 | 			"2009",
51 | 			"2010",
52 | 			"2011",
53 | 			"2012",
54 | 			"2013",
55 | 			"2014",
56 | 			"2015",
57 | 			"2016",
58 | 			"2017",
59 | 			"2018"
60 | 		]
61 | 	},
62 | 	"series": [
63 | 		{
64 | 			"label": "GDP (constant LCU)",
65 | 			"type": "int",
66 | 			"raw": [
67 | 				217509197000,
68 | 				230285030900,
69 | 				245588563100,
70 | 				250161129600,
71 | 				247820872300,
72 | 				257170176300,
73 | 				266048737900,
74 | 				280025245200,
75 | 				295653977300,
76 | 				307227520000,
77 | 				319853234600,
78 | 				335423106200,
79 | 				353145697500,
80 | 				360337370500,
81 | 				334063620100,
82 | 				329143142600,
83 | 				328023429300,
84 | 				320476781700,
85 | 				318900841000,
86 | 				318621760200,
87 | 				326270568300,
88 | 				337807918300,
89 | 				347676515100,
90 | 				356819802800
91 | 			]
92 | 		}
93 | 	]
94 | }


--------------------------------------------------------------------------------
/datasets/global_co2/README.md:
--------------------------------------------------------------------------------
 1 | # Global Monthly CO2 levels
 2 | 
 3 | This dataset concerns monthly global hemispheric means of carbon dioxide in 
 4 | air. The data is part of the CMIP6 dataset, developed by Meinshausen et al. 
 5 | 
 6 | When using this data, please cite:
 7 | 
 8 | ```bib
 9 | @article{meinshausen2017historical,
10 | 	title={Historical greenhouse gas concentrations for climate modelling ({CMIP6})},
11 | 	author={Meinshausen, M. and Vogel, E. and Nauels, A. and Lorbacher, K. and Meinshausen, N. and Etheridge, D. M. and Fraser, P. J. and Montzka, S. A. and Rayner, P. J. and Trudinger, C. M. and Krummel, P. B. and Beyerle, U. and Canadell, J. G. and Daniel, J. S. and Enting, I. G. and Law, R. M. and Lunder, C. R. and O'Doherty, S. and Prinn, R. G. and Reimann, S. and Rubino, M. and Velders, G. J. M. and Vollmer, M. K. and Wang, R. H. J. and Weiss, R.},
12 | 	journal={Geoscientific Model Development},
13 | 	volume={10},
14 | 	pages={2057--2116},
15 | 	year={2017},
16 | 	publisher={Copernicus}
17 | }
18 | ```
19 | 
20 | It seems that the work of Meinshausen et al. is licensed under [CC BY 
21 | 3.0](https://creativecommons.org/licenses/by/3.0/), judging from [the 
22 | publication](https://www.geosci-model-dev.net/10/2057/2017/). This allows us 
23 | to redistribute this time series as part of the dataset, provided that the 
24 | above source is cited. We thus include the source csv in the repository.
25 | 
26 | Note that the original data is sampled every 4 years and cropped to recent 
27 | history to reduce the length of the series.
28 | 
29 | ![Plot of global_co2 dataset](./global_co2.png)
30 | 


--------------------------------------------------------------------------------
/datasets/centralia/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         "-s",
19 |         "--subsample",
20 |         help="Number of observations to skip during subsampling",
21 |         type=int,
22 |     )
23 |     parser.add_argument("input_file", help="File to convert")
24 |     parser.add_argument("output_file", help="File to write to")
25 |     return parser.parse_args()
26 | 
27 | 
28 | def main():
29 |     args = parse_args()
30 | 
31 |     with open(args.input_file, "r") as fp:
32 |         rows = [l.strip().split("\t") for l in fp]
33 | 
34 |     time = []
35 |     values = []
36 |     for year, pop in rows:
37 |         time.append(year)
38 |         values.append(int(pop))
39 | 
40 |     name = "centralia"
41 |     longname = "Centralia Pennsylvania Population"
42 |     time_fmt = "%Y"
43 |     series = [{"label": "Population", "type": "int", "raw": values}]
44 | 
45 |     data = {
46 |         "name": name,
47 |         "longname": longname,
48 |         "n_obs": len(time),
49 |         "n_dim": len(series),
50 |         "time": {
51 |             "type": "string",
52 |             "format": time_fmt,
53 |             "index": list(range(len(time))),
54 |             "raw": time,
55 |         },
56 |         "series": series,
57 |     }
58 | 
59 |     with open(args.output_file, "w") as fp:
60 |         json.dump(data, fp, indent="\t")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/datasets/ozone/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import argparse
12 | import clevercsv
13 | import json
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar="", escapechar=""
29 |         )
30 |         rows = list(reader)
31 | 
32 |     header = rows.pop(0)
33 | 
34 |     total = [r for r in rows if r[0] == "Total emissions"]
35 |     time = [r[2] for r in total]
36 |     values = [int(r[-1]) for r in total]
37 | 
38 |     name = "ozone"
39 |     longname = "Ozone-Depleting Emissions"
40 |     time_fmt = "%Y"
41 | 
42 |     series = [{"label": "Total Emissions", "type": "int", "raw": values}]
43 | 
44 |     data = {
45 |         "name": name,
46 |         "longname": longname,
47 |         "n_obs": len(time),
48 |         "n_dim": len(series),
49 |         "time": {
50 |             "type": "string",
51 |             "format": time_fmt,
52 |             "index": list(range(len(time))),
53 |             "raw": time,
54 |         },
55 |         "series": series,
56 |     }
57 | 
58 |     with open(args.output_file, "w") as fp:
59 |         json.dump(data, fp, indent="\t")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/examples/python/README.md:
--------------------------------------------------------------------------------
 1 | # Loading a TCPD time series in Python
 2 | 
 3 | The ``load_dataset.py`` file contains example code to load a time series as a 
 4 | ``TimeSeries`` object.
 5 | 
 6 | ```python
 7 | >>> from load_dataset import TimeSeries
 8 | >>> ts = TimeSeries.from_json('../../datasets/ozone/ozone.json')
 9 | ```
10 | 
11 | To export the time series as a [pandas 
12 | DataFrame](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dataframe), 
13 | simply use:
14 | 
15 | ```python
16 | >>> ts.df
17 |      t  Total Emissions
18 | 0    0         380000.0
19 | 1    1         400000.0
20 | 2    2         440000.0
21 | 3    3         480000.0
22 | 4    4         510000.0
23 | 5    5         540000.0
24 | ...
25 | ```
26 | 
27 | The ``TimeSeries`` instance ``ts`` has an integer time axis at ``ts.t`` and 
28 | the observations at ``ts.y``. The time axis is zero-based by default. If you 
29 | prefer to use a one-based indexing, simply run:
30 | 
31 | ```python
32 | >>> ts.make_one_based()
33 | >>> ts.df
34 |      t  Total Emissions
35 | 0    1         380000.0
36 | 1    2         400000.0
37 | 2    3         440000.0
38 | 3    4         480000.0
39 | 4    5         510000.0
40 | 5    6         540000.0
41 | ...
42 | ```
43 | 
44 | Many of the time series in TCPD have date or datetime labels for the time 
45 | axis. This axis can be retrieved using:
46 | 
47 | ```python
48 | >>> ts.datestr
49 | array(['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
50 |         ...
51 |        '2009', '2010', '2011', '2012', '2013', '2014'], dtype='<U4')
52 | ```
53 | 
54 | which uses the date format stored in ``ts.datefmt``.
55 | 
56 | ```python
57 | >>> ts.datefmt
58 | '%Y'
59 | ```
60 | 


--------------------------------------------------------------------------------
/datasets/us_population/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar="", escapechar=""
29 |         )
30 |         rows = list(reader)
31 | 
32 |     rows.pop(0)
33 | 
34 |     # the time format is monthly, so we convert that here
35 |     time = [r[2][:-3] for r in rows]
36 |     time_fmt = "%Y-%m"
37 | 
38 |     # source is in thousands, so we correct that here
39 |     values = [float(r[3]) * 1000 for r in rows]
40 | 
41 |     name = "us_population"
42 |     longname = "US Population"
43 |     series = [{"label": "Population", "type": "int", "raw": values}]
44 | 
45 |     data = {
46 |         "name": name,
47 |         "longname": longname,
48 |         "n_obs": len(time),
49 |         "n_dim": len(series),
50 |         "time": {
51 |             "type": "string",
52 |             "format": time_fmt,
53 |             "index": list(range(len(time))),
54 |             "raw": time,
55 |         },
56 |         "series": series,
57 |     }
58 | 
59 |     with open(args.output_file, "w") as fp:
60 |         json.dump(data, fp, indent="\t")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/datasets/run_log/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import argparse
12 | import clevercsv
13 | import json
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar="", escapechar=""
29 |         )
30 |         rows = list(reader)
31 | 
32 |     header = rows.pop(0)
33 | 
34 |     name = "run_log"
35 |     longname = "Run Log"
36 | 
37 |     time = [r[0].rstrip("Z").replace("T", " ") for r in rows]
38 |     time_fmt = "%Y-%m-%d %H:%M:%S"
39 |     pace = [float(r[3]) for r in rows]
40 |     distance = [float(r[4]) for r in rows]
41 | 
42 |     series = [
43 |         {"label": "Pace", "type": "float", "raw": pace},
44 |         {"label": "Distance", "type": "float", "raw": distance},
45 |     ]
46 | 
47 |     data = {
48 |         "name": name,
49 |         "longname": longname,
50 |         "n_obs": len(time),
51 |         "n_dim": len(series),
52 |         "time": {
53 |             "type": "string",
54 |             "format": time_fmt,
55 |             "index": list(range(len(time))),
56 |             "raw": time,
57 |         },
58 |         "series": series,
59 |     }
60 | 
61 |     with open(args.output_file, "w") as fp:
62 |         json.dump(data, fp, indent="\t")
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/datasets/uk_coal_employ/employ_only.csv:
--------------------------------------------------------------------------------
  1 | Year ,Employment
  2 | 1913,1107000
  3 | 1914,1038000
  4 | 1915,935000
  5 | 1916,981000
  6 | 1917,1002000
  7 | 1918,990000
  8 | 1919,1136000
  9 | 1920,1191000
 10 | 1921,
 11 | 1922,1085000
 12 | 1923,1151000
 13 | 1924,1163000
 14 | 1925,1078000
 15 | 1926,
 16 | 1927,991000
 17 | 1928,915000
 18 | 1929,925000
 19 | 1930,910000
 20 | 1931,843000
 21 | 1932,796000
 22 | 1933,767000
 23 | 1934,768000
 24 | 1935,753000
 25 | 1936,750000
 26 | 1937,773000
 27 | 1938,776000
 28 | 1939,761000
 29 | 1940,744000
 30 | 1941,692000
 31 | 1942,704000
 32 | 1943,701000
 33 | 1944,704000
 34 | 1945,702000
 35 | 1946,693000
 36 | 1947,707000
 37 | 1948,720000
 38 | 1949,716000
 39 | 1950,693000
 40 | 1951,695000
 41 | 1952,712000
 42 | 1953,713000
 43 | 1954,707000
 44 | 1955,704000
 45 | 1956,703000
 46 | 1957,710000
 47 | 1958,699000
 48 | 1959,665000
 49 | 1960,607000
 50 | 1961,575000
 51 | 1962,556000
 52 | 1963,528000
 53 | 1964,502000
 54 | 1965,454700
 55 | 1966,422000
 56 | 1967,389500
 57 | 1968,330900
 58 | 1969,305700
 59 | 1970,290000
 60 | 1971,286100
 61 | 1972,273600
 62 | 1973,251800
 63 | 1974,252800
 64 | 1975,252000
 65 | 1976,249700
 66 | 1977,247900
 67 | 1978,240400
 68 | 1979,241600
 69 | 1980,236900
 70 | 1981,172000
 71 | 1982,164000
 72 | 1983,148000
 73 | 1984,139000
 74 | 1985,114000
 75 | 1986,91000
 76 | 1987,75000
 77 | 1988,69000
 78 | 1989,56000
 79 | 1990,49000
 80 | 1991,38000
 81 | 1992,28000
 82 | 1993,10000
 83 | 1994,7000
 84 | 1995,11657
 85 | 1996,10315
 86 | 1997,13768
 87 | 1998,11113
 88 | 1999,11973
 89 | 2000,10939
 90 | 2001,11439
 91 | 2002,9578
 92 | 2003,8250
 93 | 2004,7772
 94 | 2005,6054
 95 | 2006,5431
 96 | 2007,5538
 97 | 2008,6157
 98 | 2009,5912
 99 | 2010,6014
100 | 2011,5972
101 | 2012,5827
102 | 2013,3715
103 | 2014,3601
104 | 2015,1975
105 | 2016,831
106 | 2017,620
107 | 


--------------------------------------------------------------------------------
/datasets/usd_isk/README.md:
--------------------------------------------------------------------------------
 1 | # USD - ISK exchange rate
 2 | 
 3 | Due to the financial crisis the USD/ISK exchange rate shows potential change 
 4 | point behaviour in the years around 2008. Since it is difficult to find freely 
 5 | available (and permissively licensed) historical exchange rate data, we 
 6 | instead use the monthly average Euro/ECU exchange rates of both USD and ISK 
 7 | and compute the USD/ISK rate from there.
 8 | 
 9 | The Euro/ECU exchange rate can be obtained from [this direct 
10 | link](https://appsso.eurostat.ec.europa.eu/nui/show.do?query=BOOKMARK_DS-054904_QID_-3F48645A_UID_-3F171EB0&layout=TIME,C,X,0;CURRENCY,L,Y,0;UNIT,L,Z,0;STATINFO,L,Z,1;INDICATORS,C,Z,2;&zSelection=DS-054904UNIT,NAC;DS-054904INDICATORS,OBS_FLAG;DS-054904STATINFO,AVG;&rankName1=UNIT_1_2_-1_2&rankName2=INDICATORS_1_2_-1_2&rankName3=STATINFO_1_2_-1_2&rankName4=TIME_1_0_0_0&rankName5=CURRENCY_1_2_0_1&sortC=ASC_-1_FIRST&rStp=&cStp=&rDCh=&cDCh=&rDM=true&cDM=true&footnes=false&empty=false&wai=false&time_mode=NONE&time_most_recent=false&lang=EN&cfo=%23%23%23%2C%23%23%23.%23%23%23). 
11 | The data is provided by Eurostat, and is Copyrighted to the European Union. 
12 | Redistribution of the data in this repository is allowed according to [this 
13 | copyright license](https://ec.europa.eu/eurostat/about/policies/copyright). No 
14 | modification of the source data in ``ert_bil_eur_m_1_Data.csv`` has been made.
15 | 
16 | The conversion script expects the CSV file format, with the following 
17 | configuration:
18 | 
19 | - Full extraction (check)
20 |    + single file
21 | - Flags and footnotes (check)
22 | - Cell formatting
23 |    + 1 234.56
24 | 
25 | This file is included in the repository as ``ert_bil_eur_m_1_Data.csv``. 
26 | Additional metadata is stored in ``ert_bil_eur_m_Label.csv``. The file 
27 | ``usd_isk.json`` can be obtained from the original data by running:
28 | 
29 | ```
30 | $ python convert.py ert_bil_eur_m_1_Data.csv usd_isk.json
31 | ```
32 | 
33 | ![Plot of usd_isk dataset](./usd_isk.png)
34 | 


--------------------------------------------------------------------------------
/datasets/brent_spot/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import argparse
12 | import clevercsv
13 | import json
14 | 
15 | SAMPLE = 10
16 | 
17 | def date_to_iso(datestr):
18 |     mm, dd, yyyy = list(map(int, datestr.split("/")))
19 |     return f"{yyyy}-{mm:02d}-{dd:02d}"
20 | 
21 | 
22 | def parse_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("input_file", help="File to convert")
25 |     parser.add_argument("output_file", help="File to write to")
26 |     return parser.parse_args()
27 | 
28 | 
29 | def main():
30 |     args = parse_args()
31 | 
32 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
33 |         reader = clevercsv.reader(
34 |             fp, delimiter=",", quotechar="", escapechar=""
35 |         )
36 |         rows = list(reader)
37 | 
38 |     rows = rows[5:]
39 |     rows = list(reversed(rows))
40 | 
41 |     rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0]
42 | 
43 |     idx2000 = next((i for i, x in enumerate(rows) if x[0].endswith("2000")))
44 |     rows = rows[idx2000:]
45 | 
46 |     name = "brent_spot"
47 |     longname = "Brent Spot Price"
48 |     time = [date_to_iso(r[0]) for r in rows]
49 |     time_fmt = "%Y-%m-%d"
50 |     values = [float(r[1]) for r in rows]
51 | 
52 |     series = [{"label": "Dollars/Barrel", "type": "float", "raw": values}]
53 | 
54 |     data = {
55 |         "name": name,
56 |         "longname": longname,
57 |         "n_obs": len(time),
58 |         "n_dim": len(series),
59 |         "time": {
60 |             "type": "string",
61 |             "format": time_fmt,
62 |             "index": list(range(len(time))),
63 |             "raw": time,
64 |         },
65 |         "series": series,
66 |     }
67 | 
68 |     with open(args.output_file, "w") as fp:
69 |         json.dump(data, fp, indent="\t")
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for the Turing Change Point Dataset
 2 | #
 3 | # Author: G.J.J. van den Burg
 4 | # Copyright (c) 2019, The Alan Turing Institute
 5 | # License: See LICENSE file.
 6 | #
 7 | 
 8 | SHELL := bash
 9 | .SHELLFLAGS := -eu -o pipefail -c
10 | MAKEFLAGS += --warn-undefined-variables
11 | MAKEFLAGS += --no-builtin-rules
12 | 
13 | DATA_DIR=./datasets
14 | UTIL_DIR=./utils
15 | VENV_DIR=./venv
16 | EXPORT_DIR=./export
17 | 
18 | .PHONY: all clean collect verify validate test export
19 | 
20 | all: test
21 | 
22 | ################
23 | # Main targets #
24 | ################
25 | 
26 | collect: venv
27 | 	source $(VENV_DIR)/bin/activate && python build_tcpd.py -v collect
28 | 
29 | ##############
30 | # Validation #
31 | ##############
32 | 
33 | test: verify validate
34 | 
35 | verify: venv collect $(UTIL_DIR)/check_checksums.py ./checksums.json
36 | 	@echo "Verifying datasets ..."
37 | 	source $(VENV_DIR)/bin/activate && \
38 | 		python $(UTIL_DIR)/check_checksums.py -v -c ./checksums.json -d $(DATA_DIR)
39 | 
40 | validate: venv collect $(UTIL_DIR)/validate_dataset.py ./schema.json
41 | 	@echo "Validating datasets"
42 | 	source $(VENV_DIR)/bin/activate && \
43 | 		python $(UTIL_DIR)/validate_dataset.py -v -s ./schema.json -d $(DATA_DIR)
44 | 
45 | ####################
46 | # Utility commands #
47 | ####################
48 | 
49 | export: test
50 | 	mkdir -p $(EXPORT_DIR)
51 | 	cp -v $(DATA_DIR)/*/*.json $(EXPORT_DIR)
52 | 
53 | ###########
54 | # Cleanup #
55 | ###########
56 | 
57 | clean:
58 | 	if [ -d $(VENV_DIR) ] ; then \
59 | 		source $(VENV_DIR)/bin/activate && python build_tcpd.py -v clean ; \
60 | 	fi
61 | 	rm -rf $(VENV_DIR)
62 | 	rm -rf $(EXPORT_DIR)
63 | 
64 | ##############
65 | # Virtualenv #
66 | ##############
67 | 
68 | venv: $(VENV_DIR)/bin/activate
69 | 
70 | $(VENV_DIR)/bin/activate:
71 | 	test -d $(VENV_DIR) || python -m venv $(VENV_DIR)
72 | 	source $(VENV_DIR)/bin/activate && \
73 | 		pip install wheel && \
74 | 		pip install -r ./requirements.txt
75 | 	touch $(VENV_DIR)/bin/activate
76 | 


--------------------------------------------------------------------------------
/datasets/businv/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import argparse
12 | import json
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("input_file", help="File to convert")
18 |     parser.add_argument("output_file", help="File to write to")
19 |     return parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     args = parse_args()
24 | 
25 |     with open(args.input_file, "r") as fp:
26 |         lines = [l.strip() for l in fp]
27 | 
28 |     # header data should be first three lines
29 |     # we use some asserts to ensure things are what we expect them to be
30 |     header = lines[:3]
31 |     assert header[-1] == "Total Business"
32 | 
33 |     lines = lines[4:]
34 |     assert lines[0].startswith("1992")
35 | 
36 |     by_month = {}
37 |     for line in lines:
38 |         # stop on first empty line
39 |         if not line.strip():
40 |             break
41 |         parts = [x for x in line.split(" ") if x.strip()]
42 |         assert len(parts) == 13  # year + 12 months
43 |         year = parts.pop(0)
44 |         for midx, v in enumerate(parts, start=1):
45 |             if v == ".":
46 |                 break
47 |             by_month[f"{year}-{midx:02}"] = int(v)
48 | 
49 |     name = "businv"
50 |     longname = "Business Inventory"
51 |     time = sorted(by_month.keys())
52 |     time_fmt = "%Y-%m"
53 |     values = [by_month[t] for t in time]
54 | 
55 |     series = [{"label": "Business Inventory", "type": "int", "raw": values}]
56 | 
57 |     data = {
58 |         "name": name,
59 |         "longname": longname,
60 |         "n_obs": len(time),
61 |         "n_dim": len(series),
62 |         "time": {
63 |             "type": "string",
64 |             "format": time_fmt,
65 |             "index": list(range(len(time))),
66 |             "raw": time,
67 |         },
68 |         "series": series,
69 |     }
70 | 
71 |     with open(args.output_file, "w") as fp:
72 |         json.dump(data, fp, indent="\t")
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/datasets/gdp_croatia/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar='"', escapechar=""
29 |         )
30 |         rows = list(reader)
31 |     rows = rows[4:]
32 |     header = rows.pop(0)
33 | 
34 |     as_dicts = []
35 |     for row in rows:
36 |         as_dicts.append({h: v for h, v in zip(header, row)})
37 | 
38 |     croatia = next(
39 |         (d for d in as_dicts if d["Country Name"] == "Croatia"), None
40 |     )
41 | 
42 |     tuples = []
43 |     for key in croatia:
44 |         try:
45 |             ikey = int(key)
46 |         except ValueError:
47 |             continue
48 |         if not croatia[key]:
49 |             continue
50 |         tuples.append((ikey, int(croatia[key])))
51 | 
52 |     name = "gdp_croatia"
53 |     longname = "GDP Croatia"
54 |     time = [str(t[0]) for t in tuples]
55 |     time_fmt = "%Y"
56 |     series = [
57 |         {
58 |             "label": "GDP (constant LCU)",
59 |             "type": "int",
60 |             "raw": [t[1] for t in tuples],
61 |         }
62 |     ]
63 | 
64 |     data = {
65 |         "name": name,
66 |         "longname": longname,
67 |         "n_obs": len(time),
68 |         "n_dim": len(series),
69 |         "time": {
70 |             "type": "string",
71 |             "format": time_fmt,
72 |             "index": list(range(len(time))),
73 |             "raw": time,
74 |         },
75 |         "series": series,
76 |     }
77 | 
78 |     with open(args.output_file, "w") as fp:
79 |         json.dump(data, fp, indent="\t")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/datasets/gdp_iran/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar='"', escapechar=""
29 |         )
30 |         rows = list(reader)
31 |     rows = rows[4:]
32 |     header = rows.pop(0)
33 | 
34 |     as_dicts = []
35 |     for row in rows:
36 |         as_dicts.append({h: v for h, v in zip(header, row)})
37 | 
38 |     iran = next(
39 |         (d for d in as_dicts if d["Country Name"] == "Iran, Islamic Rep."),
40 |         None,
41 |     )
42 | 
43 |     tuples = []
44 |     for key in iran:
45 |         try:
46 |             ikey = int(key)
47 |         except ValueError:
48 |             continue
49 |         if not iran[key]:
50 |             continue
51 |         tuples.append((ikey, float(iran[key])))
52 | 
53 |     name = "gdp_iran"
54 |     longname = "GDP Iran"
55 |     time = [str(t[0]) for t in tuples]
56 |     time_fmt = "%Y"
57 |     series = [
58 |         {
59 |             "label": "GDP (constant LCU)",
60 |             "type": "float",
61 |             "raw": [t[1] for t in tuples],
62 |         }
63 |     ]
64 | 
65 |     data = {
66 |         "name": name,
67 |         "longname": longname,
68 |         "n_obs": len(time),
69 |         "n_dim": len(series),
70 |         "time": {
71 |             "type": "string",
72 |             "format": time_fmt,
73 |             "index": list(range(len(time))),
74 |             "raw": time,
75 |         },
76 |         "series": series,
77 |     }
78 | 
79 |     with open(args.output_file, "w") as fp:
80 |         json.dump(data, fp, indent="\t")
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/datasets/gdp_argentina/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("input_file", help="File to convert")
19 |     parser.add_argument("output_file", help="File to write to")
20 |     return parser.parse_args()
21 | 
22 | 
23 | def main():
24 |     args = parse_args()
25 | 
26 |     with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp:
27 |         reader = clevercsv.reader(
28 |             fp, delimiter=",", quotechar='"', escapechar=""
29 |         )
30 |         rows = list(reader)
31 |     rows = rows[4:]
32 |     header = rows.pop(0)
33 | 
34 |     as_dicts = []
35 |     for row in rows:
36 |         as_dicts.append({h: v for h, v in zip(header, row)})
37 | 
38 |     argentina = next(
39 |         (d for d in as_dicts if d["Country Name"] == "Argentina"), None
40 |     )
41 | 
42 |     tuples = []
43 |     for key in argentina:
44 |         try:
45 |             ikey = int(key)
46 |         except ValueError:
47 |             continue
48 |         if not argentina[key]:
49 |             continue
50 |         tuples.append((ikey, float(argentina[key])))
51 | 
52 |     name = "gdp_argentina"
53 |     longname = "GDP Argentina"
54 |     time = [str(t[0]) for t in tuples]
55 |     time_fmt = "%Y"
56 |     series = [
57 |         {
58 |             "label": "GDP (constant LCU)",
59 |             "type": "float",
60 |             "raw": [t[1] for t in tuples],
61 |         }
62 |     ]
63 | 
64 |     data = {
65 |         "name": name,
66 |         "longname": longname,
67 |         "n_obs": len(time),
68 |         "n_dim": len(series),
69 |         "time": {
70 |             "type": "string",
71 |             "format": time_fmt,
72 |             "index": list(range(len(time))),
73 |             "raw": time,
74 |         },
75 |         "series": series,
76 |     }
77 | 
78 |     with open(args.output_file, "w") as fp:
79 |         json.dump(data, fp, indent="\t")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/datasets/jfk_passengers/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def month2index(month):
17 |     return {
18 |         "Jan": "01",
19 |         "Feb": "02",
20 |         "Mar": "03",
21 |         "Apr": "04",
22 |         "May": "05",
23 |         "Jun": "06",
24 |         "Jul": "07",
25 |         "Aug": "08",
26 |         "Sep": "09",
27 |         "Oct": "10",
28 |         "Nov": "11",
29 |         "Dec": "12",
30 |     }[month]
31 | 
32 | 
33 | def parse_args():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("input_file", help="File to convert")
36 |     parser.add_argument("output_file", help="File to write to")
37 |     return parser.parse_args()
38 | 
39 | 
40 | def main():
41 |     args = parse_args()
42 | 
43 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
44 |         reader = clevercsv.DictReader(
45 |             fp, delimiter=",", quotechar="", escapechar=""
46 |         )
47 |         items = list(reader)
48 | 
49 |     for it in items:
50 |         it["time"] = f"{it['Year']}-{month2index(it['Month'])}"
51 |         it["value"] = int(it["Total Passengers"])
52 | 
53 | 
54 |     jfks = [it for it in items if it["Airport Code"] == "JFK"]
55 |     pairs = [(it["time"], it["value"]) for it in jfks]
56 |     # with this date format string sort is date sort
57 |     pairs.sort()
58 | 
59 |     name = "jfk_passengers"
60 |     longname = "JFK Passengers"
61 |     time_fmt = "%Y-%m"
62 |     time = [p[0] for p in pairs]
63 |     values = [p[1] for p in pairs]
64 | 
65 |     series = [{"label": "Number of Passengers", "type": "int", "raw": values}]
66 | 
67 |     data = {
68 |         "name": name,
69 |         "longname": longname,
70 |         "n_obs": len(time),
71 |         "n_dim": len(series),
72 |         "time": {
73 |             "type": "string",
74 |             "format": time_fmt,
75 |             "index": list(range(len(time))),
76 |             "raw": time,
77 |         },
78 |         "series": series,
79 |     }
80 | 
81 |     with open(args.output_file, "w") as fp:
82 |         json.dump(data, fp, indent="\t")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/datasets/lga_passengers/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def month2index(month):
17 |     return {
18 |         "Jan": "01",
19 |         "Feb": "02",
20 |         "Mar": "03",
21 |         "Apr": "04",
22 |         "May": "05",
23 |         "Jun": "06",
24 |         "Jul": "07",
25 |         "Aug": "08",
26 |         "Sep": "09",
27 |         "Oct": "10",
28 |         "Nov": "11",
29 |         "Dec": "12",
30 |     }[month]
31 | 
32 | 
33 | def parse_args():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("input_file", help="File to convert")
36 |     parser.add_argument("output_file", help="File to write to")
37 |     return parser.parse_args()
38 | 
39 | 
40 | def main():
41 |     args = parse_args()
42 | 
43 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
44 |         reader = clevercsv.DictReader(
45 |             fp, delimiter=",", quotechar="", escapechar=""
46 |         )
47 |         items = list(reader)
48 | 
49 |     for it in items:
50 |         it["time"] = f"{it['Year']}-{month2index(it['Month'])}"
51 |         it["value"] = int(it["Total Passengers"])
52 | 
53 |     lgas = [it for it in items if it["Airport Code"] == "LGA"]
54 |     pairs = [(it["time"], it["value"]) for it in lgas]
55 |     # with this date format string sort is date sort
56 |     pairs.sort()
57 | 
58 |     name = "lga_passengers"
59 |     longname = "LaGuardia Passengers"
60 |     time_fmt = "%Y-%m"
61 |     time = [p[0] for p in pairs]
62 |     values = [p[1] for p in pairs]
63 | 
64 |     series = [{"label": "Number of Passengers", "type": "int", "raw": values}]
65 | 
66 |     data = {
67 |         "name": name,
68 |         "longname": longname,
69 |         "n_obs": len(time),
70 |         "n_dim": len(series),
71 |         "time": {
72 |             "type": "string",
73 |             "format": time_fmt,
74 |             "index": list(range(len(time))),
75 |             "raw": time,
76 |         },
77 |         "series": series,
78 |     }
79 | 
80 |     with open(args.output_file, "w") as fp:
81 |         json.dump(data, fp, indent="\t")
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/datasets/unemployment_nl/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import argparse
11 | import clevercsv
12 | import json
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("input_file", help="File to convert")
18 |     parser.add_argument("output_file", help="File to write to")
19 |     return parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     args = parse_args()
24 |     with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp:
25 |         reader = clevercsv.reader(
26 |             fp, delimiter=";", quotechar='"', escapechar=""
27 |         )
28 |         rows = list(reader)
29 | 
30 |     # remove rows we don't need
31 |     title = rows.pop(0)
32 |     meta = rows.pop(0)
33 |     meta = rows.pop(0)
34 | 
35 |     # filter out rows we want
36 |     header = rows.pop(0)
37 |     eligible_population = rows.pop(0)
38 |     working_population = rows.pop(0)
39 |     unemployed_population = rows.pop(0)
40 | 
41 |     years = header[3:]
42 |     eligible = list(map(int, eligible_population[3:]))
43 |     unemployed = list(map(int, unemployed_population[3:]))
44 | 
45 |     # compute the percentage unemployed
46 |     by_year = {
47 |         y: (u / e * 100) for y, e, u in zip(years, eligible, unemployed)
48 |     }
49 | 
50 |     # remove value of 2001 before revision
51 |     del by_year["2001 voor revisie"]
52 |     # rename value of 2001 after revision as simply '2001'
53 |     by_year["2001"] = by_year["2001 na revisie"]
54 |     del by_year["2001 na revisie"]
55 | 
56 |     time = sorted(by_year.keys())
57 |     values = [by_year[t] for t in time]
58 |     series = [{"label": "V1", "type": "float", "raw": values}]
59 | 
60 |     data = {
61 |         "name": "unemployment_nl",
62 |         "longname": "Unemployment rate (NL)",
63 |         "n_obs": len(time),
64 |         "n_dim": len(series),
65 |         "time": {
66 |             "type": "string",
67 |             "format": "%Y",
68 |             "index": list(range(len(time))),
69 |             "raw": time,
70 |         },
71 |         "series": series,
72 |     }
73 | 
74 |     with open(args.output_file, "w") as fp:
75 |         json.dump(data, fp, indent="\t")
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/datasets/shanghai_license/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Dataset conversion script
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | import clevercsv
14 | 
15 | 
16 | def reformat_time(mmmyy):
17 |     """ From MMM-YY to %Y-%m """
18 |     MONTHS = {
19 |         "Jan": 1,
20 |         "Feb": 2,
21 |         "Mar": 3,
22 |         "Apr": 4,
23 |         "May": 5,
24 |         "Jun": 6,
25 |         "Jul": 7,
26 |         "Aug": 8,
27 |         "Sep": 9,
28 |         "Oct": 10,
29 |         "Nov": 11,
30 |         "Dec": 12,
31 |     }
32 |     mmm, yy = mmmyy.split("-")
33 |     Y = int(yy) + 2000
34 |     m = MONTHS.get(mmm)
35 |     return "%i-%02i" % (Y, m)
36 | 
37 | 
38 | def parse_args():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("input_file", help="File to convert")
41 |     parser.add_argument("output_file", help="File to write to")
42 |     return parser.parse_args()
43 | 
44 | 
45 | def main():
46 |     args = parse_args()
47 | 
48 |     with open(args.input_file, "r", newline="", encoding="ascii") as fp:
49 |         reader = clevercsv.reader(
50 |             fp, delimiter=",", quotechar="", escapechar=""
51 |         )
52 |         rows = list(reader)
53 | 
54 |     rows.pop(0)
55 | 
56 |     time = [reformat_time(r[0]) for r in rows]
57 |     values = [int(r[-1]) for r in rows]
58 | 
59 |     # Manually split Jan-08 into two, see readme for details.
60 |     jan08idx = time.index("2008-01")
61 |     values[jan08idx] /= 2
62 |     time.insert(jan08idx + 1, "2008-02")
63 |     values.insert(jan08idx + 1, values[jan08idx])
64 | 
65 |     name = "shanghai_license"
66 |     longname = "Shanghai License"
67 |     time_fmt = "%Y-%m"
68 |     series = [{"label": "No. of Applicants", "type": "int", "raw": values}]
69 | 
70 |     data = {
71 |         "name": name,
72 |         "longname": longname,
73 |         "n_obs": len(time),
74 |         "n_dim": len(series),
75 |         "time": {
76 |             "type": "string",
77 |             "format": time_fmt,
78 |             "index": list(range(len(time))),
79 |             "raw": time,
80 |         },
81 |         "series": series,
82 |     }
83 | 
84 |     with open(args.output_file, "w") as fp:
85 |         json.dump(data, fp, indent="\t")
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/datasets/rail_lines/rail_lines.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "rail_lines",
  3 | 	"longname": "Rail Lines",
  4 | 	"n_obs": 37,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"format": "%Y",
  8 | 		"index": [
  9 | 			0,
 10 | 			1,
 11 | 			2,
 12 | 			3,
 13 | 			4,
 14 | 			5,
 15 | 			6,
 16 | 			7,
 17 | 			8,
 18 | 			9,
 19 | 			10,
 20 | 			11,
 21 | 			12,
 22 | 			13,
 23 | 			14,
 24 | 			15,
 25 | 			16,
 26 | 			17,
 27 | 			18,
 28 | 			19,
 29 | 			20,
 30 | 			21,
 31 | 			22,
 32 | 			23,
 33 | 			24,
 34 | 			25,
 35 | 			26,
 36 | 			27,
 37 | 			28,
 38 | 			29,
 39 | 			30,
 40 | 			31,
 41 | 			32,
 42 | 			33,
 43 | 			34,
 44 | 			35,
 45 | 			36
 46 | 		],
 47 | 		"raw": [
 48 | 			"1980",
 49 | 			"1981",
 50 | 			"1982",
 51 | 			"1983",
 52 | 			"1984",
 53 | 			"1985",
 54 | 			"1986",
 55 | 			"1987",
 56 | 			"1988",
 57 | 			"1989",
 58 | 			"1990",
 59 | 			"1991",
 60 | 			"1992",
 61 | 			"1993",
 62 | 			"1994",
 63 | 			"1995",
 64 | 			"1996",
 65 | 			"1997",
 66 | 			"1998",
 67 | 			"1999",
 68 | 			"2000",
 69 | 			"2001",
 70 | 			"2002",
 71 | 			"2003",
 72 | 			"2004",
 73 | 			"2005",
 74 | 			"2006",
 75 | 			"2007",
 76 | 			"2008",
 77 | 			"2009",
 78 | 			"2010",
 79 | 			"2011",
 80 | 			"2012",
 81 | 			"2013",
 82 | 			"2014",
 83 | 			"2015",
 84 | 			"2016"
 85 | 		]
 86 | 	},
 87 | 	"series": [
 88 | 		{
 89 | 			"label": "V1",
 90 | 			"type": "float",
 91 | 			"raw": [
 92 | 				1000507.33548387,
 93 | 				996153.287096774,
 94 | 				994910,
 95 | 				992092.841935484,
 96 | 				983302.464516129,
 97 | 				975342.625806452,
 98 | 				963878.138709677,
 99 | 				949388.514516129,
100 | 				941808.890967742,
101 | 				938664.31516129,
102 | 				977074.383,
103 | 				973210.707096774,
104 | 				964581.342580645,
105 | 				973468.754193548,
106 | 				970988.050967742,
107 | 				968160.260322581,
108 | 				971143.774885996,
109 | 				970154.472375981,
110 | 				964146.915693211,
111 | 				963074.49690094,
112 | 				968935.935483871,
113 | 				956609.233390473,
114 | 				962635.639187575,
115 | 				959885.943535401,
116 | 				958123.943535401,
117 | 				989329.943535401,
118 | 				1062032.93548387,
119 | 				1060970.93548387,
120 | 				1060664.93548387,
121 | 				1056107.93548387,
122 | 				1076589.93548387,
123 | 				1057710.93548387,
124 | 				1051859.93548387,
125 | 				1051798.67548387,
126 | 				1055263.93548387,
127 | 				1051968.08548387,
128 | 				1051767.60548387
129 | 			]
130 | 		}
131 | 	]
132 | }
133 | 


--------------------------------------------------------------------------------
/datasets/usd_isk/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import clevercsv
11 | import json
12 | import sys
13 | 
14 | 
15 | def format_month(ymm):
16 |     year, month = ymm.split("M")
17 |     return f"{year}-{month}"
18 | 
19 | 
20 | def main(input_filename, output_filename):
21 |     with open(input_filename, "r", newline="", encoding="ascii") as fp:
22 |         reader = clevercsv.DictReader(
23 |             fp, delimiter=",", quotechar='"', escapechar=""
24 |         )
25 |         rows = list(reader)
26 | 
27 |     by_currency = {}
28 |     for row in rows:
29 |         cur = row["CURRENCY"]
30 |         if not cur in by_currency:
31 |             by_currency[cur] = []
32 |         by_currency[cur].append(row)
33 | 
34 |     by_month = {}
35 |     for cur in by_currency:
36 |         for item in by_currency[cur]:
37 |             if item["Value"] == ":":
38 |                 continue
39 |             month = item["TIME"]
40 |             if not month in by_month:
41 |                 by_month[month] = {}
42 |             by_month[month][cur] = item
43 | 
44 |     to_delete = []
45 |     for month in by_month:
46 |         if not len(by_month[month]) == 2:
47 |             to_delete.append(month)
48 |     for month in to_delete:
49 |         del by_month[month]
50 | 
51 |     ratio = {}
52 |     for month in sorted(by_month.keys()):
53 |         usd = by_month[month]["US dollar"]
54 |         isk = by_month[month]["Icelandic krona"]
55 |         ratio[format_month(month)] = float(usd["Value"]) / float(isk["Value"])
56 | 
57 |     tuples = [(m, ratio[m]) for m in ratio]
58 | 
59 |     name = "usd_isk"
60 |     longname = "USD-ISK exhange rate"
61 | 
62 |     data = {
63 |         "name": name,
64 |         "longname": longname,
65 |         "n_obs": len(tuples),
66 |         "n_dim": 1,
67 |         "time": {
68 |             "format": "%Y-%m",
69 |             "index": list(range(len(tuples))),
70 |             "raw": [t[0] for t in tuples],
71 |         },
72 |         "series": [
73 |             {
74 |                 "label": "Exchange rate",
75 |                 "type": "float",
76 |                 "raw": [t[1] for t in tuples],
77 |             }
78 |         ],
79 |     }
80 | 
81 |     with open(output_filename, "w") as fp:
82 |         json.dump(data, fp, indent="\t")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main(sys.argv[1], sys.argv[2])
87 | 


--------------------------------------------------------------------------------
/checksums.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"kind": "md5",
 3 | 	"checksums": {
 4 | 		"apple.json": "22edb48471bd3711f7a6e15de6413643",
 5 | 		"bank.json": "5207135ea53fc6fa2a8119908da73abf",
 6 | 		"bee_waggle_6.json": [
 7 | 			"4f03feafecb3be0b069b3cb0d6b17d4f",
 8 | 			"71311783488ee5f1122545d24c15429b"
 9 | 		],
10 | 		"bitcoin.json": "f90ff14ed1fc0c3d47d4394d25cbce93",
11 | 		"brent_spot.json": "79892116ef8a0aa16e2450123655b31d",
12 | 		"businv.json": "d2ab178da17b2e659a10a102a4b9f332",
13 | 		"centralia.json": "addb9b70ac1294eba6da958f3ab26595",
14 | 		"children_per_woman.json": "826e0a2328b8a8a085050115768eef98",
15 | 		"co2_canada.json": "de8d2cac911d2a8e3ce5addacbaed8e0",
16 | 		"construction.json": "fb0347dc9fd353b11b35e99f7d531f13",
17 | 		"debt_ireland.json": "a5a9e752c338d2ffcceb614bb2064cc9",
18 | 		"gdp_argentina.json": "694212b5682ebd808d740ffd83d4bc16",
19 | 		"gdp_croatia.json": "4f902ba68bf710fa245e5eb0ab35fea7",
20 | 		"gdp_iran.json": "889e9fc6292125189fd3188396167431",
21 | 		"gdp_japan.json": "17026e80ab363d9f668d69900824d9ae",
22 | 		"global_co2.json": "7c8edd8887f51a6f841cc9d806ab4e56",
23 | 		"homeruns.json": "987bbab63e2c72acba1c07325303720c",
24 | 		"iceland_tourism.json": "8bbac4ca95319a865f2d58ff564f063d",
25 | 		"jfk_passengers.json": "9655295214078f2a45a4c18e6c4e6d0d",
26 | 		"lga_passengers.json": "3e7bf55fac17f59b400f8a558d3f0337",
27 | 		"measles.json": "e42afd03be893fc7deb98514c94fa4c7",
28 | 		"nile.json": "5b08800e3ec692bfa5385b978658199b",
29 | 		"occupancy.json": "bc6cd9adaf496fe30bf0e417d2c3b0c6",
30 | 		"ozone.json": "348b1f85c3ec3da3b8989afe04c33b80",
31 | 		"quality_control_1.json": "fcfd5b0323a0dbd499c22b32c77f6a43",
32 | 		"quality_control_2.json": "919a55440bd00d635db80fe83e921c7d",
33 | 		"quality_control_3.json": "94f55ddedd03197bc3e660f6e1d840ee",
34 | 		"quality_control_4.json": "1efedb9a52cd0b9a9250cf9781c5f7ef",
35 | 		"quality_control_5.json": "2ebb10acafae18ebabf0217218717970",
36 | 		"rail_lines.json": "fa7d19c61264f0d6b9d74cd145a50012",
37 | 		"ratner_stock.json": "f7086ff916f35b88463bf8fd1857815e",
38 | 		"robocalls.json": "f67ec0ccb50f2a835912e5c51932c083",
39 | 		"run_log.json": "2c78a8fa0b4a2f8e2d22ba3ad4dfd49f",
40 | 		"scanline_126007.json": "057d5741b623308af00c42e2c8e525c3",
41 | 		"scanline_42049.json": "39921dfa959576bd0b3d6c95558f17f4",
42 | 		"seatbelts.json": "976ef4318e7b6381ff37dd4ac8029718",
43 | 		"shanghai_license.json": "b4ac173eb6c0a1a4d10268abc109eda1",
44 | 		"uk_coal_employ.json": "a7c72746e46d6e09f516bd87e0e68bef",
45 | 		"unemployment_nl.json": "26d8c0359de7f733a6fb51d4d60b5af6",
46 | 		"us_population.json": "77037fc5ff0338516a56ae686aa4dcba",
47 | 		"usd_isk.json": "5cac2807a0e280c8ffd7321662e339ac",
48 | 		"well_log.json": "7c80d2cbd5864b923e6a653aad115de6"
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/utils/check_checksums.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Validate the datasets by checksum
 6 | 
 7 | Author: G.J.J. van den Burg
 8 | License: This file is part of TCPD, see the top-level LICENSE file.
 9 | Copyright: 2019, The Alan Turing Institute
10 | 
11 | """
12 | 
13 | import argparse
14 | import hashlib
15 | import os
16 | import json
17 | 
18 | 
19 | def parse_args():
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument(
22 |         "-c", "--checksum-file", help="Checksum file (json)", required=True
23 |     )
24 |     parser.add_argument(
25 |         "-d", "--dataset-dir", help="Dataset directory", required=True
26 |     )
27 |     parser.add_argument(
28 |         "-v", "--verbose", help="Enable verbose mode", action="store_true"
29 |     )
30 |     return parser.parse_args()
31 | 
32 | 
33 | def md5sum(filename):
34 |     with open(filename, "rb") as fp:
35 |         data = fp.read()
36 |     return hashlib.md5(data).hexdigest()
37 | 
38 | 
39 | def load_checksums(checksum_file):
40 |     with open(checksum_file, "r") as fp:
41 |         checksums = json.load(fp)
42 |     assert checksums["kind"] == "md5"
43 |     return checksums["checksums"]
44 | 
45 | 
46 | def find_datafiles(dataset_dir):
47 |     data_files = {}
48 | 
49 |     datadirs = os.listdir(dataset_dir)
50 |     for ddir in datadirs:
51 |         pth = os.path.join(dataset_dir, ddir)
52 |         files = os.listdir(pth)
53 |         json_files = [f for f in files if f.endswith(".json")]
54 |         for jf in json_files:
55 |             jfpath = os.path.join(pth, jf)
56 |             if jf in data_files:
57 |                 raise KeyError("Duplicate data file '%s'?" % jfpath)
58 |             data_files[jf] = jfpath
59 | 
60 |     return data_files
61 | 
62 | 
63 | def main():
64 |     args = parse_args()
65 | 
66 |     log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
67 | 
68 |     checksums = load_checksums(args.checksum_file)
69 |     data_files = find_datafiles(args.dataset_dir)
70 | 
71 |     for fname in checksums:
72 |         log("Checking %s" % fname)
73 |         if not fname in data_files:
74 |             raise FileNotFoundError("Missing data file: %s" % fname)
75 |         md5 = md5sum(data_files[fname])
76 |         if isinstance(checksums[fname], list):
77 |             if not md5 in checksums[fname]:
78 |                 raise ValueError(
79 |                     "Checksums don't match for file: %s" % (data_files[fname])
80 |                     )
81 |         else:
82 |             if not md5 == checksums[fname]:
83 |                 raise ValueError(
84 |                 "Checksums don't match for file: %s" % (data_files[fname])
85 |             )
86 | 
87 |     log("All ok.")
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/datasets/ozone/ozone.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "ozone",
  3 | 	"longname": "Ozone-Depleting Emissions",
  4 | 	"n_obs": 54,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53
 64 | 		],
 65 | 		"raw": [
 66 | 			"1961",
 67 | 			"1962",
 68 | 			"1963",
 69 | 			"1964",
 70 | 			"1965",
 71 | 			"1966",
 72 | 			"1967",
 73 | 			"1968",
 74 | 			"1969",
 75 | 			"1970",
 76 | 			"1971",
 77 | 			"1972",
 78 | 			"1973",
 79 | 			"1974",
 80 | 			"1975",
 81 | 			"1976",
 82 | 			"1977",
 83 | 			"1978",
 84 | 			"1979",
 85 | 			"1980",
 86 | 			"1981",
 87 | 			"1982",
 88 | 			"1983",
 89 | 			"1984",
 90 | 			"1985",
 91 | 			"1986",
 92 | 			"1987",
 93 | 			"1988",
 94 | 			"1989",
 95 | 			"1990",
 96 | 			"1991",
 97 | 			"1992",
 98 | 			"1993",
 99 | 			"1994",
100 | 			"1995",
101 | 			"1996",
102 | 			"1997",
103 | 			"1998",
104 | 			"1999",
105 | 			"2000",
106 | 			"2001",
107 | 			"2002",
108 | 			"2003",
109 | 			"2004",
110 | 			"2005",
111 | 			"2006",
112 | 			"2007",
113 | 			"2008",
114 | 			"2009",
115 | 			"2010",
116 | 			"2011",
117 | 			"2012",
118 | 			"2013",
119 | 			"2014"
120 | 		]
121 | 	},
122 | 	"series": [
123 | 		{
124 | 			"label": "Total Emissions",
125 | 			"type": "int",
126 | 			"raw": [
127 | 				380000,
128 | 				400000,
129 | 				440000,
130 | 				480000,
131 | 				510000,
132 | 				540000,
133 | 				580000,
134 | 				630000,
135 | 				660000,
136 | 				720000,
137 | 				770000,
138 | 				840000,
139 | 				910000,
140 | 				980000,
141 | 				1040000,
142 | 				1050000,
143 | 				1070000,
144 | 				1070000,
145 | 				1110000,
146 | 				1080000,
147 | 				1040000,
148 | 				1100000,
149 | 				1090000,
150 | 				1150000,
151 | 				1180000,
152 | 				1280000,
153 | 				1360000,
154 | 				1460000,
155 | 				1410000,
156 | 				1320000,
157 | 				1190000,
158 | 				1080000,
159 | 				960000,
160 | 				820000,
161 | 				760000,
162 | 				700000,
163 | 				640000,
164 | 				600000,
165 | 				590000,
166 | 				560000,
167 | 				530000,
168 | 				490000,
169 | 				480000,
170 | 				470000,
171 | 				450000,
172 | 				430000,
173 | 				410000,
174 | 				400000,
175 | 				390000,
176 | 				380000,
177 | 				370000,
178 | 				350000,
179 | 				340000,
180 | 				320000
181 | 			]
182 | 		}
183 | 	]
184 | }


--------------------------------------------------------------------------------
/datasets/construction/convert.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Dataset conversion script
  6 | 
  7 | Author: G.J.J. van den Burg
  8 | 
  9 | """
 10 | 
 11 | import argparse
 12 | import json
 13 | import xlrd
 14 | 
 15 | MONTHS = {
 16 |     "Jan": 1,
 17 |     "Feb": 2,
 18 |     "Mar": 3,
 19 |     "Apr": 4,
 20 |     "May": 5,
 21 |     "Jun": 6,
 22 |     "Jul": 7,
 23 |     "Aug": 8,
 24 |     "Sep": 9,
 25 |     "Oct": 10,
 26 |     "Nov": 11,
 27 |     "Dec": 12,
 28 | }
 29 | 
 30 | 
 31 | def format_date(datestr):
 32 |     """ expects: mmm-yyx with x an extraneous character or empty """
 33 |     mmm, yyx = datestr.split("-")
 34 |     midx = MONTHS[mmm]
 35 |     if len(yyx) == 3:
 36 |         yy = yyx[:2]
 37 |     elif len(yyx) == 2:
 38 |         yy = yyx
 39 |     else:
 40 |         raise ValueError
 41 | 
 42 |     # this will break in 71 years
 43 |     if yy.startswith("9"):
 44 |         yyyy = 1900 + int(yy)
 45 |     else:
 46 |         yyyy = 2000 + int(yy)
 47 |     return f"{yyyy}-{midx:02}"
 48 | 
 49 | 
 50 | def parse_args():
 51 |     parser = argparse.ArgumentParser()
 52 |     parser.add_argument("input_file", help="File to convert")
 53 |     parser.add_argument("output_file", help="File to write to")
 54 |     return parser.parse_args()
 55 | 
 56 | 
 57 | def main():
 58 |     args = parse_args()
 59 | 
 60 |     wb = xlrd.open_workbook(args.input_file)
 61 |     ws = wb.sheet_by_index(0)
 62 |     header = ws.row(3)
 63 |     assert header[0].value == "Date"
 64 | 
 65 |     by_month = {}
 66 |     ridx = 4
 67 |     while True:
 68 |         # stop if date cell is empty
 69 |         if ws.row(ridx)[0].ctype == xlrd.XL_CELL_EMPTY:
 70 |             break
 71 | 
 72 |         date_value = ws.row(ridx)[0].value
 73 |         construct_value = ws.row(ridx)[1].value
 74 | 
 75 |         date = format_date(date_value)
 76 |         construct = int(construct_value)
 77 | 
 78 |         by_month[date] = construct
 79 |         ridx += 1
 80 | 
 81 |     name = "construction"
 82 |     longname = "US Construction Spending"
 83 |     time = sorted(by_month.keys())
 84 |     time_fmt = "%Y-%m"
 85 |     values = [by_month[t] for t in time]
 86 | 
 87 |     series = [
 88 |         {
 89 |             "label": "Total Private Construction Spending",
 90 |             "type": "int",
 91 |             "raw": values,
 92 |         }
 93 |     ]
 94 | 
 95 |     data = {
 96 |         "name": name,
 97 |         "longname": longname,
 98 |         "n_obs": len(time),
 99 |         "n_dim": len(series),
100 |         "time": {
101 |             "type": "string",
102 |             "format": time_fmt,
103 |             "index": list(range(len(time))),
104 |             "raw": time,
105 |         },
106 |         "series": series,
107 |     }
108 | 
109 |     with open(args.output_file, "w") as fp:
110 |         json.dump(data, fp, indent="\t")
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 


--------------------------------------------------------------------------------
/examples/python/load_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Example code for loading a dataset to a TimeSeries object.
  6 | 
  7 | Note that this code requires Pandas to be available.
  8 | 
  9 | Author: Gertjan van den Burg
 10 | Copyright: The Alan Turing Institute, 2019
 11 | License: See LICENSE file.
 12 | 
 13 | """
 14 | 
 15 | import json
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | 
 20 | class TimeSeries:
 21 |     def __init__(
 22 |         self,
 23 |         t,
 24 |         y,
 25 |         name=None,
 26 |         longname=None,
 27 |         datestr=None,
 28 |         datefmt=None,
 29 |         columns=None,
 30 |     ):
 31 |         self.t = t
 32 |         self.y = y
 33 | 
 34 |         self.name = name
 35 |         self.longname = longname
 36 |         self.datestr = datestr
 37 |         self.datefmt = datefmt
 38 |         self.columns = columns
 39 | 
 40 |         # whether the series is stored as zero-based or one-based
 41 |         self.zero_based = True
 42 | 
 43 |     @property
 44 |     def n_obs(self):
 45 |         return len(self.t)
 46 | 
 47 |     @property
 48 |     def n_dim(self):
 49 |         return self.y.shape[1]
 50 | 
 51 |     @property
 52 |     def shape(self):
 53 |         return (self.n_obs, self.n_dim)
 54 | 
 55 |     @classmethod
 56 |     def from_json(cls, filename):
 57 |         with open(filename, "rb") as fp:
 58 |             data = json.load(fp)
 59 | 
 60 |         tidx = np.array(data["time"]["index"])
 61 |         tidx = np.squeeze(tidx)
 62 | 
 63 |         if "format" in data["time"]:
 64 |             datefmt = data["time"]["format"]
 65 |             datestr = np.array(data["time"]["raw"])
 66 |         else:
 67 |             datefmt = None
 68 |             datestr = None
 69 | 
 70 |         y = np.zeros((data["n_obs"], data["n_dim"]))
 71 |         columns = []
 72 | 
 73 |         for idx, series in enumerate(data["series"]):
 74 |             columns.append(series.get("label", "V%i" % (idx + 1)))
 75 |             thetype = np.int if series["type"] == "integer" else np.float64
 76 |             vec = np.array(series["raw"], dtype=thetype)
 77 |             y[:, idx] = vec
 78 | 
 79 |         ts = cls(
 80 |             tidx,
 81 |             y,
 82 |             name=data["name"],
 83 |             longname=data["longname"],
 84 |             datefmt=datefmt,
 85 |             datestr=datestr,
 86 |             columns=columns,
 87 |         )
 88 |         return ts
 89 | 
 90 |     @property
 91 |     def df(self):
 92 |         d = {"t": self.t}
 93 |         for i in range(len(self.columns)):
 94 |             col = self.columns[i]
 95 |             val = self.y[:, i]
 96 |             d[col] = val
 97 |         return pd.DataFrame(d)
 98 | 
 99 |     def make_one_based(self):
100 |         """ Convert the time index to a one-based time index. """
101 |         if self.zero_based:
102 |             self.t = [t + 1 for t in self.t]
103 |             self.zero_based = False
104 | 
105 |     def __repr__(self):
106 |         return "TimeSeries(name=%s, n_obs=%s, n_dim=%s)" % (
107 |             self.name,
108 |             self.n_obs,
109 |             self.n_dim,
110 |         )
111 | 
112 |     def __str__(self):
113 |         return repr(self)
114 | 


--------------------------------------------------------------------------------
/datasets/co2_canada/co2_canada.csv:
--------------------------------------------------------------------------------
  1 | country,Canada
  2 | 1800,0.00568
  3 | 1801,0.00561
  4 | 1802,0.00555
  5 | 1803,0.00548
  6 | 1804,0.00542
  7 | 1805,0.00536
  8 | 1806,0.00529
  9 | 1807,0.00523
 10 | 1808,0.00517
 11 | 1809,0.00511
 12 | 1810,0.00504
 13 | 1811,0.00497
 14 | 1812,0.0049
 15 | 1813,0.00483
 16 | 1814,0.00475
 17 | 1815,0.00466
 18 | 1816,0.00457
 19 | 1817,0.00447
 20 | 1818,0.00438
 21 | 1819,0.00427
 22 | 1820,0.00417
 23 | 1821,0.00406
 24 | 1822,0.00395
 25 | 1823,0.00384
 26 | 1824,0.00373
 27 | 1825,0.00362
 28 | 1826,0.0035
 29 | 1827,0.00339
 30 | 1828,0.00327
 31 | 1829,0.00316
 32 | 1830,0.00305
 33 | 1831,0.00294
 34 | 1832,0.00283
 35 | 1833,0.00273
 36 | 1834,0.00263
 37 | 1835,0.00253
 38 | 1836,0.00244
 39 | 1837,0.00235
 40 | 1838,0.00227
 41 | 1839,0.00218
 42 | 1840,0.0021
 43 | 1841,0.00202
 44 | 1842,0.00195
 45 | 1843,0.00188
 46 | 1844,0.00181
 47 | 1845,0.00175
 48 | 1846,0.0118
 49 | 1847,0.0147
 50 | 1848,0.0174
 51 | 1849,0.0198
 52 | 1850,0.0236
 53 | 1851,0.0271
 54 | 1852,0.0318
 55 | 1853,0.0376
 56 | 1854,0.0445
 57 | 1855,0.0522
 58 | 1856,0.0608
 59 | 1857,0.0715
 60 | 1858,0.0853
 61 | 1859,0.0996
 62 | 1860,0.118
 63 | 1861,0.139
 64 | 1862,0.167
 65 | 1863,0.206
 66 | 1864,0.242
 67 | 1865,0.288
 68 | 1866,0.346
 69 | 1867,0.436
 70 | 1868,0.255
 71 | 1869,0.182
 72 | 1870,0.321
 73 | 1871,0.461
 74 | 1872,0.471
 75 | 1873,0.406
 76 | 1874,0.404
 77 | 1875,0.461
 78 | 1876,0.45
 79 | 1877,0.484
 80 | 1878,0.456
 81 | 1879,0.508
 82 | 1880,1.2
 83 | 1881,1.27
 84 | 1882,1.48
 85 | 1883,1.64
 86 | 1884,1.87
 87 | 1885,1.78
 88 | 1886,1.87
 89 | 1887,2.14
 90 | 1888,2.77
 91 | 1889,2.33
 92 | 1890,2.53
 93 | 1891,2.81
 94 | 1892,2.78
 95 | 1893,2.94
 96 | 1894,2.78
 97 | 1895,2.62
 98 | 1896,2.81
 99 | 1897,2.8
100 | 1898,2.94
101 | 1899,3.53
102 | 1900,3.73
103 | 1901,4.24
104 | 1902,4.49
105 | 1903,4.78
106 | 1904,5.5
107 | 1905,5.72
108 | 1906,5.87
109 | 1907,7.18
110 | 1908,7.02
111 | 1909,6.51
112 | 1910,7.2
113 | 1911,8.18
114 | 1912,8.86
115 | 1913,10.1
116 | 1914,8.53
117 | 1915,7.51
118 | 1916,9.33
119 | 1917,10.3
120 | 1918,10.7
121 | 1919,8.91
122 | 1920,9.64
123 | 1921,8.98
124 | 1922,7.6
125 | 1923,10.1
126 | 1924,8.27
127 | 1925,7.91
128 | 1926,8.75
129 | 1927,9.32
130 | 1928,9.32
131 | 1929,9.66
132 | 1930,9.04
133 | 1931,7.15
134 | 1932,6.53
135 | 1933,6.39
136 | 1934,7.28
137 | 1935,7.11
138 | 1936,7.7
139 | 1937,8.27
140 | 1938,7.5
141 | 1939,8.25
142 | 1940,9.33
143 | 1941,10.2
144 | 1942,11
145 | 1943,11.4
146 | 1944,11.5
147 | 1945,10.5
148 | 1946,11
149 | 1947,11.2
150 | 1948,12.1
151 | 1949,10.8
152 | 1950,11.2
153 | 1951,11.5
154 | 1952,11
155 | 1953,10.8
156 | 1954,10.7
157 | 1955,10.8
158 | 1956,11.7
159 | 1957,11
160 | 1958,10.7
161 | 1959,10.5
162 | 1960,10.8
163 | 1961,10.6
164 | 1962,11.1
165 | 1963,11.1
166 | 1964,12.3
167 | 1965,12.8
168 | 1966,12.9
169 | 1967,13.8
170 | 1968,14.6
171 | 1969,14.6
172 | 1970,15.9
173 | 1971,16.2
174 | 1972,17.2
175 | 1973,17
176 | 1974,17.1
177 | 1975,17.2
178 | 1976,17
179 | 1977,17.2
180 | 1978,17.3
181 | 1979,18.2
182 | 1980,18.1
183 | 1981,17.3
184 | 1982,16.6
185 | 1983,16.2
186 | 1984,16.6
187 | 1985,16.3
188 | 1986,15.5
189 | 1987,16.2
190 | 1988,16.9
191 | 1989,17
192 | 1990,15.7
193 | 1991,15.2
194 | 1992,15.5
195 | 1993,15.5
196 | 1994,15.7
197 | 1995,15.9
198 | 1996,16.2
199 | 1997,16.6
200 | 1998,16.8
201 | 1999,16.9
202 | 2000,17.4
203 | 2001,17
204 | 2002,16.6
205 | 2003,17.5
206 | 2004,17.3
207 | 2005,17.3
208 | 2006,16.7
209 | 2007,16.8
210 | 2008,16.8
211 | 2009,15.9
212 | 2010,15.6
213 | 2011,15.6
214 | 2012,14.8
215 | 2013,14.7
216 | 2014,15.1
217 | 


--------------------------------------------------------------------------------
/datasets/gdp_japan/gdp_japan.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "gdp_japan",
  3 | 	"longname": "GDP Japan",
  4 | 	"n_obs": 58,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53,
 64 | 			54,
 65 | 			55,
 66 | 			56,
 67 | 			57
 68 | 		],
 69 | 		"raw": [
 70 | 			"1960",
 71 | 			"1961",
 72 | 			"1962",
 73 | 			"1963",
 74 | 			"1964",
 75 | 			"1965",
 76 | 			"1966",
 77 | 			"1967",
 78 | 			"1968",
 79 | 			"1969",
 80 | 			"1970",
 81 | 			"1971",
 82 | 			"1972",
 83 | 			"1973",
 84 | 			"1974",
 85 | 			"1975",
 86 | 			"1976",
 87 | 			"1977",
 88 | 			"1978",
 89 | 			"1979",
 90 | 			"1980",
 91 | 			"1981",
 92 | 			"1982",
 93 | 			"1983",
 94 | 			"1984",
 95 | 			"1985",
 96 | 			"1986",
 97 | 			"1987",
 98 | 			"1988",
 99 | 			"1989",
100 | 			"1990",
101 | 			"1991",
102 | 			"1992",
103 | 			"1993",
104 | 			"1994",
105 | 			"1995",
106 | 			"1996",
107 | 			"1997",
108 | 			"1998",
109 | 			"1999",
110 | 			"2000",
111 | 			"2001",
112 | 			"2002",
113 | 			"2003",
114 | 			"2004",
115 | 			"2005",
116 | 			"2006",
117 | 			"2007",
118 | 			"2008",
119 | 			"2009",
120 | 			"2010",
121 | 			"2011",
122 | 			"2012",
123 | 			"2013",
124 | 			"2014",
125 | 			"2015",
126 | 			"2016",
127 | 			"2017"
128 | 		]
129 | 	},
130 | 	"series": [
131 | 		{
132 | 			"label": "V1",
133 | 			"type": "float",
134 | 			"raw": [
135 | 				15950643462144,
136 | 				19263102386176,
137 | 				21860286726144,
138 | 				25019327447040,
139 | 				29429642297344,
140 | 				32742100172800,
141 | 				38026105323520,
142 | 				44561476878336,
143 | 				52776386166784,
144 | 				61993511813120,
145 | 				76539307651500,
146 | 				84215883490900,
147 | 				96418343539100,
148 | 				117397596102100,
149 | 				140090360740400,
150 | 				154787118329600,
151 | 				173827764691400,
152 | 				193706278803100,
153 | 				213306268936200,
154 | 				231195355873400,
155 | 				250636100000000,
156 | 				268830700000000,
157 | 				282582000000000,
158 | 				295303900000000,
159 | 				313145300000000,
160 | 				333686000000000,
161 | 				350344800000000,
162 | 				366339100000000,
163 | 				393641400000000,
164 | 				421469400000000,
165 | 				453608500000000,
166 | 				482845400000000,
167 | 				495055800000000,
168 | 				495291000000000,
169 | 				501537700000000,
170 | 				512541700000000,
171 | 				525806900000000,
172 | 				534142500000000,
173 | 				527876900000000,
174 | 				519651800000000,
175 | 				526706000000000,
176 | 				523005000000000,
177 | 				515986200000000,
178 | 				515400700000000,
179 | 				520965400000000,
180 | 				524132800000000,
181 | 				526879700000000,
182 | 				531688200000000,
183 | 				520715700000000,
184 | 				489501000000000,
185 | 				500353900000000,
186 | 				491408500000000,
187 | 				494957200000000,
188 | 				503175600000000,
189 | 				513876000000000,
190 | 				531985800000000,
191 | 				538445800000000,
192 | 				546488800000000
193 | 			]
194 | 		}
195 | 	]
196 | }
197 | 


--------------------------------------------------------------------------------
/datasets/gdp_argentina/gdp_argentina.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "gdp_argentina",
  3 | 	"longname": "GDP Argentina",
  4 | 	"n_obs": 59,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53,
 64 | 			54,
 65 | 			55,
 66 | 			56,
 67 | 			57,
 68 | 			58
 69 | 		],
 70 | 		"raw": [
 71 | 			"1960",
 72 | 			"1961",
 73 | 			"1962",
 74 | 			"1963",
 75 | 			"1964",
 76 | 			"1965",
 77 | 			"1966",
 78 | 			"1967",
 79 | 			"1968",
 80 | 			"1969",
 81 | 			"1970",
 82 | 			"1971",
 83 | 			"1972",
 84 | 			"1973",
 85 | 			"1974",
 86 | 			"1975",
 87 | 			"1976",
 88 | 			"1977",
 89 | 			"1978",
 90 | 			"1979",
 91 | 			"1980",
 92 | 			"1981",
 93 | 			"1982",
 94 | 			"1983",
 95 | 			"1984",
 96 | 			"1985",
 97 | 			"1986",
 98 | 			"1987",
 99 | 			"1988",
100 | 			"1989",
101 | 			"1990",
102 | 			"1991",
103 | 			"1992",
104 | 			"1993",
105 | 			"1994",
106 | 			"1995",
107 | 			"1996",
108 | 			"1997",
109 | 			"1998",
110 | 			"1999",
111 | 			"2000",
112 | 			"2001",
113 | 			"2002",
114 | 			"2003",
115 | 			"2004",
116 | 			"2005",
117 | 			"2006",
118 | 			"2007",
119 | 			"2008",
120 | 			"2009",
121 | 			"2010",
122 | 			"2011",
123 | 			"2012",
124 | 			"2013",
125 | 			"2014",
126 | 			"2015",
127 | 			"2016",
128 | 			"2017",
129 | 			"2018"
130 | 		]
131 | 	},
132 | 	"series": [
133 | 		{
134 | 			"label": "GDP (constant LCU)",
135 | 			"type": "float",
136 | 			"raw": [
137 | 				182932009386.44,
138 | 				192861271432.271,
139 | 				191218051889.207,
140 | 				181067821327.096,
141 | 				199410530596.604,
142 | 				220487093701.636,
143 | 				219032482639.5,
144 | 				226023992089.437,
145 | 				236924000874.523,
146 | 				259857121158.53,
147 | 				267771442208.049,
148 | 				282922301411.503,
149 | 				287529454895.857,
150 | 				295614075962.835,
151 | 				311972780690.209,
152 | 				311884142663.154,
153 | 				305589536223.584,
154 | 				326779566190.307,
155 | 				312054471108.779,
156 | 				343955061571.789,
157 | 				349178995840.106,
158 | 				331057342234.647,
159 | 				328621888566.258,
160 | 				342913961037.103,
161 | 				348300243338.463,
162 | 				330226858893.014,
163 | 				350546962683.162,
164 | 				360028798197.3,
165 | 				356104724285.913,
166 | 				330618488447.736,
167 | 				322461423553.512,
168 | 				351912181900.0,
169 | 				379844477800.0,
170 | 				411018234600.0,
171 | 				435006083700.0,
172 | 				422629248800.0,
173 | 				445986656500.0,
174 | 				482160842800.0,
175 | 				500724897600.0,
176 | 				483773071300.0,
177 | 				479956106900.0,
178 | 				458795611600.0,
179 | 				408812193300.0,
180 | 				444939093600.0,
181 | 				485115195200.0,
182 | 				528055942500.0,
183 | 				570549404200.0,
184 | 				621942502600.0,
185 | 				647176159700.0,
186 | 				608872876400.0,
187 | 				670523679400.0,
188 | 				710781597200.0,
189 | 				703485989500.0,
190 | 				720407105300.0,
191 | 				702306046000.0,
192 | 				721487146600.0,
193 | 				706477848600.0,
194 | 				725330848500.0,
195 | 				707091754400.0
196 | 			]
197 | 		}
198 | 	]
199 | }


--------------------------------------------------------------------------------
/datasets/gdp_iran/gdp_iran.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "gdp_iran",
  3 | 	"longname": "GDP Iran",
  4 | 	"n_obs": 58,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53,
 64 | 			54,
 65 | 			55,
 66 | 			56,
 67 | 			57
 68 | 		],
 69 | 		"raw": [
 70 | 			"1960",
 71 | 			"1961",
 72 | 			"1962",
 73 | 			"1963",
 74 | 			"1964",
 75 | 			"1965",
 76 | 			"1966",
 77 | 			"1967",
 78 | 			"1968",
 79 | 			"1969",
 80 | 			"1970",
 81 | 			"1971",
 82 | 			"1972",
 83 | 			"1973",
 84 | 			"1974",
 85 | 			"1975",
 86 | 			"1976",
 87 | 			"1977",
 88 | 			"1978",
 89 | 			"1979",
 90 | 			"1980",
 91 | 			"1981",
 92 | 			"1982",
 93 | 			"1983",
 94 | 			"1984",
 95 | 			"1985",
 96 | 			"1986",
 97 | 			"1987",
 98 | 			"1988",
 99 | 			"1989",
100 | 			"1990",
101 | 			"1991",
102 | 			"1992",
103 | 			"1993",
104 | 			"1994",
105 | 			"1995",
106 | 			"1996",
107 | 			"1997",
108 | 			"1998",
109 | 			"1999",
110 | 			"2000",
111 | 			"2001",
112 | 			"2002",
113 | 			"2003",
114 | 			"2004",
115 | 			"2005",
116 | 			"2006",
117 | 			"2007",
118 | 			"2008",
119 | 			"2009",
120 | 			"2010",
121 | 			"2011",
122 | 			"2012",
123 | 			"2013",
124 | 			"2014",
125 | 			"2015",
126 | 			"2016",
127 | 			"2017"
128 | 		]
129 | 	},
130 | 	"series": [
131 | 		{
132 | 			"label": "GDP (constant LCU)",
133 | 			"type": "float",
134 | 			"raw": [
135 | 				835372833322822.0,
136 | 				931253594724320.0,
137 | 				1011252310515580.0,
138 | 				1084815415588060.0,
139 | 				1184552678329550.0,
140 | 				1388505087423820.0,
141 | 				1544572336125510.0,
142 | 				1724250326139720.0,
143 | 				1979578637059070.0,
144 | 				2302920188960200.0,
145 | 				2559064529414210.0,
146 | 				2928086546857670.0,
147 | 				3350180391061100.0,
148 | 				3637066434078650.0,
149 | 				3873361335537510.0,
150 | 				3779651859177620.0,
151 | 				4431023105415290.0,
152 | 				4250906946640460.0,
153 | 				3653562255307690.0,
154 | 				3269427305222360.0,
155 | 				2369469274184260.0,
156 | 				2246824634803560.0,
157 | 				2858194886307970.0,
158 | 				3102928534616680.0,
159 | 				2830330279459450.0,
160 | 				2891245015552970.0,
161 | 				2596157639741250.0,
162 | 				2612994168295680.0,
163 | 				2471991485967540.0,
164 | 				2620826986239520.0,
165 | 				2983386426044710.0,
166 | 				3352830570382000.0,
167 | 				3447874188131500.0,
168 | 				3483732006730800.0,
169 | 				3431072340489400.0,
170 | 				3509392093199400.0,
171 | 				3690942894824200.0,
172 | 				3708734969540800.0,
173 | 				3789504657647000.0,
174 | 				3821922301414700.0,
175 | 				4045813055230300.0,
176 | 				4077315322748900.0,
177 | 				4373554752266400.0,
178 | 				4755566584910500.0,
179 | 				4963599286320800.0,
180 | 				5121928379232500.0,
181 | 				5378014311650700.0,
182 | 				5816632978975000.0,
183 | 				5831224383995400.0,
184 | 				5889967290457600.0,
185 | 				6231463959948300.0,
186 | 				6396330918494400.0,
187 | 				5920152415454900.0,
188 | 				5908662970172800.0,
189 | 				6180663476890700.0,
190 | 				6099038846316000.0,
191 | 				6916081000000000.0,
192 | 				7175792800000000.0
193 | 			]
194 | 		}
195 | 	]
196 | }


--------------------------------------------------------------------------------
/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"definitions": {},
  3 | 	"$schema": "http://json-schema.org/draft-07/schema#",
  4 | 	"$id": "http://example.com/root.json",
  5 | 	"type": "object",
  6 | 	"title": "Dataset Schema",
  7 | 	"default": null,
  8 | 	"required": [
  9 | 		"name",
 10 | 		"n_obs",
 11 | 		"n_dim",
 12 | 		"time",
 13 | 		"series"
 14 | 	],
 15 | 	"properties": {
 16 | 		"name": {
 17 | 			"$id": "#/properties/name",
 18 | 			"type": "string",
 19 | 			"title": "The Name Schema",
 20 | 			"default": "",
 21 | 			"pattern": "^[a-z0-9\\_]+$"
 22 | 		},
 23 | 		"longname": {
 24 | 			"$id": "#/properties/longname",
 25 | 			"type": "string",
 26 | 			"title": "The Longname Schema",
 27 | 			"default": "",
 28 | 			"pattern": "^(.+)$"
 29 | 		},
 30 | 		"n_obs": {
 31 | 			"$id": "#/properties/n_obs",
 32 | 			"type": "integer",
 33 | 			"title": "The N_obs Schema",
 34 | 			"default": 0
 35 | 		},
 36 | 		"n_dim": {
 37 | 			"$id": "#/properties/n_dim",
 38 | 			"type": "integer",
 39 | 			"title": "The N_dim Schema",
 40 | 			"default": 0
 41 | 		},
 42 | 		"demo": {
 43 | 			"$id": "#/properties/demo",
 44 | 			"type": "object",
 45 | 			"title": "The Demo Schema",
 46 | 			"properties": {
 47 | 				"true_CPs": {
 48 | 					"$id": "#/properties/demo/properties/true_CPs",
 49 | 					"type": "array",
 50 | 					"items": {
 51 | 						"$id": "#/properties/demo/properties/true_CPs/items",
 52 | 						"type": "integer",
 53 | 						"title": "The Items Schema",
 54 | 						"default": null
 55 | 					}
 56 | 				}
 57 | 			}
 58 | 		},
 59 | 		"time": {
 60 | 			"$id": "#/properties/time",
 61 | 			"type": "object",
 62 | 			"title": "The Time Schema",
 63 | 			"default": null,
 64 | 			"required": [
 65 | 				"index"
 66 | 			],
 67 | 			"properties": {
 68 | 				"format": {
 69 | 					"$id": "#/properties/time/properties/format",
 70 | 					"type": "string",
 71 | 					"title": "The Format Schema",
 72 | 					"default": "",
 73 | 					"pattern": "^(.*)$"
 74 | 				},
 75 | 				"index": {
 76 | 					"$id": "#/properties/time/properties/index",
 77 | 					"type": "array",
 78 | 					"title": "Integer index of the series, starting from 0.",
 79 | 					"items": {
 80 | 						"$id": "#/properties/time/properties/index/items",
 81 | 						"type": "integer",
 82 | 						"title": "The index items schema",
 83 | 						"default": null
 84 | 					}
 85 | 				},
 86 | 				"raw": {
 87 | 					"$id": "#/properties/time/properties/raw",
 88 | 					"type": "array",
 89 | 					"title": "The Raw Schema",
 90 | 					"items": {
 91 | 						"$id": "#/properties/time/properties/raw/items",
 92 | 						"type": "string",
 93 | 						"title": "The Items Schema",
 94 | 						"default": ""
 95 | 					}
 96 | 				}
 97 | 			}
 98 | 		},
 99 | 		"series": {
100 | 			"$id": "#/properties/series",
101 | 			"type": "array",
102 | 			"title": "The Series Schema",
103 | 			"items": {
104 | 				"$id": "#/properties/series/items",
105 | 				"type": "object",
106 | 				"title": "The Variable Schema",
107 | 				"default": null,
108 | 				"properties": {
109 | 					"label": {
110 | 						"$id": "#/properties/series/items/properties/label",
111 | 						"type": "string",
112 | 						"title": "The Label Schema",
113 | 						"default": "",
114 | 						"pattern": "^(.+)$"
115 | 					},
116 | 					"type": {
117 | 						"$id": "#/properties/series/items/properties/type",
118 | 						"type": "string",
119 | 						"title": "The Type Schema",
120 | 						"default": "",
121 | 						"pattern": "^(.+)$"
122 | 					},
123 | 					"raw": {
124 | 						"$id": "#/properties/series/items/properties/raw",
125 | 						"type": "array",
126 | 						"title": "The Raw Schema",
127 | 						"items": {
128 | 							"$id": "#/properties/series/items/properties/raw/items",
129 | 							"title": "The Items Schema",
130 | 							"default": 0
131 | 						}
132 | 					}
133 | 				},
134 | 				"required": [
135 | 					"type",
136 | 					"raw"
137 | 				]
138 | 			}
139 | 		}
140 | 	}
141 | }
142 | 


--------------------------------------------------------------------------------
/datasets/ozone/ozone-depleting-substance-emissions.csv:
--------------------------------------------------------------------------------
  1 | Entity,Code,Year,Ozone-depleting substance emissions (Scientific Assessment 2014) (tonnes CFC11-equivalents)
  2 | Natural emissions,,1961,165000
  3 | Natural emissions,,1962,165000
  4 | Natural emissions,,1963,165000
  5 | Natural emissions,,1964,165000
  6 | Natural emissions,,1965,165000
  7 | Natural emissions,,1966,165000
  8 | Natural emissions,,1967,165000
  9 | Natural emissions,,1968,165000
 10 | Natural emissions,,1969,165000
 11 | Natural emissions,,1970,165000
 12 | Natural emissions,,1971,165000
 13 | Natural emissions,,1972,165000
 14 | Natural emissions,,1973,165000
 15 | Natural emissions,,1974,165000
 16 | Natural emissions,,1975,165000
 17 | Natural emissions,,1976,165000
 18 | Natural emissions,,1977,165000
 19 | Natural emissions,,1978,165000
 20 | Natural emissions,,1979,165000
 21 | Natural emissions,,1980,165000
 22 | Natural emissions,,1981,165000
 23 | Natural emissions,,1982,165000
 24 | Natural emissions,,1983,165000
 25 | Natural emissions,,1984,165000
 26 | Natural emissions,,1985,165000
 27 | Natural emissions,,1986,165000
 28 | Natural emissions,,1987,165000
 29 | Natural emissions,,1988,165000
 30 | Natural emissions,,1989,165000
 31 | Natural emissions,,1990,165000
 32 | Natural emissions,,1991,165000
 33 | Natural emissions,,1992,165000
 34 | Natural emissions,,1993,165000
 35 | Natural emissions,,1994,165000
 36 | Natural emissions,,1995,165000
 37 | Natural emissions,,1996,165000
 38 | Natural emissions,,1997,165000
 39 | Natural emissions,,1998,165000
 40 | Natural emissions,,1999,165000
 41 | Natural emissions,,2000,165000
 42 | Natural emissions,,2001,165000
 43 | Natural emissions,,2002,165000
 44 | Natural emissions,,2003,165000
 45 | Natural emissions,,2004,165000
 46 | Natural emissions,,2005,165000
 47 | Natural emissions,,2006,165000
 48 | Natural emissions,,2007,165000
 49 | Natural emissions,,2008,165000
 50 | Natural emissions,,2009,165000
 51 | Natural emissions,,2010,165000
 52 | Natural emissions,,2011,165000
 53 | Natural emissions,,2012,165000
 54 | Natural emissions,,2013,165000
 55 | Natural emissions,,2014,165000
 56 | Total emissions,,1961,380000
 57 | Total emissions,,1962,400000
 58 | Total emissions,,1963,440000
 59 | Total emissions,,1964,480000
 60 | Total emissions,,1965,510000
 61 | Total emissions,,1966,540000
 62 | Total emissions,,1967,580000
 63 | Total emissions,,1968,630000
 64 | Total emissions,,1969,660000
 65 | Total emissions,,1970,720000
 66 | Total emissions,,1971,770000
 67 | Total emissions,,1972,840000
 68 | Total emissions,,1973,910000
 69 | Total emissions,,1974,980000
 70 | Total emissions,,1975,1040000
 71 | Total emissions,,1976,1050000
 72 | Total emissions,,1977,1070000
 73 | Total emissions,,1978,1070000
 74 | Total emissions,,1979,1110000
 75 | Total emissions,,1980,1080000
 76 | Total emissions,,1981,1040000
 77 | Total emissions,,1982,1100000
 78 | Total emissions,,1983,1090000
 79 | Total emissions,,1984,1150000
 80 | Total emissions,,1985,1180000
 81 | Total emissions,,1986,1280000
 82 | Total emissions,,1987,1360000
 83 | Total emissions,,1988,1460000
 84 | Total emissions,,1989,1410000
 85 | Total emissions,,1990,1320000
 86 | Total emissions,,1991,1190000
 87 | Total emissions,,1992,1080000
 88 | Total emissions,,1993,960000
 89 | Total emissions,,1994,820000
 90 | Total emissions,,1995,760000
 91 | Total emissions,,1996,700000
 92 | Total emissions,,1997,640000
 93 | Total emissions,,1998,600000
 94 | Total emissions,,1999,590000
 95 | Total emissions,,2000,560000
 96 | Total emissions,,2001,530000
 97 | Total emissions,,2002,490000
 98 | Total emissions,,2003,480000
 99 | Total emissions,,2004,470000
100 | Total emissions,,2005,450000
101 | Total emissions,,2006,430000
102 | Total emissions,,2007,410000
103 | Total emissions,,2008,400000
104 | Total emissions,,2009,390000
105 | Total emissions,,2010,380000
106 | Total emissions,,2011,370000
107 | Total emissions,,2012,350000
108 | Total emissions,,2013,340000
109 | Total emissions,,2014,320000


--------------------------------------------------------------------------------
/utils/plot_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Utility script to plot datasets and annotations.
  6 | 
  7 | Author: G.J.J. van den Burg
  8 | Copyright (c) 2020 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | 
 11 | """
 12 | 
 13 | import argparse
 14 | import datetime
 15 | import json
 16 | import matplotlib.pyplot as plt
 17 | import pandas as pd
 18 | 
 19 | 
 20 | def parse_args():
 21 |     parser = argparse.ArgumentParser()
 22 |     parser.add_argument(
 23 |         "-r",
 24 |         "--result-file",
 25 |         help="JSON file with results from a change point detection method",
 26 |     )
 27 |     parser.add_argument(
 28 |         "-o", "--output-file", help="Output file to save the figure to"
 29 |     )
 30 |     parser.add_argument("input", help="Input dataset file (in JSON format)")
 31 |     return parser.parse_args()
 32 | 
 33 | 
 34 | def frac_to_dt(number):
 35 |     number = float(number)
 36 |     year = int(float(number))
 37 |     remainder = number - year
 38 |     begin = datetime.datetime(year, 1, 1)
 39 |     end = datetime.datetime(year + 1, 1, 1)
 40 |     seconds = remainder * (end - begin).total_seconds()
 41 |     return begin + datetime.timedelta(seconds=seconds)
 42 | 
 43 | 
 44 | def load_data(filename):
 45 |     with open(filename, "rb") as fid:
 46 |         data = json.load(fid)
 47 |     title = data["name"]
 48 |     y = data["series"][0]["raw"]
 49 |     if "time" in data and "format" in data["time"]:
 50 |         fmt = data["time"]["format"]
 51 |         if fmt == "%Y.%F":
 52 |             x = list(map(frac_to_dt, data["time"]["raw"]))
 53 |         else:
 54 |             try:
 55 |                 x = pd.to_datetime(
 56 |                     data["time"]["raw"], format=data["time"]["format"]
 57 |                 )
 58 |             except ValueError:
 59 |                 x = list(range(1, len(y) + 1))
 60 |     else:
 61 |         x = list(range(1, len(y) + 1))
 62 |     as_dict = {"x": x}
 63 |     for idx, series in enumerate(data["series"]):
 64 |         as_dict["y" + str(idx)] = series["raw"]
 65 | 
 66 |     df = pd.DataFrame(as_dict)
 67 |     return df, title
 68 | 
 69 | 
 70 | def load_result(filename):
 71 |     with open(filename, "r") as fp:
 72 |         data = json.load(fp)
 73 |     if not data["status"] == "SUCCESS":
 74 |         print("Detection wasn't successful.")
 75 |         return None
 76 |     return data["result"]["cplocations"]
 77 | 
 78 | 
 79 | def main():
 80 |     args = parse_args()
 81 |     df, title = load_data(args.input)
 82 | 
 83 |     results = None
 84 |     if args.result_file:
 85 |         results = load_result(args.result_file)
 86 | 
 87 |     has_date = False
 88 |     try:
 89 |         _ = df["x"].dt
 90 |         has_date = True
 91 |     except AttributeError:
 92 |         pass
 93 | 
 94 |     fig, axes = plt.subplots(df.shape[1] - 1, 1, squeeze=False)
 95 |     for idx, col in enumerate(df.columns[1:]):
 96 |         if has_date:
 97 |             axes[idx, 0].plot_date(df["x"], df[col], ".", color="tab:blue")
 98 |             axes[idx, 0].plot_date(df["x"], df[col], "-", color="tab:blue")
 99 |             if results:
100 |                 for loc in results:
101 |                     if loc == 0:
102 |                         continue
103 |                     if loc == df.shape[0]:
104 |                         continue
105 |                     pos = df["x"].values[loc]
106 |                     axes[idx, 0].axvline(x=pos, linestyle="--", color="red")
107 |         else:
108 |             axes[idx, 0].scatter(df["x"], df[col], color="tab:blue")
109 |             axes[idx, 0].plot(df["x"], df[col], color="tab:blue")
110 |             if results:
111 |                 for loc in results:
112 |                     if loc == 0:
113 |                         continue
114 |                     if loc == df.shape[0]:
115 |                         continue
116 |                     pos = df["x"].values[loc]
117 |                     axes[idx, 0].axvline(x=pos, linestyle="--", color="red")
118 |     fig.suptitle(title)
119 |     if args.output_file:
120 |         plt.savefig(args.output_file, transparent=True)
121 |     else:
122 |         plt.show()
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/datasets/nile/nile.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "nile",
  3 | 	"longname": "Nile Volume at Aswan",
  4 | 	"n_obs": 100,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53,
 64 | 			54,
 65 | 			55,
 66 | 			56,
 67 | 			57,
 68 | 			58,
 69 | 			59,
 70 | 			60,
 71 | 			61,
 72 | 			62,
 73 | 			63,
 74 | 			64,
 75 | 			65,
 76 | 			66,
 77 | 			67,
 78 | 			68,
 79 | 			69,
 80 | 			70,
 81 | 			71,
 82 | 			72,
 83 | 			73,
 84 | 			74,
 85 | 			75,
 86 | 			76,
 87 | 			77,
 88 | 			78,
 89 | 			79,
 90 | 			80,
 91 | 			81,
 92 | 			82,
 93 | 			83,
 94 | 			84,
 95 | 			85,
 96 | 			86,
 97 | 			87,
 98 | 			88,
 99 | 			89,
100 | 			90,
101 | 			91,
102 | 			92,
103 | 			93,
104 | 			94,
105 | 			95,
106 | 			96,
107 | 			97,
108 | 			98,
109 | 			99
110 | 		],
111 | 		"raw": [
112 | 			"1871",
113 | 			"1872",
114 | 			"1873",
115 | 			"1874",
116 | 			"1875",
117 | 			"1876",
118 | 			"1877",
119 | 			"1878",
120 | 			"1879",
121 | 			"1880",
122 | 			"1881",
123 | 			"1882",
124 | 			"1883",
125 | 			"1884",
126 | 			"1885",
127 | 			"1886",
128 | 			"1887",
129 | 			"1888",
130 | 			"1889",
131 | 			"1890",
132 | 			"1891",
133 | 			"1892",
134 | 			"1893",
135 | 			"1894",
136 | 			"1895",
137 | 			"1896",
138 | 			"1897",
139 | 			"1898",
140 | 			"1899",
141 | 			"1900",
142 | 			"1901",
143 | 			"1902",
144 | 			"1903",
145 | 			"1904",
146 | 			"1905",
147 | 			"1906",
148 | 			"1907",
149 | 			"1908",
150 | 			"1909",
151 | 			"1910",
152 | 			"1911",
153 | 			"1912",
154 | 			"1913",
155 | 			"1914",
156 | 			"1915",
157 | 			"1916",
158 | 			"1917",
159 | 			"1918",
160 | 			"1919",
161 | 			"1920",
162 | 			"1921",
163 | 			"1922",
164 | 			"1923",
165 | 			"1924",
166 | 			"1925",
167 | 			"1926",
168 | 			"1927",
169 | 			"1928",
170 | 			"1929",
171 | 			"1930",
172 | 			"1931",
173 | 			"1932",
174 | 			"1933",
175 | 			"1934",
176 | 			"1935",
177 | 			"1936",
178 | 			"1937",
179 | 			"1938",
180 | 			"1939",
181 | 			"1940",
182 | 			"1941",
183 | 			"1942",
184 | 			"1943",
185 | 			"1944",
186 | 			"1945",
187 | 			"1946",
188 | 			"1947",
189 | 			"1948",
190 | 			"1949",
191 | 			"1950",
192 | 			"1951",
193 | 			"1952",
194 | 			"1953",
195 | 			"1954",
196 | 			"1955",
197 | 			"1956",
198 | 			"1957",
199 | 			"1958",
200 | 			"1959",
201 | 			"1960",
202 | 			"1961",
203 | 			"1962",
204 | 			"1963",
205 | 			"1964",
206 | 			"1965",
207 | 			"1966",
208 | 			"1967",
209 | 			"1968",
210 | 			"1969",
211 | 			"1970"
212 | 		]
213 | 	},
214 | 	"series": [
215 | 		{
216 | 			"label": "Volume at Aswan",
217 | 			"type": "int",
218 | 			"raw": [
219 | 				1120,
220 | 				1160,
221 | 				963,
222 | 				1210,
223 | 				1160,
224 | 				1160,
225 | 				813,
226 | 				1230,
227 | 				1370,
228 | 				1140,
229 | 				995,
230 | 				935,
231 | 				1110,
232 | 				994,
233 | 				1020,
234 | 				960,
235 | 				1180,
236 | 				799,
237 | 				958,
238 | 				1140,
239 | 				1100,
240 | 				1210,
241 | 				1150,
242 | 				1250,
243 | 				1260,
244 | 				1220,
245 | 				1030,
246 | 				1100,
247 | 				774,
248 | 				840,
249 | 				874,
250 | 				694,
251 | 				940,
252 | 				833,
253 | 				701,
254 | 				916,
255 | 				692,
256 | 				1020,
257 | 				1050,
258 | 				969,
259 | 				831,
260 | 				726,
261 | 				456,
262 | 				824,
263 | 				702,
264 | 				1120,
265 | 				1100,
266 | 				832,
267 | 				764,
268 | 				821,
269 | 				768,
270 | 				845,
271 | 				864,
272 | 				862,
273 | 				698,
274 | 				845,
275 | 				744,
276 | 				796,
277 | 				1040,
278 | 				759,
279 | 				781,
280 | 				865,
281 | 				845,
282 | 				944,
283 | 				984,
284 | 				897,
285 | 				822,
286 | 				1010,
287 | 				771,
288 | 				676,
289 | 				649,
290 | 				846,
291 | 				812,
292 | 				742,
293 | 				801,
294 | 				1040,
295 | 				860,
296 | 				874,
297 | 				848,
298 | 				890,
299 | 				744,
300 | 				749,
301 | 				838,
302 | 				1050,
303 | 				918,
304 | 				986,
305 | 				797,
306 | 				923,
307 | 				975,
308 | 				815,
309 | 				1020,
310 | 				906,
311 | 				901,
312 | 				1170,
313 | 				912,
314 | 				746,
315 | 				919,
316 | 				718,
317 | 				714,
318 | 				740
319 | 			]
320 | 		}
321 | 	]
322 | }
323 | 


--------------------------------------------------------------------------------
/datasets/scanline_42049/get_scanline_42049.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the scanline_42049 dataset.
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: Gertjan van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import hashlib
 17 | import os
 18 | import numpy as np
 19 | import json
 20 | import sys
 21 | import time
 22 | 
 23 | from PIL import Image
 24 | from functools import wraps
 25 | from urllib.request import urlretrieve
 26 | from urllib.error import URLError
 27 | 
 28 | IMG_URL = "https://web.archive.org/web/20070611230044im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/42049.jpg"
 29 | 
 30 | MD5_IMG = "75a3d395b4f3f506abb9edadacaa4d55"
 31 | MD5_JSON = "39921dfa959576bd0b3d6c95558f17f4"
 32 | 
 33 | NAME_IMG = "42049.jpg"
 34 | NAME_JSON = "scanline_42049.json"
 35 | 
 36 | 
 37 | class ValidationError(Exception):
 38 |     def __init__(self, filename):
 39 |         self.message = (
 40 |             "Validating the file '%s' failed. \n"
 41 |             "Please raise an issue on the GitHub page for this project \n"
 42 |             "if the error persists." % filename
 43 |         )
 44 | 
 45 | 
 46 | def check_md5sum(filename, checksum):
 47 |     with open(filename, "rb") as fp:
 48 |         data = fp.read()
 49 |     h = hashlib.md5(data).hexdigest()
 50 |     return h == checksum
 51 | 
 52 | 
 53 | def validate(checksum):
 54 |     """Decorator that validates the target file."""
 55 | 
 56 |     def validate_decorator(func):
 57 |         @wraps(func)
 58 |         def wrapper(*args, **kwargs):
 59 |             target = kwargs.get("target_path", None)
 60 |             if os.path.exists(target) and check_md5sum(target, checksum):
 61 |                 return
 62 |             out = func(*args, **kwargs)
 63 |             if not os.path.exists(target):
 64 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 65 |             if not check_md5sum(target, checksum):
 66 |                 raise ValidationError(target)
 67 |             return out
 68 | 
 69 |         return wrapper
 70 | 
 71 |     return validate_decorator
 72 | 
 73 | 
 74 | @validate(MD5_IMG)
 75 | def download_img(target_path=None):
 76 |     count = 0
 77 |     while count < 5:
 78 |         count += 1
 79 |         try:
 80 |             urlretrieve(IMG_URL, target_path)
 81 |             return
 82 |         except URLError as err:
 83 |             print(
 84 |                 "Error occurred (%r) when trying to download img. Retrying in 5 seconds"
 85 |                 % err,
 86 |                 sys.stderr,
 87 |             )
 88 |             time.sleep(5)
 89 | 
 90 | 
 91 | @validate(MD5_JSON)
 92 | def write_json(img_path, target_path=None):
 93 |     name = "scanline_42049"
 94 |     longname = "Scanline 42049"
 95 |     index = 170
 96 | 
 97 |     im = Image.open(img_path)
 98 |     arr = np.array(im)
 99 |     line = list(map(int, list(arr[index, :])))
100 | 
101 |     series = [{"label": "Line %s" % index, "type": "int", "raw": line}]
102 | 
103 |     data = {
104 |         "name": name,
105 |         "longname": longname,
106 |         "n_obs": len(line),
107 |         "n_dim": len(series),
108 |         "time": {"index": list(range(len(line)))},
109 |         "series": series,
110 |     }
111 | 
112 |     with open(target_path, "w") as fp:
113 |         json.dump(data, fp, indent="\t")
114 | 
115 | 
116 | def collect(output_dir="."):
117 |     img_path = os.path.join(output_dir, NAME_IMG)
118 |     json_path = os.path.join(output_dir, NAME_JSON)
119 | 
120 |     download_img(target_path=img_path)
121 |     write_json(img_path, target_path=json_path)
122 | 
123 | 
124 | def clean(output_dir="."):
125 |     img_path = os.path.join(output_dir, NAME_IMG)
126 |     json_path = os.path.join(output_dir, NAME_JSON)
127 | 
128 |     if os.path.exists(img_path):
129 |         os.unlink(img_path)
130 |     if os.path.exists(json_path):
131 |         os.unlink(json_path)
132 | 
133 | 
134 | def parse_args():
135 |     parser = argparse.ArgumentParser()
136 |     parser.add_argument(
137 |         "-o", "--output-dir", help="output directory to use", default="."
138 |     )
139 |     parser.add_argument(
140 |         "action",
141 |         choices=["collect", "clean"],
142 |         help="Action to perform",
143 |         default="collect",
144 |         nargs="?",
145 |     )
146 |     return parser.parse_args()
147 | 
148 | 
149 | def main(output_dir="."):
150 |     args = parse_args()
151 |     if args.action == "collect":
152 |         collect(output_dir=args.output_dir)
153 |     elif args.action == "clean":
154 |         clean(output_dir=args.output_dir)
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | 


--------------------------------------------------------------------------------
/datasets/scanline_126007/get_scanline_126007.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the scanline_126007 dataset.
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: Gertjan van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import hashlib
 17 | import os
 18 | import numpy as np
 19 | import json
 20 | import sys
 21 | import time
 22 | 
 23 | from PIL import Image
 24 | from functools import wraps
 25 | from urllib.request import urlretrieve
 26 | from urllib.error import URLError
 27 | 
 28 | IMG_URL = "https://web.archive.org/web/20070611200633im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/126007.jpg"
 29 | 
 30 | MD5_IMG = "0ca6db4848b6d319d94a37e697930fb4"
 31 | MD5_JSON = "057d5741b623308af00c42e2c8e525c3"
 32 | 
 33 | NAME_IMG = "126007.jpg"
 34 | NAME_JSON = "scanline_126007.json"
 35 | 
 36 | 
 37 | class ValidationError(Exception):
 38 |     def __init__(self, filename):
 39 |         self.message = (
 40 |             "Validating the file '%s' failed. \n"
 41 |             "Please raise an issue on the GitHub page for this project \n"
 42 |             "if the error persists." % filename
 43 |         )
 44 | 
 45 | 
 46 | def check_md5sum(filename, checksum):
 47 |     with open(filename, "rb") as fp:
 48 |         data = fp.read()
 49 |     h = hashlib.md5(data).hexdigest()
 50 |     return h == checksum
 51 | 
 52 | 
 53 | def validate(checksum):
 54 |     """Decorator that validates the target file."""
 55 | 
 56 |     def validate_decorator(func):
 57 |         @wraps(func)
 58 |         def wrapper(*args, **kwargs):
 59 |             target = kwargs.get("target_path", None)
 60 |             if os.path.exists(target) and check_md5sum(target, checksum):
 61 |                 return
 62 |             out = func(*args, **kwargs)
 63 |             if not os.path.exists(target):
 64 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 65 |             if not check_md5sum(target, checksum):
 66 |                 raise ValidationError(target)
 67 |             return out
 68 | 
 69 |         return wrapper
 70 | 
 71 |     return validate_decorator
 72 | 
 73 | 
 74 | @validate(MD5_IMG)
 75 | def download_img(target_path=None):
 76 |     count = 0
 77 |     while count < 5:
 78 |         count += 1
 79 |         try:
 80 |             urlretrieve(IMG_URL, target_path)
 81 |             return
 82 |         except URLError as err:
 83 |             print(
 84 |                 "Error occurred (%r) when trying to download img. Retrying in 5 seconds"
 85 |                 % err,
 86 |                 sys.stderr,
 87 |             )
 88 |             time.sleep(5)
 89 | 
 90 | 
 91 | 
 92 | @validate(MD5_JSON)
 93 | def write_json(img_path, target_path=None):
 94 |     name = "scanline_126007"
 95 |     longname = "Scanline 126007"
 96 |     index = 200
 97 | 
 98 |     im = Image.open(img_path)
 99 |     arr = np.array(im)
100 |     line = list(map(int, list(arr[index, :])))
101 | 
102 |     series = [{"label": "Line %s" % index, "type": "int", "raw": line}]
103 | 
104 |     data = {
105 |         "name": name,
106 |         "longname": longname,
107 |         "n_obs": len(line),
108 |         "n_dim": len(series),
109 |         "time": {"index": list(range(len(line)))},
110 |         "series": series,
111 |     }
112 | 
113 |     with open(target_path, "w") as fp:
114 |         json.dump(data, fp, indent="\t")
115 | 
116 | 
117 | def collect(output_dir="."):
118 |     img_path = os.path.join(output_dir, NAME_IMG)
119 |     json_path = os.path.join(output_dir, NAME_JSON)
120 | 
121 |     download_img(target_path=img_path)
122 |     write_json(img_path, target_path=json_path)
123 | 
124 | 
125 | def clean(output_dir="."):
126 |     img_path = os.path.join(output_dir, NAME_IMG)
127 |     json_path = os.path.join(output_dir, NAME_JSON)
128 | 
129 |     if os.path.exists(img_path):
130 |         os.unlink(img_path)
131 |     if os.path.exists(json_path):
132 |         os.unlink(json_path)
133 | 
134 | 
135 | def parse_args():
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument(
138 |         "-o", "--output-dir", help="output directory to use", default="."
139 |     )
140 |     parser.add_argument(
141 |         "action",
142 |         choices=["collect", "clean"],
143 |         help="Action to perform",
144 |         default="collect",
145 |         nargs="?",
146 |     )
147 |     return parser.parse_args()
148 | 
149 | 
150 | def main(output_dir="."):
151 |     args = parse_args()
152 |     if args.action == "collect":
153 |         collect(output_dir=args.output_dir)
154 |     elif args.action == "clean":
155 |         clean(output_dir=args.output_dir)
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     main()
160 | 


--------------------------------------------------------------------------------
/datasets/measles/get_measles.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the measles dataset
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import clevercsv
 17 | import hashlib
 18 | import json
 19 | import os
 20 | import sys
 21 | import time
 22 | 
 23 | from functools import wraps
 24 | from urllib.request import urlretrieve
 25 | from urllib.error import URLError
 26 | 
 27 | DAT_URL = "https://web.archive.org/web/20191128124615if_/https://ms.mcmaster.ca/~bolker/measdata/ewmeas.dat"
 28 | 
 29 | MD5_DAT = "143d1dacd791df963674468c8b005bf9"
 30 | MD5_JSON = "e42afd03be893fc7deb98514c94fa4c7"
 31 | 
 32 | NAME_DAT = "ewmeas.dat"
 33 | NAME_JSON = "measles.json"
 34 | 
 35 | 
 36 | class ValidationError(Exception):
 37 |     def __init__(self, filename):
 38 |         message = (
 39 |             "Validating the file '%s' failed. \n"
 40 |             "Please raise an issue on the GitHub page for this project "
 41 |             "if the error persists." % filename
 42 |         )
 43 |         super().__init__(message)
 44 | 
 45 | 
 46 | def check_md5sum(filename, checksum):
 47 |     with open(filename, "rb") as fp:
 48 |         data = fp.read()
 49 |     h = hashlib.md5(data).hexdigest()
 50 |     return h == checksum
 51 | 
 52 | 
 53 | def validate(checksum):
 54 |     """Decorator that validates the target file."""
 55 | 
 56 |     def validate_decorator(func):
 57 |         @wraps(func)
 58 |         def wrapper(*args, **kwargs):
 59 |             target = kwargs.get("target_path", None)
 60 |             if os.path.exists(target) and check_md5sum(target, checksum):
 61 |                 return
 62 |             out = func(*args, **kwargs)
 63 |             if not os.path.exists(target):
 64 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 65 |             if not check_md5sum(target, checksum):
 66 |                 raise ValidationError(target)
 67 |             return out
 68 | 
 69 |         return wrapper
 70 | 
 71 |     return validate_decorator
 72 | 
 73 | 
 74 | @validate(MD5_DAT)
 75 | def download_zip(target_path=None):
 76 |     count = 0
 77 |     while count < 5:
 78 |         count += 1
 79 |         try:
 80 |             urlretrieve(DAT_URL, target_path)
 81 |             return
 82 |         except URLError as err:
 83 |             print(
 84 |                 "Error occurred (%r) when trying to download zip. Retrying in 5 seconds"
 85 |                 % err,
 86 |                 sys.stderr,
 87 |             )
 88 |             time.sleep(5)
 89 | 
 90 | 
 91 | 
 92 | @validate(MD5_JSON)
 93 | def write_json(dat_path, target_path=None):
 94 |     with open(dat_path, "r", newline="", encoding="ascii") as fp:
 95 |         reader = clevercsv.reader(
 96 |             fp, delimiter=" ", quotechar="", escapechar=""
 97 |         )
 98 |         rows = list(reader)
 99 | 
100 |     as_dicts = {t: int(x) for t, x in rows}
101 | 
102 |     time = sorted(as_dicts.keys())
103 |     values = [as_dicts[t] for t in time]
104 |     series = [{"label": "V1", "type": "int", "raw": values}]
105 | 
106 |     data = {
107 |         "name": "measles",
108 |         "longname": "Measles cases (England & Wales)",
109 |         "n_obs": len(time),
110 |         "n_dim": len(series),
111 |         "time": {
112 |             "type": "string",
113 |             "format": "%Y-%F",
114 |             "index": list(range(len(time))),
115 |             "raw": time,
116 |         },
117 |         "series": series,
118 |     }
119 | 
120 |     with open(target_path, "w") as fp:
121 |         json.dump(data, fp, indent="\t")
122 | 
123 | 
124 | def collect(output_dir="."):
125 |     dat_path = os.path.join(output_dir, NAME_DAT)
126 |     json_path = os.path.join(output_dir, NAME_JSON)
127 | 
128 |     download_zip(target_path=dat_path)
129 |     write_json(dat_path, target_path=json_path)
130 | 
131 | 
132 | def clean(output_dir="."):
133 |     dat_path = os.path.join(output_dir, NAME_DAT)
134 |     json_path = os.path.join(output_dir, NAME_JSON)
135 | 
136 |     if os.path.exists(dat_path):
137 |         os.unlink(dat_path)
138 |     if os.path.exists(json_path):
139 |         os.unlink(json_path)
140 | 
141 | 
142 | def parse_args():
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument(
145 |         "-o", "--output-dir", help="output directory to use", default="."
146 |     )
147 |     parser.add_argument(
148 |         "action",
149 |         choices=["collect", "clean"],
150 |         help="Action to perform",
151 |         default="collect",
152 |         nargs="?",
153 |     )
154 |     return parser.parse_args()
155 | 
156 | 
157 | def main(output_dir="."):
158 |     args = parse_args()
159 |     if args.action == "collect":
160 |         collect(output_dir=args.output_dir)
161 |     elif args.action == "clean":
162 |         clean(output_dir=args.output_dir)
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()
167 | 


--------------------------------------------------------------------------------
/datasets/bitcoin/get_bitcoin.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Retrieve the bitcoin dataset.
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import clevercsv
 17 | import hashlib
 18 | import json
 19 | import os
 20 | import sys
 21 | import time
 22 | 
 23 | from functools import wraps
 24 | from urllib.request import urlretrieve
 25 | from urllib.error import URLError
 26 | 
 27 | CSV_URL = "https://web.archive.org/web/20191114131838if_/https://api.blockchain.info/charts/market-price?timespan=all&format=csv"
 28 | 
 29 | MD5_CSV = "9bd4f7b06d78347415f6aafe1d9eb680"
 30 | MD5_JSON = "f90ff14ed1fc0c3d47d4394d25cbce93"
 31 | 
 32 | NAME_CSV = "market-price.csv"
 33 | NAME_JSON = "bitcoin.json"
 34 | 
 35 | 
 36 | class ValidationError(Exception):
 37 |     def __init__(self, filename):
 38 |         message = (
 39 |             "Validating the file '%s' failed. \n"
 40 |             "Please raise an issue on the GitHub page for this project "
 41 |             "if the error persists." % filename
 42 |         )
 43 |         super().__init__(message)
 44 | 
 45 | 
 46 | def check_md5sum(filename, checksum):
 47 |     with open(filename, "rb") as fp:
 48 |         data = fp.read()
 49 |     h = hashlib.md5(data).hexdigest()
 50 |     return h == checksum
 51 | 
 52 | 
 53 | def validate(checksum):
 54 |     """Decorator that validates the target file."""
 55 | 
 56 |     def validate_decorator(func):
 57 |         @wraps(func)
 58 |         def wrapper(*args, **kwargs):
 59 |             target = kwargs.get("target_path", None)
 60 |             if os.path.exists(target) and check_md5sum(target, checksum):
 61 |                 return
 62 |             out = func(*args, **kwargs)
 63 |             if not os.path.exists(target):
 64 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 65 |             if not check_md5sum(target, checksum):
 66 |                 raise ValidationError(target)
 67 |             return out
 68 | 
 69 |         return wrapper
 70 | 
 71 |     return validate_decorator
 72 | 
 73 | 
 74 | @validate(MD5_CSV)
 75 | def get_market_price(target_path=None):
 76 |     count = 0
 77 |     while count < 5:
 78 |         count += 1
 79 |         try:
 80 |             urlretrieve(CSV_URL, target_path)
 81 |             return
 82 |         except URLError as err:
 83 |             print(
 84 |                 "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
 85 |                 % err,
 86 |                 sys.stderr,
 87 |             )
 88 |             time.sleep(5)
 89 | 
 90 | 
 91 | @validate(MD5_JSON)
 92 | def write_json(csv_path, target_path=None):
 93 |     rows = clevercsv.read_table(csv_path)
 94 | 
 95 |     rows = rows[500:]
 96 |     last_idx = next(
 97 |         (i for i, r in enumerate(rows) if r[0] == "2019-06-19 00:00:00"), None
 98 |     )
 99 |     rows = rows[: (last_idx + 1)]
100 | 
101 |     name = "bitcoin"
102 |     longname = "Bitcoin Price"
103 |     values = [float(r[1]) for r in rows]
104 |     time = [r[0].split(" ")[0] for r in rows]
105 |     time_fmt = "%Y-%m-%d"
106 |     series = [{"label": "USD/Bitcoin", "type": "float", "raw": values}]
107 | 
108 |     data = {
109 |         "name": name,
110 |         "longname": longname,
111 |         "n_obs": len(time),
112 |         "n_dim": len(series),
113 |         "time": {
114 |             "type": "string",
115 |             "format": time_fmt,
116 |             "index": list(range(0, len(time))),
117 |             "raw": time,
118 |         },
119 |         "series": series,
120 |     }
121 | 
122 |     with open(target_path, "w") as fp:
123 |         json.dump(data, fp, indent="\t")
124 | 
125 | 
126 | def collect(output_dir="."):
127 |     csv_path = os.path.join(output_dir, NAME_CSV)
128 |     json_path = os.path.join(output_dir, NAME_JSON)
129 | 
130 |     get_market_price(target_path=csv_path)
131 |     write_json(csv_path, target_path=json_path)
132 | 
133 | 
134 | def clean(output_dir="."):
135 |     csv_path = os.path.join(output_dir, NAME_CSV)
136 |     json_path = os.path.join(output_dir, NAME_JSON)
137 | 
138 |     if os.path.exists(csv_path):
139 |         os.unlink(csv_path)
140 |     if os.path.exists(json_path):
141 |         os.unlink(json_path)
142 | 
143 | 
144 | def parse_args():
145 |     parser = argparse.ArgumentParser()
146 |     parser.add_argument(
147 |         "-o", "--output-dir", help="output directory to use", default="."
148 |     )
149 |     parser.add_argument(
150 |         "action",
151 |         choices=["collect", "clean"],
152 |         help="Action to perform",
153 |         default="collect",
154 |         nargs="?",
155 |     )
156 |     return parser.parse_args()
157 | 
158 | 
159 | def main(output_dir="."):
160 |     args = parse_args()
161 |     if args.action == "collect":
162 |         collect(output_dir=args.output_dir)
163 |     elif args.action == "clean":
164 |         clean(output_dir=args.output_dir)
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     main()
169 | 


--------------------------------------------------------------------------------
/datasets/uk_coal_employ/uk_coal_employ.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "uk_coal_employ",
  3 | 	"longname": "Coal Mining Employees (UK)",
  4 | 	"n_obs": 105,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"format": "%Y",
  8 | 		"index": [
  9 | 			0,
 10 | 			1,
 11 | 			2,
 12 | 			3,
 13 | 			4,
 14 | 			5,
 15 | 			6,
 16 | 			7,
 17 | 			8,
 18 | 			9,
 19 | 			10,
 20 | 			11,
 21 | 			12,
 22 | 			13,
 23 | 			14,
 24 | 			15,
 25 | 			16,
 26 | 			17,
 27 | 			18,
 28 | 			19,
 29 | 			20,
 30 | 			21,
 31 | 			22,
 32 | 			23,
 33 | 			24,
 34 | 			25,
 35 | 			26,
 36 | 			27,
 37 | 			28,
 38 | 			29,
 39 | 			30,
 40 | 			31,
 41 | 			32,
 42 | 			33,
 43 | 			34,
 44 | 			35,
 45 | 			36,
 46 | 			37,
 47 | 			38,
 48 | 			39,
 49 | 			40,
 50 | 			41,
 51 | 			42,
 52 | 			43,
 53 | 			44,
 54 | 			45,
 55 | 			46,
 56 | 			47,
 57 | 			48,
 58 | 			49,
 59 | 			50,
 60 | 			51,
 61 | 			52,
 62 | 			53,
 63 | 			54,
 64 | 			55,
 65 | 			56,
 66 | 			57,
 67 | 			58,
 68 | 			59,
 69 | 			60,
 70 | 			61,
 71 | 			62,
 72 | 			63,
 73 | 			64,
 74 | 			65,
 75 | 			66,
 76 | 			67,
 77 | 			68,
 78 | 			69,
 79 | 			70,
 80 | 			71,
 81 | 			72,
 82 | 			73,
 83 | 			74,
 84 | 			75,
 85 | 			76,
 86 | 			77,
 87 | 			78,
 88 | 			79,
 89 | 			80,
 90 | 			81,
 91 | 			82,
 92 | 			83,
 93 | 			84,
 94 | 			85,
 95 | 			86,
 96 | 			87,
 97 | 			88,
 98 | 			89,
 99 | 			90,
100 | 			91,
101 | 			92,
102 | 			93,
103 | 			94,
104 | 			95,
105 | 			96,
106 | 			97,
107 | 			98,
108 | 			99,
109 | 			100,
110 | 			101,
111 | 			102,
112 | 			103,
113 | 			104
114 | 		],
115 | 		"raw": [
116 | 			"1913",
117 | 			"1914",
118 | 			"1915",
119 | 			"1916",
120 | 			"1917",
121 | 			"1918",
122 | 			"1919",
123 | 			"1920",
124 | 			"1921",
125 | 			"1922",
126 | 			"1923",
127 | 			"1924",
128 | 			"1925",
129 | 			"1926",
130 | 			"1927",
131 | 			"1928",
132 | 			"1929",
133 | 			"1930",
134 | 			"1931",
135 | 			"1932",
136 | 			"1933",
137 | 			"1934",
138 | 			"1935",
139 | 			"1936",
140 | 			"1937",
141 | 			"1938",
142 | 			"1939",
143 | 			"1940",
144 | 			"1941",
145 | 			"1942",
146 | 			"1943",
147 | 			"1944",
148 | 			"1945",
149 | 			"1946",
150 | 			"1947",
151 | 			"1948",
152 | 			"1949",
153 | 			"1950",
154 | 			"1951",
155 | 			"1952",
156 | 			"1953",
157 | 			"1954",
158 | 			"1955",
159 | 			"1956",
160 | 			"1957",
161 | 			"1958",
162 | 			"1959",
163 | 			"1960",
164 | 			"1961",
165 | 			"1962",
166 | 			"1963",
167 | 			"1964",
168 | 			"1965",
169 | 			"1966",
170 | 			"1967",
171 | 			"1968",
172 | 			"1969",
173 | 			"1970",
174 | 			"1971",
175 | 			"1972",
176 | 			"1973",
177 | 			"1974",
178 | 			"1975",
179 | 			"1976",
180 | 			"1977",
181 | 			"1978",
182 | 			"1979",
183 | 			"1980",
184 | 			"1981",
185 | 			"1982",
186 | 			"1983",
187 | 			"1984",
188 | 			"1985",
189 | 			"1986",
190 | 			"1987",
191 | 			"1988",
192 | 			"1989",
193 | 			"1990",
194 | 			"1991",
195 | 			"1992",
196 | 			"1993",
197 | 			"1994",
198 | 			"1995",
199 | 			"1996",
200 | 			"1997",
201 | 			"1998",
202 | 			"1999",
203 | 			"2000",
204 | 			"2001",
205 | 			"2002",
206 | 			"2003",
207 | 			"2004",
208 | 			"2005",
209 | 			"2006",
210 | 			"2007",
211 | 			"2008",
212 | 			"2009",
213 | 			"2010",
214 | 			"2011",
215 | 			"2012",
216 | 			"2013",
217 | 			"2014",
218 | 			"2015",
219 | 			"2016",
220 | 			"2017"
221 | 		]
222 | 	},
223 | 	"series": [
224 | 		{
225 | 			"label": "V1",
226 | 			"type": "int",
227 | 			"raw": [
228 | 				1107000,
229 | 				1038000,
230 | 				935000,
231 | 				981000,
232 | 				1002000,
233 | 				990000,
234 | 				1136000,
235 | 				1191000,
236 | 				null,
237 | 				1085000,
238 | 				1151000,
239 | 				1163000,
240 | 				1078000,
241 | 				null,
242 | 				991000,
243 | 				915000,
244 | 				925000,
245 | 				910000,
246 | 				843000,
247 | 				796000,
248 | 				767000,
249 | 				768000,
250 | 				753000,
251 | 				750000,
252 | 				773000,
253 | 				776000,
254 | 				761000,
255 | 				744000,
256 | 				692000,
257 | 				704000,
258 | 				701000,
259 | 				704000,
260 | 				702000,
261 | 				693000,
262 | 				707000,
263 | 				720000,
264 | 				716000,
265 | 				693000,
266 | 				695000,
267 | 				712000,
268 | 				713000,
269 | 				707000,
270 | 				704000,
271 | 				703000,
272 | 				710000,
273 | 				699000,
274 | 				665000,
275 | 				607000,
276 | 				575000,
277 | 				556000,
278 | 				528000,
279 | 				502000,
280 | 				454700,
281 | 				422000,
282 | 				389500,
283 | 				330900,
284 | 				305700,
285 | 				290000,
286 | 				286100,
287 | 				273600,
288 | 				251800,
289 | 				252800,
290 | 				252000,
291 | 				249700,
292 | 				247900,
293 | 				240400,
294 | 				241600,
295 | 				236900,
296 | 				172000,
297 | 				164000,
298 | 				148000,
299 | 				139000,
300 | 				114000,
301 | 				91000,
302 | 				75000,
303 | 				69000,
304 | 				56000,
305 | 				49000,
306 | 				38000,
307 | 				28000,
308 | 				10000,
309 | 				7000,
310 | 				11657,
311 | 				10315,
312 | 				13768,
313 | 				11113,
314 | 				11973,
315 | 				10939,
316 | 				11439,
317 | 				9578,
318 | 				8250,
319 | 				7772,
320 | 				6054,
321 | 				5431,
322 | 				5538,
323 | 				6157,
324 | 				5912,
325 | 				6014,
326 | 				5972,
327 | 				5827,
328 | 				3715,
329 | 				3601,
330 | 				1975,
331 | 				831,
332 | 				620
333 | 			]
334 | 		}
335 | 	]
336 | }
337 | 


--------------------------------------------------------------------------------
/utils/validate_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Validate the dataset schema of a given file.
  6 | 
  7 | Note that this script requires the ``jsonschema`` package.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD. See the LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import json
 17 | import jsonschema
 18 | import os
 19 | import sys
 20 | 
 21 | 
 22 | def parse_args():
 23 |     parser = argparse.ArgumentParser()
 24 |     parser.add_argument(
 25 |         "-s",
 26 |         "--schema-file",
 27 |         help="Schema file to use",
 28 |         default="./schema.json",
 29 |     )
 30 |     parser.add_argument("-d", "--dataset-dir", help="Dataset directory")
 31 |     parser.add_argument(
 32 |         "datafile", help="JSON file with a TCPD time series", nargs="?"
 33 |     )
 34 |     parser.add_argument(
 35 |         "-v", "--verbose", help="Enable verbose mode", action="store_true"
 36 |     )
 37 |     return parser.parse_args()
 38 | 
 39 | 
 40 | def load_schema(schema_file):
 41 |     if not os.path.exists(schema_file):
 42 |         raise FileNotFoundError(schema_file)
 43 |     with open(schema_file, "rb") as fp:
 44 |         schema = json.load(fp)
 45 |     return schema
 46 | 
 47 | 
 48 | def find_datafiles(dataset_dir):
 49 |     data_files = {}
 50 | 
 51 |     datadirs = os.listdir(dataset_dir)
 52 |     for ddir in datadirs:
 53 |         pth = os.path.join(dataset_dir, ddir)
 54 |         files = os.listdir(pth)
 55 |         json_files = [f for f in files if f.endswith(".json")]
 56 |         for jf in json_files:
 57 |             jfpath = os.path.join(pth, jf)
 58 |             if jf in data_files:
 59 |                 raise KeyError("Duplicate data file '%s'?" % jfpath)
 60 |             data_files[jf] = jfpath
 61 | 
 62 |     return data_files
 63 | 
 64 | 
 65 | def validate_dataset(filename, schema_file=None):
 66 |     """Validate a dataset file against the schema and other requirements
 67 |     """
 68 |     if not os.path.exists(filename):
 69 |         return "File not found."
 70 | 
 71 |     with open(filename, "rb") as fp:
 72 |         try:
 73 |             data = json.load(fp)
 74 |         except json.JSONDecodeError as err:
 75 |             return "JSON decoding error: %s" % err.msg
 76 | 
 77 |     try:
 78 |         schema = load_schema(schema_file)
 79 |     except FileNotFoundError:
 80 |         return "Schema file not found."
 81 | 
 82 |     try:
 83 |         jsonschema.validate(instance=data, schema=schema)
 84 |     except jsonschema.ValidationError as err:
 85 |         return "JSONSchema validation error: %s" % err.message
 86 | 
 87 |     if len(data["series"]) != data["n_dim"]:
 88 |         return "Number of dimensions and number of series don't match"
 89 | 
 90 |     if "time" in data.keys():
 91 |         if not "format" in data["time"] and "raw" in data["time"]:
 92 |             return "'raw' must be accompanied by format"
 93 |         if "format" in data["time"] and not "raw" in data["time"]:
 94 |             return "Format must be accompanied by 'raw'"
 95 |         if "index" in data["time"]:
 96 |             if not data["time"]["index"][0] == 0:
 97 |                 return "Index should start at zero."
 98 |             if not len(data["time"]["index"]) == data["n_obs"]:
 99 |                 return "Number of indices must match number of observations"
100 |         if "raw" in data["time"]:
101 |             if len(data["time"]["raw"]) != data["n_obs"]:
102 |                 return "Number of time points doesn't match number of observations"
103 |             if None in data["time"]["raw"]:
104 |                 return "Null is not supported in time axis. Use 'NaN' instead."
105 | 
106 |     has_missing = False
107 |     for var in data["series"]:
108 |         if len(var["raw"]) != data["n_obs"]:
109 |             return "Number of observations doesn't match for %s" % var["label"]
110 |         if float("nan") in var["raw"]:
111 |             return "NaN is not supported in series. Use null instead."
112 |         has_missing = has_missing or any(map(lambda x: x is None, var["raw"]))
113 | 
114 |     # this doesn't exist yet, so let's not implement it until we need it.
115 |     if data["n_dim"] > 1 and has_missing:
116 |         return "Missing values are not yet supported for multidimensional data"
117 | 
118 |     return None
119 | 
120 | 
121 | def main():
122 |     args = parse_args()
123 | 
124 |     log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
125 | 
126 |     if args.dataset_dir:
127 |         datafiles = find_datafiles(args.dataset_dir)
128 |         for dset in datafiles:
129 |             log("Validating %s" % dset)
130 |             result = validate_dataset(
131 |                 datafiles[dset], schema_file=args.schema_file
132 |             )
133 |             if not result is None:
134 |                 print(
135 |                     "Dataset: %s. Error: %s" % (dset, result), file=sys.stderr
136 |                 )
137 |                 raise SystemExit(1)
138 |     else:
139 |         result = validate_dataset(args.datafile, schema_file=args.schema_file)
140 |         if not result is None:
141 |             print("Error: %s" % result, file=sys.stderr)
142 |             raise SystemExit(1)
143 |     log("Validation passed.")
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     main()
148 | 


--------------------------------------------------------------------------------
/datasets/usd_isk/ert_bil_eur_m_Label.csv:
--------------------------------------------------------------------------------
  1 | "DATASET: Euro/ECU exchange rates - monthly data [ert_bil_eur_m]"
  2 | 
  3 | "LAST UPDATE: 09.08.19 02:12:13"
  4 | 
  5 | "EXTRACTION DATE: 22.08.19 15:35:13"
  6 | 
  7 | "SOURCE OF DATA: Eurostat"
  8 | 
  9 | "TIME"
 10 | "1995M01"
 11 | "1995M02"
 12 | "1995M03"
 13 | "1995M04"
 14 | "1995M05"
 15 | "1995M06"
 16 | "1995M07"
 17 | "1995M08"
 18 | "1995M09"
 19 | "1995M10"
 20 | "1995M11"
 21 | "1995M12"
 22 | "1996M01"
 23 | "1996M02"
 24 | "1996M03"
 25 | "1996M04"
 26 | "1996M05"
 27 | "1996M06"
 28 | "1996M07"
 29 | "1996M08"
 30 | "1996M09"
 31 | "1996M10"
 32 | "1996M11"
 33 | "1996M12"
 34 | "1997M01"
 35 | "1997M02"
 36 | "1997M03"
 37 | "1997M04"
 38 | "1997M05"
 39 | "1997M06"
 40 | "1997M07"
 41 | "1997M08"
 42 | "1997M09"
 43 | "1997M10"
 44 | "1997M11"
 45 | "1997M12"
 46 | "1998M01"
 47 | "1998M02"
 48 | "1998M03"
 49 | "1998M04"
 50 | "1998M05"
 51 | "1998M06"
 52 | "1998M07"
 53 | "1998M08"
 54 | "1998M09"
 55 | "1998M10"
 56 | "1998M11"
 57 | "1998M12"
 58 | "1999M01"
 59 | "1999M02"
 60 | "1999M03"
 61 | "1999M04"
 62 | "1999M05"
 63 | "1999M06"
 64 | "1999M07"
 65 | "1999M08"
 66 | "1999M09"
 67 | "1999M10"
 68 | "1999M11"
 69 | "1999M12"
 70 | "2000M01"
 71 | "2000M02"
 72 | "2000M03"
 73 | "2000M04"
 74 | "2000M05"
 75 | "2000M06"
 76 | "2000M07"
 77 | "2000M08"
 78 | "2000M09"
 79 | "2000M10"
 80 | "2000M11"
 81 | "2000M12"
 82 | "2001M01"
 83 | "2001M02"
 84 | "2001M03"
 85 | "2001M04"
 86 | "2001M05"
 87 | "2001M06"
 88 | "2001M07"
 89 | "2001M08"
 90 | "2001M09"
 91 | "2001M10"
 92 | "2001M11"
 93 | "2001M12"
 94 | "2002M01"
 95 | "2002M02"
 96 | "2002M03"
 97 | "2002M04"
 98 | "2002M05"
 99 | "2002M06"
100 | "2002M07"
101 | "2002M08"
102 | "2002M09"
103 | "2002M10"
104 | "2002M11"
105 | "2002M12"
106 | "2003M01"
107 | "2003M02"
108 | "2003M03"
109 | "2003M04"
110 | "2003M05"
111 | "2003M06"
112 | "2003M07"
113 | "2003M08"
114 | "2003M09"
115 | "2003M10"
116 | "2003M11"
117 | "2003M12"
118 | "2004M01"
119 | "2004M02"
120 | "2004M03"
121 | "2004M04"
122 | "2004M05"
123 | "2004M06"
124 | "2004M07"
125 | "2004M08"
126 | "2004M09"
127 | "2004M10"
128 | "2004M11"
129 | "2004M12"
130 | "2005M01"
131 | "2005M02"
132 | "2005M03"
133 | "2005M04"
134 | "2005M05"
135 | "2005M06"
136 | "2005M07"
137 | "2005M08"
138 | "2005M09"
139 | "2005M10"
140 | "2005M11"
141 | "2005M12"
142 | "2006M01"
143 | "2006M02"
144 | "2006M03"
145 | "2006M04"
146 | "2006M05"
147 | "2006M06"
148 | "2006M07"
149 | "2006M08"
150 | "2006M09"
151 | "2006M10"
152 | "2006M11"
153 | "2006M12"
154 | "2007M01"
155 | "2007M02"
156 | "2007M03"
157 | "2007M04"
158 | "2007M05"
159 | "2007M06"
160 | "2007M07"
161 | "2007M08"
162 | "2007M09"
163 | "2007M10"
164 | "2007M11"
165 | "2007M12"
166 | "2008M01"
167 | "2008M02"
168 | "2008M03"
169 | "2008M04"
170 | "2008M05"
171 | "2008M06"
172 | "2008M07"
173 | "2008M08"
174 | "2008M09"
175 | "2008M10"
176 | "2008M11"
177 | "2008M12"
178 | "2009M01"
179 | "2009M02"
180 | "2009M03"
181 | "2009M04"
182 | "2009M05"
183 | "2009M06"
184 | "2009M07"
185 | "2009M08"
186 | "2009M09"
187 | "2009M10"
188 | "2009M11"
189 | "2009M12"
190 | "2010M01"
191 | "2010M02"
192 | "2010M03"
193 | "2010M04"
194 | "2010M05"
195 | "2010M06"
196 | "2010M07"
197 | "2010M08"
198 | "2010M09"
199 | "2010M10"
200 | "2010M11"
201 | "2010M12"
202 | "2011M01"
203 | "2011M02"
204 | "2011M03"
205 | "2011M04"
206 | "2011M05"
207 | "2011M06"
208 | "2011M07"
209 | "2011M08"
210 | "2011M09"
211 | "2011M10"
212 | "2011M11"
213 | "2011M12"
214 | "2012M01"
215 | "2012M02"
216 | "2012M03"
217 | "2012M04"
218 | "2012M05"
219 | "2012M06"
220 | "2012M07"
221 | "2012M08"
222 | "2012M09"
223 | "2012M10"
224 | "2012M11"
225 | "2012M12"
226 | "2013M01"
227 | "2013M02"
228 | "2013M03"
229 | "2013M04"
230 | "2013M05"
231 | "2013M06"
232 | "2013M07"
233 | "2013M08"
234 | "2013M09"
235 | "2013M10"
236 | "2013M11"
237 | "2013M12"
238 | "2014M01"
239 | "2014M02"
240 | "2014M03"
241 | "2014M04"
242 | "2014M05"
243 | "2014M06"
244 | "2014M07"
245 | "2014M08"
246 | "2014M09"
247 | "2014M10"
248 | "2014M11"
249 | "2014M12"
250 | "2015M01"
251 | "2015M02"
252 | "2015M03"
253 | "2015M04"
254 | "2015M05"
255 | "2015M06"
256 | "2015M07"
257 | "2015M08"
258 | "2015M09"
259 | "2015M10"
260 | "2015M11"
261 | "2015M12"
262 | "2016M01"
263 | "2016M02"
264 | "2016M03"
265 | "2016M04"
266 | "2016M05"
267 | "2016M06"
268 | "2016M07"
269 | "2016M08"
270 | "2016M09"
271 | "2016M10"
272 | "2016M11"
273 | "2016M12"
274 | "2017M01"
275 | "2017M02"
276 | "2017M03"
277 | "2017M04"
278 | "2017M05"
279 | "2017M06"
280 | "2017M07"
281 | "2017M08"
282 | "2017M09"
283 | "2017M10"
284 | "2017M11"
285 | "2017M12"
286 | "2018M01"
287 | "2018M02"
288 | "2018M03"
289 | "2018M04"
290 | "2018M05"
291 | "2018M06"
292 | "2018M07"
293 | "2018M08"
294 | "2018M09"
295 | "2018M10"
296 | "2018M11"
297 | "2018M12"
298 | "2019M01"
299 | "2019M02"
300 | "2019M03"
301 | "2019M04"
302 | "2019M05"
303 | "2019M06"
304 | "2019M07"
305 | 
306 | "CURRENCY"
307 | "Icelandic krona"
308 | "US dollar"
309 | 
310 | "UNIT"
311 | "National currency"
312 | 
313 | "STATINFO"
314 | "Average"
315 | 
316 | "No footnotes available"
317 | 
318 | "Available flags:"
319 | "b","break in time series"
320 | "c","confidential"
321 | "d","definition differs, see metadata"
322 | "e","estimated"
323 | "f","forecast"
324 | "n","not significant"
325 | "p","provisional"
326 | "r","revised"
327 | "s","Eurostat estimate"
328 | "u","low reliability"
329 | "z","not applicable"
330 | 
331 | "Special value:"
332 | ":","not available"
333 | 
334 | 


--------------------------------------------------------------------------------
/datasets/occupancy/get_occupancy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the occupancy dataset.
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import clevercsv
 17 | import hashlib
 18 | import json
 19 | import os
 20 | import sys
 21 | import time
 22 | 
 23 | from functools import wraps
 24 | from urllib.request import urlretrieve
 25 | from urllib.error import URLError
 26 | 
 27 | SAMPLE = 16
 28 | 
 29 | TXT_URL = "https://web.archive.org/web/20191128145102if_/https://raw.githubusercontent.com/LuisM78/Occupancy-detection-data/master/datatraining.txt"
 30 | 
 31 | MD5_TXT = "e656cd731300cb444bd10fcd28071e37"
 32 | MD5_JSON = "bc6cd9adaf496fe30bf0e417d2c3b0c6"
 33 | 
 34 | NAME_TXT = "datatraining.txt"
 35 | NAME_JSON = "occupancy.json"
 36 | 
 37 | 
 38 | class ValidationError(Exception):
 39 |     def __init__(self, filename):
 40 |         message = (
 41 |             "Validating the file '%s' failed. \n"
 42 |             "Please raise an issue on the GitHub page for this project "
 43 |             "if the error persists." % filename
 44 |         )
 45 |         super().__init__(message)
 46 | 
 47 | 
 48 | def check_md5sum(filename, checksum):
 49 |     with open(filename, "rb") as fp:
 50 |         data = fp.read()
 51 |     h = hashlib.md5(data).hexdigest()
 52 |     return h == checksum
 53 | 
 54 | 
 55 | def validate(checksum):
 56 |     """Decorator that validates the target file."""
 57 | 
 58 |     def validate_decorator(func):
 59 |         @wraps(func)
 60 |         def wrapper(*args, **kwargs):
 61 |             target = kwargs.get("target_path", None)
 62 |             if os.path.exists(target) and check_md5sum(target, checksum):
 63 |                 return
 64 |             out = func(*args, **kwargs)
 65 |             if not os.path.exists(target):
 66 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 67 |             if not check_md5sum(target, checksum):
 68 |                 raise ValidationError(target)
 69 |             return out
 70 | 
 71 |         return wrapper
 72 | 
 73 |     return validate_decorator
 74 | 
 75 | 
 76 | @validate(MD5_TXT)
 77 | def download_txt(target_path=None):
 78 |     count = 0
 79 |     while count < 5:
 80 |         count += 1
 81 |         try:
 82 |             urlretrieve(TXT_URL, target_path)
 83 |             return
 84 |         except URLError as err:
 85 |             print(
 86 |                 "Error occurred (%r) when trying to download txt. Retrying in 5 seconds"
 87 |                 % err,
 88 |                 sys.stderr,
 89 |             )
 90 |             time.sleep(5)
 91 | 
 92 | 
 93 | @validate(MD5_JSON)
 94 | def write_json(txt_path, target_path=None):
 95 |     with open(txt_path, "r", newline="", encoding="ascii") as fp:
 96 |         reader = clevercsv.reader(
 97 |             fp, delimiter=",", quotechar='"', escapechar=""
 98 |         )
 99 |         rows = list(reader)
100 | 
101 |     header = rows.pop(0)
102 |     header.insert(0, "id")
103 |     as_dicts = [dict(zip(header, r)) for r in rows]
104 | 
105 |     var_include = ["Temperature", "Humidity", "Light", "CO2"]
106 | 
107 |     time = [x["date"] for x in as_dicts]
108 |     time = [time[i] for i in range(0, len(time), SAMPLE)]
109 | 
110 |     data = {
111 |         "name": "occupancy",
112 |         "longname": "Occupancy",
113 |         "n_obs": len(time),
114 |         "n_dim": len(var_include),
115 |         "time": {
116 |             "type": "string",
117 |             "format": "%Y-%m-%d %H:%M:%S",
118 |             "index": list(range(len(time))),
119 |             "raw": time,
120 |         },
121 |         "series": [],
122 |     }
123 |     for idx, var in enumerate(var_include, start=1):
124 |         lbl = "V%i" % idx
125 |         obs = [float(x[var]) for x in as_dicts]
126 |         obs = [obs[i] for i in range(0, len(obs), SAMPLE)]
127 |         data["series"].append({"label": lbl, "type": "float", "raw": obs})
128 | 
129 |     with open(target_path, "w") as fp:
130 |         json.dump(data, fp, indent="\t")
131 | 
132 | 
133 | def collect(output_dir="."):
134 |     txt_path = os.path.join(output_dir, NAME_TXT)
135 |     json_path = os.path.join(output_dir, NAME_JSON)
136 | 
137 |     download_txt(target_path=txt_path)
138 |     write_json(txt_path, target_path=json_path)
139 | 
140 | 
141 | def clean(output_dir="."):
142 |     txt_path = os.path.join(output_dir, NAME_TXT)
143 |     json_path = os.path.join(output_dir, NAME_JSON)
144 | 
145 |     if os.path.exists(txt_path):
146 |         os.unlink(txt_path)
147 |     if os.path.exists(json_path):
148 |         os.unlink(json_path)
149 | 
150 | 
151 | def parse_args():
152 |     parser = argparse.ArgumentParser()
153 |     parser.add_argument(
154 |         "-o", "--output-dir", help="output directory to use", default="."
155 |     )
156 |     parser.add_argument(
157 |         "action",
158 |         choices=["collect", "clean"],
159 |         help="Action to perform",
160 |         default="collect",
161 |         nargs="?",
162 |     )
163 |     return parser.parse_args()
164 | 
165 | 
166 | def main(output_dir="."):
167 |     args = parse_args()
168 |     if args.action == "collect":
169 |         collect(output_dir=args.output_dir)
170 |     elif args.action == "clean":
171 |         clean(output_dir=args.output_dir)
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     main()
176 | 


--------------------------------------------------------------------------------
/datasets/homeruns/homeruns.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "homeruns",
  3 | 	"longname": "Homeruns",
  4 | 	"n_obs": 118,
  5 | 	"n_dim": 1,
  6 | 	"time": {
  7 | 		"type": "string",
  8 | 		"format": "%Y",
  9 | 		"index": [
 10 | 			0,
 11 | 			1,
 12 | 			2,
 13 | 			3,
 14 | 			4,
 15 | 			5,
 16 | 			6,
 17 | 			7,
 18 | 			8,
 19 | 			9,
 20 | 			10,
 21 | 			11,
 22 | 			12,
 23 | 			13,
 24 | 			14,
 25 | 			15,
 26 | 			16,
 27 | 			17,
 28 | 			18,
 29 | 			19,
 30 | 			20,
 31 | 			21,
 32 | 			22,
 33 | 			23,
 34 | 			24,
 35 | 			25,
 36 | 			26,
 37 | 			27,
 38 | 			28,
 39 | 			29,
 40 | 			30,
 41 | 			31,
 42 | 			32,
 43 | 			33,
 44 | 			34,
 45 | 			35,
 46 | 			36,
 47 | 			37,
 48 | 			38,
 49 | 			39,
 50 | 			40,
 51 | 			41,
 52 | 			42,
 53 | 			43,
 54 | 			44,
 55 | 			45,
 56 | 			46,
 57 | 			47,
 58 | 			48,
 59 | 			49,
 60 | 			50,
 61 | 			51,
 62 | 			52,
 63 | 			53,
 64 | 			54,
 65 | 			55,
 66 | 			56,
 67 | 			57,
 68 | 			58,
 69 | 			59,
 70 | 			60,
 71 | 			61,
 72 | 			62,
 73 | 			63,
 74 | 			64,
 75 | 			65,
 76 | 			66,
 77 | 			67,
 78 | 			68,
 79 | 			69,
 80 | 			70,
 81 | 			71,
 82 | 			72,
 83 | 			73,
 84 | 			74,
 85 | 			75,
 86 | 			76,
 87 | 			77,
 88 | 			78,
 89 | 			79,
 90 | 			80,
 91 | 			81,
 92 | 			82,
 93 | 			83,
 94 | 			84,
 95 | 			85,
 96 | 			86,
 97 | 			87,
 98 | 			88,
 99 | 			89,
100 | 			90,
101 | 			91,
102 | 			92,
103 | 			93,
104 | 			94,
105 | 			95,
106 | 			96,
107 | 			97,
108 | 			98,
109 | 			99,
110 | 			100,
111 | 			101,
112 | 			102,
113 | 			103,
114 | 			104,
115 | 			105,
116 | 			106,
117 | 			107,
118 | 			108,
119 | 			109,
120 | 			110,
121 | 			111,
122 | 			112,
123 | 			113,
124 | 			114,
125 | 			115,
126 | 			116,
127 | 			117
128 | 		],
129 | 		"raw": [
130 | 			"1901",
131 | 			"1902",
132 | 			"1903",
133 | 			"1904",
134 | 			"1905",
135 | 			"1906",
136 | 			"1907",
137 | 			"1908",
138 | 			"1909",
139 | 			"1910",
140 | 			"1911",
141 | 			"1912",
142 | 			"1913",
143 | 			"1914",
144 | 			"1915",
145 | 			"1916",
146 | 			"1917",
147 | 			"1918",
148 | 			"1919",
149 | 			"1920",
150 | 			"1921",
151 | 			"1922",
152 | 			"1923",
153 | 			"1924",
154 | 			"1925",
155 | 			"1926",
156 | 			"1927",
157 | 			"1928",
158 | 			"1929",
159 | 			"1930",
160 | 			"1931",
161 | 			"1932",
162 | 			"1933",
163 | 			"1934",
164 | 			"1935",
165 | 			"1936",
166 | 			"1937",
167 | 			"1938",
168 | 			"1939",
169 | 			"1940",
170 | 			"1941",
171 | 			"1942",
172 | 			"1943",
173 | 			"1944",
174 | 			"1945",
175 | 			"1946",
176 | 			"1947",
177 | 			"1948",
178 | 			"1949",
179 | 			"1950",
180 | 			"1951",
181 | 			"1952",
182 | 			"1953",
183 | 			"1954",
184 | 			"1955",
185 | 			"1956",
186 | 			"1957",
187 | 			"1958",
188 | 			"1959",
189 | 			"1960",
190 | 			"1961",
191 | 			"1962",
192 | 			"1963",
193 | 			"1964",
194 | 			"1965",
195 | 			"1966",
196 | 			"1967",
197 | 			"1968",
198 | 			"1969",
199 | 			"1970",
200 | 			"1971",
201 | 			"1972",
202 | 			"1973",
203 | 			"1974",
204 | 			"1975",
205 | 			"1976",
206 | 			"1977",
207 | 			"1978",
208 | 			"1979",
209 | 			"1980",
210 | 			"1981",
211 | 			"1982",
212 | 			"1983",
213 | 			"1984",
214 | 			"1985",
215 | 			"1986",
216 | 			"1987",
217 | 			"1988",
218 | 			"1989",
219 | 			"1990",
220 | 			"1991",
221 | 			"1992",
222 | 			"1993",
223 | 			"1994",
224 | 			"1995",
225 | 			"1996",
226 | 			"1997",
227 | 			"1998",
228 | 			"1999",
229 | 			"2000",
230 | 			"2001",
231 | 			"2002",
232 | 			"2003",
233 | 			"2004",
234 | 			"2005",
235 | 			"2006",
236 | 			"2007",
237 | 			"2008",
238 | 			"2009",
239 | 			"2010",
240 | 			"2011",
241 | 			"2012",
242 | 			"2013",
243 | 			"2014",
244 | 			"2015",
245 | 			"2016",
246 | 			"2017",
247 | 			"2018"
248 | 		]
249 | 	},
250 | 	"series": [
251 | 		{
252 | 			"label": "American League Home Runs",
253 | 			"type": "int",
254 | 			"raw": [
255 | 				228,
256 | 				258,
257 | 				184,
258 | 				156,
259 | 				156,
260 | 				137,
261 | 				104,
262 | 				116,
263 | 				109,
264 | 				147,
265 | 				198,
266 | 				156,
267 | 				159,
268 | 				148,
269 | 				160,
270 | 				144,
271 | 				133,
272 | 				96,
273 | 				240,
274 | 				369,
275 | 				477,
276 | 				525,
277 | 				442,
278 | 				397,
279 | 				533,
280 | 				424,
281 | 				439,
282 | 				483,
283 | 				595,
284 | 				673,
285 | 				576,
286 | 				707,
287 | 				607,
288 | 				688,
289 | 				663,
290 | 				758,
291 | 				806,
292 | 				864,
293 | 				796,
294 | 				883,
295 | 				734,
296 | 				533,
297 | 				473,
298 | 				459,
299 | 				430,
300 | 				653,
301 | 				679,
302 | 				710,
303 | 				769,
304 | 				973,
305 | 				839,
306 | 				794,
307 | 				879,
308 | 				823,
309 | 				961,
310 | 				1075,
311 | 				1024,
312 | 				1057,
313 | 				1091,
314 | 				1086,
315 | 				1534,
316 | 				1552,
317 | 				1489,
318 | 				1551,
319 | 				1370,
320 | 				1365,
321 | 				1197,
322 | 				1104,
323 | 				1649,
324 | 				1746,
325 | 				1484,
326 | 				1175,
327 | 				1552,
328 | 				1369,
329 | 				1465,
330 | 				1122,
331 | 				2013,
332 | 				1680,
333 | 				2006,
334 | 				1844,
335 | 				1062,
336 | 				2080,
337 | 				1903,
338 | 				1980,
339 | 				2178,
340 | 				2290,
341 | 				2634,
342 | 				1901,
343 | 				1718,
344 | 				1796,
345 | 				1953,
346 | 				1776,
347 | 				2074,
348 | 				1774,
349 | 				2164,
350 | 				2742,
351 | 				2477,
352 | 				2499,
353 | 				2635,
354 | 				2688,
355 | 				2506,
356 | 				2464,
357 | 				2499,
358 | 				2605,
359 | 				2437,
360 | 				2546,
361 | 				2252,
362 | 				2270,
363 | 				2560,
364 | 				2209,
365 | 				2271,
366 | 				2500,
367 | 				2504,
368 | 				2161,
369 | 				2634,
370 | 				2953,
371 | 				3170,
372 | 				2900
373 | 			]
374 | 		}
375 | 	]
376 | }


--------------------------------------------------------------------------------
/datasets/homeruns/get_homeruns.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the homeruns dataset
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import clevercsv
 17 | import hashlib
 18 | import json
 19 | import os
 20 | import sys
 21 | import time
 22 | 
 23 | from functools import wraps
 24 | from urllib.request import urlretrieve
 25 | from urllib.error import URLError
 26 | 
 27 | # Original source of the batting csv file
 28 | CSV_URL = "https://web.archive.org/web/20191128150525if_/https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/242285f8f5e8981327cf50c07355fb034833ce4a/core/Batting.csv"
 29 | 
 30 | MD5_CSV = "43d8f8135e76dcd8b77d0709e33d2221"
 31 | MD5_JSON = "987bbab63e2c72acba1c07325303720c"
 32 | 
 33 | NAME_CSV = "Batting.csv"
 34 | NAME_JSON = "homeruns.json"
 35 | 
 36 | 
 37 | class ValidationError(Exception):
 38 |     def __init__(self, filename):
 39 |         self.message = (
 40 |             "Validating the file '%s' failed. \n"
 41 |             "Please raise an issue on the GitHub page for this project \n"
 42 |             "if the error persists." % filename
 43 |         )
 44 | 
 45 | 
 46 | def check_md5sum(filename, checksum):
 47 |     with open(filename, "rb") as fp:
 48 |         data = fp.read()
 49 |     h = hashlib.md5(data).hexdigest()
 50 |     return h == checksum
 51 | 
 52 | 
 53 | def validate(checksum):
 54 |     """Decorator that validates the target file."""
 55 | 
 56 |     def validate_decorator(func):
 57 |         @wraps(func)
 58 |         def wrapper(*args, **kwargs):
 59 |             target = kwargs.get("target_path", None)
 60 |             if os.path.exists(target) and check_md5sum(target, checksum):
 61 |                 return
 62 |             out = func(*args, **kwargs)
 63 |             if not os.path.exists(target):
 64 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 65 |             if not check_md5sum(target, checksum):
 66 |                 raise ValidationError(target)
 67 |             return out
 68 | 
 69 |         return wrapper
 70 | 
 71 |     return validate_decorator
 72 | 
 73 | 
 74 | @validate(MD5_CSV)
 75 | def download_csv(target_path=None):
 76 |     count = 0
 77 |     while count < 5:
 78 |         count += 1
 79 |         try:
 80 |             urlretrieve(CSV_URL, target_path)
 81 |             return
 82 |         except URLError as err:
 83 |             print(
 84 |                 "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
 85 |                 % err,
 86 |                 sys.stderr,
 87 |             )
 88 |             time.sleep(5)
 89 | 
 90 | 
 91 | def read_csv(csv_file):
 92 |     with open(csv_file, "r", newline="", encoding="ascii") as fp:
 93 |         reader = clevercsv.reader(
 94 |             fp, delimiter=",", quotechar="", escapechar=""
 95 |         )
 96 |         rows = list(reader)
 97 | 
 98 |     header = rows.pop(0)
 99 |     dicts = [dict(zip(header, row)) for row in rows]
100 | 
101 |     AL = [d for d in dicts if d["lgID"] == "AL"]
102 |     years = sorted(set((d["yearID"] for d in AL)))
103 |     by_year = {
104 |         int(y): sum(int(d["HR"]) for d in [x for x in AL if x["yearID"] == y])
105 |         for y in years
106 |     }
107 |     return by_year
108 | 
109 | 
110 | @validate(MD5_JSON)
111 | def write_json(csv_path, target_path=None):
112 |     by_year = read_csv(csv_path)
113 | 
114 |     name = "homeruns"
115 |     longname = "Homeruns"
116 |     time_fmt = "%Y"
117 | 
118 |     time = sorted(by_year.keys())
119 |     values = [by_year[t] for t in time]
120 | 
121 |     series = [
122 |         {"label": "American League Home Runs", "type": "int", "raw": values},
123 |     ]
124 | 
125 |     data = {
126 |         "name": name,
127 |         "longname": longname,
128 |         "n_obs": len(time),
129 |         "n_dim": len(series),
130 |         "time": {
131 |             "type": "string",
132 |             "format": time_fmt,
133 |             "index": list(range(0, len(time))),
134 |             "raw": list(map(str, time)),
135 |         },
136 |         "series": series,
137 |     }
138 | 
139 |     with open(target_path, "w") as fp:
140 |         json.dump(data, fp, indent="\t")
141 | 
142 | 
143 | def collect(output_dir="."):
144 |     csv_path = os.path.join(output_dir, NAME_CSV)
145 |     json_path = os.path.join(output_dir, NAME_JSON)
146 | 
147 |     download_csv(target_path=csv_path)
148 |     write_json(csv_path, target_path=json_path)
149 | 
150 | 
151 | def clean(output_dir="."):
152 |     csv_path = os.path.join(output_dir, NAME_CSV)
153 |     json_path = os.path.join(output_dir, NAME_JSON)
154 | 
155 |     if os.path.exists(csv_path):
156 |         os.unlink(csv_path)
157 |     if os.path.exists(json_path):
158 |         os.unlink(json_path)
159 | 
160 | 
161 | def parse_args():
162 |     parser = argparse.ArgumentParser()
163 |     parser.add_argument(
164 |         "-o", "--output-dir", help="output directory to use", default="."
165 |     )
166 |     parser.add_argument(
167 |         "action",
168 |         choices=["collect", "clean"],
169 |         help="Action to perform",
170 |         default="collect",
171 |         nargs="?",
172 |     )
173 |     return parser.parse_args()
174 | 
175 | 
176 | def main(output_dir="."):
177 |     args = parse_args()
178 |     if args.action == "collect":
179 |         collect(output_dir=args.output_dir)
180 |     elif args.action == "clean":
181 |         clean(output_dir=args.output_dir)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------
/datasets/global_co2/get_global_co2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the global_co2 dataset
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | 
 16 | import argparse
 17 | import clevercsv
 18 | import hashlib
 19 | import json
 20 | import os
 21 | 
 22 | from functools import wraps
 23 | from urllib.request import urlretrieve
 24 | 
 25 | 
 26 | CSV_URL = "ftp://data.iac.ethz.ch/CMIP6/input4MIPs/UoM/GHGConc/CMIP/mon/atmos/UoM-CMIP-1-1-0/GHGConc/gr3-GMNHSH/v20160701/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
 27 | 
 28 | MD5_CSV = "a3d42f5e339f4c652b8ae80e830b6941"
 29 | MD5_JSON = "7c8edd8887f51a6f841cc9d806ab4e56"
 30 | 
 31 | NAME_CSV = "mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
 32 | NAME_JSON = "global_co2.json"
 33 | 
 34 | SAMPLE = 48
 35 | 
 36 | 
 37 | class ValidationError(Exception):
 38 |     def __init__(self, filename):
 39 |         message = (
 40 |             "Validating the file '%s' failed. \n"
 41 |             "Please raise an issue on the GitHub page for this project "
 42 |             "if the error persists." % filename
 43 |         )
 44 |         super().__init__(message)
 45 | 
 46 | 
 47 | def check_md5sum(filename, checksum):
 48 |     with open(filename, "rb") as fp:
 49 |         data = fp.read()
 50 |     h = hashlib.md5(data).hexdigest()
 51 |     return h == checksum
 52 | 
 53 | 
 54 | def validate(checksum):
 55 |     """Decorator that validates the target file."""
 56 | 
 57 |     def validate_decorator(func):
 58 |         @wraps(func)
 59 |         def wrapper(*args, **kwargs):
 60 |             target = kwargs.get("target_path", None)
 61 |             if os.path.exists(target) and check_md5sum(target, checksum):
 62 |                 return
 63 |             out = func(*args, **kwargs)
 64 |             if not os.path.exists(target):
 65 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 66 |             if not check_md5sum(target, checksum):
 67 |                 raise ValidationError(target)
 68 |             return out
 69 | 
 70 |         return wrapper
 71 | 
 72 |     return validate_decorator
 73 | 
 74 | 
 75 | @validate(MD5_CSV)
 76 | def get_csv(target_path=None):
 77 |     urlretrieve(CSV_URL, target_path)
 78 | 
 79 | 
 80 | def reformat_time(datestr):
 81 |     """ From MMM-YY to %Y-%m """
 82 |     MONTHS = {
 83 |         "Jan": 1,
 84 |         "Feb": 2,
 85 |         "Mar": 3,
 86 |         "Apr": 4,
 87 |         "May": 5,
 88 |         "Jun": 6,
 89 |         "Jul": 7,
 90 |         "Aug": 8,
 91 |         "Sep": 9,
 92 |         "Oct": 10,
 93 |         "Nov": 11,
 94 |         "Dec": 12,
 95 |     }
 96 |     dd, mmm, rest = datestr.split("-")
 97 |     yyyy = rest.split(" ")[0]
 98 |     m = MONTHS.get(mmm)
 99 |     return "%s-%02d-%s" % (yyyy, m, dd)
100 | 
101 | 
102 | @validate(MD5_JSON)
103 | def write_json(csv_path, target_path=None):
104 |     with open(csv_path, "r", newline="", encoding="ascii") as fp:
105 |         reader = clevercsv.reader(
106 |             fp, delimiter=",", quotechar="", escapechar=""
107 |         )
108 |         rows = list(reader)
109 | 
110 |     header = rows.pop(0)
111 |     rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0]
112 | 
113 |     as_dicts = [{h: v for h, v in zip(header, row)} for row in rows]
114 |     by_date = {
115 |         reformat_time(d["datetime"]): float(d["data_mean_global"])
116 |         for d in as_dicts
117 |     }
118 | 
119 |     # trim off anything before 1600
120 |     by_date = {k: v for k, v in by_date.items() if k.split("-")[0] >= "1600"}
121 | 
122 |     time = sorted(by_date.keys())
123 |     values = [by_date[t] for t in time]
124 | 
125 |     name = "global_co2"
126 |     longname = "Global CO2"
127 |     time_fmt = "%Y-%m-%d"
128 |     series = [{"label": "Mean", "type": "float", "raw": values}]
129 | 
130 |     data = {
131 |         "name": name,
132 |         "longname": longname,
133 |         "n_obs": len(values),
134 |         "n_dim": len(series),
135 |         "time": {
136 |             "type": "string",
137 |             "format": time_fmt,
138 |             "index": list(range(len(time))),
139 |             "raw": time,
140 |         },
141 |         "series": series,
142 |     }
143 |     if time is None:
144 |         del data["time"]
145 | 
146 |     with open(target_path, "w") as fp:
147 |         json.dump(data, fp, indent="\t")
148 | 
149 | 
150 | def collect(output_dir="."):
151 |     csv_path = os.path.join(output_dir, NAME_CSV,)
152 |     json_path = os.path.join(output_dir, NAME_JSON)
153 | 
154 |     get_csv(target_path=csv_path)
155 |     write_json(csv_path, target_path=json_path)
156 | 
157 | 
158 | def clean(output_dir="."):
159 |     csv_path = os.path.join(output_dir, NAME_CSV,)
160 |     json_path = os.path.join(output_dir, NAME_JSON)
161 | 
162 |     if os.path.exists(csv_path):
163 |         os.unlink(csv_path)
164 |     if os.path.exists(json_path):
165 |         os.unlink(json_path)
166 | 
167 | 
168 | def parse_args():
169 |     parser = argparse.ArgumentParser()
170 |     parser.add_argument(
171 |         "-o", "--output-dir", help="output directory to use", default="."
172 |     )
173 |     parser.add_argument(
174 |         "action",
175 |         choices=["collect", "clean"],
176 |         help="Action to perform",
177 |         default="collect",
178 |         nargs="?",
179 |     )
180 |     return parser.parse_args()
181 | 
182 | 
183 | def main(output_dir="."):
184 |     args = parse_args()
185 |     if args.action == "collect":
186 |         collect(output_dir=args.output_dir)
187 |     elif args.action == "clean":
188 |         clean(output_dir=args.output_dir)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()
193 | 


--------------------------------------------------------------------------------
/datasets/iceland_tourism/get_iceland_tourism.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Collect the iceland_tourism dataset
  6 | 
  7 | See the README file for more information.
  8 | 
  9 | Author: G.J.J. van den Burg
 10 | License: This file is part of TCPD, see the top-level LICENSE file.
 11 | Copyright: 2019, The Alan Turing Institute
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import hashlib
 17 | import json
 18 | import openpyxl
 19 | import os
 20 | import sys
 21 | import time
 22 | 
 23 | from functools import wraps
 24 | from urllib.request import urlretrieve
 25 | from urllib.error import URLError
 26 | 
 27 | XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx"
 28 | 
 29 | MD5_XLSX = "ec777afd95b01ca901aa00475fc284e5"
 30 | MD5_JSON = "8bbac4ca95319a865f2d58ff564f063d"
 31 | 
 32 | NAME_XLSX = "visitors-to-iceland-2002-2019-oct.xlsx"
 33 | NAME_JSON = "iceland_tourism.json"
 34 | 
 35 | MONTHS = {
 36 |     "January": 1,
 37 |     "February": 2,
 38 |     "March": 3,
 39 |     "April": 4,
 40 |     "May": 5,
 41 |     "June": 6,
 42 |     "July": 7,
 43 |     "August": 8,
 44 |     "September": 9,
 45 |     "October": 10,
 46 |     "November": 11,
 47 |     "December": 12,
 48 | }
 49 | 
 50 | 
 51 | class ValidationError(Exception):
 52 |     def __init__(self, filename):
 53 |         self.message = (
 54 |             "Validating the file '%s' failed. \n"
 55 |             "Please raise an issue on the GitHub page for this project \n"
 56 |             "if the error persists." % filename
 57 |         )
 58 | 
 59 | 
 60 | def check_md5sum(filename, checksum):
 61 |     with open(filename, "rb") as fp:
 62 |         data = fp.read()
 63 |     h = hashlib.md5(data).hexdigest()
 64 |     return h == checksum
 65 | 
 66 | 
 67 | def validate(checksum):
 68 |     """Decorator that validates the target file."""
 69 | 
 70 |     def validate_decorator(func):
 71 |         @wraps(func)
 72 |         def wrapper(*args, **kwargs):
 73 |             target = kwargs.get("target_path", None)
 74 |             if os.path.exists(target) and check_md5sum(target, checksum):
 75 |                 return
 76 |             out = func(*args, **kwargs)
 77 |             if not os.path.exists(target):
 78 |                 raise FileNotFoundError("Target file expected at: %s" % target)
 79 |             if not check_md5sum(target, checksum):
 80 |                 raise ValidationError(target)
 81 |             return out
 82 | 
 83 |         return wrapper
 84 | 
 85 |     return validate_decorator
 86 | 
 87 | 
 88 | @validate(MD5_XLSX)
 89 | def download_xlsx(target_path=None):
 90 |     count = 0
 91 |     while count < 5:
 92 |         count += 1
 93 |         try:
 94 |             urlretrieve(XLSX_URL, target_path)
 95 |             return
 96 |         except URLError as err:
 97 |             print(
 98 |                 "Error occurred (%r) when trying to download xlsx. Retrying in 5 seconds"
 99 |                 % err,
100 |                 sys.stderr,
101 |             )
102 |             time.sleep(5)
103 | 
104 | 
105 | def format_ym(year, month):
106 |     midx = MONTHS[month]
107 |     return "%i-%02d" % (int(year), midx)
108 | 
109 | 
110 | @validate(MD5_JSON)
111 | def write_json(xlsx_path, target_path=None):
112 |     wb = openpyxl.load_workbook(xlsx_path)
113 |     ws = wb.worksheets[2]
114 | 
115 |     rows = list(ws.rows)
116 | 
117 |     # hardcoding these row indices, not worth doing it nicely
118 |     header = rows[2]
119 | 
120 |     column_idx = [
121 |         i
122 |         for i, c in enumerate(header)
123 |         if c.data_type == "n" and c.value and 2003 <= c.value < 2020
124 |     ]
125 | 
126 |     visitors = []
127 | 
128 |     r_offset = 4
129 |     for c in column_idx:
130 |         for r in range(r_offset, r_offset + 12):
131 |             cell = ws.cell(r, c + 1)
132 |             if cell.value is None or str(cell.value) == "":
133 |                 continue
134 |             year = header[c].value
135 |             month = ws.cell(r, 1).value
136 |             datestr = format_ym(year, month)
137 |             # eliminate some observations that were not in the original dataset
138 |             if datestr in ["2019-08", "2019-09", "2019-10"]:
139 |                 continue
140 |             item = {"time": datestr, "value": int(cell.value)}
141 |             visitors.append(item)
142 | 
143 |     name = "iceland_tourism"
144 |     longname = "Iceland Tourism"
145 | 
146 |     data = {
147 |         "name": name,
148 |         "longname": longname,
149 |         "n_obs": len(visitors),
150 |         "n_dim": 1,
151 |         "time": {
152 |             "format": "%Y-%m",
153 |             "index": list(range(len(visitors))),
154 |             "raw": [v["time"] for v in visitors],
155 |         },
156 |         "series": [
157 |             {
158 |                 "label": "Visitor Number",
159 |                 "type": "int",
160 |                 "raw": [v["value"] for v in visitors],
161 |             }
162 |         ],
163 |     }
164 | 
165 |     with open(target_path, "w") as fp:
166 |         json.dump(data, fp, indent="\t")
167 | 
168 | 
169 | def collect(output_dir="."):
170 |     xlsx_path = os.path.join(output_dir, NAME_XLSX)
171 |     json_path = os.path.join(output_dir, NAME_JSON)
172 | 
173 |     download_xlsx(target_path=xlsx_path)
174 |     write_json(xlsx_path, target_path=json_path)
175 | 
176 | 
177 | def clean(output_dir="."):
178 |     xlsx_path = os.path.join(output_dir, NAME_XLSX)
179 |     json_path = os.path.join(output_dir, NAME_JSON)
180 | 
181 |     if os.path.exists(xlsx_path):
182 |         os.unlink(xlsx_path)
183 |     if os.path.exists(json_path):
184 |         os.unlink(json_path)
185 | 
186 | 
187 | def parse_args():
188 |     parser = argparse.ArgumentParser()
189 |     parser.add_argument(
190 |         "-o", "--output-dir", help="output directory to use", default="."
191 |     )
192 |     parser.add_argument(
193 |         "action",
194 |         choices=["collect", "clean"],
195 |         help="Action to perform",
196 |         default="collect",
197 |         nargs="?",
198 |     )
199 |     return parser.parse_args()
200 | 
201 | 
202 | def main(output_dir="."):
203 |     args = parse_args()
204 |     if args.action == "collect":
205 |         collect(output_dir=args.output_dir)
206 |     elif args.action == "clean":
207 |         clean(output_dir=args.output_dir)
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     main()
212 | 


--------------------------------------------------------------------------------