├── datasets ├── nile │ ├── .gitignore │ ├── nile.png │ ├── README.md │ └── nile.json ├── businv │ ├── .gitignore │ ├── businv.png │ ├── README.md │ └── convert.py ├── centralia │ ├── .gitignore │ ├── centralia.png │ ├── from_wikipedia.txt │ ├── README.md │ ├── centralia.json │ └── convert.py ├── homeruns │ ├── .gitignore │ ├── homeruns.png │ ├── README.md │ ├── homeruns.json │ └── get_homeruns.py ├── ozone │ ├── .gitignore │ ├── ozone.png │ ├── README.md │ ├── convert.py │ ├── ozone.json │ └── ozone-depleting-substance-emissions.csv ├── run_log │ ├── .gitignore │ ├── run_log.png │ ├── README.md │ ├── LICENSE │ └── convert.py ├── seatbelts │ ├── .gitignore │ ├── seatbelts.png │ └── README.md ├── well_log │ ├── .gitignore │ ├── well_log.png │ ├── README.md │ └── convert.py ├── brent_spot │ ├── .gitignore │ ├── brent_spot.png │ ├── README.md │ └── convert.py ├── construction │ ├── .gitignore │ ├── privtime.xls │ ├── construction.png │ ├── README.md │ └── convert.py ├── global_co2 │ ├── .gitignore │ ├── global_co2.png │ ├── README.md │ └── get_global_co2.py ├── jfk_passengers │ ├── .gitignore │ ├── jfk_passengers.png │ ├── README.md │ └── convert.py ├── lga_passengers │ ├── .gitignore │ ├── lga_passengers.png │ ├── README.md │ └── convert.py ├── rail_lines │ ├── .gitignore │ ├── rail_lines.png │ ├── README.md │ └── rail_lines.json ├── us_population │ ├── .gitignore │ ├── us_population.png │ ├── README.md │ └── convert.py ├── quality_control_1 │ ├── .gitignore │ ├── quality_control_1.png │ └── README.md ├── quality_control_2 │ ├── .gitignore │ ├── quality_control_2.png │ └── README.md ├── quality_control_3 │ ├── .gitignore │ ├── quality_control_3.png │ └── README.md ├── quality_control_4 │ ├── .gitignore │ ├── quality_control_4.png │ └── README.md ├── quality_control_5 │ ├── .gitignore │ ├── quality_control_5.png │ └── README.md ├── shanghai_license │ ├── .gitignore │ ├── shanghai_license.png │ ├── README.md │ └── convert.py ├── unemployment_nl │ ├── .gitignore │ ├── unemployment_nl.png │ ├── README.md │ └── convert.py ├── gdp_iran │ ├── .gitignore │ ├── gdp_iran.png │ ├── README.md │ ├── convert.py │ └── gdp_iran.json ├── usd_isk │ ├── .gitignore │ ├── usd_isk.png │ ├── README.md │ ├── convert.py │ └── ert_bil_eur_m_Label.csv ├── gdp_argentina │ ├── .gitignore │ ├── gdp_argentina.png │ ├── README.md │ ├── convert.py │ └── gdp_argentina.json ├── gdp_croatia │ ├── .gitignore │ ├── gdp_croatia.png │ ├── README.md │ ├── gdp_croatia.json │ └── convert.py ├── apple │ ├── .gitignore │ ├── apple.png │ └── README.md ├── measles │ ├── .gitignore │ ├── measles.png │ ├── README.md │ └── get_measles.py ├── bee_waggle_6 │ ├── .gitignore │ ├── bee_waggle_6.png │ └── README.md ├── bitcoin │ ├── .gitignore │ ├── bitcoin.png │ ├── README.md │ └── get_bitcoin.py ├── occupancy │ ├── .gitignore │ ├── occupancy.png │ ├── README.md │ └── get_occupancy.py ├── ratner_stock │ ├── .gitignore │ ├── ratner_stock.png │ └── README.md ├── robocalls │ ├── .gitignore │ ├── robocalls.png │ └── README.md ├── scanline_42049 │ ├── .gitignore │ ├── scanline_42049.png │ ├── README.md │ └── get_scanline_42049.py ├── scanline_126007 │ ├── .gitignore │ ├── scanline_126007.png │ ├── README.md │ └── get_scanline_126007.py ├── bank │ ├── bank.png │ ├── README.md │ └── LICENSE ├── iceland_tourism │ ├── .gitignore │ ├── iceland_tourism.png │ ├── README.md │ └── get_iceland_tourism.py ├── gdp_japan │ ├── gdp_japan.png │ ├── README.md │ ├── gdp_japan.csv │ └── gdp_japan.json ├── co2_canada │ ├── co2_canada.png │ ├── README.md │ └── co2_canada.csv ├── debt_ireland │ ├── debt_ireland.png │ ├── debt_ireland.csv │ ├── README.md │ └── debt_ireland.json ├── uk_coal_employ │ ├── Coal_since_1853.xls │ ├── uk_coal_employ.png │ ├── README.md │ ├── employ_only.csv │ └── uk_coal_employ.json └── children_per_woman │ ├── children_per_woman.png │ ├── tfr-by-gapminder.xlsx │ └── README.md ├── .gitignore ├── CHANGELOG.md ├── .github └── workflows │ ├── action.yml │ └── validate.yml ├── requirements.txt ├── examples ├── R │ ├── README.md │ └── load_dataset.R └── python │ ├── README.md │ └── load_dataset.py ├── Dockerfile ├── LICENSE ├── Makefile ├── checksums.json ├── utils ├── check_checksums.py ├── plot_dataset.py └── validate_dataset.py └── schema.json /datasets/nile/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/businv/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/centralia/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/homeruns/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/ozone/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/run_log/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/seatbelts/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/well_log/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/brent_spot/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/construction/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/global_co2/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/jfk_passengers/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/lga_passengers/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/rail_lines/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/us_population/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/quality_control_1/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/quality_control_2/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/quality_control_3/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/quality_control_4/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/quality_control_5/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/shanghai_license/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/unemployment_nl/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | -------------------------------------------------------------------------------- /datasets/gdp_iran/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | original/ 3 | -------------------------------------------------------------------------------- /datasets/usd_isk/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | original/ 3 | -------------------------------------------------------------------------------- /datasets/gdp_argentina/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | original/ 3 | -------------------------------------------------------------------------------- /datasets/gdp_croatia/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | original/ 3 | -------------------------------------------------------------------------------- /datasets/apple/.gitignore: -------------------------------------------------------------------------------- 1 | AAPL.csv 2 | apple.json 3 | old/ 4 | -------------------------------------------------------------------------------- /datasets/measles/.gitignore: -------------------------------------------------------------------------------- 1 | ewmeas.dat 2 | measles.json 3 | old/ 4 | -------------------------------------------------------------------------------- /datasets/bee_waggle_6/.gitignore: -------------------------------------------------------------------------------- 1 | bee_waggle_6.json 2 | old/ 3 | psslds.zip 4 | -------------------------------------------------------------------------------- /datasets/bitcoin/.gitignore: -------------------------------------------------------------------------------- 1 | bitcoin.json 2 | market-price.csv 3 | old/ 4 | -------------------------------------------------------------------------------- /datasets/occupancy/.gitignore: -------------------------------------------------------------------------------- 1 | datatraining.txt 2 | occupancy.json 3 | old/ 4 | -------------------------------------------------------------------------------- /datasets/ratner_stock/.gitignore: -------------------------------------------------------------------------------- 1 | SIG.csv 2 | old/ 3 | ratner_stock.json 4 | -------------------------------------------------------------------------------- /datasets/robocalls/.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | robocalls.html 3 | robocalls.json 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__ 2 | */*/__pycache__ 3 | *.pyc 4 | venv/ 5 | export/ 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Version 1.0.0 4 | 5 | * Initial release 6 | -------------------------------------------------------------------------------- /datasets/scanline_42049/.gitignore: -------------------------------------------------------------------------------- 1 | 42049.jpg 2 | old/ 3 | scanline_42049.json 4 | -------------------------------------------------------------------------------- /datasets/scanline_126007/.gitignore: -------------------------------------------------------------------------------- 1 | 126007.jpg 2 | old/ 3 | scanline_126007.json 4 | -------------------------------------------------------------------------------- /datasets/bank/bank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bank/bank.png -------------------------------------------------------------------------------- /datasets/nile/nile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/nile/nile.png -------------------------------------------------------------------------------- /datasets/apple/apple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/apple/apple.png -------------------------------------------------------------------------------- /datasets/iceland_tourism/.gitignore: -------------------------------------------------------------------------------- 1 | iceland_tourism.json 2 | old/ 3 | visitors-to-iceland-2002-2019-oct.xlsx 4 | -------------------------------------------------------------------------------- /datasets/ozone/ozone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/ozone/ozone.png -------------------------------------------------------------------------------- /datasets/bitcoin/bitcoin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bitcoin/bitcoin.png -------------------------------------------------------------------------------- /datasets/businv/businv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/businv/businv.png -------------------------------------------------------------------------------- /datasets/measles/measles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/measles/measles.png -------------------------------------------------------------------------------- /datasets/run_log/run_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/run_log/run_log.png -------------------------------------------------------------------------------- /datasets/usd_isk/usd_isk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/usd_isk/usd_isk.png -------------------------------------------------------------------------------- /datasets/gdp_iran/gdp_iran.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_iran/gdp_iran.png -------------------------------------------------------------------------------- /datasets/homeruns/homeruns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/homeruns/homeruns.png -------------------------------------------------------------------------------- /datasets/well_log/well_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/well_log/well_log.png -------------------------------------------------------------------------------- /datasets/centralia/centralia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/centralia/centralia.png -------------------------------------------------------------------------------- /datasets/gdp_japan/gdp_japan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_japan/gdp_japan.png -------------------------------------------------------------------------------- /datasets/occupancy/occupancy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/occupancy/occupancy.png -------------------------------------------------------------------------------- /datasets/robocalls/robocalls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/robocalls/robocalls.png -------------------------------------------------------------------------------- /datasets/seatbelts/seatbelts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/seatbelts/seatbelts.png -------------------------------------------------------------------------------- /datasets/brent_spot/brent_spot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/brent_spot/brent_spot.png -------------------------------------------------------------------------------- /datasets/co2_canada/co2_canada.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/co2_canada/co2_canada.png -------------------------------------------------------------------------------- /datasets/construction/privtime.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/construction/privtime.xls -------------------------------------------------------------------------------- /datasets/global_co2/global_co2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/global_co2/global_co2.png -------------------------------------------------------------------------------- /datasets/rail_lines/rail_lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/rail_lines/rail_lines.png -------------------------------------------------------------------------------- /datasets/bee_waggle_6/bee_waggle_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/bee_waggle_6/bee_waggle_6.png -------------------------------------------------------------------------------- /datasets/construction/construction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/construction/construction.png -------------------------------------------------------------------------------- /datasets/debt_ireland/debt_ireland.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/debt_ireland/debt_ireland.png -------------------------------------------------------------------------------- /datasets/gdp_croatia/gdp_croatia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_croatia/gdp_croatia.png -------------------------------------------------------------------------------- /datasets/ratner_stock/ratner_stock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/ratner_stock/ratner_stock.png -------------------------------------------------------------------------------- /datasets/gdp_argentina/gdp_argentina.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/gdp_argentina/gdp_argentina.png -------------------------------------------------------------------------------- /datasets/us_population/us_population.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/us_population/us_population.png -------------------------------------------------------------------------------- /datasets/jfk_passengers/jfk_passengers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/jfk_passengers/jfk_passengers.png -------------------------------------------------------------------------------- /datasets/lga_passengers/lga_passengers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/lga_passengers/lga_passengers.png -------------------------------------------------------------------------------- /datasets/scanline_42049/scanline_42049.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/scanline_42049/scanline_42049.png -------------------------------------------------------------------------------- /datasets/uk_coal_employ/Coal_since_1853.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/uk_coal_employ/Coal_since_1853.xls -------------------------------------------------------------------------------- /datasets/uk_coal_employ/uk_coal_employ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/uk_coal_employ/uk_coal_employ.png -------------------------------------------------------------------------------- /datasets/iceland_tourism/iceland_tourism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/iceland_tourism/iceland_tourism.png -------------------------------------------------------------------------------- /datasets/scanline_126007/scanline_126007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/scanline_126007/scanline_126007.png -------------------------------------------------------------------------------- /datasets/unemployment_nl/unemployment_nl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/unemployment_nl/unemployment_nl.png -------------------------------------------------------------------------------- /datasets/quality_control_1/quality_control_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_1/quality_control_1.png -------------------------------------------------------------------------------- /datasets/quality_control_2/quality_control_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_2/quality_control_2.png -------------------------------------------------------------------------------- /datasets/quality_control_3/quality_control_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_3/quality_control_3.png -------------------------------------------------------------------------------- /datasets/quality_control_4/quality_control_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_4/quality_control_4.png -------------------------------------------------------------------------------- /datasets/quality_control_5/quality_control_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/quality_control_5/quality_control_5.png -------------------------------------------------------------------------------- /datasets/shanghai_license/shanghai_license.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/shanghai_license/shanghai_license.png -------------------------------------------------------------------------------- /datasets/children_per_woman/children_per_woman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/children_per_woman/children_per_woman.png -------------------------------------------------------------------------------- /datasets/children_per_woman/tfr-by-gapminder.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/TCPD/HEAD/datasets/children_per_woman/tfr-by-gapminder.xlsx -------------------------------------------------------------------------------- /.github/workflows/action.yml: -------------------------------------------------------------------------------- 1 | name: 'TCPD Docker' 2 | description: 'Runs the TCPD build script in a Docker container' 3 | runs: 4 | using: 'docker' 5 | image: '../../Dockerfile' 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow>=6.2.1 2 | beautifulsoup4>=4.8.1 3 | clevercsv>=0.4.7 4 | numpy>=1.17 5 | requests>=2.22.0 6 | yfinance>=0.1.79 7 | jsonschema>=3.2.0 8 | openpyxl 9 | diff-match-patch 10 | -------------------------------------------------------------------------------- /datasets/quality_control_5/README.md: -------------------------------------------------------------------------------- 1 | # Quality Control no. 5 2 | 3 | This is a simple quality control dataset with no change points and Gaussian 4 | (0, 1) noise. 5 | 6 | ![Plot of quality_control_5 dataset](./quality_control_5.png) 7 | -------------------------------------------------------------------------------- /datasets/centralia/from_wikipedia.txt: -------------------------------------------------------------------------------- 1 | 1870 1342 2 | 1880 1886 3 | 1890 2761 4 | 1900 2048 5 | 1910 2429 6 | 1920 2336 7 | 1930 2446 8 | 1940 2449 9 | 1950 1986 10 | 1960 1435 11 | 1970 1165 12 | 1980 1017 13 | 1990 63 14 | 2000 21 15 | 2010 10 16 | -------------------------------------------------------------------------------- /datasets/quality_control_4/README.md: -------------------------------------------------------------------------------- 1 | # Quality Control no. 4 2 | 3 | This dataset has multiple periodic components with different amplitude and an 4 | offset change at time index 341. 5 | 6 | ![Plot of quality_control_4 dataset](./quality_control_4.png) 7 | -------------------------------------------------------------------------------- /datasets/centralia/README.md: -------------------------------------------------------------------------------- 1 | # Population of Centralia, Pennsylvania 2 | 3 | Abandoned mining town in the US. 4 | 5 | Source: 6 | [https://en.wikipedia.org/wiki/Centralia,_Pennsylvania#Demographics](https://en.wikipedia.org/wiki/Centralia,_Pennsylvania#Demographics) 7 | 8 | ![Plot of centralia dataset](./centralia.png) 9 | -------------------------------------------------------------------------------- /datasets/bank/README.md: -------------------------------------------------------------------------------- 1 | # Bank amounts 2 | 3 | This dataset represents the amount of money in someone's current account. 4 | Significant changes occur on days of large transactions. 5 | 6 | The ``bank.json`` file and this readme are licensed under the MIT license, see 7 | the LICENSE file. 8 | 9 | ![Plot of bank dataset](./bank.png) 10 | -------------------------------------------------------------------------------- /datasets/quality_control_1/README.md: -------------------------------------------------------------------------------- 1 | # Quality Control no. 1 2 | 3 | This is a quality control dataset with a known change point at time index 146. 4 | The series has a small trend and has Gaussian noise before the change point 5 | and an offset and uniform noise after the change point. 6 | 7 | ![Plot of quality_control_1 dataset](./quality_control_1.png) 8 | -------------------------------------------------------------------------------- /datasets/quality_control_2/README.md: -------------------------------------------------------------------------------- 1 | # Quality Control no. 2 2 | 3 | This is a quality control dataset with a known change point at time index 97. 4 | The data has constant Gaussian (0, 1) noise throughout, with a step change of 5 | size 1.5. It exemplifies the kind of datasets used in simulation studies of CP 6 | algorithms. 7 | 8 | ![Plot of quality_control_2 dataset](./quality_control_2.png) 9 | -------------------------------------------------------------------------------- /datasets/quality_control_3/README.md: -------------------------------------------------------------------------------- 1 | # Quality Control no. 3 2 | 3 | This is a quality control dataset with a slight seasonal pattern and a known 4 | change point at time index 179. The change is a change in the Gaussian noise 5 | distribution from (0, 1) noise to (2, 2) noise. The data also contains an 6 | outlier at index 42 (indexing from 0). 7 | 8 | ![Plot of quality_control_3 dataset](./quality_control_3.png) 9 | -------------------------------------------------------------------------------- /datasets/bitcoin/README.md: -------------------------------------------------------------------------------- 1 | # Bitcoin Market Price 2 | 3 | This data is obtained from: 4 | [https://www.blockchain.com/charts/market-price?timespan=all](https://www.blockchain.com/charts/market-price?timespan=all). 5 | As this data can not be redistributed, the Makefile will download it from the 6 | Internet Archive. 7 | 8 | The first 500 observations of the resulting time series are removed as they 9 | are quite uninteresting. 10 | 11 | ![Plot of bitcoin dataset](./bitcoin.png) 12 | -------------------------------------------------------------------------------- /datasets/scanline_42049/README.md: -------------------------------------------------------------------------------- 1 | # Scan line of image 42049 2 | 3 | This is a "scan line", a horizontal slice, from a grayscale image from the 4 | BSD300 dataset. The image and the exact index of the scan line have been 5 | selected because of the abrupt changes between black and white that occur. 6 | 7 | As it is not clear whether the BSD300 images can be redistributed freely, we 8 | download the image from the internet archive instead. 9 | 10 | ![Plot of scanline_42049 dataset](./scanline_42049.png) 11 | -------------------------------------------------------------------------------- /datasets/scanline_126007/README.md: -------------------------------------------------------------------------------- 1 | # Scan line of image 126007 2 | 3 | This is a "scan line", a horizontal slice, from a grayscale image from the 4 | BSD300 dataset. The image and the exact index of the scan line have been 5 | selected because of the abrupt changes between black and white that occur. 6 | 7 | As it is not clear whether the BSD300 images can be redistributed freely, we 8 | download the image from the internet archive instead. 9 | 10 | ![Plot of scanline_126007 dataset](./scanline_126007.png) 11 | -------------------------------------------------------------------------------- /datasets/debt_ireland/debt_ireland.csv: -------------------------------------------------------------------------------- 1 | Statistical Data Warehouse code,AME.A.IRL.1.0.319.0.UDGGL 2 | Country,Ireland 3 | 2000,36.0732199 4 | 2001,33.2394627 5 | 2002,30.5521068 6 | 2003,29.9296861 7 | 2004,28.2148891 8 | 2005,26.0766114 9 | 2006,23.618314 10 | 2007,23.9083721 11 | 2008,42.4036869 12 | 2009,61.5433048 13 | 2010,85.9938449 14 | 2011,110.861647 15 | 2012,119.8646655 16 | 2013,119.6837014 17 | 2014,104.1283774 18 | 2015,76.8191392 19 | 2016,73.4443135 20 | 2017,68.4403541 21 | 2018,63.8514846 22 | 2019,61.1353388 23 | 2020,56.0118623 24 | -------------------------------------------------------------------------------- /datasets/run_log/README.md: -------------------------------------------------------------------------------- 1 | # Interval Training Running Pace 2 | 3 | This dataset shows the pace of a runner during an interval training session, 4 | where a mobile application provides instructions on when to run and when to 5 | walk. 6 | 7 | Data obtained from the authors' RunDouble account for a run on 2018-07-31. 8 | 9 | See the LICENSE file for the license of the ``stats.csv`` file. To retrieve 10 | ``run_log.json`` from ``stats.csv``, run: 11 | 12 | ``` 13 | $ python convert.py stats.csv run_log.json 14 | ``` 15 | 16 | ![Plot of run_log dataset](./run_log.png) 17 | -------------------------------------------------------------------------------- /datasets/iceland_tourism/README.md: -------------------------------------------------------------------------------- 1 | # Iceland Tourism numbers by Month 2 | 3 | Source [Icelandic Tourist 4 | Board](https://www.ferdamalastofa.is/en/recearch-and-statistics/numbers-of-foreign-visitors). 5 | 6 | This dataset contains the monthly visitor numbers of tourists to Iceland, 7 | arriving through Keflavik airport. The data is obtained from the Icelandic 8 | Tourist Board. Since it is unclear if the data is in the public domain, we 9 | download it from an archive.org URL and do not redistribute it as part of this 10 | repository. 11 | 12 | ![Plot of iceland_tourism dataset](./iceland_tourism.png) 13 | -------------------------------------------------------------------------------- /datasets/robocalls/README.md: -------------------------------------------------------------------------------- 1 | # Robocalls in the US per month 2 | 3 | Data obtained from [RoboCallIndex](https://robocallindex.com/history/time). As 4 | it is not clear whether we can redistribute the data as part of this 5 | repository, we retrieve it locally instead. 6 | 7 | There is a potential changepoint in March 2018 when the Federal Appeals Court 8 | struck down FCC rules on Robocalls. Full history of the relevant legislation 9 | can be found at 10 | [https://epic.org/amicus/tcpa/aca-international/](https://epic.org/amicus/tcpa/aca-international/). 11 | 12 | ![Plot of robocalls dataset](./robocalls.png) 13 | -------------------------------------------------------------------------------- /datasets/debt_ireland/README.md: -------------------------------------------------------------------------------- 1 | # Debt of Ireland 2 | 3 | Data obtained from 4 | [Eurostat](https://www.euro-area-statistics.org/macro-economic-indicators?cr=aut&lg=en&page=2&template=1). 5 | 6 | Specifically, the timeseries concerns the government debt ratio of Ireland for 7 | the period 2000-2020. Effects of the financial crisis are visible. 8 | 9 | Source: euro area statistics. 10 | Retrieved: 2019-03-27. 11 | 12 | The information page of the Euro Area Statistics website states that data can 13 | be redistributed under the condition that the source is quoted. 14 | 15 | ![Plot of debt_ireland dataset](./debt_ireland.png) 16 | -------------------------------------------------------------------------------- /datasets/children_per_woman/README.md: -------------------------------------------------------------------------------- 1 | # Children per Woman 2 | 3 | This is a dataset from GapMinder showing the number of children per woman on 4 | average, globally. 5 | 6 | The original data is obtained from GapMinder at: 7 | [https://www.gapminder.org/data/documentation/gd008/](https://www.gapminder.org/data/documentation/gd008/) 8 | 9 | The timeseries that we use is from the ``world_total`` tab in the XLSX file. 10 | 11 | GapMinder data is licensed under the CC BY 4.0 license, which allows us to 12 | redistribute the original xlsx file here. Attribution: Free data from 13 | www.gapminder.org. 14 | 15 | ![Plot of children_per_woman dataset](./children_per_woman.png) 16 | -------------------------------------------------------------------------------- /datasets/homeruns/README.md: -------------------------------------------------------------------------------- 1 | # Home Runs in the American League by Year 2 | 3 | Data retrieved from the [Baseball 4 | Databank](https://github.com/chadwickbureau/baseballdatabank). The file 5 | ``Batting.csv`` is obtained from this repository and can be redistributed in 6 | this repository under the [CC BY-SA 3.0 7 | license](https://creativecommons.org/licenses/by-sa/3.0/). This implies that 8 | both ``Batting.csv`` and ``homeruns.json`` are licensed under this same 9 | license: http://creativecommons.org/licenses/by-sa/3.0/. 10 | 11 | This dataset lists the number of home runs in the American League of baseball 12 | by year. 13 | 14 | ![Plot of homeruns dataset](./homeruns.png) 15 | -------------------------------------------------------------------------------- /datasets/gdp_iran/README.md: -------------------------------------------------------------------------------- 1 | # GDP of Argentina in constant LCU 2 | 3 | Potential change point around the Iranian Revolution. Obtained from the [World 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=IR&start=1960). 5 | 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 7 | World Bank on 2019-08-28. No modifications to the original data file 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made. 9 | 10 | To retrieve the ``gdp_iran.json`` file from the csv file, simply run: 11 | 12 | ``` 13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_iran.json 14 | ``` 15 | 16 | ![Plot of gdp_iran dataset](./gdp_iran.png) 17 | -------------------------------------------------------------------------------- /datasets/gdp_croatia/README.md: -------------------------------------------------------------------------------- 1 | # GDP Croatia in constant LCU 2 | 3 | Apparent change point around the financial crisis. Obtained from the [World 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=HR&start=1995). 5 | 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 7 | World Bank on 2019-08-28. No modifications to the original data file 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made. 9 | 10 | To retrieve the ``gdp_croatia.json`` file from the csv file, simply run: 11 | 12 | ``` 13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_croatia.json 14 | ``` 15 | 16 | ![Plot of gdp_croatia dataset](./gdp_croatia.png) 17 | -------------------------------------------------------------------------------- /datasets/gdp_argentina/README.md: -------------------------------------------------------------------------------- 1 | # GDP of Argentina in constant LCU 2 | 3 | Potential change point around the financial crisis. Obtained from the [World 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.KN?end=2018&locations=AR&start=1960). 5 | 6 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 7 | World Bank on 2019-08-28. No modifications to the original data file 8 | (``API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv``) have been made. 9 | 10 | To retrieve the ``gdp_argentina.json`` file from the csv file, simply run: 11 | 12 | ``` 13 | $ python convert.py ./API_NY.GDP.MKTP.KN_DS2_en_csv_v2_126181.csv ./gdp_argentina.json 14 | ``` 15 | 16 | ![Plot of gdp_argentina dataset](./gdp_argentina.png) 17 | -------------------------------------------------------------------------------- /datasets/seatbelts/README.md: -------------------------------------------------------------------------------- 1 | # UK Driver Deaths 2 | 3 | This dataset concerns the number of drivers killed or seriously injured in the 4 | UK around the period where seatbelts are introduced. Seatbelts were compulsory 5 | equipment in all new cars in 1972 and were mandatory to be worn from 1983 6 | onwards. 7 | 8 | Data exported from R, where the it is a builtin dataset called 9 | ``UKDriverDeaths`` in the ``datasets`` package. Since the ``datasets`` package 10 | is part of R it is licensed under version 2 of the [GNU Public 11 | License](https://www.r-project.org/COPYING). The data file produced from this 12 | data (``seatbelts.json``) is therefore licensed under GPLv2 as well. 13 | 14 | ![Plot of seatbelts dataset](./seatbelts.png) 15 | -------------------------------------------------------------------------------- /datasets/rail_lines/README.md: -------------------------------------------------------------------------------- 1 | # Rail Lines (total route-km) 2 | 3 | Data on the total kilometers of rail lines in the world. Data obtained from 4 | the [World 5 | Bank](https://data.worldbank.org/indicator/IS.RRS.TOTL.KM?locations=1W). 6 | 7 | The dataset is licensed under [CC BY 8 | 4.0](https://creativecommons.org/licenses/by/4.0/) and can therefore be 9 | redistributed as part of this repository. No modifications to the data have 10 | been made during the conversion to the JSON format. 11 | 12 | - ``./API_IS.RRS.TOTL.KM_DS2_en_csv_v2_10520532.csv`` contains the original 13 | dataset retrieved from the World Bank. 14 | - ``./rail_lines.json`` contains the data from the entire world in JSON 15 | format. 16 | 17 | ![Plot of rail_lines dataset](./rail_lines.png) 18 | -------------------------------------------------------------------------------- /datasets/unemployment_nl/README.md: -------------------------------------------------------------------------------- 1 | # Unemployment in the Netherlands 2 | 3 | This data shows the percentage of unemployment people in the labor population. 4 | The original data is retrieved from [Statistics 5 | Netherlands](https://opendata.cbs.nl/statline/#/CBS/nl/dataset/71882ned/table?ts=1554392218500 6 | ) and can be redistributed as part of this repository. 7 | 8 | In the time series we use the data from both genders and use the corrected 9 | value for the year 2001. 10 | 11 | To retrieve the ``unemployment_nl.json`` from the original source file, 12 | simply run: 13 | 14 | ``` 15 | $ python convert.py Beroepsbevolking__vanaf_1800__12_uursgrens___1800_2013_04042019_154346.csv unemployment_nl.json 16 | ``` 17 | 18 | ![Plot of unemployment_nl dataset](./unemployment_nl.png) 19 | -------------------------------------------------------------------------------- /datasets/co2_canada/README.md: -------------------------------------------------------------------------------- 1 | # CO2 emissions (tonnes per person) for Canada 2 | 3 | This series describes carbon dioxide emissions from the burning of fossil 4 | fuels (metric tonnes of CO2 per person) in Canada. The data is retrieved from 5 | [GapMinder](https://www.gapminder.org/tools/#$state$marker$axis_y$which=co2_emissions_tonnes_per_person&domainMin:null&domainMax:null&zoomedMin:null&zoomedMax:null&scaleType=genericLog&spaceRef:null;;;&chart-type=bubbles) 6 | 7 | We isolate Canada because there is a long history and the behaviour looks 8 | interesting. 9 | 10 | GapMinder data is licensed under the CC BY 4.0 license, which allows us to 11 | redistribute the original data here. Attribution: Free data from 12 | www.gapminder.org. 13 | 14 | ![Plot of co2_canada dataset](./co2_canada.png) 15 | -------------------------------------------------------------------------------- /datasets/gdp_japan/README.md: -------------------------------------------------------------------------------- 1 | # Historic GDP of Japan in the Local Currency Unit (LCU) 2 | 3 | Data obtained from the [World 4 | Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.CN?locations=JP). 5 | 6 | There is a known structural break in the growth rate of Japan, known as the 7 | [lost decade](https://en.wikipedia.org/wiki/Lost_Decade_(Japan)). 8 | 9 | The dataset is licensed under the CC-BY 4.0 license. Data retrieved from the 10 | World Bank on 2019-03-27. No modifications to the original data file 11 | (``gdp.csv``) have been made. The file ``gdp_japan.csv`` is a subset of the 12 | ``gdp.csv`` file that contains only the data for Japan. The ``gdp_japan.json`` 13 | file is manually constructed from the ``gdp_japan.csv`` file. 14 | 15 | ![Plot of gdp_japan dataset](./gdp_japan.png) 16 | -------------------------------------------------------------------------------- /datasets/uk_coal_employ/README.md: -------------------------------------------------------------------------------- 1 | # Historic Employment in UK Coal Mines 2 | 3 | This is historic data obtained from [the UK 4 | government](https://www.gov.uk/government/statistical-data-sets/historical-coal-data-coal-production-availability-and-consumption). 5 | As the dataset is licensed under the [Open Government 6 | License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) 7 | we distribute it as part of this repository. 8 | 9 | We use the employment column for the number of workers employed in the British 10 | coal mines (extracted to ``employ_only.csv`` from ``Coal_since_1853.xls``) and 11 | converted to the ``uk_coal_employ.json`` file. Missing values in the data are 12 | indicated by a ``null`` value in the JSON file. 13 | 14 | ![Plot of uk_coal_employ dataset](./uk_coal_employ.png) 15 | -------------------------------------------------------------------------------- /datasets/jfk_passengers/README.md: -------------------------------------------------------------------------------- 1 | # JFK Airline Passengers 2 | 3 | This dataset gives the number of passengers arriving and departing at JFK. 4 | 5 | The data is obtained from New York State's official Kaggle page for this 6 | dataset: https://www.kaggle.com/new-york-state/nys-air-passenger-traffic,-port-authority-of-ny-nj#air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv 7 | 8 | This page shows that the data is under a "CC0: Public Domain" license, so we 9 | redistribute it here as part of our repository. 10 | 11 | To create the ``jfk_passengers.json`` file from the raw csv file, simply run: 12 | 13 | ``` 14 | $ python convert.py ./air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv ./jfk_passengers.json 15 | ``` 16 | 17 | ![Plot of jfk_passengers dataset](./jfk_passengers.png) 18 | -------------------------------------------------------------------------------- /datasets/lga_passengers/README.md: -------------------------------------------------------------------------------- 1 | # LGA Airline Passengers 2 | 3 | This dataset gives the number of passengers arriving and departing at LGA. 4 | 5 | The data is obtained from New York State's official Kaggle page for this 6 | dataset: https://www.kaggle.com/new-york-state/nys-air-passenger-traffic,-port-authority-of-ny-nj#air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv 7 | 8 | This page shows that the data is under a "CC0: Public Domain" license, so we 9 | redistribute it here as part of our repository. 10 | 11 | To create the ``lga_passengers.json`` file from the raw csv file, simply run: 12 | 13 | ``` 14 | $ python convert.py ./air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv ./lga_passengers.json 15 | ``` 16 | 17 | ![Plot of lga_passengers dataset](./lga_passengers.png) 18 | -------------------------------------------------------------------------------- /datasets/measles/README.md: -------------------------------------------------------------------------------- 1 | # Weekly Measles Case Reports England & Wales 2 | 3 | This data is available from [prof. Ben Bolker's 4 | webpage](https://ms.mcmaster.ca/~bolker/measdata.html). 5 | 6 | The original file is ``ewmeas.dat``. 7 | 8 | For the data format, see: 9 | [https://ms.mcmaster.ca/~bolker/measdata/formats.html](https://ms.mcmaster.ca/~bolker/measdata/formats.html). 10 | 11 | The time difference between observations is not exactly constant, but for the 12 | annotation we will consider it as such. Detection algorithms that can take a 13 | specific time axis should be provided with the true axis. 14 | 15 | While this data is provided "as is", it is not clearly licensed for 16 | redistribution. We therefore download it locally instead of distributing it 17 | with this repository. 18 | 19 | ![Plot of measles dataset](./measles.png) 20 | -------------------------------------------------------------------------------- /datasets/apple/README.md: -------------------------------------------------------------------------------- 1 | # Apple Stock 2 | 3 | This dataset concerns the daily close price and volume of Apple stock around 4 | the year 2000. The dataset is sampled every 3 observations to reduce the 5 | length of the time series. 6 | 7 | Data retrieved from [Yahoo 8 | Finance](https://finance.yahoo.com/quote/AAPL/history?period1=850348800&period2=1084579200&interval=1d&filter=history&frequency=1d). 9 | We use the Python package ``yfinance`` to download the data as it can not be 10 | redistributed as part of this repository. 11 | 12 | Since the original data has observations only on trading days, there are 13 | arguably gaps in this time series on non-trading days. However we consider 14 | these to be consecutive, and thus also consider the sampled time series to 15 | have consecutive observations. 16 | 17 | ![Plot of apple dataset](./apple.png) 18 | -------------------------------------------------------------------------------- /datasets/businv/README.md: -------------------------------------------------------------------------------- 1 | # Total Business Inventories 2 | 3 | Monthly total business inventories from the US Census. Data retrieved from 4 | [this direct 5 | url](https://www.census.gov/mtis/www/data/text/mtis-inventory.txt) on 6 | 2019-09-11. We use the unadjusted time series to maintain the seasonal 7 | component. 8 | 9 | According to [this 10 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 11 | on the US Census website, we are allowed to redistribute the data as part of 12 | this repository. 13 | 14 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 15 | 2019-09-11. 16 | 17 | To create the ``businv.json`` file from the raw ``mtis-inventory.txt`` file, 18 | simply run: 19 | 20 | ``` 21 | $ python convert.py mtis-inventory.txt businv.json 22 | ``` 23 | 24 | ![Plot of businv dataset](./businv.png) 25 | -------------------------------------------------------------------------------- /datasets/us_population/README.md: -------------------------------------------------------------------------------- 1 | # US Population 2 | 3 | This time series are the population numbers in the US. A potential change 4 | point occurs around index 459 (1990s). 5 | 6 | Data obtained from 7 | [Kaggle](https://www.kaggle.com/census/population-time-series-data#POP.csv). 8 | 9 | The original source of the data is the US Census Bureau. According to [this 10 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 11 | on the US Census website, we are allowed to redistribute the data as part of 12 | this repository. 13 | 14 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 15 | 2019-08-28. 16 | 17 | To obtain ``./us_population.json`` from ``POP.csv``, simply run: 18 | 19 | ``` 20 | $ python convert.py POP.csv us_population.json 21 | ``` 22 | 23 | ![Plot of us_population dataset](./us_population.png) 24 | -------------------------------------------------------------------------------- /datasets/bee_waggle_6/README.md: -------------------------------------------------------------------------------- 1 | # Bee Waggle Dataset sequence 6 2 | 3 | The movement of honey bees switches between a left turn, a right turn, and a 4 | waggle. This dataset contains is a three-dimensional dataset of the position 5 | (x, y) and heading angle (theta) of a single bee. 6 | 7 | Source: [Parametric Segmental Switching Linear Dynamic Systems 8 | (PS-SLDS)](https://www.cc.gatech.edu/~borg/ijcv_psslds/). 9 | 10 | When using this time series, please cite original authors: 11 | 12 | ```bibtex 13 | @article{oh2008learning, 14 | title={Learning and inferring motion patterns using parametric segmental switching linear dynamic systems}, 15 | author={Oh, S. M. and Rehg, J. M. and Balch, T. and Dellaert, F.}, 16 | journal={International Journal of Computer Vision}, 17 | volume={77}, 18 | number={1-3}, 19 | pages={103--124}, 20 | year={2008}, 21 | publisher={Springer} 22 | } 23 | ``` 24 | 25 | ![Plot of bee_waggle_6 dataset](./bee_waggle_6.png) 26 | -------------------------------------------------------------------------------- /datasets/centralia/centralia.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "centralia", 3 | "longname": "Centralia Pennsylvania Population", 4 | "n_obs": 15, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14 25 | ], 26 | "raw": [ 27 | "1870", 28 | "1880", 29 | "1890", 30 | "1900", 31 | "1910", 32 | "1920", 33 | "1930", 34 | "1940", 35 | "1950", 36 | "1960", 37 | "1970", 38 | "1980", 39 | "1990", 40 | "2000", 41 | "2010" 42 | ] 43 | }, 44 | "series": [ 45 | { 46 | "label": "Population", 47 | "type": "int", 48 | "raw": [ 49 | 1342, 50 | 1886, 51 | 2761, 52 | 2048, 53 | 2429, 54 | 2336, 55 | 2446, 56 | 2449, 57 | 1986, 58 | 1435, 59 | 1165, 60 | 1017, 61 | 63, 62 | 21, 63 | 10 64 | ] 65 | } 66 | ] 67 | } -------------------------------------------------------------------------------- /datasets/ratner_stock/README.md: -------------------------------------------------------------------------------- 1 | # Ratner Group Stock Price 2 | 3 | The Ratner Group's stock price [is 4 | known](https://en.wikipedia.org/wiki/Gerald_Ratner#The_speech) for an event 5 | that can be considered a change point. 6 | 7 | Historical stock market data for SIG retrieved from [Yahoo finance (daily 8 | frequency)](https://finance.yahoo.com/quote/SIG/history?period1=584841600&period2=1567036800&interval=1d&filter=history&frequency=1d). 9 | We use the Python package ``yfinance`` to download the data as it can not be 10 | redistributed as part of this repository. 11 | 12 | The data has been sampled every 3 observations to reduce the length of the 13 | series. 14 | 15 | Since the original data has observations only on trading days, there are 16 | arguably gaps in this time series (on non-trading days). However we consider 17 | these to be consecutive, and thus also consider the sampled time series to 18 | have consecutive observations. 19 | 20 | ![Plot of ratner_stock dataset](./ratner_stock.png) 21 | -------------------------------------------------------------------------------- /examples/R/README.md: -------------------------------------------------------------------------------- 1 | # Loading a TCPD dataset into R 2 | 3 | The file ``load_dataset.R`` contains the function ``load.dataset`` that reads 4 | the JSON file into an R dataframe. The 5 | [RJSONIO](https://cran.r-project.org/web/packages/RJSONIO/index.html) package 6 | is required: 7 | 8 | ```R 9 | > install.packages('RJSONIO') 10 | ``` 11 | 12 | Simply run: 13 | 14 | ```R 15 | > source('./load_dataset.R') 16 | > df <- load.dataset('../../datasets/ozone/ozone.json') 17 | > df 18 | t Total Emissions 19 | 1 0 380000 20 | 2 1 400000 21 | 3 2 440000 22 | 4 3 480000 23 | 5 4 510000 24 | 6 5 540000 25 | 7 6 580000 26 | 8 7 630000 27 | ``` 28 | 29 | Notice that the time axis in TCPD is always 0-based. This needs to be taken 30 | into account when comparing detection results to the human annotations. (This 31 | is an unfortunate consequence of the differences between indexing in R and 32 | Python.) 33 | 34 | Missing observations in time series are represented with a ``NA`` value. 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | 3 | # Install necessary packages and ensure python means python3 4 | RUN apt-get update && \ 5 | DEBIAN_FRONTEND=noninteractive apt-get remove -y python && \ 6 | apt-get install -y --no-install-recommends \ 7 | git \ 8 | build-essential \ 9 | libcurl4-openssl-dev \ 10 | libssl-dev \ 11 | python3 \ 12 | python3-dev \ 13 | python3-pip \ 14 | python3-venv \ 15 | python3-wheel && \ 16 | echo "alias python='python3'" >> /root/.bash_aliases && \ 17 | echo "alias pip='pip3'" >> /root/.bash_aliases && \ 18 | cd /usr/local/bin && ln -s /usr/bin/python3 python && \ 19 | cd /usr/local/bin && ln -s /usr/bin/pip3 pip 20 | 21 | # Make bash the default shell 22 | RUN mv /bin/sh /bin/sh.old && cp /bin/bash /bin/sh 23 | 24 | # Clone the dataset repo 25 | RUN git clone https://github.com/alan-turing-institute/TCPD 26 | 27 | # Change working dir 28 | WORKDIR TCPD 29 | 30 | # Create virtualenv 31 | RUN make venv 32 | 33 | # Build the dataset when container is run. 34 | CMD ["make", "export"] 35 | -------------------------------------------------------------------------------- /examples/R/load_dataset.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: Example code to load a TCPD time series 3 | #' author: G.J.J. van den Burg 4 | #' date: 2020-01-06 5 | #' license: See the LICENSE file. 6 | #' copyright: 2019, The Alan Turing Institute 7 | #' --- 8 | 9 | library(RJSONIO) 10 | 11 | load.dataset <- function(filename) 12 | { 13 | data <- fromJSON(filename) 14 | 15 | # reformat the data into a data frame with a time index and the data values 16 | tidx <- data$time$index 17 | 18 | cols <- c() 19 | 20 | mat <- NULL 21 | for (j in 1:data$n_dim) { 22 | s <- data$series[[j]] 23 | v <- NULL 24 | for (i in 1:data$n_obs) { 25 | val <- s$raw[[i]] 26 | if (is.null(val)) { 27 | v <- c(v, NA) 28 | } else { 29 | v <- c(v, val) 30 | } 31 | } 32 | cols <- c(cols, s$label) 33 | mat <- cbind(mat, v) 34 | } 35 | 36 | mat <- cbind(tidx, mat) 37 | colnames(mat) <- c('t', cols) 38 | 39 | df <- as.data.frame(mat) 40 | return(df) 41 | } 42 | -------------------------------------------------------------------------------- /datasets/occupancy/README.md: -------------------------------------------------------------------------------- 1 | # Room occupancy data 2 | 3 | Dataset on detecting room occupancy based on several variables. For our 4 | dataset we use the Temperature, Humidity, Light, and CO2 variables from the 5 | training dataset. 6 | 7 | This dataset is obtained from the [UCI 8 | repository](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+) 9 | on 2019-06-10. As it is unclear whether the data can be redistributed as part 10 | of this repository, we download it locally instead. 11 | 12 | The data is sampled at every 16 observations to reduce the length of the 13 | series. 14 | 15 | When using this particular time series, please cite: 16 | 17 | ```bib 18 | @article{candanedo2016accurate, 19 | title={Accurate occupancy detection of an office room from light, temperature, humidity and $\text{CO}_2$ measurements using statistical learning models}, 20 | author={Candanedo, L. M. and Feldheim, V.}, 21 | journal={Energy and Buildings}, 22 | volume={112}, 23 | pages={28--39}, 24 | year={2016}, 25 | publisher={Elsevier} 26 | } 27 | ``` 28 | 29 | ![Plot of occupancy dataset](./occupancy.png) 30 | -------------------------------------------------------------------------------- /datasets/nile/README.md: -------------------------------------------------------------------------------- 1 | # Volume of Nile River at Aswan 2 | 3 | This is a dataset on the volume of the Nile river at Aswan for the period 4 | 1871-1970. The data is obtained from the website of the book [Time Series 5 | Analysis by State Space Methods](http://www.ssfpack.com/DKbook.html) by Durbin 6 | and Koopman. The data is also available in the R ``datasets`` package, which 7 | is part of R and is therefore licensed under version 2 of the [GNU Public 8 | License](https://www.r-project.org/COPYING). The data file produced from this 9 | data (``nile.json``) is therefore licensed under GPLv2 as well. 10 | 11 | A potential change point occurs in 1898 with the introduction of a dam. 12 | 13 | Note that this is not the Nile dataset used in other change point papers, 14 | which covers a period from 622 to 1284 AD (see e.g. Witcher et al. (2002)). 15 | That dataset, [available 16 | here](https://web.archive.org/web/20000815223740/http://lib.stat.cmu.edu/S/beran), 17 | has been used in many papers to detect the introduction of a nilometer in the 18 | year 715 AD. 19 | 20 | ![Plot of nile dataset](./nile.png) 21 | -------------------------------------------------------------------------------- /datasets/brent_spot/README.md: -------------------------------------------------------------------------------- 1 | # Brent Spot Price 2 | 3 | This is the USD price for Brent Crude oil, measured daily. We include the time 4 | series from 2000 onwards. The data is sampled at every 10 original 5 | observations to reduce the length of the series. 6 | 7 | The data is obtained from the [U.S. Energy Information 8 | Administration](https://www.eia.gov/opendata/qb.php?sdid=PET.RBRTE.D). Since 9 | the data is in the public domain, we distribute it as part of this repository. 10 | Source: U.S. Energy Information Administration (Sep. 2019). 11 | 12 | Since the original data has observations only on trading days, there are 13 | arguably gaps in this time series (on non-trading days). However we consider 14 | these to be consecutive, and thus also consider the sampled time series to 15 | have consecutive observations. 16 | 17 | To obtain the ``brent_spot.json`` file from the original 18 | ``Europe_Brent_Spot_Price_FOB_Daily.csv`` file, simply run: 19 | 20 | ``` 21 | $ python convert.py Europe_Brent_Spot_Price_FOB_Daily.csv brent_spot.json 22 | ``` 23 | 24 | ![Plot of brent_spot dataset](./brent_spot.png) 25 | -------------------------------------------------------------------------------- /.github/workflows/validate.yml: -------------------------------------------------------------------------------- 1 | name: Check TCPD 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | schedule: 11 | - cron: 16 17 */10 * * 12 | 13 | jobs: 14 | tcpd-ubuntu: 15 | name: check TCPD (direct) 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Install dependencies 20 | run: sudo apt-get update && sudo apt-get install build-essential 21 | shell: bash 22 | 23 | - name: Install Python 3.12 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: '3.12' 27 | 28 | - name: Checkout code 29 | uses: actions/checkout@v4 30 | 31 | - name: Ensure clean 32 | run: make clean 33 | shell: bash 34 | 35 | - name: Build and verify 36 | run: make test 37 | shell: bash 38 | 39 | tcpd-docker: 40 | name: check TCPD (docker) 41 | runs-on: ubuntu-latest 42 | 43 | steps: 44 | - name: Checkout 45 | uses: actions/checkout@v4 46 | 47 | - name: TCPD docker check 48 | uses: ./.github/workflows/ 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 The Alan Turing Institute 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /datasets/bank/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 G.J.J. van den Burg 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /datasets/well_log/README.md: -------------------------------------------------------------------------------- 1 | # Well-log dataset 2 | 3 | This is the well-known well-log dataset used in many changepoint papers. This 4 | version is obtained from [this Github 5 | repository](https://raw.githubusercontent.com/alan-turing-institute/bocpdms/a4042b45004592f5b9fa912b346dd22a212b7ea0/Data/well.txt), 6 | and licensed under the MIT license. 7 | 8 | The dataset is sampled at every 6 observations to reduce the length of the 9 | series. To obtain the json dataset from the original file, simply run: 10 | 11 | ``` 12 | $ python convert.py well_log.txt well_log.json 13 | ``` 14 | 15 | [Here](https://web.archive.org/web/20191128143944/https://raw.githubusercontent.com/alan-turing-institute/bocpdms/a4042b45004592f5b9fa912b346dd22a212b7ea0/Data/well.txt) 16 | is an archive.org url for the dataset. 17 | 18 | When using this series, please cite the original source: 19 | 20 | ```bib 21 | @book{oruanaidh1996numerical, 22 | title={Numerical {Bayesian} Methods Applied to Signal Processing}, 23 | author={{\'O Ruanaidh}, J. J. K. and Fitzgerald, W. J.}, 24 | year={1996}, 25 | publisher={Springer} 26 | } 27 | ``` 28 | 29 | ![Plot of well_log dataset](./well_log.png) 30 | -------------------------------------------------------------------------------- /datasets/construction/README.md: -------------------------------------------------------------------------------- 1 | # Total Private Construction Spending 2 | 3 | This dataset is retrieved from the US Census and concerns the total private 4 | construction spending. Potential change points occur at recessions. The data 5 | is obtained [from this Census 6 | page](https://www.census.gov/construction/c30/historical_data.html) using the 7 | "Private" series from the "Not Seasonally Adjusted" column in the "Monthly" 8 | table. Alternatively, use [this direct 9 | URL](https://www.census.gov/construction/c30/xls/privtime.xls). 10 | 11 | According to [this 12 | page](https://web.archive.org/web/20191120160410/https://ask.census.gov/prweb/PRServletCustom/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pyActivity=pyMobileSnapStart&ArticleID=KCP-4726) 13 | on the US Census website, we are allowed to redistribute the data as part of 14 | this repository. 15 | 16 | Source: United States Census Bureau, URL: https://www.census.gov, Retrieved: 17 | 2019-09-11. 18 | 19 | To create ``construction.json`` file from the raw ``privtime.xls`` file, 20 | simply run: 21 | 22 | ``` 23 | $ python convert.py privtime.xls construction.json 24 | ``` 25 | 26 | ![Plot of construction dataset](./construction.png) 27 | -------------------------------------------------------------------------------- /datasets/run_log/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Gerrit J.J. van den Burg 2 | 3 | This LICENSE file covers the stats.csv file only. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /datasets/well_log/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | 14 | SAMPLE = 6 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("input_file", help="File to convert") 20 | parser.add_argument("output_file", help="File to write to") 21 | return parser.parse_args() 22 | 23 | 24 | def main(): 25 | args = parse_args() 26 | 27 | with open(args.input_file, "r") as fp: 28 | rows = [l.strip() for l in fp] 29 | 30 | rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0] 31 | 32 | values = list(map(float, rows)) 33 | name = "well_log" 34 | longname = "Well Log" 35 | 36 | series = [{"label": "V1", "type": "float", "raw": values}] 37 | 38 | data = { 39 | "name": name, 40 | "longname": longname, 41 | "n_obs": len(values), 42 | "n_dim": len(series), 43 | "time": {"index": list(range(len(values)))}, 44 | "series": series, 45 | } 46 | 47 | with open(args.output_file, "w") as fp: 48 | json.dump(data, fp, indent="\t") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /datasets/debt_ireland/debt_ireland.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "debt_ireland", 3 | "longname": "Debt Ireland", 4 | "n_obs": 21, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20 31 | ], 32 | "raw": [ 33 | "2000", 34 | "2001", 35 | "2002", 36 | "2003", 37 | "2004", 38 | "2005", 39 | "2006", 40 | "2007", 41 | "2008", 42 | "2009", 43 | "2010", 44 | "2011", 45 | "2012", 46 | "2013", 47 | "2014", 48 | "2015", 49 | "2016", 50 | "2017", 51 | "2018", 52 | "2019", 53 | "2020" 54 | ] 55 | }, 56 | "series": [ 57 | { 58 | "label": "V1", 59 | "type": "float", 60 | "raw": [ 61 | 36.0732199, 62 | 33.2394627, 63 | 30.5521068, 64 | 29.9296861, 65 | 28.2148891, 66 | 26.0766114, 67 | 23.618314, 68 | 23.9083721, 69 | 42.4036869, 70 | 61.5433048, 71 | 85.9938449, 72 | 110.861647, 73 | 119.8646655, 74 | 119.6837014, 75 | 104.1283774, 76 | 76.8191392, 77 | 73.4443135, 78 | 68.4403541, 79 | 63.8514846, 80 | 61.1353388, 81 | 56.0118623 82 | ] 83 | } 84 | ] 85 | } 86 | -------------------------------------------------------------------------------- /datasets/ozone/README.md: -------------------------------------------------------------------------------- 1 | # Ozone-depleting substance emissions 2 | 3 | This dataset contains "Global emissions of ozone-depleting substances, 4 | measured in tonnes of chlorofluorocarbon-11 equivalents 5 | (CFC11-equivalents) per year." It is obtained from [Our World in 6 | Data](https://ourworldindata.org/ozone-layer), who have scraped the data from: 7 | 8 | Hegglin, M. I., Fahey, D. W., McFarland, M., Montzka, S. A., & Nash, E. R. 9 | (2014). [Twenty questions and answers about the ozone layer: 2014 10 | update](https://www.wmo.int/pages/prog/arep/gaw/ozone_2014/documents/2014%20Twenty%20Questions_Final.pdf). 11 | World Meteorological Organization, UNEP, NOAA, NASA, and European Commission. 12 | 13 | A change is expected after the signing of the [Montreal 14 | Protocol](https://en.wikipedia.org/wiki/Montreal_Protocol). 15 | 16 | The chart in the article by [Our World in 17 | Data](https://ourworldindata.org/ozone-layer) is licensed under [CC BY 18 | 4.0](https://creativecommons.org/licenses/by/4.0/deed.en_US). No changes to 19 | the data were made. The original data sourced from Hegglin et al., 2015 (cited 20 | above) is in the public domain. 21 | 22 | The ``ozone.json`` file can be obtained from the original 23 | ``./ozone-depleting-substance-emissions.csv`` by running: 24 | 25 | ``` 26 | $ python convert.py ./ozone-depleting-substance-emissions.csv ./ozone.json 27 | ``` 28 | 29 | ![Plot of ozone dataset](./ozone.png) 30 | -------------------------------------------------------------------------------- /datasets/shanghai_license/README.md: -------------------------------------------------------------------------------- 1 | # Shanghai License Plate Applicants 2 | 3 | Source: 4 | [Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price). 5 | Data licensed under [CC0: Public 6 | Domain](https://creativecommons.org/publicdomain/zero/1.0/), so we can 7 | redistribute it as part of this repository. 8 | 9 | There seems to be a clear sudden growth in the number of applicants. 10 | 11 | Note: according to [this discussion on 12 | Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price/discussion/73140), 13 | the record for 2008-02 is missing because the license plates for January and 14 | Feburary were auctioned off simultaneously in January. As this represents an 15 | uneven measurement and a missing value, we choose to split the observation for 16 | January and February 2008 in two, dividing the amount equally between the 17 | months. An alternative would be to introduce a missing value in 2008-02, but 18 | since many of the algorithms we wish to evaluate are not able to handle 19 | missing values (and any imputation method would be incorrect), we believe this 20 | is a reasonable way to deal with this issue. 21 | 22 | To obtain the ``shanghai_license.json`` file from the 23 | ``Shanghai_license_plate_price_-_Sheet3.csv`` file, simply run: 24 | 25 | ``` 26 | $ python convert.py Shanghai_license_plate_price_-_Sheet3.csv shanghai_license.json 27 | ``` 28 | 29 | ![Plot of shanghai_license dataset](./shanghai_license.png) 30 | -------------------------------------------------------------------------------- /datasets/gdp_japan/gdp_japan.csv: -------------------------------------------------------------------------------- 1 | "Country Name","Country Code","Indicator Name","Indicator Code","1960","1961","1962","1963","1964","1965","1966","1967","1968","1969","1970","1971","1972","1973","1974","1975","1976","1977","1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018" 2 | "Japan","JPN","GDP (current LCU)","NY.GDP.MKTP.CN","15950643462144","19263102386176","21860286726144","25019327447040","29429642297344","32742100172800","38026105323520","44561476878336","52776386166784","61993511813120","76539307651500","84215883490900","96418343539100","117397596102100","140090360740400","154787118329600","173827764691400","193706278803100","213306268936200","231195355873400","250636100000000","268830700000000","282582000000000","295303900000000","313145300000000","333686000000000","350344800000000","366339100000000","393641400000000","421469400000000","453608500000000","482845400000000","495055800000000","495291000000000","501537700000000","512541700000000","525806900000000","534142500000000","527876900000000","519651800000000","526706000000000","523005000000000","515986200000000","515400700000000","520965400000000","524132800000000","526879700000000","531688200000000","520715700000000","489501000000000","500353900000000","491408500000000","494957200000000","503175600000000","513876000000000","531985800000000","538445800000000","546488800000000","" 3 | -------------------------------------------------------------------------------- /datasets/gdp_croatia/gdp_croatia.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gdp_croatia", 3 | "longname": "GDP Croatia", 4 | "n_obs": 24, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23 34 | ], 35 | "raw": [ 36 | "1995", 37 | "1996", 38 | "1997", 39 | "1998", 40 | "1999", 41 | "2000", 42 | "2001", 43 | "2002", 44 | "2003", 45 | "2004", 46 | "2005", 47 | "2006", 48 | "2007", 49 | "2008", 50 | "2009", 51 | "2010", 52 | "2011", 53 | "2012", 54 | "2013", 55 | "2014", 56 | "2015", 57 | "2016", 58 | "2017", 59 | "2018" 60 | ] 61 | }, 62 | "series": [ 63 | { 64 | "label": "GDP (constant LCU)", 65 | "type": "int", 66 | "raw": [ 67 | 217509197000, 68 | 230285030900, 69 | 245588563100, 70 | 250161129600, 71 | 247820872300, 72 | 257170176300, 73 | 266048737900, 74 | 280025245200, 75 | 295653977300, 76 | 307227520000, 77 | 319853234600, 78 | 335423106200, 79 | 353145697500, 80 | 360337370500, 81 | 334063620100, 82 | 329143142600, 83 | 328023429300, 84 | 320476781700, 85 | 318900841000, 86 | 318621760200, 87 | 326270568300, 88 | 337807918300, 89 | 347676515100, 90 | 356819802800 91 | ] 92 | } 93 | ] 94 | } -------------------------------------------------------------------------------- /datasets/global_co2/README.md: -------------------------------------------------------------------------------- 1 | # Global Monthly CO2 levels 2 | 3 | This dataset concerns monthly global hemispheric means of carbon dioxide in 4 | air. The data is part of the CMIP6 dataset, developed by Meinshausen et al. 5 | 6 | When using this data, please cite: 7 | 8 | ```bib 9 | @article{meinshausen2017historical, 10 | title={Historical greenhouse gas concentrations for climate modelling ({CMIP6})}, 11 | author={Meinshausen, M. and Vogel, E. and Nauels, A. and Lorbacher, K. and Meinshausen, N. and Etheridge, D. M. and Fraser, P. J. and Montzka, S. A. and Rayner, P. J. and Trudinger, C. M. and Krummel, P. B. and Beyerle, U. and Canadell, J. G. and Daniel, J. S. and Enting, I. G. and Law, R. M. and Lunder, C. R. and O'Doherty, S. and Prinn, R. G. and Reimann, S. and Rubino, M. and Velders, G. J. M. and Vollmer, M. K. and Wang, R. H. J. and Weiss, R.}, 12 | journal={Geoscientific Model Development}, 13 | volume={10}, 14 | pages={2057--2116}, 15 | year={2017}, 16 | publisher={Copernicus} 17 | } 18 | ``` 19 | 20 | It seems that the work of Meinshausen et al. is licensed under [CC BY 21 | 3.0](https://creativecommons.org/licenses/by/3.0/), judging from [the 22 | publication](https://www.geosci-model-dev.net/10/2057/2017/). This allows us 23 | to redistribute this time series as part of the dataset, provided that the 24 | above source is cited. We thus include the source csv in the repository. 25 | 26 | Note that the original data is sampled every 4 years and cropped to recent 27 | history to reduce the length of the series. 28 | 29 | ![Plot of global_co2 dataset](./global_co2.png) 30 | -------------------------------------------------------------------------------- /datasets/centralia/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "-s", 19 | "--subsample", 20 | help="Number of observations to skip during subsampling", 21 | type=int, 22 | ) 23 | parser.add_argument("input_file", help="File to convert") 24 | parser.add_argument("output_file", help="File to write to") 25 | return parser.parse_args() 26 | 27 | 28 | def main(): 29 | args = parse_args() 30 | 31 | with open(args.input_file, "r") as fp: 32 | rows = [l.strip().split("\t") for l in fp] 33 | 34 | time = [] 35 | values = [] 36 | for year, pop in rows: 37 | time.append(year) 38 | values.append(int(pop)) 39 | 40 | name = "centralia" 41 | longname = "Centralia Pennsylvania Population" 42 | time_fmt = "%Y" 43 | series = [{"label": "Population", "type": "int", "raw": values}] 44 | 45 | data = { 46 | "name": name, 47 | "longname": longname, 48 | "n_obs": len(time), 49 | "n_dim": len(series), 50 | "time": { 51 | "type": "string", 52 | "format": time_fmt, 53 | "index": list(range(len(time))), 54 | "raw": time, 55 | }, 56 | "series": series, 57 | } 58 | 59 | with open(args.output_file, "w") as fp: 60 | json.dump(data, fp, indent="\t") 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /datasets/ozone/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import argparse 12 | import clevercsv 13 | import json 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar="", escapechar="" 29 | ) 30 | rows = list(reader) 31 | 32 | header = rows.pop(0) 33 | 34 | total = [r for r in rows if r[0] == "Total emissions"] 35 | time = [r[2] for r in total] 36 | values = [int(r[-1]) for r in total] 37 | 38 | name = "ozone" 39 | longname = "Ozone-Depleting Emissions" 40 | time_fmt = "%Y" 41 | 42 | series = [{"label": "Total Emissions", "type": "int", "raw": values}] 43 | 44 | data = { 45 | "name": name, 46 | "longname": longname, 47 | "n_obs": len(time), 48 | "n_dim": len(series), 49 | "time": { 50 | "type": "string", 51 | "format": time_fmt, 52 | "index": list(range(len(time))), 53 | "raw": time, 54 | }, 55 | "series": series, 56 | } 57 | 58 | with open(args.output_file, "w") as fp: 59 | json.dump(data, fp, indent="\t") 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /examples/python/README.md: -------------------------------------------------------------------------------- 1 | # Loading a TCPD time series in Python 2 | 3 | The ``load_dataset.py`` file contains example code to load a time series as a 4 | ``TimeSeries`` object. 5 | 6 | ```python 7 | >>> from load_dataset import TimeSeries 8 | >>> ts = TimeSeries.from_json('../../datasets/ozone/ozone.json') 9 | ``` 10 | 11 | To export the time series as a [pandas 12 | DataFrame](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dataframe), 13 | simply use: 14 | 15 | ```python 16 | >>> ts.df 17 | t Total Emissions 18 | 0 0 380000.0 19 | 1 1 400000.0 20 | 2 2 440000.0 21 | 3 3 480000.0 22 | 4 4 510000.0 23 | 5 5 540000.0 24 | ... 25 | ``` 26 | 27 | The ``TimeSeries`` instance ``ts`` has an integer time axis at ``ts.t`` and 28 | the observations at ``ts.y``. The time axis is zero-based by default. If you 29 | prefer to use a one-based indexing, simply run: 30 | 31 | ```python 32 | >>> ts.make_one_based() 33 | >>> ts.df 34 | t Total Emissions 35 | 0 1 380000.0 36 | 1 2 400000.0 37 | 2 3 440000.0 38 | 3 4 480000.0 39 | 4 5 510000.0 40 | 5 6 540000.0 41 | ... 42 | ``` 43 | 44 | Many of the time series in TCPD have date or datetime labels for the time 45 | axis. This axis can be retrieved using: 46 | 47 | ```python 48 | >>> ts.datestr 49 | array(['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', 50 | ... 51 | '2009', '2010', '2011', '2012', '2013', '2014'], dtype='>> ts.datefmt 58 | '%Y' 59 | ``` 60 | -------------------------------------------------------------------------------- /datasets/us_population/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar="", escapechar="" 29 | ) 30 | rows = list(reader) 31 | 32 | rows.pop(0) 33 | 34 | # the time format is monthly, so we convert that here 35 | time = [r[2][:-3] for r in rows] 36 | time_fmt = "%Y-%m" 37 | 38 | # source is in thousands, so we correct that here 39 | values = [float(r[3]) * 1000 for r in rows] 40 | 41 | name = "us_population" 42 | longname = "US Population" 43 | series = [{"label": "Population", "type": "int", "raw": values}] 44 | 45 | data = { 46 | "name": name, 47 | "longname": longname, 48 | "n_obs": len(time), 49 | "n_dim": len(series), 50 | "time": { 51 | "type": "string", 52 | "format": time_fmt, 53 | "index": list(range(len(time))), 54 | "raw": time, 55 | }, 56 | "series": series, 57 | } 58 | 59 | with open(args.output_file, "w") as fp: 60 | json.dump(data, fp, indent="\t") 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /datasets/run_log/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import argparse 12 | import clevercsv 13 | import json 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar="", escapechar="" 29 | ) 30 | rows = list(reader) 31 | 32 | header = rows.pop(0) 33 | 34 | name = "run_log" 35 | longname = "Run Log" 36 | 37 | time = [r[0].rstrip("Z").replace("T", " ") for r in rows] 38 | time_fmt = "%Y-%m-%d %H:%M:%S" 39 | pace = [float(r[3]) for r in rows] 40 | distance = [float(r[4]) for r in rows] 41 | 42 | series = [ 43 | {"label": "Pace", "type": "float", "raw": pace}, 44 | {"label": "Distance", "type": "float", "raw": distance}, 45 | ] 46 | 47 | data = { 48 | "name": name, 49 | "longname": longname, 50 | "n_obs": len(time), 51 | "n_dim": len(series), 52 | "time": { 53 | "type": "string", 54 | "format": time_fmt, 55 | "index": list(range(len(time))), 56 | "raw": time, 57 | }, 58 | "series": series, 59 | } 60 | 61 | with open(args.output_file, "w") as fp: 62 | json.dump(data, fp, indent="\t") 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /datasets/uk_coal_employ/employ_only.csv: -------------------------------------------------------------------------------- 1 | Year ,Employment 2 | 1913,1107000 3 | 1914,1038000 4 | 1915,935000 5 | 1916,981000 6 | 1917,1002000 7 | 1918,990000 8 | 1919,1136000 9 | 1920,1191000 10 | 1921, 11 | 1922,1085000 12 | 1923,1151000 13 | 1924,1163000 14 | 1925,1078000 15 | 1926, 16 | 1927,991000 17 | 1928,915000 18 | 1929,925000 19 | 1930,910000 20 | 1931,843000 21 | 1932,796000 22 | 1933,767000 23 | 1934,768000 24 | 1935,753000 25 | 1936,750000 26 | 1937,773000 27 | 1938,776000 28 | 1939,761000 29 | 1940,744000 30 | 1941,692000 31 | 1942,704000 32 | 1943,701000 33 | 1944,704000 34 | 1945,702000 35 | 1946,693000 36 | 1947,707000 37 | 1948,720000 38 | 1949,716000 39 | 1950,693000 40 | 1951,695000 41 | 1952,712000 42 | 1953,713000 43 | 1954,707000 44 | 1955,704000 45 | 1956,703000 46 | 1957,710000 47 | 1958,699000 48 | 1959,665000 49 | 1960,607000 50 | 1961,575000 51 | 1962,556000 52 | 1963,528000 53 | 1964,502000 54 | 1965,454700 55 | 1966,422000 56 | 1967,389500 57 | 1968,330900 58 | 1969,305700 59 | 1970,290000 60 | 1971,286100 61 | 1972,273600 62 | 1973,251800 63 | 1974,252800 64 | 1975,252000 65 | 1976,249700 66 | 1977,247900 67 | 1978,240400 68 | 1979,241600 69 | 1980,236900 70 | 1981,172000 71 | 1982,164000 72 | 1983,148000 73 | 1984,139000 74 | 1985,114000 75 | 1986,91000 76 | 1987,75000 77 | 1988,69000 78 | 1989,56000 79 | 1990,49000 80 | 1991,38000 81 | 1992,28000 82 | 1993,10000 83 | 1994,7000 84 | 1995,11657 85 | 1996,10315 86 | 1997,13768 87 | 1998,11113 88 | 1999,11973 89 | 2000,10939 90 | 2001,11439 91 | 2002,9578 92 | 2003,8250 93 | 2004,7772 94 | 2005,6054 95 | 2006,5431 96 | 2007,5538 97 | 2008,6157 98 | 2009,5912 99 | 2010,6014 100 | 2011,5972 101 | 2012,5827 102 | 2013,3715 103 | 2014,3601 104 | 2015,1975 105 | 2016,831 106 | 2017,620 107 | -------------------------------------------------------------------------------- /datasets/usd_isk/README.md: -------------------------------------------------------------------------------- 1 | # USD - ISK exchange rate 2 | 3 | Due to the financial crisis the USD/ISK exchange rate shows potential change 4 | point behaviour in the years around 2008. Since it is difficult to find freely 5 | available (and permissively licensed) historical exchange rate data, we 6 | instead use the monthly average Euro/ECU exchange rates of both USD and ISK 7 | and compute the USD/ISK rate from there. 8 | 9 | The Euro/ECU exchange rate can be obtained from [this direct 10 | link](https://appsso.eurostat.ec.europa.eu/nui/show.do?query=BOOKMARK_DS-054904_QID_-3F48645A_UID_-3F171EB0&layout=TIME,C,X,0;CURRENCY,L,Y,0;UNIT,L,Z,0;STATINFO,L,Z,1;INDICATORS,C,Z,2;&zSelection=DS-054904UNIT,NAC;DS-054904INDICATORS,OBS_FLAG;DS-054904STATINFO,AVG;&rankName1=UNIT_1_2_-1_2&rankName2=INDICATORS_1_2_-1_2&rankName3=STATINFO_1_2_-1_2&rankName4=TIME_1_0_0_0&rankName5=CURRENCY_1_2_0_1&sortC=ASC_-1_FIRST&rStp=&cStp=&rDCh=&cDCh=&rDM=true&cDM=true&footnes=false&empty=false&wai=false&time_mode=NONE&time_most_recent=false&lang=EN&cfo=%23%23%23%2C%23%23%23.%23%23%23). 11 | The data is provided by Eurostat, and is Copyrighted to the European Union. 12 | Redistribution of the data in this repository is allowed according to [this 13 | copyright license](https://ec.europa.eu/eurostat/about/policies/copyright). No 14 | modification of the source data in ``ert_bil_eur_m_1_Data.csv`` has been made. 15 | 16 | The conversion script expects the CSV file format, with the following 17 | configuration: 18 | 19 | - Full extraction (check) 20 | + single file 21 | - Flags and footnotes (check) 22 | - Cell formatting 23 | + 1 234.56 24 | 25 | This file is included in the repository as ``ert_bil_eur_m_1_Data.csv``. 26 | Additional metadata is stored in ``ert_bil_eur_m_Label.csv``. The file 27 | ``usd_isk.json`` can be obtained from the original data by running: 28 | 29 | ``` 30 | $ python convert.py ert_bil_eur_m_1_Data.csv usd_isk.json 31 | ``` 32 | 33 | ![Plot of usd_isk dataset](./usd_isk.png) 34 | -------------------------------------------------------------------------------- /datasets/brent_spot/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import argparse 12 | import clevercsv 13 | import json 14 | 15 | SAMPLE = 10 16 | 17 | def date_to_iso(datestr): 18 | mm, dd, yyyy = list(map(int, datestr.split("/"))) 19 | return f"{yyyy}-{mm:02d}-{dd:02d}" 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("input_file", help="File to convert") 25 | parser.add_argument("output_file", help="File to write to") 26 | return parser.parse_args() 27 | 28 | 29 | def main(): 30 | args = parse_args() 31 | 32 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 33 | reader = clevercsv.reader( 34 | fp, delimiter=",", quotechar="", escapechar="" 35 | ) 36 | rows = list(reader) 37 | 38 | rows = rows[5:] 39 | rows = list(reversed(rows)) 40 | 41 | rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0] 42 | 43 | idx2000 = next((i for i, x in enumerate(rows) if x[0].endswith("2000"))) 44 | rows = rows[idx2000:] 45 | 46 | name = "brent_spot" 47 | longname = "Brent Spot Price" 48 | time = [date_to_iso(r[0]) for r in rows] 49 | time_fmt = "%Y-%m-%d" 50 | values = [float(r[1]) for r in rows] 51 | 52 | series = [{"label": "Dollars/Barrel", "type": "float", "raw": values}] 53 | 54 | data = { 55 | "name": name, 56 | "longname": longname, 57 | "n_obs": len(time), 58 | "n_dim": len(series), 59 | "time": { 60 | "type": "string", 61 | "format": time_fmt, 62 | "index": list(range(len(time))), 63 | "raw": time, 64 | }, 65 | "series": series, 66 | } 67 | 68 | with open(args.output_file, "w") as fp: 69 | json.dump(data, fp, indent="\t") 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for the Turing Change Point Dataset 2 | # 3 | # Author: G.J.J. van den Burg 4 | # Copyright (c) 2019, The Alan Turing Institute 5 | # License: See LICENSE file. 6 | # 7 | 8 | SHELL := bash 9 | .SHELLFLAGS := -eu -o pipefail -c 10 | MAKEFLAGS += --warn-undefined-variables 11 | MAKEFLAGS += --no-builtin-rules 12 | 13 | DATA_DIR=./datasets 14 | UTIL_DIR=./utils 15 | VENV_DIR=./venv 16 | EXPORT_DIR=./export 17 | 18 | .PHONY: all clean collect verify validate test export 19 | 20 | all: test 21 | 22 | ################ 23 | # Main targets # 24 | ################ 25 | 26 | collect: venv 27 | source $(VENV_DIR)/bin/activate && python build_tcpd.py -v collect 28 | 29 | ############## 30 | # Validation # 31 | ############## 32 | 33 | test: verify validate 34 | 35 | verify: venv collect $(UTIL_DIR)/check_checksums.py ./checksums.json 36 | @echo "Verifying datasets ..." 37 | source $(VENV_DIR)/bin/activate && \ 38 | python $(UTIL_DIR)/check_checksums.py -v -c ./checksums.json -d $(DATA_DIR) 39 | 40 | validate: venv collect $(UTIL_DIR)/validate_dataset.py ./schema.json 41 | @echo "Validating datasets" 42 | source $(VENV_DIR)/bin/activate && \ 43 | python $(UTIL_DIR)/validate_dataset.py -v -s ./schema.json -d $(DATA_DIR) 44 | 45 | #################### 46 | # Utility commands # 47 | #################### 48 | 49 | export: test 50 | mkdir -p $(EXPORT_DIR) 51 | cp -v $(DATA_DIR)/*/*.json $(EXPORT_DIR) 52 | 53 | ########### 54 | # Cleanup # 55 | ########### 56 | 57 | clean: 58 | if [ -d $(VENV_DIR) ] ; then \ 59 | source $(VENV_DIR)/bin/activate && python build_tcpd.py -v clean ; \ 60 | fi 61 | rm -rf $(VENV_DIR) 62 | rm -rf $(EXPORT_DIR) 63 | 64 | ############## 65 | # Virtualenv # 66 | ############## 67 | 68 | venv: $(VENV_DIR)/bin/activate 69 | 70 | $(VENV_DIR)/bin/activate: 71 | test -d $(VENV_DIR) || python -m venv $(VENV_DIR) 72 | source $(VENV_DIR)/bin/activate && \ 73 | pip install wheel && \ 74 | pip install -r ./requirements.txt 75 | touch $(VENV_DIR)/bin/activate 76 | -------------------------------------------------------------------------------- /datasets/businv/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import argparse 12 | import json 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("input_file", help="File to convert") 18 | parser.add_argument("output_file", help="File to write to") 19 | return parser.parse_args() 20 | 21 | 22 | def main(): 23 | args = parse_args() 24 | 25 | with open(args.input_file, "r") as fp: 26 | lines = [l.strip() for l in fp] 27 | 28 | # header data should be first three lines 29 | # we use some asserts to ensure things are what we expect them to be 30 | header = lines[:3] 31 | assert header[-1] == "Total Business" 32 | 33 | lines = lines[4:] 34 | assert lines[0].startswith("1992") 35 | 36 | by_month = {} 37 | for line in lines: 38 | # stop on first empty line 39 | if not line.strip(): 40 | break 41 | parts = [x for x in line.split(" ") if x.strip()] 42 | assert len(parts) == 13 # year + 12 months 43 | year = parts.pop(0) 44 | for midx, v in enumerate(parts, start=1): 45 | if v == ".": 46 | break 47 | by_month[f"{year}-{midx:02}"] = int(v) 48 | 49 | name = "businv" 50 | longname = "Business Inventory" 51 | time = sorted(by_month.keys()) 52 | time_fmt = "%Y-%m" 53 | values = [by_month[t] for t in time] 54 | 55 | series = [{"label": "Business Inventory", "type": "int", "raw": values}] 56 | 57 | data = { 58 | "name": name, 59 | "longname": longname, 60 | "n_obs": len(time), 61 | "n_dim": len(series), 62 | "time": { 63 | "type": "string", 64 | "format": time_fmt, 65 | "index": list(range(len(time))), 66 | "raw": time, 67 | }, 68 | "series": series, 69 | } 70 | 71 | with open(args.output_file, "w") as fp: 72 | json.dump(data, fp, indent="\t") 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /datasets/gdp_croatia/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar='"', escapechar="" 29 | ) 30 | rows = list(reader) 31 | rows = rows[4:] 32 | header = rows.pop(0) 33 | 34 | as_dicts = [] 35 | for row in rows: 36 | as_dicts.append({h: v for h, v in zip(header, row)}) 37 | 38 | croatia = next( 39 | (d for d in as_dicts if d["Country Name"] == "Croatia"), None 40 | ) 41 | 42 | tuples = [] 43 | for key in croatia: 44 | try: 45 | ikey = int(key) 46 | except ValueError: 47 | continue 48 | if not croatia[key]: 49 | continue 50 | tuples.append((ikey, int(croatia[key]))) 51 | 52 | name = "gdp_croatia" 53 | longname = "GDP Croatia" 54 | time = [str(t[0]) for t in tuples] 55 | time_fmt = "%Y" 56 | series = [ 57 | { 58 | "label": "GDP (constant LCU)", 59 | "type": "int", 60 | "raw": [t[1] for t in tuples], 61 | } 62 | ] 63 | 64 | data = { 65 | "name": name, 66 | "longname": longname, 67 | "n_obs": len(time), 68 | "n_dim": len(series), 69 | "time": { 70 | "type": "string", 71 | "format": time_fmt, 72 | "index": list(range(len(time))), 73 | "raw": time, 74 | }, 75 | "series": series, 76 | } 77 | 78 | with open(args.output_file, "w") as fp: 79 | json.dump(data, fp, indent="\t") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /datasets/gdp_iran/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar='"', escapechar="" 29 | ) 30 | rows = list(reader) 31 | rows = rows[4:] 32 | header = rows.pop(0) 33 | 34 | as_dicts = [] 35 | for row in rows: 36 | as_dicts.append({h: v for h, v in zip(header, row)}) 37 | 38 | iran = next( 39 | (d for d in as_dicts if d["Country Name"] == "Iran, Islamic Rep."), 40 | None, 41 | ) 42 | 43 | tuples = [] 44 | for key in iran: 45 | try: 46 | ikey = int(key) 47 | except ValueError: 48 | continue 49 | if not iran[key]: 50 | continue 51 | tuples.append((ikey, float(iran[key]))) 52 | 53 | name = "gdp_iran" 54 | longname = "GDP Iran" 55 | time = [str(t[0]) for t in tuples] 56 | time_fmt = "%Y" 57 | series = [ 58 | { 59 | "label": "GDP (constant LCU)", 60 | "type": "float", 61 | "raw": [t[1] for t in tuples], 62 | } 63 | ] 64 | 65 | data = { 66 | "name": name, 67 | "longname": longname, 68 | "n_obs": len(time), 69 | "n_dim": len(series), 70 | "time": { 71 | "type": "string", 72 | "format": time_fmt, 73 | "index": list(range(len(time))), 74 | "raw": time, 75 | }, 76 | "series": series, 77 | } 78 | 79 | with open(args.output_file, "w") as fp: 80 | json.dump(data, fp, indent="\t") 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /datasets/gdp_argentina/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("input_file", help="File to convert") 19 | parser.add_argument("output_file", help="File to write to") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | 26 | with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp: 27 | reader = clevercsv.reader( 28 | fp, delimiter=",", quotechar='"', escapechar="" 29 | ) 30 | rows = list(reader) 31 | rows = rows[4:] 32 | header = rows.pop(0) 33 | 34 | as_dicts = [] 35 | for row in rows: 36 | as_dicts.append({h: v for h, v in zip(header, row)}) 37 | 38 | argentina = next( 39 | (d for d in as_dicts if d["Country Name"] == "Argentina"), None 40 | ) 41 | 42 | tuples = [] 43 | for key in argentina: 44 | try: 45 | ikey = int(key) 46 | except ValueError: 47 | continue 48 | if not argentina[key]: 49 | continue 50 | tuples.append((ikey, float(argentina[key]))) 51 | 52 | name = "gdp_argentina" 53 | longname = "GDP Argentina" 54 | time = [str(t[0]) for t in tuples] 55 | time_fmt = "%Y" 56 | series = [ 57 | { 58 | "label": "GDP (constant LCU)", 59 | "type": "float", 60 | "raw": [t[1] for t in tuples], 61 | } 62 | ] 63 | 64 | data = { 65 | "name": name, 66 | "longname": longname, 67 | "n_obs": len(time), 68 | "n_dim": len(series), 69 | "time": { 70 | "type": "string", 71 | "format": time_fmt, 72 | "index": list(range(len(time))), 73 | "raw": time, 74 | }, 75 | "series": series, 76 | } 77 | 78 | with open(args.output_file, "w") as fp: 79 | json.dump(data, fp, indent="\t") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /datasets/jfk_passengers/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def month2index(month): 17 | return { 18 | "Jan": "01", 19 | "Feb": "02", 20 | "Mar": "03", 21 | "Apr": "04", 22 | "May": "05", 23 | "Jun": "06", 24 | "Jul": "07", 25 | "Aug": "08", 26 | "Sep": "09", 27 | "Oct": "10", 28 | "Nov": "11", 29 | "Dec": "12", 30 | }[month] 31 | 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("input_file", help="File to convert") 36 | parser.add_argument("output_file", help="File to write to") 37 | return parser.parse_args() 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | 43 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 44 | reader = clevercsv.DictReader( 45 | fp, delimiter=",", quotechar="", escapechar="" 46 | ) 47 | items = list(reader) 48 | 49 | for it in items: 50 | it["time"] = f"{it['Year']}-{month2index(it['Month'])}" 51 | it["value"] = int(it["Total Passengers"]) 52 | 53 | 54 | jfks = [it for it in items if it["Airport Code"] == "JFK"] 55 | pairs = [(it["time"], it["value"]) for it in jfks] 56 | # with this date format string sort is date sort 57 | pairs.sort() 58 | 59 | name = "jfk_passengers" 60 | longname = "JFK Passengers" 61 | time_fmt = "%Y-%m" 62 | time = [p[0] for p in pairs] 63 | values = [p[1] for p in pairs] 64 | 65 | series = [{"label": "Number of Passengers", "type": "int", "raw": values}] 66 | 67 | data = { 68 | "name": name, 69 | "longname": longname, 70 | "n_obs": len(time), 71 | "n_dim": len(series), 72 | "time": { 73 | "type": "string", 74 | "format": time_fmt, 75 | "index": list(range(len(time))), 76 | "raw": time, 77 | }, 78 | "series": series, 79 | } 80 | 81 | with open(args.output_file, "w") as fp: 82 | json.dump(data, fp, indent="\t") 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /datasets/lga_passengers/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def month2index(month): 17 | return { 18 | "Jan": "01", 19 | "Feb": "02", 20 | "Mar": "03", 21 | "Apr": "04", 22 | "May": "05", 23 | "Jun": "06", 24 | "Jul": "07", 25 | "Aug": "08", 26 | "Sep": "09", 27 | "Oct": "10", 28 | "Nov": "11", 29 | "Dec": "12", 30 | }[month] 31 | 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("input_file", help="File to convert") 36 | parser.add_argument("output_file", help="File to write to") 37 | return parser.parse_args() 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | 43 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 44 | reader = clevercsv.DictReader( 45 | fp, delimiter=",", quotechar="", escapechar="" 46 | ) 47 | items = list(reader) 48 | 49 | for it in items: 50 | it["time"] = f"{it['Year']}-{month2index(it['Month'])}" 51 | it["value"] = int(it["Total Passengers"]) 52 | 53 | lgas = [it for it in items if it["Airport Code"] == "LGA"] 54 | pairs = [(it["time"], it["value"]) for it in lgas] 55 | # with this date format string sort is date sort 56 | pairs.sort() 57 | 58 | name = "lga_passengers" 59 | longname = "LaGuardia Passengers" 60 | time_fmt = "%Y-%m" 61 | time = [p[0] for p in pairs] 62 | values = [p[1] for p in pairs] 63 | 64 | series = [{"label": "Number of Passengers", "type": "int", "raw": values}] 65 | 66 | data = { 67 | "name": name, 68 | "longname": longname, 69 | "n_obs": len(time), 70 | "n_dim": len(series), 71 | "time": { 72 | "type": "string", 73 | "format": time_fmt, 74 | "index": list(range(len(time))), 75 | "raw": time, 76 | }, 77 | "series": series, 78 | } 79 | 80 | with open(args.output_file, "w") as fp: 81 | json.dump(data, fp, indent="\t") 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /datasets/unemployment_nl/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import argparse 11 | import clevercsv 12 | import json 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("input_file", help="File to convert") 18 | parser.add_argument("output_file", help="File to write to") 19 | return parser.parse_args() 20 | 21 | 22 | def main(): 23 | args = parse_args() 24 | with open(args.input_file, "r", newline="", encoding="UTF-8-SIG") as fp: 25 | reader = clevercsv.reader( 26 | fp, delimiter=";", quotechar='"', escapechar="" 27 | ) 28 | rows = list(reader) 29 | 30 | # remove rows we don't need 31 | title = rows.pop(0) 32 | meta = rows.pop(0) 33 | meta = rows.pop(0) 34 | 35 | # filter out rows we want 36 | header = rows.pop(0) 37 | eligible_population = rows.pop(0) 38 | working_population = rows.pop(0) 39 | unemployed_population = rows.pop(0) 40 | 41 | years = header[3:] 42 | eligible = list(map(int, eligible_population[3:])) 43 | unemployed = list(map(int, unemployed_population[3:])) 44 | 45 | # compute the percentage unemployed 46 | by_year = { 47 | y: (u / e * 100) for y, e, u in zip(years, eligible, unemployed) 48 | } 49 | 50 | # remove value of 2001 before revision 51 | del by_year["2001 voor revisie"] 52 | # rename value of 2001 after revision as simply '2001' 53 | by_year["2001"] = by_year["2001 na revisie"] 54 | del by_year["2001 na revisie"] 55 | 56 | time = sorted(by_year.keys()) 57 | values = [by_year[t] for t in time] 58 | series = [{"label": "V1", "type": "float", "raw": values}] 59 | 60 | data = { 61 | "name": "unemployment_nl", 62 | "longname": "Unemployment rate (NL)", 63 | "n_obs": len(time), 64 | "n_dim": len(series), 65 | "time": { 66 | "type": "string", 67 | "format": "%Y", 68 | "index": list(range(len(time))), 69 | "raw": time, 70 | }, 71 | "series": series, 72 | } 73 | 74 | with open(args.output_file, "w") as fp: 75 | json.dump(data, fp, indent="\t") 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /datasets/shanghai_license/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import clevercsv 14 | 15 | 16 | def reformat_time(mmmyy): 17 | """ From MMM-YY to %Y-%m """ 18 | MONTHS = { 19 | "Jan": 1, 20 | "Feb": 2, 21 | "Mar": 3, 22 | "Apr": 4, 23 | "May": 5, 24 | "Jun": 6, 25 | "Jul": 7, 26 | "Aug": 8, 27 | "Sep": 9, 28 | "Oct": 10, 29 | "Nov": 11, 30 | "Dec": 12, 31 | } 32 | mmm, yy = mmmyy.split("-") 33 | Y = int(yy) + 2000 34 | m = MONTHS.get(mmm) 35 | return "%i-%02i" % (Y, m) 36 | 37 | 38 | def parse_args(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("input_file", help="File to convert") 41 | parser.add_argument("output_file", help="File to write to") 42 | return parser.parse_args() 43 | 44 | 45 | def main(): 46 | args = parse_args() 47 | 48 | with open(args.input_file, "r", newline="", encoding="ascii") as fp: 49 | reader = clevercsv.reader( 50 | fp, delimiter=",", quotechar="", escapechar="" 51 | ) 52 | rows = list(reader) 53 | 54 | rows.pop(0) 55 | 56 | time = [reformat_time(r[0]) for r in rows] 57 | values = [int(r[-1]) for r in rows] 58 | 59 | # Manually split Jan-08 into two, see readme for details. 60 | jan08idx = time.index("2008-01") 61 | values[jan08idx] /= 2 62 | time.insert(jan08idx + 1, "2008-02") 63 | values.insert(jan08idx + 1, values[jan08idx]) 64 | 65 | name = "shanghai_license" 66 | longname = "Shanghai License" 67 | time_fmt = "%Y-%m" 68 | series = [{"label": "No. of Applicants", "type": "int", "raw": values}] 69 | 70 | data = { 71 | "name": name, 72 | "longname": longname, 73 | "n_obs": len(time), 74 | "n_dim": len(series), 75 | "time": { 76 | "type": "string", 77 | "format": time_fmt, 78 | "index": list(range(len(time))), 79 | "raw": time, 80 | }, 81 | "series": series, 82 | } 83 | 84 | with open(args.output_file, "w") as fp: 85 | json.dump(data, fp, indent="\t") 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /datasets/rail_lines/rail_lines.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rail_lines", 3 | "longname": "Rail Lines", 4 | "n_obs": 37, 5 | "n_dim": 1, 6 | "time": { 7 | "format": "%Y", 8 | "index": [ 9 | 0, 10 | 1, 11 | 2, 12 | 3, 13 | 4, 14 | 5, 15 | 6, 16 | 7, 17 | 8, 18 | 9, 19 | 10, 20 | 11, 21 | 12, 22 | 13, 23 | 14, 24 | 15, 25 | 16, 26 | 17, 27 | 18, 28 | 19, 29 | 20, 30 | 21, 31 | 22, 32 | 23, 33 | 24, 34 | 25, 35 | 26, 36 | 27, 37 | 28, 38 | 29, 39 | 30, 40 | 31, 41 | 32, 42 | 33, 43 | 34, 44 | 35, 45 | 36 46 | ], 47 | "raw": [ 48 | "1980", 49 | "1981", 50 | "1982", 51 | "1983", 52 | "1984", 53 | "1985", 54 | "1986", 55 | "1987", 56 | "1988", 57 | "1989", 58 | "1990", 59 | "1991", 60 | "1992", 61 | "1993", 62 | "1994", 63 | "1995", 64 | "1996", 65 | "1997", 66 | "1998", 67 | "1999", 68 | "2000", 69 | "2001", 70 | "2002", 71 | "2003", 72 | "2004", 73 | "2005", 74 | "2006", 75 | "2007", 76 | "2008", 77 | "2009", 78 | "2010", 79 | "2011", 80 | "2012", 81 | "2013", 82 | "2014", 83 | "2015", 84 | "2016" 85 | ] 86 | }, 87 | "series": [ 88 | { 89 | "label": "V1", 90 | "type": "float", 91 | "raw": [ 92 | 1000507.33548387, 93 | 996153.287096774, 94 | 994910, 95 | 992092.841935484, 96 | 983302.464516129, 97 | 975342.625806452, 98 | 963878.138709677, 99 | 949388.514516129, 100 | 941808.890967742, 101 | 938664.31516129, 102 | 977074.383, 103 | 973210.707096774, 104 | 964581.342580645, 105 | 973468.754193548, 106 | 970988.050967742, 107 | 968160.260322581, 108 | 971143.774885996, 109 | 970154.472375981, 110 | 964146.915693211, 111 | 963074.49690094, 112 | 968935.935483871, 113 | 956609.233390473, 114 | 962635.639187575, 115 | 959885.943535401, 116 | 958123.943535401, 117 | 989329.943535401, 118 | 1062032.93548387, 119 | 1060970.93548387, 120 | 1060664.93548387, 121 | 1056107.93548387, 122 | 1076589.93548387, 123 | 1057710.93548387, 124 | 1051859.93548387, 125 | 1051798.67548387, 126 | 1055263.93548387, 127 | 1051968.08548387, 128 | 1051767.60548387 129 | ] 130 | } 131 | ] 132 | } 133 | -------------------------------------------------------------------------------- /datasets/usd_isk/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import clevercsv 11 | import json 12 | import sys 13 | 14 | 15 | def format_month(ymm): 16 | year, month = ymm.split("M") 17 | return f"{year}-{month}" 18 | 19 | 20 | def main(input_filename, output_filename): 21 | with open(input_filename, "r", newline="", encoding="ascii") as fp: 22 | reader = clevercsv.DictReader( 23 | fp, delimiter=",", quotechar='"', escapechar="" 24 | ) 25 | rows = list(reader) 26 | 27 | by_currency = {} 28 | for row in rows: 29 | cur = row["CURRENCY"] 30 | if not cur in by_currency: 31 | by_currency[cur] = [] 32 | by_currency[cur].append(row) 33 | 34 | by_month = {} 35 | for cur in by_currency: 36 | for item in by_currency[cur]: 37 | if item["Value"] == ":": 38 | continue 39 | month = item["TIME"] 40 | if not month in by_month: 41 | by_month[month] = {} 42 | by_month[month][cur] = item 43 | 44 | to_delete = [] 45 | for month in by_month: 46 | if not len(by_month[month]) == 2: 47 | to_delete.append(month) 48 | for month in to_delete: 49 | del by_month[month] 50 | 51 | ratio = {} 52 | for month in sorted(by_month.keys()): 53 | usd = by_month[month]["US dollar"] 54 | isk = by_month[month]["Icelandic krona"] 55 | ratio[format_month(month)] = float(usd["Value"]) / float(isk["Value"]) 56 | 57 | tuples = [(m, ratio[m]) for m in ratio] 58 | 59 | name = "usd_isk" 60 | longname = "USD-ISK exhange rate" 61 | 62 | data = { 63 | "name": name, 64 | "longname": longname, 65 | "n_obs": len(tuples), 66 | "n_dim": 1, 67 | "time": { 68 | "format": "%Y-%m", 69 | "index": list(range(len(tuples))), 70 | "raw": [t[0] for t in tuples], 71 | }, 72 | "series": [ 73 | { 74 | "label": "Exchange rate", 75 | "type": "float", 76 | "raw": [t[1] for t in tuples], 77 | } 78 | ], 79 | } 80 | 81 | with open(output_filename, "w") as fp: 82 | json.dump(data, fp, indent="\t") 83 | 84 | 85 | if __name__ == "__main__": 86 | main(sys.argv[1], sys.argv[2]) 87 | -------------------------------------------------------------------------------- /checksums.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "md5", 3 | "checksums": { 4 | "apple.json": "22edb48471bd3711f7a6e15de6413643", 5 | "bank.json": "5207135ea53fc6fa2a8119908da73abf", 6 | "bee_waggle_6.json": [ 7 | "4f03feafecb3be0b069b3cb0d6b17d4f", 8 | "71311783488ee5f1122545d24c15429b" 9 | ], 10 | "bitcoin.json": "f90ff14ed1fc0c3d47d4394d25cbce93", 11 | "brent_spot.json": "79892116ef8a0aa16e2450123655b31d", 12 | "businv.json": "d2ab178da17b2e659a10a102a4b9f332", 13 | "centralia.json": "addb9b70ac1294eba6da958f3ab26595", 14 | "children_per_woman.json": "826e0a2328b8a8a085050115768eef98", 15 | "co2_canada.json": "de8d2cac911d2a8e3ce5addacbaed8e0", 16 | "construction.json": "fb0347dc9fd353b11b35e99f7d531f13", 17 | "debt_ireland.json": "a5a9e752c338d2ffcceb614bb2064cc9", 18 | "gdp_argentina.json": "694212b5682ebd808d740ffd83d4bc16", 19 | "gdp_croatia.json": "4f902ba68bf710fa245e5eb0ab35fea7", 20 | "gdp_iran.json": "889e9fc6292125189fd3188396167431", 21 | "gdp_japan.json": "17026e80ab363d9f668d69900824d9ae", 22 | "global_co2.json": "7c8edd8887f51a6f841cc9d806ab4e56", 23 | "homeruns.json": "987bbab63e2c72acba1c07325303720c", 24 | "iceland_tourism.json": "8bbac4ca95319a865f2d58ff564f063d", 25 | "jfk_passengers.json": "9655295214078f2a45a4c18e6c4e6d0d", 26 | "lga_passengers.json": "3e7bf55fac17f59b400f8a558d3f0337", 27 | "measles.json": "e42afd03be893fc7deb98514c94fa4c7", 28 | "nile.json": "5b08800e3ec692bfa5385b978658199b", 29 | "occupancy.json": "bc6cd9adaf496fe30bf0e417d2c3b0c6", 30 | "ozone.json": "348b1f85c3ec3da3b8989afe04c33b80", 31 | "quality_control_1.json": "fcfd5b0323a0dbd499c22b32c77f6a43", 32 | "quality_control_2.json": "919a55440bd00d635db80fe83e921c7d", 33 | "quality_control_3.json": "94f55ddedd03197bc3e660f6e1d840ee", 34 | "quality_control_4.json": "1efedb9a52cd0b9a9250cf9781c5f7ef", 35 | "quality_control_5.json": "2ebb10acafae18ebabf0217218717970", 36 | "rail_lines.json": "fa7d19c61264f0d6b9d74cd145a50012", 37 | "ratner_stock.json": "f7086ff916f35b88463bf8fd1857815e", 38 | "robocalls.json": "f67ec0ccb50f2a835912e5c51932c083", 39 | "run_log.json": "2c78a8fa0b4a2f8e2d22ba3ad4dfd49f", 40 | "scanline_126007.json": "057d5741b623308af00c42e2c8e525c3", 41 | "scanline_42049.json": "39921dfa959576bd0b3d6c95558f17f4", 42 | "seatbelts.json": "976ef4318e7b6381ff37dd4ac8029718", 43 | "shanghai_license.json": "b4ac173eb6c0a1a4d10268abc109eda1", 44 | "uk_coal_employ.json": "a7c72746e46d6e09f516bd87e0e68bef", 45 | "unemployment_nl.json": "26d8c0359de7f733a6fb51d4d60b5af6", 46 | "us_population.json": "77037fc5ff0338516a56ae686aa4dcba", 47 | "usd_isk.json": "5cac2807a0e280c8ffd7321662e339ac", 48 | "well_log.json": "7c80d2cbd5864b923e6a653aad115de6" 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /utils/check_checksums.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Validate the datasets by checksum 6 | 7 | Author: G.J.J. van den Burg 8 | License: This file is part of TCPD, see the top-level LICENSE file. 9 | Copyright: 2019, The Alan Turing Institute 10 | 11 | """ 12 | 13 | import argparse 14 | import hashlib 15 | import os 16 | import json 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument( 22 | "-c", "--checksum-file", help="Checksum file (json)", required=True 23 | ) 24 | parser.add_argument( 25 | "-d", "--dataset-dir", help="Dataset directory", required=True 26 | ) 27 | parser.add_argument( 28 | "-v", "--verbose", help="Enable verbose mode", action="store_true" 29 | ) 30 | return parser.parse_args() 31 | 32 | 33 | def md5sum(filename): 34 | with open(filename, "rb") as fp: 35 | data = fp.read() 36 | return hashlib.md5(data).hexdigest() 37 | 38 | 39 | def load_checksums(checksum_file): 40 | with open(checksum_file, "r") as fp: 41 | checksums = json.load(fp) 42 | assert checksums["kind"] == "md5" 43 | return checksums["checksums"] 44 | 45 | 46 | def find_datafiles(dataset_dir): 47 | data_files = {} 48 | 49 | datadirs = os.listdir(dataset_dir) 50 | for ddir in datadirs: 51 | pth = os.path.join(dataset_dir, ddir) 52 | files = os.listdir(pth) 53 | json_files = [f for f in files if f.endswith(".json")] 54 | for jf in json_files: 55 | jfpath = os.path.join(pth, jf) 56 | if jf in data_files: 57 | raise KeyError("Duplicate data file '%s'?" % jfpath) 58 | data_files[jf] = jfpath 59 | 60 | return data_files 61 | 62 | 63 | def main(): 64 | args = parse_args() 65 | 66 | log = lambda *a, **kw: print(*a, **kw) if args.verbose else None 67 | 68 | checksums = load_checksums(args.checksum_file) 69 | data_files = find_datafiles(args.dataset_dir) 70 | 71 | for fname in checksums: 72 | log("Checking %s" % fname) 73 | if not fname in data_files: 74 | raise FileNotFoundError("Missing data file: %s" % fname) 75 | md5 = md5sum(data_files[fname]) 76 | if isinstance(checksums[fname], list): 77 | if not md5 in checksums[fname]: 78 | raise ValueError( 79 | "Checksums don't match for file: %s" % (data_files[fname]) 80 | ) 81 | else: 82 | if not md5 == checksums[fname]: 83 | raise ValueError( 84 | "Checksums don't match for file: %s" % (data_files[fname]) 85 | ) 86 | 87 | log("All ok.") 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /datasets/ozone/ozone.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ozone", 3 | "longname": "Ozone-Depleting Emissions", 4 | "n_obs": 54, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53 64 | ], 65 | "raw": [ 66 | "1961", 67 | "1962", 68 | "1963", 69 | "1964", 70 | "1965", 71 | "1966", 72 | "1967", 73 | "1968", 74 | "1969", 75 | "1970", 76 | "1971", 77 | "1972", 78 | "1973", 79 | "1974", 80 | "1975", 81 | "1976", 82 | "1977", 83 | "1978", 84 | "1979", 85 | "1980", 86 | "1981", 87 | "1982", 88 | "1983", 89 | "1984", 90 | "1985", 91 | "1986", 92 | "1987", 93 | "1988", 94 | "1989", 95 | "1990", 96 | "1991", 97 | "1992", 98 | "1993", 99 | "1994", 100 | "1995", 101 | "1996", 102 | "1997", 103 | "1998", 104 | "1999", 105 | "2000", 106 | "2001", 107 | "2002", 108 | "2003", 109 | "2004", 110 | "2005", 111 | "2006", 112 | "2007", 113 | "2008", 114 | "2009", 115 | "2010", 116 | "2011", 117 | "2012", 118 | "2013", 119 | "2014" 120 | ] 121 | }, 122 | "series": [ 123 | { 124 | "label": "Total Emissions", 125 | "type": "int", 126 | "raw": [ 127 | 380000, 128 | 400000, 129 | 440000, 130 | 480000, 131 | 510000, 132 | 540000, 133 | 580000, 134 | 630000, 135 | 660000, 136 | 720000, 137 | 770000, 138 | 840000, 139 | 910000, 140 | 980000, 141 | 1040000, 142 | 1050000, 143 | 1070000, 144 | 1070000, 145 | 1110000, 146 | 1080000, 147 | 1040000, 148 | 1100000, 149 | 1090000, 150 | 1150000, 151 | 1180000, 152 | 1280000, 153 | 1360000, 154 | 1460000, 155 | 1410000, 156 | 1320000, 157 | 1190000, 158 | 1080000, 159 | 960000, 160 | 820000, 161 | 760000, 162 | 700000, 163 | 640000, 164 | 600000, 165 | 590000, 166 | 560000, 167 | 530000, 168 | 490000, 169 | 480000, 170 | 470000, 171 | 450000, 172 | 430000, 173 | 410000, 174 | 400000, 175 | 390000, 176 | 380000, 177 | 370000, 178 | 350000, 179 | 340000, 180 | 320000 181 | ] 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /datasets/construction/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Dataset conversion script 6 | 7 | Author: G.J.J. van den Burg 8 | 9 | """ 10 | 11 | import argparse 12 | import json 13 | import xlrd 14 | 15 | MONTHS = { 16 | "Jan": 1, 17 | "Feb": 2, 18 | "Mar": 3, 19 | "Apr": 4, 20 | "May": 5, 21 | "Jun": 6, 22 | "Jul": 7, 23 | "Aug": 8, 24 | "Sep": 9, 25 | "Oct": 10, 26 | "Nov": 11, 27 | "Dec": 12, 28 | } 29 | 30 | 31 | def format_date(datestr): 32 | """ expects: mmm-yyx with x an extraneous character or empty """ 33 | mmm, yyx = datestr.split("-") 34 | midx = MONTHS[mmm] 35 | if len(yyx) == 3: 36 | yy = yyx[:2] 37 | elif len(yyx) == 2: 38 | yy = yyx 39 | else: 40 | raise ValueError 41 | 42 | # this will break in 71 years 43 | if yy.startswith("9"): 44 | yyyy = 1900 + int(yy) 45 | else: 46 | yyyy = 2000 + int(yy) 47 | return f"{yyyy}-{midx:02}" 48 | 49 | 50 | def parse_args(): 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("input_file", help="File to convert") 53 | parser.add_argument("output_file", help="File to write to") 54 | return parser.parse_args() 55 | 56 | 57 | def main(): 58 | args = parse_args() 59 | 60 | wb = xlrd.open_workbook(args.input_file) 61 | ws = wb.sheet_by_index(0) 62 | header = ws.row(3) 63 | assert header[0].value == "Date" 64 | 65 | by_month = {} 66 | ridx = 4 67 | while True: 68 | # stop if date cell is empty 69 | if ws.row(ridx)[0].ctype == xlrd.XL_CELL_EMPTY: 70 | break 71 | 72 | date_value = ws.row(ridx)[0].value 73 | construct_value = ws.row(ridx)[1].value 74 | 75 | date = format_date(date_value) 76 | construct = int(construct_value) 77 | 78 | by_month[date] = construct 79 | ridx += 1 80 | 81 | name = "construction" 82 | longname = "US Construction Spending" 83 | time = sorted(by_month.keys()) 84 | time_fmt = "%Y-%m" 85 | values = [by_month[t] for t in time] 86 | 87 | series = [ 88 | { 89 | "label": "Total Private Construction Spending", 90 | "type": "int", 91 | "raw": values, 92 | } 93 | ] 94 | 95 | data = { 96 | "name": name, 97 | "longname": longname, 98 | "n_obs": len(time), 99 | "n_dim": len(series), 100 | "time": { 101 | "type": "string", 102 | "format": time_fmt, 103 | "index": list(range(len(time))), 104 | "raw": time, 105 | }, 106 | "series": series, 107 | } 108 | 109 | with open(args.output_file, "w") as fp: 110 | json.dump(data, fp, indent="\t") 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /examples/python/load_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Example code for loading a dataset to a TimeSeries object. 6 | 7 | Note that this code requires Pandas to be available. 8 | 9 | Author: Gertjan van den Burg 10 | Copyright: The Alan Turing Institute, 2019 11 | License: See LICENSE file. 12 | 13 | """ 14 | 15 | import json 16 | import numpy as np 17 | import pandas as pd 18 | 19 | 20 | class TimeSeries: 21 | def __init__( 22 | self, 23 | t, 24 | y, 25 | name=None, 26 | longname=None, 27 | datestr=None, 28 | datefmt=None, 29 | columns=None, 30 | ): 31 | self.t = t 32 | self.y = y 33 | 34 | self.name = name 35 | self.longname = longname 36 | self.datestr = datestr 37 | self.datefmt = datefmt 38 | self.columns = columns 39 | 40 | # whether the series is stored as zero-based or one-based 41 | self.zero_based = True 42 | 43 | @property 44 | def n_obs(self): 45 | return len(self.t) 46 | 47 | @property 48 | def n_dim(self): 49 | return self.y.shape[1] 50 | 51 | @property 52 | def shape(self): 53 | return (self.n_obs, self.n_dim) 54 | 55 | @classmethod 56 | def from_json(cls, filename): 57 | with open(filename, "rb") as fp: 58 | data = json.load(fp) 59 | 60 | tidx = np.array(data["time"]["index"]) 61 | tidx = np.squeeze(tidx) 62 | 63 | if "format" in data["time"]: 64 | datefmt = data["time"]["format"] 65 | datestr = np.array(data["time"]["raw"]) 66 | else: 67 | datefmt = None 68 | datestr = None 69 | 70 | y = np.zeros((data["n_obs"], data["n_dim"])) 71 | columns = [] 72 | 73 | for idx, series in enumerate(data["series"]): 74 | columns.append(series.get("label", "V%i" % (idx + 1))) 75 | thetype = np.int if series["type"] == "integer" else np.float64 76 | vec = np.array(series["raw"], dtype=thetype) 77 | y[:, idx] = vec 78 | 79 | ts = cls( 80 | tidx, 81 | y, 82 | name=data["name"], 83 | longname=data["longname"], 84 | datefmt=datefmt, 85 | datestr=datestr, 86 | columns=columns, 87 | ) 88 | return ts 89 | 90 | @property 91 | def df(self): 92 | d = {"t": self.t} 93 | for i in range(len(self.columns)): 94 | col = self.columns[i] 95 | val = self.y[:, i] 96 | d[col] = val 97 | return pd.DataFrame(d) 98 | 99 | def make_one_based(self): 100 | """ Convert the time index to a one-based time index. """ 101 | if self.zero_based: 102 | self.t = [t + 1 for t in self.t] 103 | self.zero_based = False 104 | 105 | def __repr__(self): 106 | return "TimeSeries(name=%s, n_obs=%s, n_dim=%s)" % ( 107 | self.name, 108 | self.n_obs, 109 | self.n_dim, 110 | ) 111 | 112 | def __str__(self): 113 | return repr(self) 114 | -------------------------------------------------------------------------------- /datasets/co2_canada/co2_canada.csv: -------------------------------------------------------------------------------- 1 | country,Canada 2 | 1800,0.00568 3 | 1801,0.00561 4 | 1802,0.00555 5 | 1803,0.00548 6 | 1804,0.00542 7 | 1805,0.00536 8 | 1806,0.00529 9 | 1807,0.00523 10 | 1808,0.00517 11 | 1809,0.00511 12 | 1810,0.00504 13 | 1811,0.00497 14 | 1812,0.0049 15 | 1813,0.00483 16 | 1814,0.00475 17 | 1815,0.00466 18 | 1816,0.00457 19 | 1817,0.00447 20 | 1818,0.00438 21 | 1819,0.00427 22 | 1820,0.00417 23 | 1821,0.00406 24 | 1822,0.00395 25 | 1823,0.00384 26 | 1824,0.00373 27 | 1825,0.00362 28 | 1826,0.0035 29 | 1827,0.00339 30 | 1828,0.00327 31 | 1829,0.00316 32 | 1830,0.00305 33 | 1831,0.00294 34 | 1832,0.00283 35 | 1833,0.00273 36 | 1834,0.00263 37 | 1835,0.00253 38 | 1836,0.00244 39 | 1837,0.00235 40 | 1838,0.00227 41 | 1839,0.00218 42 | 1840,0.0021 43 | 1841,0.00202 44 | 1842,0.00195 45 | 1843,0.00188 46 | 1844,0.00181 47 | 1845,0.00175 48 | 1846,0.0118 49 | 1847,0.0147 50 | 1848,0.0174 51 | 1849,0.0198 52 | 1850,0.0236 53 | 1851,0.0271 54 | 1852,0.0318 55 | 1853,0.0376 56 | 1854,0.0445 57 | 1855,0.0522 58 | 1856,0.0608 59 | 1857,0.0715 60 | 1858,0.0853 61 | 1859,0.0996 62 | 1860,0.118 63 | 1861,0.139 64 | 1862,0.167 65 | 1863,0.206 66 | 1864,0.242 67 | 1865,0.288 68 | 1866,0.346 69 | 1867,0.436 70 | 1868,0.255 71 | 1869,0.182 72 | 1870,0.321 73 | 1871,0.461 74 | 1872,0.471 75 | 1873,0.406 76 | 1874,0.404 77 | 1875,0.461 78 | 1876,0.45 79 | 1877,0.484 80 | 1878,0.456 81 | 1879,0.508 82 | 1880,1.2 83 | 1881,1.27 84 | 1882,1.48 85 | 1883,1.64 86 | 1884,1.87 87 | 1885,1.78 88 | 1886,1.87 89 | 1887,2.14 90 | 1888,2.77 91 | 1889,2.33 92 | 1890,2.53 93 | 1891,2.81 94 | 1892,2.78 95 | 1893,2.94 96 | 1894,2.78 97 | 1895,2.62 98 | 1896,2.81 99 | 1897,2.8 100 | 1898,2.94 101 | 1899,3.53 102 | 1900,3.73 103 | 1901,4.24 104 | 1902,4.49 105 | 1903,4.78 106 | 1904,5.5 107 | 1905,5.72 108 | 1906,5.87 109 | 1907,7.18 110 | 1908,7.02 111 | 1909,6.51 112 | 1910,7.2 113 | 1911,8.18 114 | 1912,8.86 115 | 1913,10.1 116 | 1914,8.53 117 | 1915,7.51 118 | 1916,9.33 119 | 1917,10.3 120 | 1918,10.7 121 | 1919,8.91 122 | 1920,9.64 123 | 1921,8.98 124 | 1922,7.6 125 | 1923,10.1 126 | 1924,8.27 127 | 1925,7.91 128 | 1926,8.75 129 | 1927,9.32 130 | 1928,9.32 131 | 1929,9.66 132 | 1930,9.04 133 | 1931,7.15 134 | 1932,6.53 135 | 1933,6.39 136 | 1934,7.28 137 | 1935,7.11 138 | 1936,7.7 139 | 1937,8.27 140 | 1938,7.5 141 | 1939,8.25 142 | 1940,9.33 143 | 1941,10.2 144 | 1942,11 145 | 1943,11.4 146 | 1944,11.5 147 | 1945,10.5 148 | 1946,11 149 | 1947,11.2 150 | 1948,12.1 151 | 1949,10.8 152 | 1950,11.2 153 | 1951,11.5 154 | 1952,11 155 | 1953,10.8 156 | 1954,10.7 157 | 1955,10.8 158 | 1956,11.7 159 | 1957,11 160 | 1958,10.7 161 | 1959,10.5 162 | 1960,10.8 163 | 1961,10.6 164 | 1962,11.1 165 | 1963,11.1 166 | 1964,12.3 167 | 1965,12.8 168 | 1966,12.9 169 | 1967,13.8 170 | 1968,14.6 171 | 1969,14.6 172 | 1970,15.9 173 | 1971,16.2 174 | 1972,17.2 175 | 1973,17 176 | 1974,17.1 177 | 1975,17.2 178 | 1976,17 179 | 1977,17.2 180 | 1978,17.3 181 | 1979,18.2 182 | 1980,18.1 183 | 1981,17.3 184 | 1982,16.6 185 | 1983,16.2 186 | 1984,16.6 187 | 1985,16.3 188 | 1986,15.5 189 | 1987,16.2 190 | 1988,16.9 191 | 1989,17 192 | 1990,15.7 193 | 1991,15.2 194 | 1992,15.5 195 | 1993,15.5 196 | 1994,15.7 197 | 1995,15.9 198 | 1996,16.2 199 | 1997,16.6 200 | 1998,16.8 201 | 1999,16.9 202 | 2000,17.4 203 | 2001,17 204 | 2002,16.6 205 | 2003,17.5 206 | 2004,17.3 207 | 2005,17.3 208 | 2006,16.7 209 | 2007,16.8 210 | 2008,16.8 211 | 2009,15.9 212 | 2010,15.6 213 | 2011,15.6 214 | 2012,14.8 215 | 2013,14.7 216 | 2014,15.1 217 | -------------------------------------------------------------------------------- /datasets/gdp_japan/gdp_japan.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gdp_japan", 3 | "longname": "GDP Japan", 4 | "n_obs": 58, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53, 64 | 54, 65 | 55, 66 | 56, 67 | 57 68 | ], 69 | "raw": [ 70 | "1960", 71 | "1961", 72 | "1962", 73 | "1963", 74 | "1964", 75 | "1965", 76 | "1966", 77 | "1967", 78 | "1968", 79 | "1969", 80 | "1970", 81 | "1971", 82 | "1972", 83 | "1973", 84 | "1974", 85 | "1975", 86 | "1976", 87 | "1977", 88 | "1978", 89 | "1979", 90 | "1980", 91 | "1981", 92 | "1982", 93 | "1983", 94 | "1984", 95 | "1985", 96 | "1986", 97 | "1987", 98 | "1988", 99 | "1989", 100 | "1990", 101 | "1991", 102 | "1992", 103 | "1993", 104 | "1994", 105 | "1995", 106 | "1996", 107 | "1997", 108 | "1998", 109 | "1999", 110 | "2000", 111 | "2001", 112 | "2002", 113 | "2003", 114 | "2004", 115 | "2005", 116 | "2006", 117 | "2007", 118 | "2008", 119 | "2009", 120 | "2010", 121 | "2011", 122 | "2012", 123 | "2013", 124 | "2014", 125 | "2015", 126 | "2016", 127 | "2017" 128 | ] 129 | }, 130 | "series": [ 131 | { 132 | "label": "V1", 133 | "type": "float", 134 | "raw": [ 135 | 15950643462144, 136 | 19263102386176, 137 | 21860286726144, 138 | 25019327447040, 139 | 29429642297344, 140 | 32742100172800, 141 | 38026105323520, 142 | 44561476878336, 143 | 52776386166784, 144 | 61993511813120, 145 | 76539307651500, 146 | 84215883490900, 147 | 96418343539100, 148 | 117397596102100, 149 | 140090360740400, 150 | 154787118329600, 151 | 173827764691400, 152 | 193706278803100, 153 | 213306268936200, 154 | 231195355873400, 155 | 250636100000000, 156 | 268830700000000, 157 | 282582000000000, 158 | 295303900000000, 159 | 313145300000000, 160 | 333686000000000, 161 | 350344800000000, 162 | 366339100000000, 163 | 393641400000000, 164 | 421469400000000, 165 | 453608500000000, 166 | 482845400000000, 167 | 495055800000000, 168 | 495291000000000, 169 | 501537700000000, 170 | 512541700000000, 171 | 525806900000000, 172 | 534142500000000, 173 | 527876900000000, 174 | 519651800000000, 175 | 526706000000000, 176 | 523005000000000, 177 | 515986200000000, 178 | 515400700000000, 179 | 520965400000000, 180 | 524132800000000, 181 | 526879700000000, 182 | 531688200000000, 183 | 520715700000000, 184 | 489501000000000, 185 | 500353900000000, 186 | 491408500000000, 187 | 494957200000000, 188 | 503175600000000, 189 | 513876000000000, 190 | 531985800000000, 191 | 538445800000000, 192 | 546488800000000 193 | ] 194 | } 195 | ] 196 | } 197 | -------------------------------------------------------------------------------- /datasets/gdp_argentina/gdp_argentina.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gdp_argentina", 3 | "longname": "GDP Argentina", 4 | "n_obs": 59, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53, 64 | 54, 65 | 55, 66 | 56, 67 | 57, 68 | 58 69 | ], 70 | "raw": [ 71 | "1960", 72 | "1961", 73 | "1962", 74 | "1963", 75 | "1964", 76 | "1965", 77 | "1966", 78 | "1967", 79 | "1968", 80 | "1969", 81 | "1970", 82 | "1971", 83 | "1972", 84 | "1973", 85 | "1974", 86 | "1975", 87 | "1976", 88 | "1977", 89 | "1978", 90 | "1979", 91 | "1980", 92 | "1981", 93 | "1982", 94 | "1983", 95 | "1984", 96 | "1985", 97 | "1986", 98 | "1987", 99 | "1988", 100 | "1989", 101 | "1990", 102 | "1991", 103 | "1992", 104 | "1993", 105 | "1994", 106 | "1995", 107 | "1996", 108 | "1997", 109 | "1998", 110 | "1999", 111 | "2000", 112 | "2001", 113 | "2002", 114 | "2003", 115 | "2004", 116 | "2005", 117 | "2006", 118 | "2007", 119 | "2008", 120 | "2009", 121 | "2010", 122 | "2011", 123 | "2012", 124 | "2013", 125 | "2014", 126 | "2015", 127 | "2016", 128 | "2017", 129 | "2018" 130 | ] 131 | }, 132 | "series": [ 133 | { 134 | "label": "GDP (constant LCU)", 135 | "type": "float", 136 | "raw": [ 137 | 182932009386.44, 138 | 192861271432.271, 139 | 191218051889.207, 140 | 181067821327.096, 141 | 199410530596.604, 142 | 220487093701.636, 143 | 219032482639.5, 144 | 226023992089.437, 145 | 236924000874.523, 146 | 259857121158.53, 147 | 267771442208.049, 148 | 282922301411.503, 149 | 287529454895.857, 150 | 295614075962.835, 151 | 311972780690.209, 152 | 311884142663.154, 153 | 305589536223.584, 154 | 326779566190.307, 155 | 312054471108.779, 156 | 343955061571.789, 157 | 349178995840.106, 158 | 331057342234.647, 159 | 328621888566.258, 160 | 342913961037.103, 161 | 348300243338.463, 162 | 330226858893.014, 163 | 350546962683.162, 164 | 360028798197.3, 165 | 356104724285.913, 166 | 330618488447.736, 167 | 322461423553.512, 168 | 351912181900.0, 169 | 379844477800.0, 170 | 411018234600.0, 171 | 435006083700.0, 172 | 422629248800.0, 173 | 445986656500.0, 174 | 482160842800.0, 175 | 500724897600.0, 176 | 483773071300.0, 177 | 479956106900.0, 178 | 458795611600.0, 179 | 408812193300.0, 180 | 444939093600.0, 181 | 485115195200.0, 182 | 528055942500.0, 183 | 570549404200.0, 184 | 621942502600.0, 185 | 647176159700.0, 186 | 608872876400.0, 187 | 670523679400.0, 188 | 710781597200.0, 189 | 703485989500.0, 190 | 720407105300.0, 191 | 702306046000.0, 192 | 721487146600.0, 193 | 706477848600.0, 194 | 725330848500.0, 195 | 707091754400.0 196 | ] 197 | } 198 | ] 199 | } -------------------------------------------------------------------------------- /datasets/gdp_iran/gdp_iran.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gdp_iran", 3 | "longname": "GDP Iran", 4 | "n_obs": 58, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53, 64 | 54, 65 | 55, 66 | 56, 67 | 57 68 | ], 69 | "raw": [ 70 | "1960", 71 | "1961", 72 | "1962", 73 | "1963", 74 | "1964", 75 | "1965", 76 | "1966", 77 | "1967", 78 | "1968", 79 | "1969", 80 | "1970", 81 | "1971", 82 | "1972", 83 | "1973", 84 | "1974", 85 | "1975", 86 | "1976", 87 | "1977", 88 | "1978", 89 | "1979", 90 | "1980", 91 | "1981", 92 | "1982", 93 | "1983", 94 | "1984", 95 | "1985", 96 | "1986", 97 | "1987", 98 | "1988", 99 | "1989", 100 | "1990", 101 | "1991", 102 | "1992", 103 | "1993", 104 | "1994", 105 | "1995", 106 | "1996", 107 | "1997", 108 | "1998", 109 | "1999", 110 | "2000", 111 | "2001", 112 | "2002", 113 | "2003", 114 | "2004", 115 | "2005", 116 | "2006", 117 | "2007", 118 | "2008", 119 | "2009", 120 | "2010", 121 | "2011", 122 | "2012", 123 | "2013", 124 | "2014", 125 | "2015", 126 | "2016", 127 | "2017" 128 | ] 129 | }, 130 | "series": [ 131 | { 132 | "label": "GDP (constant LCU)", 133 | "type": "float", 134 | "raw": [ 135 | 835372833322822.0, 136 | 931253594724320.0, 137 | 1011252310515580.0, 138 | 1084815415588060.0, 139 | 1184552678329550.0, 140 | 1388505087423820.0, 141 | 1544572336125510.0, 142 | 1724250326139720.0, 143 | 1979578637059070.0, 144 | 2302920188960200.0, 145 | 2559064529414210.0, 146 | 2928086546857670.0, 147 | 3350180391061100.0, 148 | 3637066434078650.0, 149 | 3873361335537510.0, 150 | 3779651859177620.0, 151 | 4431023105415290.0, 152 | 4250906946640460.0, 153 | 3653562255307690.0, 154 | 3269427305222360.0, 155 | 2369469274184260.0, 156 | 2246824634803560.0, 157 | 2858194886307970.0, 158 | 3102928534616680.0, 159 | 2830330279459450.0, 160 | 2891245015552970.0, 161 | 2596157639741250.0, 162 | 2612994168295680.0, 163 | 2471991485967540.0, 164 | 2620826986239520.0, 165 | 2983386426044710.0, 166 | 3352830570382000.0, 167 | 3447874188131500.0, 168 | 3483732006730800.0, 169 | 3431072340489400.0, 170 | 3509392093199400.0, 171 | 3690942894824200.0, 172 | 3708734969540800.0, 173 | 3789504657647000.0, 174 | 3821922301414700.0, 175 | 4045813055230300.0, 176 | 4077315322748900.0, 177 | 4373554752266400.0, 178 | 4755566584910500.0, 179 | 4963599286320800.0, 180 | 5121928379232500.0, 181 | 5378014311650700.0, 182 | 5816632978975000.0, 183 | 5831224383995400.0, 184 | 5889967290457600.0, 185 | 6231463959948300.0, 186 | 6396330918494400.0, 187 | 5920152415454900.0, 188 | 5908662970172800.0, 189 | 6180663476890700.0, 190 | 6099038846316000.0, 191 | 6916081000000000.0, 192 | 7175792800000000.0 193 | ] 194 | } 195 | ] 196 | } -------------------------------------------------------------------------------- /schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "definitions": {}, 3 | "$schema": "http://json-schema.org/draft-07/schema#", 4 | "$id": "http://example.com/root.json", 5 | "type": "object", 6 | "title": "Dataset Schema", 7 | "default": null, 8 | "required": [ 9 | "name", 10 | "n_obs", 11 | "n_dim", 12 | "time", 13 | "series" 14 | ], 15 | "properties": { 16 | "name": { 17 | "$id": "#/properties/name", 18 | "type": "string", 19 | "title": "The Name Schema", 20 | "default": "", 21 | "pattern": "^[a-z0-9\\_]+$" 22 | }, 23 | "longname": { 24 | "$id": "#/properties/longname", 25 | "type": "string", 26 | "title": "The Longname Schema", 27 | "default": "", 28 | "pattern": "^(.+)$" 29 | }, 30 | "n_obs": { 31 | "$id": "#/properties/n_obs", 32 | "type": "integer", 33 | "title": "The N_obs Schema", 34 | "default": 0 35 | }, 36 | "n_dim": { 37 | "$id": "#/properties/n_dim", 38 | "type": "integer", 39 | "title": "The N_dim Schema", 40 | "default": 0 41 | }, 42 | "demo": { 43 | "$id": "#/properties/demo", 44 | "type": "object", 45 | "title": "The Demo Schema", 46 | "properties": { 47 | "true_CPs": { 48 | "$id": "#/properties/demo/properties/true_CPs", 49 | "type": "array", 50 | "items": { 51 | "$id": "#/properties/demo/properties/true_CPs/items", 52 | "type": "integer", 53 | "title": "The Items Schema", 54 | "default": null 55 | } 56 | } 57 | } 58 | }, 59 | "time": { 60 | "$id": "#/properties/time", 61 | "type": "object", 62 | "title": "The Time Schema", 63 | "default": null, 64 | "required": [ 65 | "index" 66 | ], 67 | "properties": { 68 | "format": { 69 | "$id": "#/properties/time/properties/format", 70 | "type": "string", 71 | "title": "The Format Schema", 72 | "default": "", 73 | "pattern": "^(.*)$" 74 | }, 75 | "index": { 76 | "$id": "#/properties/time/properties/index", 77 | "type": "array", 78 | "title": "Integer index of the series, starting from 0.", 79 | "items": { 80 | "$id": "#/properties/time/properties/index/items", 81 | "type": "integer", 82 | "title": "The index items schema", 83 | "default": null 84 | } 85 | }, 86 | "raw": { 87 | "$id": "#/properties/time/properties/raw", 88 | "type": "array", 89 | "title": "The Raw Schema", 90 | "items": { 91 | "$id": "#/properties/time/properties/raw/items", 92 | "type": "string", 93 | "title": "The Items Schema", 94 | "default": "" 95 | } 96 | } 97 | } 98 | }, 99 | "series": { 100 | "$id": "#/properties/series", 101 | "type": "array", 102 | "title": "The Series Schema", 103 | "items": { 104 | "$id": "#/properties/series/items", 105 | "type": "object", 106 | "title": "The Variable Schema", 107 | "default": null, 108 | "properties": { 109 | "label": { 110 | "$id": "#/properties/series/items/properties/label", 111 | "type": "string", 112 | "title": "The Label Schema", 113 | "default": "", 114 | "pattern": "^(.+)$" 115 | }, 116 | "type": { 117 | "$id": "#/properties/series/items/properties/type", 118 | "type": "string", 119 | "title": "The Type Schema", 120 | "default": "", 121 | "pattern": "^(.+)$" 122 | }, 123 | "raw": { 124 | "$id": "#/properties/series/items/properties/raw", 125 | "type": "array", 126 | "title": "The Raw Schema", 127 | "items": { 128 | "$id": "#/properties/series/items/properties/raw/items", 129 | "title": "The Items Schema", 130 | "default": 0 131 | } 132 | } 133 | }, 134 | "required": [ 135 | "type", 136 | "raw" 137 | ] 138 | } 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /datasets/ozone/ozone-depleting-substance-emissions.csv: -------------------------------------------------------------------------------- 1 | Entity,Code,Year,Ozone-depleting substance emissions (Scientific Assessment 2014) (tonnes CFC11-equivalents) 2 | Natural emissions,,1961,165000 3 | Natural emissions,,1962,165000 4 | Natural emissions,,1963,165000 5 | Natural emissions,,1964,165000 6 | Natural emissions,,1965,165000 7 | Natural emissions,,1966,165000 8 | Natural emissions,,1967,165000 9 | Natural emissions,,1968,165000 10 | Natural emissions,,1969,165000 11 | Natural emissions,,1970,165000 12 | Natural emissions,,1971,165000 13 | Natural emissions,,1972,165000 14 | Natural emissions,,1973,165000 15 | Natural emissions,,1974,165000 16 | Natural emissions,,1975,165000 17 | Natural emissions,,1976,165000 18 | Natural emissions,,1977,165000 19 | Natural emissions,,1978,165000 20 | Natural emissions,,1979,165000 21 | Natural emissions,,1980,165000 22 | Natural emissions,,1981,165000 23 | Natural emissions,,1982,165000 24 | Natural emissions,,1983,165000 25 | Natural emissions,,1984,165000 26 | Natural emissions,,1985,165000 27 | Natural emissions,,1986,165000 28 | Natural emissions,,1987,165000 29 | Natural emissions,,1988,165000 30 | Natural emissions,,1989,165000 31 | Natural emissions,,1990,165000 32 | Natural emissions,,1991,165000 33 | Natural emissions,,1992,165000 34 | Natural emissions,,1993,165000 35 | Natural emissions,,1994,165000 36 | Natural emissions,,1995,165000 37 | Natural emissions,,1996,165000 38 | Natural emissions,,1997,165000 39 | Natural emissions,,1998,165000 40 | Natural emissions,,1999,165000 41 | Natural emissions,,2000,165000 42 | Natural emissions,,2001,165000 43 | Natural emissions,,2002,165000 44 | Natural emissions,,2003,165000 45 | Natural emissions,,2004,165000 46 | Natural emissions,,2005,165000 47 | Natural emissions,,2006,165000 48 | Natural emissions,,2007,165000 49 | Natural emissions,,2008,165000 50 | Natural emissions,,2009,165000 51 | Natural emissions,,2010,165000 52 | Natural emissions,,2011,165000 53 | Natural emissions,,2012,165000 54 | Natural emissions,,2013,165000 55 | Natural emissions,,2014,165000 56 | Total emissions,,1961,380000 57 | Total emissions,,1962,400000 58 | Total emissions,,1963,440000 59 | Total emissions,,1964,480000 60 | Total emissions,,1965,510000 61 | Total emissions,,1966,540000 62 | Total emissions,,1967,580000 63 | Total emissions,,1968,630000 64 | Total emissions,,1969,660000 65 | Total emissions,,1970,720000 66 | Total emissions,,1971,770000 67 | Total emissions,,1972,840000 68 | Total emissions,,1973,910000 69 | Total emissions,,1974,980000 70 | Total emissions,,1975,1040000 71 | Total emissions,,1976,1050000 72 | Total emissions,,1977,1070000 73 | Total emissions,,1978,1070000 74 | Total emissions,,1979,1110000 75 | Total emissions,,1980,1080000 76 | Total emissions,,1981,1040000 77 | Total emissions,,1982,1100000 78 | Total emissions,,1983,1090000 79 | Total emissions,,1984,1150000 80 | Total emissions,,1985,1180000 81 | Total emissions,,1986,1280000 82 | Total emissions,,1987,1360000 83 | Total emissions,,1988,1460000 84 | Total emissions,,1989,1410000 85 | Total emissions,,1990,1320000 86 | Total emissions,,1991,1190000 87 | Total emissions,,1992,1080000 88 | Total emissions,,1993,960000 89 | Total emissions,,1994,820000 90 | Total emissions,,1995,760000 91 | Total emissions,,1996,700000 92 | Total emissions,,1997,640000 93 | Total emissions,,1998,600000 94 | Total emissions,,1999,590000 95 | Total emissions,,2000,560000 96 | Total emissions,,2001,530000 97 | Total emissions,,2002,490000 98 | Total emissions,,2003,480000 99 | Total emissions,,2004,470000 100 | Total emissions,,2005,450000 101 | Total emissions,,2006,430000 102 | Total emissions,,2007,410000 103 | Total emissions,,2008,400000 104 | Total emissions,,2009,390000 105 | Total emissions,,2010,380000 106 | Total emissions,,2011,370000 107 | Total emissions,,2012,350000 108 | Total emissions,,2013,340000 109 | Total emissions,,2014,320000 -------------------------------------------------------------------------------- /utils/plot_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Utility script to plot datasets and annotations. 6 | 7 | Author: G.J.J. van den Burg 8 | Copyright (c) 2020 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import datetime 15 | import json 16 | import matplotlib.pyplot as plt 17 | import pandas as pd 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "-r", 24 | "--result-file", 25 | help="JSON file with results from a change point detection method", 26 | ) 27 | parser.add_argument( 28 | "-o", "--output-file", help="Output file to save the figure to" 29 | ) 30 | parser.add_argument("input", help="Input dataset file (in JSON format)") 31 | return parser.parse_args() 32 | 33 | 34 | def frac_to_dt(number): 35 | number = float(number) 36 | year = int(float(number)) 37 | remainder = number - year 38 | begin = datetime.datetime(year, 1, 1) 39 | end = datetime.datetime(year + 1, 1, 1) 40 | seconds = remainder * (end - begin).total_seconds() 41 | return begin + datetime.timedelta(seconds=seconds) 42 | 43 | 44 | def load_data(filename): 45 | with open(filename, "rb") as fid: 46 | data = json.load(fid) 47 | title = data["name"] 48 | y = data["series"][0]["raw"] 49 | if "time" in data and "format" in data["time"]: 50 | fmt = data["time"]["format"] 51 | if fmt == "%Y.%F": 52 | x = list(map(frac_to_dt, data["time"]["raw"])) 53 | else: 54 | try: 55 | x = pd.to_datetime( 56 | data["time"]["raw"], format=data["time"]["format"] 57 | ) 58 | except ValueError: 59 | x = list(range(1, len(y) + 1)) 60 | else: 61 | x = list(range(1, len(y) + 1)) 62 | as_dict = {"x": x} 63 | for idx, series in enumerate(data["series"]): 64 | as_dict["y" + str(idx)] = series["raw"] 65 | 66 | df = pd.DataFrame(as_dict) 67 | return df, title 68 | 69 | 70 | def load_result(filename): 71 | with open(filename, "r") as fp: 72 | data = json.load(fp) 73 | if not data["status"] == "SUCCESS": 74 | print("Detection wasn't successful.") 75 | return None 76 | return data["result"]["cplocations"] 77 | 78 | 79 | def main(): 80 | args = parse_args() 81 | df, title = load_data(args.input) 82 | 83 | results = None 84 | if args.result_file: 85 | results = load_result(args.result_file) 86 | 87 | has_date = False 88 | try: 89 | _ = df["x"].dt 90 | has_date = True 91 | except AttributeError: 92 | pass 93 | 94 | fig, axes = plt.subplots(df.shape[1] - 1, 1, squeeze=False) 95 | for idx, col in enumerate(df.columns[1:]): 96 | if has_date: 97 | axes[idx, 0].plot_date(df["x"], df[col], ".", color="tab:blue") 98 | axes[idx, 0].plot_date(df["x"], df[col], "-", color="tab:blue") 99 | if results: 100 | for loc in results: 101 | if loc == 0: 102 | continue 103 | if loc == df.shape[0]: 104 | continue 105 | pos = df["x"].values[loc] 106 | axes[idx, 0].axvline(x=pos, linestyle="--", color="red") 107 | else: 108 | axes[idx, 0].scatter(df["x"], df[col], color="tab:blue") 109 | axes[idx, 0].plot(df["x"], df[col], color="tab:blue") 110 | if results: 111 | for loc in results: 112 | if loc == 0: 113 | continue 114 | if loc == df.shape[0]: 115 | continue 116 | pos = df["x"].values[loc] 117 | axes[idx, 0].axvline(x=pos, linestyle="--", color="red") 118 | fig.suptitle(title) 119 | if args.output_file: 120 | plt.savefig(args.output_file, transparent=True) 121 | else: 122 | plt.show() 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /datasets/nile/nile.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nile", 3 | "longname": "Nile Volume at Aswan", 4 | "n_obs": 100, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53, 64 | 54, 65 | 55, 66 | 56, 67 | 57, 68 | 58, 69 | 59, 70 | 60, 71 | 61, 72 | 62, 73 | 63, 74 | 64, 75 | 65, 76 | 66, 77 | 67, 78 | 68, 79 | 69, 80 | 70, 81 | 71, 82 | 72, 83 | 73, 84 | 74, 85 | 75, 86 | 76, 87 | 77, 88 | 78, 89 | 79, 90 | 80, 91 | 81, 92 | 82, 93 | 83, 94 | 84, 95 | 85, 96 | 86, 97 | 87, 98 | 88, 99 | 89, 100 | 90, 101 | 91, 102 | 92, 103 | 93, 104 | 94, 105 | 95, 106 | 96, 107 | 97, 108 | 98, 109 | 99 110 | ], 111 | "raw": [ 112 | "1871", 113 | "1872", 114 | "1873", 115 | "1874", 116 | "1875", 117 | "1876", 118 | "1877", 119 | "1878", 120 | "1879", 121 | "1880", 122 | "1881", 123 | "1882", 124 | "1883", 125 | "1884", 126 | "1885", 127 | "1886", 128 | "1887", 129 | "1888", 130 | "1889", 131 | "1890", 132 | "1891", 133 | "1892", 134 | "1893", 135 | "1894", 136 | "1895", 137 | "1896", 138 | "1897", 139 | "1898", 140 | "1899", 141 | "1900", 142 | "1901", 143 | "1902", 144 | "1903", 145 | "1904", 146 | "1905", 147 | "1906", 148 | "1907", 149 | "1908", 150 | "1909", 151 | "1910", 152 | "1911", 153 | "1912", 154 | "1913", 155 | "1914", 156 | "1915", 157 | "1916", 158 | "1917", 159 | "1918", 160 | "1919", 161 | "1920", 162 | "1921", 163 | "1922", 164 | "1923", 165 | "1924", 166 | "1925", 167 | "1926", 168 | "1927", 169 | "1928", 170 | "1929", 171 | "1930", 172 | "1931", 173 | "1932", 174 | "1933", 175 | "1934", 176 | "1935", 177 | "1936", 178 | "1937", 179 | "1938", 180 | "1939", 181 | "1940", 182 | "1941", 183 | "1942", 184 | "1943", 185 | "1944", 186 | "1945", 187 | "1946", 188 | "1947", 189 | "1948", 190 | "1949", 191 | "1950", 192 | "1951", 193 | "1952", 194 | "1953", 195 | "1954", 196 | "1955", 197 | "1956", 198 | "1957", 199 | "1958", 200 | "1959", 201 | "1960", 202 | "1961", 203 | "1962", 204 | "1963", 205 | "1964", 206 | "1965", 207 | "1966", 208 | "1967", 209 | "1968", 210 | "1969", 211 | "1970" 212 | ] 213 | }, 214 | "series": [ 215 | { 216 | "label": "Volume at Aswan", 217 | "type": "int", 218 | "raw": [ 219 | 1120, 220 | 1160, 221 | 963, 222 | 1210, 223 | 1160, 224 | 1160, 225 | 813, 226 | 1230, 227 | 1370, 228 | 1140, 229 | 995, 230 | 935, 231 | 1110, 232 | 994, 233 | 1020, 234 | 960, 235 | 1180, 236 | 799, 237 | 958, 238 | 1140, 239 | 1100, 240 | 1210, 241 | 1150, 242 | 1250, 243 | 1260, 244 | 1220, 245 | 1030, 246 | 1100, 247 | 774, 248 | 840, 249 | 874, 250 | 694, 251 | 940, 252 | 833, 253 | 701, 254 | 916, 255 | 692, 256 | 1020, 257 | 1050, 258 | 969, 259 | 831, 260 | 726, 261 | 456, 262 | 824, 263 | 702, 264 | 1120, 265 | 1100, 266 | 832, 267 | 764, 268 | 821, 269 | 768, 270 | 845, 271 | 864, 272 | 862, 273 | 698, 274 | 845, 275 | 744, 276 | 796, 277 | 1040, 278 | 759, 279 | 781, 280 | 865, 281 | 845, 282 | 944, 283 | 984, 284 | 897, 285 | 822, 286 | 1010, 287 | 771, 288 | 676, 289 | 649, 290 | 846, 291 | 812, 292 | 742, 293 | 801, 294 | 1040, 295 | 860, 296 | 874, 297 | 848, 298 | 890, 299 | 744, 300 | 749, 301 | 838, 302 | 1050, 303 | 918, 304 | 986, 305 | 797, 306 | 923, 307 | 975, 308 | 815, 309 | 1020, 310 | 906, 311 | 901, 312 | 1170, 313 | 912, 314 | 746, 315 | 919, 316 | 718, 317 | 714, 318 | 740 319 | ] 320 | } 321 | ] 322 | } 323 | -------------------------------------------------------------------------------- /datasets/scanline_42049/get_scanline_42049.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the scanline_42049 dataset. 6 | 7 | See the README file for more information. 8 | 9 | Author: Gertjan van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import hashlib 17 | import os 18 | import numpy as np 19 | import json 20 | import sys 21 | import time 22 | 23 | from PIL import Image 24 | from functools import wraps 25 | from urllib.request import urlretrieve 26 | from urllib.error import URLError 27 | 28 | IMG_URL = "https://web.archive.org/web/20070611230044im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/42049.jpg" 29 | 30 | MD5_IMG = "75a3d395b4f3f506abb9edadacaa4d55" 31 | MD5_JSON = "39921dfa959576bd0b3d6c95558f17f4" 32 | 33 | NAME_IMG = "42049.jpg" 34 | NAME_JSON = "scanline_42049.json" 35 | 36 | 37 | class ValidationError(Exception): 38 | def __init__(self, filename): 39 | self.message = ( 40 | "Validating the file '%s' failed. \n" 41 | "Please raise an issue on the GitHub page for this project \n" 42 | "if the error persists." % filename 43 | ) 44 | 45 | 46 | def check_md5sum(filename, checksum): 47 | with open(filename, "rb") as fp: 48 | data = fp.read() 49 | h = hashlib.md5(data).hexdigest() 50 | return h == checksum 51 | 52 | 53 | def validate(checksum): 54 | """Decorator that validates the target file.""" 55 | 56 | def validate_decorator(func): 57 | @wraps(func) 58 | def wrapper(*args, **kwargs): 59 | target = kwargs.get("target_path", None) 60 | if os.path.exists(target) and check_md5sum(target, checksum): 61 | return 62 | out = func(*args, **kwargs) 63 | if not os.path.exists(target): 64 | raise FileNotFoundError("Target file expected at: %s" % target) 65 | if not check_md5sum(target, checksum): 66 | raise ValidationError(target) 67 | return out 68 | 69 | return wrapper 70 | 71 | return validate_decorator 72 | 73 | 74 | @validate(MD5_IMG) 75 | def download_img(target_path=None): 76 | count = 0 77 | while count < 5: 78 | count += 1 79 | try: 80 | urlretrieve(IMG_URL, target_path) 81 | return 82 | except URLError as err: 83 | print( 84 | "Error occurred (%r) when trying to download img. Retrying in 5 seconds" 85 | % err, 86 | sys.stderr, 87 | ) 88 | time.sleep(5) 89 | 90 | 91 | @validate(MD5_JSON) 92 | def write_json(img_path, target_path=None): 93 | name = "scanline_42049" 94 | longname = "Scanline 42049" 95 | index = 170 96 | 97 | im = Image.open(img_path) 98 | arr = np.array(im) 99 | line = list(map(int, list(arr[index, :]))) 100 | 101 | series = [{"label": "Line %s" % index, "type": "int", "raw": line}] 102 | 103 | data = { 104 | "name": name, 105 | "longname": longname, 106 | "n_obs": len(line), 107 | "n_dim": len(series), 108 | "time": {"index": list(range(len(line)))}, 109 | "series": series, 110 | } 111 | 112 | with open(target_path, "w") as fp: 113 | json.dump(data, fp, indent="\t") 114 | 115 | 116 | def collect(output_dir="."): 117 | img_path = os.path.join(output_dir, NAME_IMG) 118 | json_path = os.path.join(output_dir, NAME_JSON) 119 | 120 | download_img(target_path=img_path) 121 | write_json(img_path, target_path=json_path) 122 | 123 | 124 | def clean(output_dir="."): 125 | img_path = os.path.join(output_dir, NAME_IMG) 126 | json_path = os.path.join(output_dir, NAME_JSON) 127 | 128 | if os.path.exists(img_path): 129 | os.unlink(img_path) 130 | if os.path.exists(json_path): 131 | os.unlink(json_path) 132 | 133 | 134 | def parse_args(): 135 | parser = argparse.ArgumentParser() 136 | parser.add_argument( 137 | "-o", "--output-dir", help="output directory to use", default="." 138 | ) 139 | parser.add_argument( 140 | "action", 141 | choices=["collect", "clean"], 142 | help="Action to perform", 143 | default="collect", 144 | nargs="?", 145 | ) 146 | return parser.parse_args() 147 | 148 | 149 | def main(output_dir="."): 150 | args = parse_args() 151 | if args.action == "collect": 152 | collect(output_dir=args.output_dir) 153 | elif args.action == "clean": 154 | clean(output_dir=args.output_dir) 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /datasets/scanline_126007/get_scanline_126007.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the scanline_126007 dataset. 6 | 7 | See the README file for more information. 8 | 9 | Author: Gertjan van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import hashlib 17 | import os 18 | import numpy as np 19 | import json 20 | import sys 21 | import time 22 | 23 | from PIL import Image 24 | from functools import wraps 25 | from urllib.request import urlretrieve 26 | from urllib.error import URLError 27 | 28 | IMG_URL = "https://web.archive.org/web/20070611200633im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/126007.jpg" 29 | 30 | MD5_IMG = "0ca6db4848b6d319d94a37e697930fb4" 31 | MD5_JSON = "057d5741b623308af00c42e2c8e525c3" 32 | 33 | NAME_IMG = "126007.jpg" 34 | NAME_JSON = "scanline_126007.json" 35 | 36 | 37 | class ValidationError(Exception): 38 | def __init__(self, filename): 39 | self.message = ( 40 | "Validating the file '%s' failed. \n" 41 | "Please raise an issue on the GitHub page for this project \n" 42 | "if the error persists." % filename 43 | ) 44 | 45 | 46 | def check_md5sum(filename, checksum): 47 | with open(filename, "rb") as fp: 48 | data = fp.read() 49 | h = hashlib.md5(data).hexdigest() 50 | return h == checksum 51 | 52 | 53 | def validate(checksum): 54 | """Decorator that validates the target file.""" 55 | 56 | def validate_decorator(func): 57 | @wraps(func) 58 | def wrapper(*args, **kwargs): 59 | target = kwargs.get("target_path", None) 60 | if os.path.exists(target) and check_md5sum(target, checksum): 61 | return 62 | out = func(*args, **kwargs) 63 | if not os.path.exists(target): 64 | raise FileNotFoundError("Target file expected at: %s" % target) 65 | if not check_md5sum(target, checksum): 66 | raise ValidationError(target) 67 | return out 68 | 69 | return wrapper 70 | 71 | return validate_decorator 72 | 73 | 74 | @validate(MD5_IMG) 75 | def download_img(target_path=None): 76 | count = 0 77 | while count < 5: 78 | count += 1 79 | try: 80 | urlretrieve(IMG_URL, target_path) 81 | return 82 | except URLError as err: 83 | print( 84 | "Error occurred (%r) when trying to download img. Retrying in 5 seconds" 85 | % err, 86 | sys.stderr, 87 | ) 88 | time.sleep(5) 89 | 90 | 91 | 92 | @validate(MD5_JSON) 93 | def write_json(img_path, target_path=None): 94 | name = "scanline_126007" 95 | longname = "Scanline 126007" 96 | index = 200 97 | 98 | im = Image.open(img_path) 99 | arr = np.array(im) 100 | line = list(map(int, list(arr[index, :]))) 101 | 102 | series = [{"label": "Line %s" % index, "type": "int", "raw": line}] 103 | 104 | data = { 105 | "name": name, 106 | "longname": longname, 107 | "n_obs": len(line), 108 | "n_dim": len(series), 109 | "time": {"index": list(range(len(line)))}, 110 | "series": series, 111 | } 112 | 113 | with open(target_path, "w") as fp: 114 | json.dump(data, fp, indent="\t") 115 | 116 | 117 | def collect(output_dir="."): 118 | img_path = os.path.join(output_dir, NAME_IMG) 119 | json_path = os.path.join(output_dir, NAME_JSON) 120 | 121 | download_img(target_path=img_path) 122 | write_json(img_path, target_path=json_path) 123 | 124 | 125 | def clean(output_dir="."): 126 | img_path = os.path.join(output_dir, NAME_IMG) 127 | json_path = os.path.join(output_dir, NAME_JSON) 128 | 129 | if os.path.exists(img_path): 130 | os.unlink(img_path) 131 | if os.path.exists(json_path): 132 | os.unlink(json_path) 133 | 134 | 135 | def parse_args(): 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument( 138 | "-o", "--output-dir", help="output directory to use", default="." 139 | ) 140 | parser.add_argument( 141 | "action", 142 | choices=["collect", "clean"], 143 | help="Action to perform", 144 | default="collect", 145 | nargs="?", 146 | ) 147 | return parser.parse_args() 148 | 149 | 150 | def main(output_dir="."): 151 | args = parse_args() 152 | if args.action == "collect": 153 | collect(output_dir=args.output_dir) 154 | elif args.action == "clean": 155 | clean(output_dir=args.output_dir) 156 | 157 | 158 | if __name__ == "__main__": 159 | main() 160 | -------------------------------------------------------------------------------- /datasets/measles/get_measles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the measles dataset 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import clevercsv 17 | import hashlib 18 | import json 19 | import os 20 | import sys 21 | import time 22 | 23 | from functools import wraps 24 | from urllib.request import urlretrieve 25 | from urllib.error import URLError 26 | 27 | DAT_URL = "https://web.archive.org/web/20191128124615if_/https://ms.mcmaster.ca/~bolker/measdata/ewmeas.dat" 28 | 29 | MD5_DAT = "143d1dacd791df963674468c8b005bf9" 30 | MD5_JSON = "e42afd03be893fc7deb98514c94fa4c7" 31 | 32 | NAME_DAT = "ewmeas.dat" 33 | NAME_JSON = "measles.json" 34 | 35 | 36 | class ValidationError(Exception): 37 | def __init__(self, filename): 38 | message = ( 39 | "Validating the file '%s' failed. \n" 40 | "Please raise an issue on the GitHub page for this project " 41 | "if the error persists." % filename 42 | ) 43 | super().__init__(message) 44 | 45 | 46 | def check_md5sum(filename, checksum): 47 | with open(filename, "rb") as fp: 48 | data = fp.read() 49 | h = hashlib.md5(data).hexdigest() 50 | return h == checksum 51 | 52 | 53 | def validate(checksum): 54 | """Decorator that validates the target file.""" 55 | 56 | def validate_decorator(func): 57 | @wraps(func) 58 | def wrapper(*args, **kwargs): 59 | target = kwargs.get("target_path", None) 60 | if os.path.exists(target) and check_md5sum(target, checksum): 61 | return 62 | out = func(*args, **kwargs) 63 | if not os.path.exists(target): 64 | raise FileNotFoundError("Target file expected at: %s" % target) 65 | if not check_md5sum(target, checksum): 66 | raise ValidationError(target) 67 | return out 68 | 69 | return wrapper 70 | 71 | return validate_decorator 72 | 73 | 74 | @validate(MD5_DAT) 75 | def download_zip(target_path=None): 76 | count = 0 77 | while count < 5: 78 | count += 1 79 | try: 80 | urlretrieve(DAT_URL, target_path) 81 | return 82 | except URLError as err: 83 | print( 84 | "Error occurred (%r) when trying to download zip. Retrying in 5 seconds" 85 | % err, 86 | sys.stderr, 87 | ) 88 | time.sleep(5) 89 | 90 | 91 | 92 | @validate(MD5_JSON) 93 | def write_json(dat_path, target_path=None): 94 | with open(dat_path, "r", newline="", encoding="ascii") as fp: 95 | reader = clevercsv.reader( 96 | fp, delimiter=" ", quotechar="", escapechar="" 97 | ) 98 | rows = list(reader) 99 | 100 | as_dicts = {t: int(x) for t, x in rows} 101 | 102 | time = sorted(as_dicts.keys()) 103 | values = [as_dicts[t] for t in time] 104 | series = [{"label": "V1", "type": "int", "raw": values}] 105 | 106 | data = { 107 | "name": "measles", 108 | "longname": "Measles cases (England & Wales)", 109 | "n_obs": len(time), 110 | "n_dim": len(series), 111 | "time": { 112 | "type": "string", 113 | "format": "%Y-%F", 114 | "index": list(range(len(time))), 115 | "raw": time, 116 | }, 117 | "series": series, 118 | } 119 | 120 | with open(target_path, "w") as fp: 121 | json.dump(data, fp, indent="\t") 122 | 123 | 124 | def collect(output_dir="."): 125 | dat_path = os.path.join(output_dir, NAME_DAT) 126 | json_path = os.path.join(output_dir, NAME_JSON) 127 | 128 | download_zip(target_path=dat_path) 129 | write_json(dat_path, target_path=json_path) 130 | 131 | 132 | def clean(output_dir="."): 133 | dat_path = os.path.join(output_dir, NAME_DAT) 134 | json_path = os.path.join(output_dir, NAME_JSON) 135 | 136 | if os.path.exists(dat_path): 137 | os.unlink(dat_path) 138 | if os.path.exists(json_path): 139 | os.unlink(json_path) 140 | 141 | 142 | def parse_args(): 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument( 145 | "-o", "--output-dir", help="output directory to use", default="." 146 | ) 147 | parser.add_argument( 148 | "action", 149 | choices=["collect", "clean"], 150 | help="Action to perform", 151 | default="collect", 152 | nargs="?", 153 | ) 154 | return parser.parse_args() 155 | 156 | 157 | def main(output_dir="."): 158 | args = parse_args() 159 | if args.action == "collect": 160 | collect(output_dir=args.output_dir) 161 | elif args.action == "clean": 162 | clean(output_dir=args.output_dir) 163 | 164 | 165 | if __name__ == "__main__": 166 | main() 167 | -------------------------------------------------------------------------------- /datasets/bitcoin/get_bitcoin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Retrieve the bitcoin dataset. 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import clevercsv 17 | import hashlib 18 | import json 19 | import os 20 | import sys 21 | import time 22 | 23 | from functools import wraps 24 | from urllib.request import urlretrieve 25 | from urllib.error import URLError 26 | 27 | CSV_URL = "https://web.archive.org/web/20191114131838if_/https://api.blockchain.info/charts/market-price?timespan=all&format=csv" 28 | 29 | MD5_CSV = "9bd4f7b06d78347415f6aafe1d9eb680" 30 | MD5_JSON = "f90ff14ed1fc0c3d47d4394d25cbce93" 31 | 32 | NAME_CSV = "market-price.csv" 33 | NAME_JSON = "bitcoin.json" 34 | 35 | 36 | class ValidationError(Exception): 37 | def __init__(self, filename): 38 | message = ( 39 | "Validating the file '%s' failed. \n" 40 | "Please raise an issue on the GitHub page for this project " 41 | "if the error persists." % filename 42 | ) 43 | super().__init__(message) 44 | 45 | 46 | def check_md5sum(filename, checksum): 47 | with open(filename, "rb") as fp: 48 | data = fp.read() 49 | h = hashlib.md5(data).hexdigest() 50 | return h == checksum 51 | 52 | 53 | def validate(checksum): 54 | """Decorator that validates the target file.""" 55 | 56 | def validate_decorator(func): 57 | @wraps(func) 58 | def wrapper(*args, **kwargs): 59 | target = kwargs.get("target_path", None) 60 | if os.path.exists(target) and check_md5sum(target, checksum): 61 | return 62 | out = func(*args, **kwargs) 63 | if not os.path.exists(target): 64 | raise FileNotFoundError("Target file expected at: %s" % target) 65 | if not check_md5sum(target, checksum): 66 | raise ValidationError(target) 67 | return out 68 | 69 | return wrapper 70 | 71 | return validate_decorator 72 | 73 | 74 | @validate(MD5_CSV) 75 | def get_market_price(target_path=None): 76 | count = 0 77 | while count < 5: 78 | count += 1 79 | try: 80 | urlretrieve(CSV_URL, target_path) 81 | return 82 | except URLError as err: 83 | print( 84 | "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" 85 | % err, 86 | sys.stderr, 87 | ) 88 | time.sleep(5) 89 | 90 | 91 | @validate(MD5_JSON) 92 | def write_json(csv_path, target_path=None): 93 | rows = clevercsv.read_table(csv_path) 94 | 95 | rows = rows[500:] 96 | last_idx = next( 97 | (i for i, r in enumerate(rows) if r[0] == "2019-06-19 00:00:00"), None 98 | ) 99 | rows = rows[: (last_idx + 1)] 100 | 101 | name = "bitcoin" 102 | longname = "Bitcoin Price" 103 | values = [float(r[1]) for r in rows] 104 | time = [r[0].split(" ")[0] for r in rows] 105 | time_fmt = "%Y-%m-%d" 106 | series = [{"label": "USD/Bitcoin", "type": "float", "raw": values}] 107 | 108 | data = { 109 | "name": name, 110 | "longname": longname, 111 | "n_obs": len(time), 112 | "n_dim": len(series), 113 | "time": { 114 | "type": "string", 115 | "format": time_fmt, 116 | "index": list(range(0, len(time))), 117 | "raw": time, 118 | }, 119 | "series": series, 120 | } 121 | 122 | with open(target_path, "w") as fp: 123 | json.dump(data, fp, indent="\t") 124 | 125 | 126 | def collect(output_dir="."): 127 | csv_path = os.path.join(output_dir, NAME_CSV) 128 | json_path = os.path.join(output_dir, NAME_JSON) 129 | 130 | get_market_price(target_path=csv_path) 131 | write_json(csv_path, target_path=json_path) 132 | 133 | 134 | def clean(output_dir="."): 135 | csv_path = os.path.join(output_dir, NAME_CSV) 136 | json_path = os.path.join(output_dir, NAME_JSON) 137 | 138 | if os.path.exists(csv_path): 139 | os.unlink(csv_path) 140 | if os.path.exists(json_path): 141 | os.unlink(json_path) 142 | 143 | 144 | def parse_args(): 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument( 147 | "-o", "--output-dir", help="output directory to use", default="." 148 | ) 149 | parser.add_argument( 150 | "action", 151 | choices=["collect", "clean"], 152 | help="Action to perform", 153 | default="collect", 154 | nargs="?", 155 | ) 156 | return parser.parse_args() 157 | 158 | 159 | def main(output_dir="."): 160 | args = parse_args() 161 | if args.action == "collect": 162 | collect(output_dir=args.output_dir) 163 | elif args.action == "clean": 164 | clean(output_dir=args.output_dir) 165 | 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /datasets/uk_coal_employ/uk_coal_employ.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "uk_coal_employ", 3 | "longname": "Coal Mining Employees (UK)", 4 | "n_obs": 105, 5 | "n_dim": 1, 6 | "time": { 7 | "format": "%Y", 8 | "index": [ 9 | 0, 10 | 1, 11 | 2, 12 | 3, 13 | 4, 14 | 5, 15 | 6, 16 | 7, 17 | 8, 18 | 9, 19 | 10, 20 | 11, 21 | 12, 22 | 13, 23 | 14, 24 | 15, 25 | 16, 26 | 17, 27 | 18, 28 | 19, 29 | 20, 30 | 21, 31 | 22, 32 | 23, 33 | 24, 34 | 25, 35 | 26, 36 | 27, 37 | 28, 38 | 29, 39 | 30, 40 | 31, 41 | 32, 42 | 33, 43 | 34, 44 | 35, 45 | 36, 46 | 37, 47 | 38, 48 | 39, 49 | 40, 50 | 41, 51 | 42, 52 | 43, 53 | 44, 54 | 45, 55 | 46, 56 | 47, 57 | 48, 58 | 49, 59 | 50, 60 | 51, 61 | 52, 62 | 53, 63 | 54, 64 | 55, 65 | 56, 66 | 57, 67 | 58, 68 | 59, 69 | 60, 70 | 61, 71 | 62, 72 | 63, 73 | 64, 74 | 65, 75 | 66, 76 | 67, 77 | 68, 78 | 69, 79 | 70, 80 | 71, 81 | 72, 82 | 73, 83 | 74, 84 | 75, 85 | 76, 86 | 77, 87 | 78, 88 | 79, 89 | 80, 90 | 81, 91 | 82, 92 | 83, 93 | 84, 94 | 85, 95 | 86, 96 | 87, 97 | 88, 98 | 89, 99 | 90, 100 | 91, 101 | 92, 102 | 93, 103 | 94, 104 | 95, 105 | 96, 106 | 97, 107 | 98, 108 | 99, 109 | 100, 110 | 101, 111 | 102, 112 | 103, 113 | 104 114 | ], 115 | "raw": [ 116 | "1913", 117 | "1914", 118 | "1915", 119 | "1916", 120 | "1917", 121 | "1918", 122 | "1919", 123 | "1920", 124 | "1921", 125 | "1922", 126 | "1923", 127 | "1924", 128 | "1925", 129 | "1926", 130 | "1927", 131 | "1928", 132 | "1929", 133 | "1930", 134 | "1931", 135 | "1932", 136 | "1933", 137 | "1934", 138 | "1935", 139 | "1936", 140 | "1937", 141 | "1938", 142 | "1939", 143 | "1940", 144 | "1941", 145 | "1942", 146 | "1943", 147 | "1944", 148 | "1945", 149 | "1946", 150 | "1947", 151 | "1948", 152 | "1949", 153 | "1950", 154 | "1951", 155 | "1952", 156 | "1953", 157 | "1954", 158 | "1955", 159 | "1956", 160 | "1957", 161 | "1958", 162 | "1959", 163 | "1960", 164 | "1961", 165 | "1962", 166 | "1963", 167 | "1964", 168 | "1965", 169 | "1966", 170 | "1967", 171 | "1968", 172 | "1969", 173 | "1970", 174 | "1971", 175 | "1972", 176 | "1973", 177 | "1974", 178 | "1975", 179 | "1976", 180 | "1977", 181 | "1978", 182 | "1979", 183 | "1980", 184 | "1981", 185 | "1982", 186 | "1983", 187 | "1984", 188 | "1985", 189 | "1986", 190 | "1987", 191 | "1988", 192 | "1989", 193 | "1990", 194 | "1991", 195 | "1992", 196 | "1993", 197 | "1994", 198 | "1995", 199 | "1996", 200 | "1997", 201 | "1998", 202 | "1999", 203 | "2000", 204 | "2001", 205 | "2002", 206 | "2003", 207 | "2004", 208 | "2005", 209 | "2006", 210 | "2007", 211 | "2008", 212 | "2009", 213 | "2010", 214 | "2011", 215 | "2012", 216 | "2013", 217 | "2014", 218 | "2015", 219 | "2016", 220 | "2017" 221 | ] 222 | }, 223 | "series": [ 224 | { 225 | "label": "V1", 226 | "type": "int", 227 | "raw": [ 228 | 1107000, 229 | 1038000, 230 | 935000, 231 | 981000, 232 | 1002000, 233 | 990000, 234 | 1136000, 235 | 1191000, 236 | null, 237 | 1085000, 238 | 1151000, 239 | 1163000, 240 | 1078000, 241 | null, 242 | 991000, 243 | 915000, 244 | 925000, 245 | 910000, 246 | 843000, 247 | 796000, 248 | 767000, 249 | 768000, 250 | 753000, 251 | 750000, 252 | 773000, 253 | 776000, 254 | 761000, 255 | 744000, 256 | 692000, 257 | 704000, 258 | 701000, 259 | 704000, 260 | 702000, 261 | 693000, 262 | 707000, 263 | 720000, 264 | 716000, 265 | 693000, 266 | 695000, 267 | 712000, 268 | 713000, 269 | 707000, 270 | 704000, 271 | 703000, 272 | 710000, 273 | 699000, 274 | 665000, 275 | 607000, 276 | 575000, 277 | 556000, 278 | 528000, 279 | 502000, 280 | 454700, 281 | 422000, 282 | 389500, 283 | 330900, 284 | 305700, 285 | 290000, 286 | 286100, 287 | 273600, 288 | 251800, 289 | 252800, 290 | 252000, 291 | 249700, 292 | 247900, 293 | 240400, 294 | 241600, 295 | 236900, 296 | 172000, 297 | 164000, 298 | 148000, 299 | 139000, 300 | 114000, 301 | 91000, 302 | 75000, 303 | 69000, 304 | 56000, 305 | 49000, 306 | 38000, 307 | 28000, 308 | 10000, 309 | 7000, 310 | 11657, 311 | 10315, 312 | 13768, 313 | 11113, 314 | 11973, 315 | 10939, 316 | 11439, 317 | 9578, 318 | 8250, 319 | 7772, 320 | 6054, 321 | 5431, 322 | 5538, 323 | 6157, 324 | 5912, 325 | 6014, 326 | 5972, 327 | 5827, 328 | 3715, 329 | 3601, 330 | 1975, 331 | 831, 332 | 620 333 | ] 334 | } 335 | ] 336 | } 337 | -------------------------------------------------------------------------------- /utils/validate_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Validate the dataset schema of a given file. 6 | 7 | Note that this script requires the ``jsonschema`` package. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD. See the LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import json 17 | import jsonschema 18 | import os 19 | import sys 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | "-s", 26 | "--schema-file", 27 | help="Schema file to use", 28 | default="./schema.json", 29 | ) 30 | parser.add_argument("-d", "--dataset-dir", help="Dataset directory") 31 | parser.add_argument( 32 | "datafile", help="JSON file with a TCPD time series", nargs="?" 33 | ) 34 | parser.add_argument( 35 | "-v", "--verbose", help="Enable verbose mode", action="store_true" 36 | ) 37 | return parser.parse_args() 38 | 39 | 40 | def load_schema(schema_file): 41 | if not os.path.exists(schema_file): 42 | raise FileNotFoundError(schema_file) 43 | with open(schema_file, "rb") as fp: 44 | schema = json.load(fp) 45 | return schema 46 | 47 | 48 | def find_datafiles(dataset_dir): 49 | data_files = {} 50 | 51 | datadirs = os.listdir(dataset_dir) 52 | for ddir in datadirs: 53 | pth = os.path.join(dataset_dir, ddir) 54 | files = os.listdir(pth) 55 | json_files = [f for f in files if f.endswith(".json")] 56 | for jf in json_files: 57 | jfpath = os.path.join(pth, jf) 58 | if jf in data_files: 59 | raise KeyError("Duplicate data file '%s'?" % jfpath) 60 | data_files[jf] = jfpath 61 | 62 | return data_files 63 | 64 | 65 | def validate_dataset(filename, schema_file=None): 66 | """Validate a dataset file against the schema and other requirements 67 | """ 68 | if not os.path.exists(filename): 69 | return "File not found." 70 | 71 | with open(filename, "rb") as fp: 72 | try: 73 | data = json.load(fp) 74 | except json.JSONDecodeError as err: 75 | return "JSON decoding error: %s" % err.msg 76 | 77 | try: 78 | schema = load_schema(schema_file) 79 | except FileNotFoundError: 80 | return "Schema file not found." 81 | 82 | try: 83 | jsonschema.validate(instance=data, schema=schema) 84 | except jsonschema.ValidationError as err: 85 | return "JSONSchema validation error: %s" % err.message 86 | 87 | if len(data["series"]) != data["n_dim"]: 88 | return "Number of dimensions and number of series don't match" 89 | 90 | if "time" in data.keys(): 91 | if not "format" in data["time"] and "raw" in data["time"]: 92 | return "'raw' must be accompanied by format" 93 | if "format" in data["time"] and not "raw" in data["time"]: 94 | return "Format must be accompanied by 'raw'" 95 | if "index" in data["time"]: 96 | if not data["time"]["index"][0] == 0: 97 | return "Index should start at zero." 98 | if not len(data["time"]["index"]) == data["n_obs"]: 99 | return "Number of indices must match number of observations" 100 | if "raw" in data["time"]: 101 | if len(data["time"]["raw"]) != data["n_obs"]: 102 | return "Number of time points doesn't match number of observations" 103 | if None in data["time"]["raw"]: 104 | return "Null is not supported in time axis. Use 'NaN' instead." 105 | 106 | has_missing = False 107 | for var in data["series"]: 108 | if len(var["raw"]) != data["n_obs"]: 109 | return "Number of observations doesn't match for %s" % var["label"] 110 | if float("nan") in var["raw"]: 111 | return "NaN is not supported in series. Use null instead." 112 | has_missing = has_missing or any(map(lambda x: x is None, var["raw"])) 113 | 114 | # this doesn't exist yet, so let's not implement it until we need it. 115 | if data["n_dim"] > 1 and has_missing: 116 | return "Missing values are not yet supported for multidimensional data" 117 | 118 | return None 119 | 120 | 121 | def main(): 122 | args = parse_args() 123 | 124 | log = lambda *a, **kw: print(*a, **kw) if args.verbose else None 125 | 126 | if args.dataset_dir: 127 | datafiles = find_datafiles(args.dataset_dir) 128 | for dset in datafiles: 129 | log("Validating %s" % dset) 130 | result = validate_dataset( 131 | datafiles[dset], schema_file=args.schema_file 132 | ) 133 | if not result is None: 134 | print( 135 | "Dataset: %s. Error: %s" % (dset, result), file=sys.stderr 136 | ) 137 | raise SystemExit(1) 138 | else: 139 | result = validate_dataset(args.datafile, schema_file=args.schema_file) 140 | if not result is None: 141 | print("Error: %s" % result, file=sys.stderr) 142 | raise SystemExit(1) 143 | log("Validation passed.") 144 | 145 | 146 | if __name__ == "__main__": 147 | main() 148 | -------------------------------------------------------------------------------- /datasets/usd_isk/ert_bil_eur_m_Label.csv: -------------------------------------------------------------------------------- 1 | "DATASET: Euro/ECU exchange rates - monthly data [ert_bil_eur_m]" 2 | 3 | "LAST UPDATE: 09.08.19 02:12:13" 4 | 5 | "EXTRACTION DATE: 22.08.19 15:35:13" 6 | 7 | "SOURCE OF DATA: Eurostat" 8 | 9 | "TIME" 10 | "1995M01" 11 | "1995M02" 12 | "1995M03" 13 | "1995M04" 14 | "1995M05" 15 | "1995M06" 16 | "1995M07" 17 | "1995M08" 18 | "1995M09" 19 | "1995M10" 20 | "1995M11" 21 | "1995M12" 22 | "1996M01" 23 | "1996M02" 24 | "1996M03" 25 | "1996M04" 26 | "1996M05" 27 | "1996M06" 28 | "1996M07" 29 | "1996M08" 30 | "1996M09" 31 | "1996M10" 32 | "1996M11" 33 | "1996M12" 34 | "1997M01" 35 | "1997M02" 36 | "1997M03" 37 | "1997M04" 38 | "1997M05" 39 | "1997M06" 40 | "1997M07" 41 | "1997M08" 42 | "1997M09" 43 | "1997M10" 44 | "1997M11" 45 | "1997M12" 46 | "1998M01" 47 | "1998M02" 48 | "1998M03" 49 | "1998M04" 50 | "1998M05" 51 | "1998M06" 52 | "1998M07" 53 | "1998M08" 54 | "1998M09" 55 | "1998M10" 56 | "1998M11" 57 | "1998M12" 58 | "1999M01" 59 | "1999M02" 60 | "1999M03" 61 | "1999M04" 62 | "1999M05" 63 | "1999M06" 64 | "1999M07" 65 | "1999M08" 66 | "1999M09" 67 | "1999M10" 68 | "1999M11" 69 | "1999M12" 70 | "2000M01" 71 | "2000M02" 72 | "2000M03" 73 | "2000M04" 74 | "2000M05" 75 | "2000M06" 76 | "2000M07" 77 | "2000M08" 78 | "2000M09" 79 | "2000M10" 80 | "2000M11" 81 | "2000M12" 82 | "2001M01" 83 | "2001M02" 84 | "2001M03" 85 | "2001M04" 86 | "2001M05" 87 | "2001M06" 88 | "2001M07" 89 | "2001M08" 90 | "2001M09" 91 | "2001M10" 92 | "2001M11" 93 | "2001M12" 94 | "2002M01" 95 | "2002M02" 96 | "2002M03" 97 | "2002M04" 98 | "2002M05" 99 | "2002M06" 100 | "2002M07" 101 | "2002M08" 102 | "2002M09" 103 | "2002M10" 104 | "2002M11" 105 | "2002M12" 106 | "2003M01" 107 | "2003M02" 108 | "2003M03" 109 | "2003M04" 110 | "2003M05" 111 | "2003M06" 112 | "2003M07" 113 | "2003M08" 114 | "2003M09" 115 | "2003M10" 116 | "2003M11" 117 | "2003M12" 118 | "2004M01" 119 | "2004M02" 120 | "2004M03" 121 | "2004M04" 122 | "2004M05" 123 | "2004M06" 124 | "2004M07" 125 | "2004M08" 126 | "2004M09" 127 | "2004M10" 128 | "2004M11" 129 | "2004M12" 130 | "2005M01" 131 | "2005M02" 132 | "2005M03" 133 | "2005M04" 134 | "2005M05" 135 | "2005M06" 136 | "2005M07" 137 | "2005M08" 138 | "2005M09" 139 | "2005M10" 140 | "2005M11" 141 | "2005M12" 142 | "2006M01" 143 | "2006M02" 144 | "2006M03" 145 | "2006M04" 146 | "2006M05" 147 | "2006M06" 148 | "2006M07" 149 | "2006M08" 150 | "2006M09" 151 | "2006M10" 152 | "2006M11" 153 | "2006M12" 154 | "2007M01" 155 | "2007M02" 156 | "2007M03" 157 | "2007M04" 158 | "2007M05" 159 | "2007M06" 160 | "2007M07" 161 | "2007M08" 162 | "2007M09" 163 | "2007M10" 164 | "2007M11" 165 | "2007M12" 166 | "2008M01" 167 | "2008M02" 168 | "2008M03" 169 | "2008M04" 170 | "2008M05" 171 | "2008M06" 172 | "2008M07" 173 | "2008M08" 174 | "2008M09" 175 | "2008M10" 176 | "2008M11" 177 | "2008M12" 178 | "2009M01" 179 | "2009M02" 180 | "2009M03" 181 | "2009M04" 182 | "2009M05" 183 | "2009M06" 184 | "2009M07" 185 | "2009M08" 186 | "2009M09" 187 | "2009M10" 188 | "2009M11" 189 | "2009M12" 190 | "2010M01" 191 | "2010M02" 192 | "2010M03" 193 | "2010M04" 194 | "2010M05" 195 | "2010M06" 196 | "2010M07" 197 | "2010M08" 198 | "2010M09" 199 | "2010M10" 200 | "2010M11" 201 | "2010M12" 202 | "2011M01" 203 | "2011M02" 204 | "2011M03" 205 | "2011M04" 206 | "2011M05" 207 | "2011M06" 208 | "2011M07" 209 | "2011M08" 210 | "2011M09" 211 | "2011M10" 212 | "2011M11" 213 | "2011M12" 214 | "2012M01" 215 | "2012M02" 216 | "2012M03" 217 | "2012M04" 218 | "2012M05" 219 | "2012M06" 220 | "2012M07" 221 | "2012M08" 222 | "2012M09" 223 | "2012M10" 224 | "2012M11" 225 | "2012M12" 226 | "2013M01" 227 | "2013M02" 228 | "2013M03" 229 | "2013M04" 230 | "2013M05" 231 | "2013M06" 232 | "2013M07" 233 | "2013M08" 234 | "2013M09" 235 | "2013M10" 236 | "2013M11" 237 | "2013M12" 238 | "2014M01" 239 | "2014M02" 240 | "2014M03" 241 | "2014M04" 242 | "2014M05" 243 | "2014M06" 244 | "2014M07" 245 | "2014M08" 246 | "2014M09" 247 | "2014M10" 248 | "2014M11" 249 | "2014M12" 250 | "2015M01" 251 | "2015M02" 252 | "2015M03" 253 | "2015M04" 254 | "2015M05" 255 | "2015M06" 256 | "2015M07" 257 | "2015M08" 258 | "2015M09" 259 | "2015M10" 260 | "2015M11" 261 | "2015M12" 262 | "2016M01" 263 | "2016M02" 264 | "2016M03" 265 | "2016M04" 266 | "2016M05" 267 | "2016M06" 268 | "2016M07" 269 | "2016M08" 270 | "2016M09" 271 | "2016M10" 272 | "2016M11" 273 | "2016M12" 274 | "2017M01" 275 | "2017M02" 276 | "2017M03" 277 | "2017M04" 278 | "2017M05" 279 | "2017M06" 280 | "2017M07" 281 | "2017M08" 282 | "2017M09" 283 | "2017M10" 284 | "2017M11" 285 | "2017M12" 286 | "2018M01" 287 | "2018M02" 288 | "2018M03" 289 | "2018M04" 290 | "2018M05" 291 | "2018M06" 292 | "2018M07" 293 | "2018M08" 294 | "2018M09" 295 | "2018M10" 296 | "2018M11" 297 | "2018M12" 298 | "2019M01" 299 | "2019M02" 300 | "2019M03" 301 | "2019M04" 302 | "2019M05" 303 | "2019M06" 304 | "2019M07" 305 | 306 | "CURRENCY" 307 | "Icelandic krona" 308 | "US dollar" 309 | 310 | "UNIT" 311 | "National currency" 312 | 313 | "STATINFO" 314 | "Average" 315 | 316 | "No footnotes available" 317 | 318 | "Available flags:" 319 | "b","break in time series" 320 | "c","confidential" 321 | "d","definition differs, see metadata" 322 | "e","estimated" 323 | "f","forecast" 324 | "n","not significant" 325 | "p","provisional" 326 | "r","revised" 327 | "s","Eurostat estimate" 328 | "u","low reliability" 329 | "z","not applicable" 330 | 331 | "Special value:" 332 | ":","not available" 333 | 334 | -------------------------------------------------------------------------------- /datasets/occupancy/get_occupancy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the occupancy dataset. 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import clevercsv 17 | import hashlib 18 | import json 19 | import os 20 | import sys 21 | import time 22 | 23 | from functools import wraps 24 | from urllib.request import urlretrieve 25 | from urllib.error import URLError 26 | 27 | SAMPLE = 16 28 | 29 | TXT_URL = "https://web.archive.org/web/20191128145102if_/https://raw.githubusercontent.com/LuisM78/Occupancy-detection-data/master/datatraining.txt" 30 | 31 | MD5_TXT = "e656cd731300cb444bd10fcd28071e37" 32 | MD5_JSON = "bc6cd9adaf496fe30bf0e417d2c3b0c6" 33 | 34 | NAME_TXT = "datatraining.txt" 35 | NAME_JSON = "occupancy.json" 36 | 37 | 38 | class ValidationError(Exception): 39 | def __init__(self, filename): 40 | message = ( 41 | "Validating the file '%s' failed. \n" 42 | "Please raise an issue on the GitHub page for this project " 43 | "if the error persists." % filename 44 | ) 45 | super().__init__(message) 46 | 47 | 48 | def check_md5sum(filename, checksum): 49 | with open(filename, "rb") as fp: 50 | data = fp.read() 51 | h = hashlib.md5(data).hexdigest() 52 | return h == checksum 53 | 54 | 55 | def validate(checksum): 56 | """Decorator that validates the target file.""" 57 | 58 | def validate_decorator(func): 59 | @wraps(func) 60 | def wrapper(*args, **kwargs): 61 | target = kwargs.get("target_path", None) 62 | if os.path.exists(target) and check_md5sum(target, checksum): 63 | return 64 | out = func(*args, **kwargs) 65 | if not os.path.exists(target): 66 | raise FileNotFoundError("Target file expected at: %s" % target) 67 | if not check_md5sum(target, checksum): 68 | raise ValidationError(target) 69 | return out 70 | 71 | return wrapper 72 | 73 | return validate_decorator 74 | 75 | 76 | @validate(MD5_TXT) 77 | def download_txt(target_path=None): 78 | count = 0 79 | while count < 5: 80 | count += 1 81 | try: 82 | urlretrieve(TXT_URL, target_path) 83 | return 84 | except URLError as err: 85 | print( 86 | "Error occurred (%r) when trying to download txt. Retrying in 5 seconds" 87 | % err, 88 | sys.stderr, 89 | ) 90 | time.sleep(5) 91 | 92 | 93 | @validate(MD5_JSON) 94 | def write_json(txt_path, target_path=None): 95 | with open(txt_path, "r", newline="", encoding="ascii") as fp: 96 | reader = clevercsv.reader( 97 | fp, delimiter=",", quotechar='"', escapechar="" 98 | ) 99 | rows = list(reader) 100 | 101 | header = rows.pop(0) 102 | header.insert(0, "id") 103 | as_dicts = [dict(zip(header, r)) for r in rows] 104 | 105 | var_include = ["Temperature", "Humidity", "Light", "CO2"] 106 | 107 | time = [x["date"] for x in as_dicts] 108 | time = [time[i] for i in range(0, len(time), SAMPLE)] 109 | 110 | data = { 111 | "name": "occupancy", 112 | "longname": "Occupancy", 113 | "n_obs": len(time), 114 | "n_dim": len(var_include), 115 | "time": { 116 | "type": "string", 117 | "format": "%Y-%m-%d %H:%M:%S", 118 | "index": list(range(len(time))), 119 | "raw": time, 120 | }, 121 | "series": [], 122 | } 123 | for idx, var in enumerate(var_include, start=1): 124 | lbl = "V%i" % idx 125 | obs = [float(x[var]) for x in as_dicts] 126 | obs = [obs[i] for i in range(0, len(obs), SAMPLE)] 127 | data["series"].append({"label": lbl, "type": "float", "raw": obs}) 128 | 129 | with open(target_path, "w") as fp: 130 | json.dump(data, fp, indent="\t") 131 | 132 | 133 | def collect(output_dir="."): 134 | txt_path = os.path.join(output_dir, NAME_TXT) 135 | json_path = os.path.join(output_dir, NAME_JSON) 136 | 137 | download_txt(target_path=txt_path) 138 | write_json(txt_path, target_path=json_path) 139 | 140 | 141 | def clean(output_dir="."): 142 | txt_path = os.path.join(output_dir, NAME_TXT) 143 | json_path = os.path.join(output_dir, NAME_JSON) 144 | 145 | if os.path.exists(txt_path): 146 | os.unlink(txt_path) 147 | if os.path.exists(json_path): 148 | os.unlink(json_path) 149 | 150 | 151 | def parse_args(): 152 | parser = argparse.ArgumentParser() 153 | parser.add_argument( 154 | "-o", "--output-dir", help="output directory to use", default="." 155 | ) 156 | parser.add_argument( 157 | "action", 158 | choices=["collect", "clean"], 159 | help="Action to perform", 160 | default="collect", 161 | nargs="?", 162 | ) 163 | return parser.parse_args() 164 | 165 | 166 | def main(output_dir="."): 167 | args = parse_args() 168 | if args.action == "collect": 169 | collect(output_dir=args.output_dir) 170 | elif args.action == "clean": 171 | clean(output_dir=args.output_dir) 172 | 173 | 174 | if __name__ == "__main__": 175 | main() 176 | -------------------------------------------------------------------------------- /datasets/homeruns/homeruns.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "homeruns", 3 | "longname": "Homeruns", 4 | "n_obs": 118, 5 | "n_dim": 1, 6 | "time": { 7 | "type": "string", 8 | "format": "%Y", 9 | "index": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7, 18 | 8, 19 | 9, 20 | 10, 21 | 11, 22 | 12, 23 | 13, 24 | 14, 25 | 15, 26 | 16, 27 | 17, 28 | 18, 29 | 19, 30 | 20, 31 | 21, 32 | 22, 33 | 23, 34 | 24, 35 | 25, 36 | 26, 37 | 27, 38 | 28, 39 | 29, 40 | 30, 41 | 31, 42 | 32, 43 | 33, 44 | 34, 45 | 35, 46 | 36, 47 | 37, 48 | 38, 49 | 39, 50 | 40, 51 | 41, 52 | 42, 53 | 43, 54 | 44, 55 | 45, 56 | 46, 57 | 47, 58 | 48, 59 | 49, 60 | 50, 61 | 51, 62 | 52, 63 | 53, 64 | 54, 65 | 55, 66 | 56, 67 | 57, 68 | 58, 69 | 59, 70 | 60, 71 | 61, 72 | 62, 73 | 63, 74 | 64, 75 | 65, 76 | 66, 77 | 67, 78 | 68, 79 | 69, 80 | 70, 81 | 71, 82 | 72, 83 | 73, 84 | 74, 85 | 75, 86 | 76, 87 | 77, 88 | 78, 89 | 79, 90 | 80, 91 | 81, 92 | 82, 93 | 83, 94 | 84, 95 | 85, 96 | 86, 97 | 87, 98 | 88, 99 | 89, 100 | 90, 101 | 91, 102 | 92, 103 | 93, 104 | 94, 105 | 95, 106 | 96, 107 | 97, 108 | 98, 109 | 99, 110 | 100, 111 | 101, 112 | 102, 113 | 103, 114 | 104, 115 | 105, 116 | 106, 117 | 107, 118 | 108, 119 | 109, 120 | 110, 121 | 111, 122 | 112, 123 | 113, 124 | 114, 125 | 115, 126 | 116, 127 | 117 128 | ], 129 | "raw": [ 130 | "1901", 131 | "1902", 132 | "1903", 133 | "1904", 134 | "1905", 135 | "1906", 136 | "1907", 137 | "1908", 138 | "1909", 139 | "1910", 140 | "1911", 141 | "1912", 142 | "1913", 143 | "1914", 144 | "1915", 145 | "1916", 146 | "1917", 147 | "1918", 148 | "1919", 149 | "1920", 150 | "1921", 151 | "1922", 152 | "1923", 153 | "1924", 154 | "1925", 155 | "1926", 156 | "1927", 157 | "1928", 158 | "1929", 159 | "1930", 160 | "1931", 161 | "1932", 162 | "1933", 163 | "1934", 164 | "1935", 165 | "1936", 166 | "1937", 167 | "1938", 168 | "1939", 169 | "1940", 170 | "1941", 171 | "1942", 172 | "1943", 173 | "1944", 174 | "1945", 175 | "1946", 176 | "1947", 177 | "1948", 178 | "1949", 179 | "1950", 180 | "1951", 181 | "1952", 182 | "1953", 183 | "1954", 184 | "1955", 185 | "1956", 186 | "1957", 187 | "1958", 188 | "1959", 189 | "1960", 190 | "1961", 191 | "1962", 192 | "1963", 193 | "1964", 194 | "1965", 195 | "1966", 196 | "1967", 197 | "1968", 198 | "1969", 199 | "1970", 200 | "1971", 201 | "1972", 202 | "1973", 203 | "1974", 204 | "1975", 205 | "1976", 206 | "1977", 207 | "1978", 208 | "1979", 209 | "1980", 210 | "1981", 211 | "1982", 212 | "1983", 213 | "1984", 214 | "1985", 215 | "1986", 216 | "1987", 217 | "1988", 218 | "1989", 219 | "1990", 220 | "1991", 221 | "1992", 222 | "1993", 223 | "1994", 224 | "1995", 225 | "1996", 226 | "1997", 227 | "1998", 228 | "1999", 229 | "2000", 230 | "2001", 231 | "2002", 232 | "2003", 233 | "2004", 234 | "2005", 235 | "2006", 236 | "2007", 237 | "2008", 238 | "2009", 239 | "2010", 240 | "2011", 241 | "2012", 242 | "2013", 243 | "2014", 244 | "2015", 245 | "2016", 246 | "2017", 247 | "2018" 248 | ] 249 | }, 250 | "series": [ 251 | { 252 | "label": "American League Home Runs", 253 | "type": "int", 254 | "raw": [ 255 | 228, 256 | 258, 257 | 184, 258 | 156, 259 | 156, 260 | 137, 261 | 104, 262 | 116, 263 | 109, 264 | 147, 265 | 198, 266 | 156, 267 | 159, 268 | 148, 269 | 160, 270 | 144, 271 | 133, 272 | 96, 273 | 240, 274 | 369, 275 | 477, 276 | 525, 277 | 442, 278 | 397, 279 | 533, 280 | 424, 281 | 439, 282 | 483, 283 | 595, 284 | 673, 285 | 576, 286 | 707, 287 | 607, 288 | 688, 289 | 663, 290 | 758, 291 | 806, 292 | 864, 293 | 796, 294 | 883, 295 | 734, 296 | 533, 297 | 473, 298 | 459, 299 | 430, 300 | 653, 301 | 679, 302 | 710, 303 | 769, 304 | 973, 305 | 839, 306 | 794, 307 | 879, 308 | 823, 309 | 961, 310 | 1075, 311 | 1024, 312 | 1057, 313 | 1091, 314 | 1086, 315 | 1534, 316 | 1552, 317 | 1489, 318 | 1551, 319 | 1370, 320 | 1365, 321 | 1197, 322 | 1104, 323 | 1649, 324 | 1746, 325 | 1484, 326 | 1175, 327 | 1552, 328 | 1369, 329 | 1465, 330 | 1122, 331 | 2013, 332 | 1680, 333 | 2006, 334 | 1844, 335 | 1062, 336 | 2080, 337 | 1903, 338 | 1980, 339 | 2178, 340 | 2290, 341 | 2634, 342 | 1901, 343 | 1718, 344 | 1796, 345 | 1953, 346 | 1776, 347 | 2074, 348 | 1774, 349 | 2164, 350 | 2742, 351 | 2477, 352 | 2499, 353 | 2635, 354 | 2688, 355 | 2506, 356 | 2464, 357 | 2499, 358 | 2605, 359 | 2437, 360 | 2546, 361 | 2252, 362 | 2270, 363 | 2560, 364 | 2209, 365 | 2271, 366 | 2500, 367 | 2504, 368 | 2161, 369 | 2634, 370 | 2953, 371 | 3170, 372 | 2900 373 | ] 374 | } 375 | ] 376 | } -------------------------------------------------------------------------------- /datasets/homeruns/get_homeruns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the homeruns dataset 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import clevercsv 17 | import hashlib 18 | import json 19 | import os 20 | import sys 21 | import time 22 | 23 | from functools import wraps 24 | from urllib.request import urlretrieve 25 | from urllib.error import URLError 26 | 27 | # Original source of the batting csv file 28 | CSV_URL = "https://web.archive.org/web/20191128150525if_/https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/242285f8f5e8981327cf50c07355fb034833ce4a/core/Batting.csv" 29 | 30 | MD5_CSV = "43d8f8135e76dcd8b77d0709e33d2221" 31 | MD5_JSON = "987bbab63e2c72acba1c07325303720c" 32 | 33 | NAME_CSV = "Batting.csv" 34 | NAME_JSON = "homeruns.json" 35 | 36 | 37 | class ValidationError(Exception): 38 | def __init__(self, filename): 39 | self.message = ( 40 | "Validating the file '%s' failed. \n" 41 | "Please raise an issue on the GitHub page for this project \n" 42 | "if the error persists." % filename 43 | ) 44 | 45 | 46 | def check_md5sum(filename, checksum): 47 | with open(filename, "rb") as fp: 48 | data = fp.read() 49 | h = hashlib.md5(data).hexdigest() 50 | return h == checksum 51 | 52 | 53 | def validate(checksum): 54 | """Decorator that validates the target file.""" 55 | 56 | def validate_decorator(func): 57 | @wraps(func) 58 | def wrapper(*args, **kwargs): 59 | target = kwargs.get("target_path", None) 60 | if os.path.exists(target) and check_md5sum(target, checksum): 61 | return 62 | out = func(*args, **kwargs) 63 | if not os.path.exists(target): 64 | raise FileNotFoundError("Target file expected at: %s" % target) 65 | if not check_md5sum(target, checksum): 66 | raise ValidationError(target) 67 | return out 68 | 69 | return wrapper 70 | 71 | return validate_decorator 72 | 73 | 74 | @validate(MD5_CSV) 75 | def download_csv(target_path=None): 76 | count = 0 77 | while count < 5: 78 | count += 1 79 | try: 80 | urlretrieve(CSV_URL, target_path) 81 | return 82 | except URLError as err: 83 | print( 84 | "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" 85 | % err, 86 | sys.stderr, 87 | ) 88 | time.sleep(5) 89 | 90 | 91 | def read_csv(csv_file): 92 | with open(csv_file, "r", newline="", encoding="ascii") as fp: 93 | reader = clevercsv.reader( 94 | fp, delimiter=",", quotechar="", escapechar="" 95 | ) 96 | rows = list(reader) 97 | 98 | header = rows.pop(0) 99 | dicts = [dict(zip(header, row)) for row in rows] 100 | 101 | AL = [d for d in dicts if d["lgID"] == "AL"] 102 | years = sorted(set((d["yearID"] for d in AL))) 103 | by_year = { 104 | int(y): sum(int(d["HR"]) for d in [x for x in AL if x["yearID"] == y]) 105 | for y in years 106 | } 107 | return by_year 108 | 109 | 110 | @validate(MD5_JSON) 111 | def write_json(csv_path, target_path=None): 112 | by_year = read_csv(csv_path) 113 | 114 | name = "homeruns" 115 | longname = "Homeruns" 116 | time_fmt = "%Y" 117 | 118 | time = sorted(by_year.keys()) 119 | values = [by_year[t] for t in time] 120 | 121 | series = [ 122 | {"label": "American League Home Runs", "type": "int", "raw": values}, 123 | ] 124 | 125 | data = { 126 | "name": name, 127 | "longname": longname, 128 | "n_obs": len(time), 129 | "n_dim": len(series), 130 | "time": { 131 | "type": "string", 132 | "format": time_fmt, 133 | "index": list(range(0, len(time))), 134 | "raw": list(map(str, time)), 135 | }, 136 | "series": series, 137 | } 138 | 139 | with open(target_path, "w") as fp: 140 | json.dump(data, fp, indent="\t") 141 | 142 | 143 | def collect(output_dir="."): 144 | csv_path = os.path.join(output_dir, NAME_CSV) 145 | json_path = os.path.join(output_dir, NAME_JSON) 146 | 147 | download_csv(target_path=csv_path) 148 | write_json(csv_path, target_path=json_path) 149 | 150 | 151 | def clean(output_dir="."): 152 | csv_path = os.path.join(output_dir, NAME_CSV) 153 | json_path = os.path.join(output_dir, NAME_JSON) 154 | 155 | if os.path.exists(csv_path): 156 | os.unlink(csv_path) 157 | if os.path.exists(json_path): 158 | os.unlink(json_path) 159 | 160 | 161 | def parse_args(): 162 | parser = argparse.ArgumentParser() 163 | parser.add_argument( 164 | "-o", "--output-dir", help="output directory to use", default="." 165 | ) 166 | parser.add_argument( 167 | "action", 168 | choices=["collect", "clean"], 169 | help="Action to perform", 170 | default="collect", 171 | nargs="?", 172 | ) 173 | return parser.parse_args() 174 | 175 | 176 | def main(output_dir="."): 177 | args = parse_args() 178 | if args.action == "collect": 179 | collect(output_dir=args.output_dir) 180 | elif args.action == "clean": 181 | clean(output_dir=args.output_dir) 182 | 183 | 184 | if __name__ == "__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /datasets/global_co2/get_global_co2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the global_co2 dataset 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | 16 | import argparse 17 | import clevercsv 18 | import hashlib 19 | import json 20 | import os 21 | 22 | from functools import wraps 23 | from urllib.request import urlretrieve 24 | 25 | 26 | CSV_URL = "ftp://data.iac.ethz.ch/CMIP6/input4MIPs/UoM/GHGConc/CMIP/mon/atmos/UoM-CMIP-1-1-0/GHGConc/gr3-GMNHSH/v20160701/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv" 27 | 28 | MD5_CSV = "a3d42f5e339f4c652b8ae80e830b6941" 29 | MD5_JSON = "7c8edd8887f51a6f841cc9d806ab4e56" 30 | 31 | NAME_CSV = "mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv" 32 | NAME_JSON = "global_co2.json" 33 | 34 | SAMPLE = 48 35 | 36 | 37 | class ValidationError(Exception): 38 | def __init__(self, filename): 39 | message = ( 40 | "Validating the file '%s' failed. \n" 41 | "Please raise an issue on the GitHub page for this project " 42 | "if the error persists." % filename 43 | ) 44 | super().__init__(message) 45 | 46 | 47 | def check_md5sum(filename, checksum): 48 | with open(filename, "rb") as fp: 49 | data = fp.read() 50 | h = hashlib.md5(data).hexdigest() 51 | return h == checksum 52 | 53 | 54 | def validate(checksum): 55 | """Decorator that validates the target file.""" 56 | 57 | def validate_decorator(func): 58 | @wraps(func) 59 | def wrapper(*args, **kwargs): 60 | target = kwargs.get("target_path", None) 61 | if os.path.exists(target) and check_md5sum(target, checksum): 62 | return 63 | out = func(*args, **kwargs) 64 | if not os.path.exists(target): 65 | raise FileNotFoundError("Target file expected at: %s" % target) 66 | if not check_md5sum(target, checksum): 67 | raise ValidationError(target) 68 | return out 69 | 70 | return wrapper 71 | 72 | return validate_decorator 73 | 74 | 75 | @validate(MD5_CSV) 76 | def get_csv(target_path=None): 77 | urlretrieve(CSV_URL, target_path) 78 | 79 | 80 | def reformat_time(datestr): 81 | """ From MMM-YY to %Y-%m """ 82 | MONTHS = { 83 | "Jan": 1, 84 | "Feb": 2, 85 | "Mar": 3, 86 | "Apr": 4, 87 | "May": 5, 88 | "Jun": 6, 89 | "Jul": 7, 90 | "Aug": 8, 91 | "Sep": 9, 92 | "Oct": 10, 93 | "Nov": 11, 94 | "Dec": 12, 95 | } 96 | dd, mmm, rest = datestr.split("-") 97 | yyyy = rest.split(" ")[0] 98 | m = MONTHS.get(mmm) 99 | return "%s-%02d-%s" % (yyyy, m, dd) 100 | 101 | 102 | @validate(MD5_JSON) 103 | def write_json(csv_path, target_path=None): 104 | with open(csv_path, "r", newline="", encoding="ascii") as fp: 105 | reader = clevercsv.reader( 106 | fp, delimiter=",", quotechar="", escapechar="" 107 | ) 108 | rows = list(reader) 109 | 110 | header = rows.pop(0) 111 | rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0] 112 | 113 | as_dicts = [{h: v for h, v in zip(header, row)} for row in rows] 114 | by_date = { 115 | reformat_time(d["datetime"]): float(d["data_mean_global"]) 116 | for d in as_dicts 117 | } 118 | 119 | # trim off anything before 1600 120 | by_date = {k: v for k, v in by_date.items() if k.split("-")[0] >= "1600"} 121 | 122 | time = sorted(by_date.keys()) 123 | values = [by_date[t] for t in time] 124 | 125 | name = "global_co2" 126 | longname = "Global CO2" 127 | time_fmt = "%Y-%m-%d" 128 | series = [{"label": "Mean", "type": "float", "raw": values}] 129 | 130 | data = { 131 | "name": name, 132 | "longname": longname, 133 | "n_obs": len(values), 134 | "n_dim": len(series), 135 | "time": { 136 | "type": "string", 137 | "format": time_fmt, 138 | "index": list(range(len(time))), 139 | "raw": time, 140 | }, 141 | "series": series, 142 | } 143 | if time is None: 144 | del data["time"] 145 | 146 | with open(target_path, "w") as fp: 147 | json.dump(data, fp, indent="\t") 148 | 149 | 150 | def collect(output_dir="."): 151 | csv_path = os.path.join(output_dir, NAME_CSV,) 152 | json_path = os.path.join(output_dir, NAME_JSON) 153 | 154 | get_csv(target_path=csv_path) 155 | write_json(csv_path, target_path=json_path) 156 | 157 | 158 | def clean(output_dir="."): 159 | csv_path = os.path.join(output_dir, NAME_CSV,) 160 | json_path = os.path.join(output_dir, NAME_JSON) 161 | 162 | if os.path.exists(csv_path): 163 | os.unlink(csv_path) 164 | if os.path.exists(json_path): 165 | os.unlink(json_path) 166 | 167 | 168 | def parse_args(): 169 | parser = argparse.ArgumentParser() 170 | parser.add_argument( 171 | "-o", "--output-dir", help="output directory to use", default="." 172 | ) 173 | parser.add_argument( 174 | "action", 175 | choices=["collect", "clean"], 176 | help="Action to perform", 177 | default="collect", 178 | nargs="?", 179 | ) 180 | return parser.parse_args() 181 | 182 | 183 | def main(output_dir="."): 184 | args = parse_args() 185 | if args.action == "collect": 186 | collect(output_dir=args.output_dir) 187 | elif args.action == "clean": 188 | clean(output_dir=args.output_dir) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() 193 | -------------------------------------------------------------------------------- /datasets/iceland_tourism/get_iceland_tourism.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Collect the iceland_tourism dataset 6 | 7 | See the README file for more information. 8 | 9 | Author: G.J.J. van den Burg 10 | License: This file is part of TCPD, see the top-level LICENSE file. 11 | Copyright: 2019, The Alan Turing Institute 12 | 13 | """ 14 | 15 | import argparse 16 | import hashlib 17 | import json 18 | import openpyxl 19 | import os 20 | import sys 21 | import time 22 | 23 | from functools import wraps 24 | from urllib.request import urlretrieve 25 | from urllib.error import URLError 26 | 27 | XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx" 28 | 29 | MD5_XLSX = "ec777afd95b01ca901aa00475fc284e5" 30 | MD5_JSON = "8bbac4ca95319a865f2d58ff564f063d" 31 | 32 | NAME_XLSX = "visitors-to-iceland-2002-2019-oct.xlsx" 33 | NAME_JSON = "iceland_tourism.json" 34 | 35 | MONTHS = { 36 | "January": 1, 37 | "February": 2, 38 | "March": 3, 39 | "April": 4, 40 | "May": 5, 41 | "June": 6, 42 | "July": 7, 43 | "August": 8, 44 | "September": 9, 45 | "October": 10, 46 | "November": 11, 47 | "December": 12, 48 | } 49 | 50 | 51 | class ValidationError(Exception): 52 | def __init__(self, filename): 53 | self.message = ( 54 | "Validating the file '%s' failed. \n" 55 | "Please raise an issue on the GitHub page for this project \n" 56 | "if the error persists." % filename 57 | ) 58 | 59 | 60 | def check_md5sum(filename, checksum): 61 | with open(filename, "rb") as fp: 62 | data = fp.read() 63 | h = hashlib.md5(data).hexdigest() 64 | return h == checksum 65 | 66 | 67 | def validate(checksum): 68 | """Decorator that validates the target file.""" 69 | 70 | def validate_decorator(func): 71 | @wraps(func) 72 | def wrapper(*args, **kwargs): 73 | target = kwargs.get("target_path", None) 74 | if os.path.exists(target) and check_md5sum(target, checksum): 75 | return 76 | out = func(*args, **kwargs) 77 | if not os.path.exists(target): 78 | raise FileNotFoundError("Target file expected at: %s" % target) 79 | if not check_md5sum(target, checksum): 80 | raise ValidationError(target) 81 | return out 82 | 83 | return wrapper 84 | 85 | return validate_decorator 86 | 87 | 88 | @validate(MD5_XLSX) 89 | def download_xlsx(target_path=None): 90 | count = 0 91 | while count < 5: 92 | count += 1 93 | try: 94 | urlretrieve(XLSX_URL, target_path) 95 | return 96 | except URLError as err: 97 | print( 98 | "Error occurred (%r) when trying to download xlsx. Retrying in 5 seconds" 99 | % err, 100 | sys.stderr, 101 | ) 102 | time.sleep(5) 103 | 104 | 105 | def format_ym(year, month): 106 | midx = MONTHS[month] 107 | return "%i-%02d" % (int(year), midx) 108 | 109 | 110 | @validate(MD5_JSON) 111 | def write_json(xlsx_path, target_path=None): 112 | wb = openpyxl.load_workbook(xlsx_path) 113 | ws = wb.worksheets[2] 114 | 115 | rows = list(ws.rows) 116 | 117 | # hardcoding these row indices, not worth doing it nicely 118 | header = rows[2] 119 | 120 | column_idx = [ 121 | i 122 | for i, c in enumerate(header) 123 | if c.data_type == "n" and c.value and 2003 <= c.value < 2020 124 | ] 125 | 126 | visitors = [] 127 | 128 | r_offset = 4 129 | for c in column_idx: 130 | for r in range(r_offset, r_offset + 12): 131 | cell = ws.cell(r, c + 1) 132 | if cell.value is None or str(cell.value) == "": 133 | continue 134 | year = header[c].value 135 | month = ws.cell(r, 1).value 136 | datestr = format_ym(year, month) 137 | # eliminate some observations that were not in the original dataset 138 | if datestr in ["2019-08", "2019-09", "2019-10"]: 139 | continue 140 | item = {"time": datestr, "value": int(cell.value)} 141 | visitors.append(item) 142 | 143 | name = "iceland_tourism" 144 | longname = "Iceland Tourism" 145 | 146 | data = { 147 | "name": name, 148 | "longname": longname, 149 | "n_obs": len(visitors), 150 | "n_dim": 1, 151 | "time": { 152 | "format": "%Y-%m", 153 | "index": list(range(len(visitors))), 154 | "raw": [v["time"] for v in visitors], 155 | }, 156 | "series": [ 157 | { 158 | "label": "Visitor Number", 159 | "type": "int", 160 | "raw": [v["value"] for v in visitors], 161 | } 162 | ], 163 | } 164 | 165 | with open(target_path, "w") as fp: 166 | json.dump(data, fp, indent="\t") 167 | 168 | 169 | def collect(output_dir="."): 170 | xlsx_path = os.path.join(output_dir, NAME_XLSX) 171 | json_path = os.path.join(output_dir, NAME_JSON) 172 | 173 | download_xlsx(target_path=xlsx_path) 174 | write_json(xlsx_path, target_path=json_path) 175 | 176 | 177 | def clean(output_dir="."): 178 | xlsx_path = os.path.join(output_dir, NAME_XLSX) 179 | json_path = os.path.join(output_dir, NAME_JSON) 180 | 181 | if os.path.exists(xlsx_path): 182 | os.unlink(xlsx_path) 183 | if os.path.exists(json_path): 184 | os.unlink(json_path) 185 | 186 | 187 | def parse_args(): 188 | parser = argparse.ArgumentParser() 189 | parser.add_argument( 190 | "-o", "--output-dir", help="output directory to use", default="." 191 | ) 192 | parser.add_argument( 193 | "action", 194 | choices=["collect", "clean"], 195 | help="Action to perform", 196 | default="collect", 197 | nargs="?", 198 | ) 199 | return parser.parse_args() 200 | 201 | 202 | def main(output_dir="."): 203 | args = parse_args() 204 | if args.action == "collect": 205 | collect(output_dir=args.output_dir) 206 | elif args.action == "clean": 207 | clean(output_dir=args.output_dir) 208 | 209 | 210 | if __name__ == "__main__": 211 | main() 212 | --------------------------------------------------------------------------------