├── datasets ├── Setup.hs ├── src │ └── Numeric │ │ ├── Datasets │ │ ├── Michelson.hs │ │ ├── CO2.hs │ │ ├── Quakes.hs │ │ ├── Vocabulary.hs │ │ ├── Coal.hs │ │ ├── Sunspots.hs │ │ ├── UN.hs │ │ ├── Nightingale.hs │ │ ├── Abalone.hs │ │ ├── Wine.hs │ │ ├── Iris.hs │ │ ├── States.hs │ │ ├── Anscombe.hs │ │ ├── Gapminder.hs │ │ ├── BostonHousing.hs │ │ ├── WineQuality.hs │ │ ├── Car.hs │ │ ├── Adult.hs │ │ ├── BreastCancerWisconsin.hs │ │ └── OldFaithful.hs │ │ └── Datasets.hs ├── datafiles │ ├── michelson.json │ ├── nightingale.json │ └── iris.data ├── changelog.md ├── LICENSE ├── stack.yaml └── datasets.cabal ├── README.md ├── .gitignore └── .travis.yml /datasets/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project has moved to https://github.com/filopodia/open/tree/master/datasets 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | dist-* 3 | cabal-dev 4 | *.o 5 | *.hi 6 | *.chi 7 | *.chs.h 8 | *.dyn_o 9 | *.dyn_hi 10 | .hpc 11 | .hsenv 12 | .cabal-sandbox/ 13 | cabal.sandbox.config 14 | *.prof 15 | *.aux 16 | *.hp 17 | *.eventlog 18 | .stack-work/ 19 | cabal.project.local 20 | *~ -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Michelson.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | Michelson's speed of light dataset - five repeated measurements of the speed of light. 6 | 7 | Data from 8 | 9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors. 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Michelson where 14 | 15 | import Numeric.Datasets 16 | import Data.FileEmbed 17 | import Data.ByteString.Lazy (fromStrict) 18 | 19 | 20 | michelson :: [[Double]] 21 | michelson = readDataset JSON (fromStrict $(embedFile "datafiles/michelson.json")) 22 | -------------------------------------------------------------------------------- /datasets/datafiles/michelson.json: -------------------------------------------------------------------------------- 1 | [[299.85,299.74,299.9,300.07,299.93,299.85,299.95,299.98,299.98,299.88,300,299.98,299.93,299.65,299.76,299.81,300,300,299.96,299.96],[299.96,299.94,299.96,299.94,299.88,299.8,299.85,299.88,299.9,299.84,299.83,299.79,299.81,299.88,299.88,299.83,299.8,299.79,299.76,299.8],[299.88,299.88,299.88,299.86,299.72,299.72,299.62,299.86,299.97,299.95,299.88,299.91,299.85,299.87,299.84,299.84,299.85,299.84,299.84,299.84],[299.89,299.81,299.81,299.82,299.8,299.77,299.76,299.74,299.75,299.76,299.91,299.92,299.89,299.86,299.88,299.72,299.84,299.85,299.85,299.78],[299.89,299.84,299.78,299.81,299.76,299.81,299.79,299.81,299.82,299.85,299.87,299.87,299.81,299.74,299.81,299.94,299.95,299.8,299.81,299.87]] 2 | -------------------------------------------------------------------------------- /datasets/changelog.md: -------------------------------------------------------------------------------- 1 | 0.2.3 2 | 3 | * Coal dataset 4 | 5 | * New internal API 6 | 7 | * Ord instance for IrisClass 8 | 9 | 0.2.2 10 | 11 | * Enum, bounded instances for IrisClass 12 | 13 | * Gapminder dataset 14 | 15 | * Use wreq for HTTP and HTTPS requests 16 | 17 | 0.2.1 18 | 19 | * Wine quality datasets 20 | 21 | * Vocabulary, UN, States datasets 22 | 23 | * CO2, Sunspots and Quakes datasets 24 | 25 | 0.2.0.3 26 | 27 | * Further GHC portability 28 | 29 | 0.2.0.2 30 | 31 | * Improve GHC portability 32 | 33 | 0.2.0.1 34 | 35 | * Bugfix: include embedded data files in cabal extra-source-files 36 | 37 | 0.2 38 | 39 | * iris dataset is a pure value (with file-embed) 40 | 41 | * Michelson, Nightingale and BostonHousing datasets 42 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/CO2.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Mauna Loa CO2 time-series 6 | 7 | 8 | Listed as co2 here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 9 | 10 | See 11 | 12 | -} 13 | 14 | module Numeric.Datasets.CO2 where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | 21 | data CO2 = CO2 22 | { time :: Double 23 | , co2 :: Double 24 | } deriving (Show, Read, Generic) 25 | 26 | instance FromNamedRecord CO2 27 | 28 | maunaLoaCO2 :: Dataset CO2 29 | maunaLoaCO2 = csvHdrDataset 30 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/datasets/co2.csv" 31 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Quakes.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Locations of Earthquakes off Fiji 6 | 7 | Listed as quakes here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 8 | 9 | 10 | -} 11 | 12 | module Numeric.Datasets.Quakes where 13 | 14 | import Numeric.Datasets 15 | 16 | import Data.Csv 17 | import GHC.Generics 18 | 19 | data Quake = Quake 20 | { lat :: Double 21 | , long :: Double 22 | , depth :: Double 23 | , mag :: Double 24 | , stations :: Int 25 | } deriving (Show, Read, Generic) 26 | 27 | instance FromNamedRecord Quake 28 | 29 | quakes :: Dataset Quake 30 | quakes = csvHdrDataset 31 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/datasets/quakes.csv" 32 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Vocabulary.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Vocabulary and Education 6 | 7 | 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Vocabulary where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | 18 | data Sex = Female | Male 19 | deriving (Show, Read, Eq, Generic) 20 | 21 | instance FromField Sex where 22 | parseField = parseReadField 23 | 24 | data Vocab = Vocab 25 | { year :: Integer 26 | , sex :: Sex 27 | , education :: Int 28 | , vocabulary :: Int 29 | } deriving (Show, Read, Generic) 30 | 31 | instance FromNamedRecord Vocab 32 | 33 | vocab :: Dataset Vocab 34 | vocab = csvHdrDataset 35 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/car/Vocab.csv" 36 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Coal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | {-# LANGUAGE GADTs, QuasiQuotes, ViewPatterns, FlexibleContexts #-} 3 | 4 | {-| 5 | 6 | Coal data set 7 | 8 | Dates of mining disasters, from the `coal` dataset in the R package `boot`. 9 | 10 | For further information, see 11 | 12 | -} 13 | 14 | module Numeric.Datasets.Coal ( Coal, coal, date ) where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | import Control.Applicative 21 | 22 | data Coal = Coal 23 | { date :: Double 24 | } deriving (Show, Read, Generic) 25 | 26 | instance FromRecord Coal where 27 | parseRecord v = Coal <$> v .! 1 28 | 29 | coal :: Dataset Coal 30 | coal = let src = URL "http://vincentarelbundock.github.io/Rdatasets/csv/boot/coal.csv" 31 | in Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions 32 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Sunspots.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Monthly sunspots from 1749 6 | 7 | Listed as sunspot.month here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 8 | 9 | See 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Sunspots where 14 | 15 | import Numeric.Datasets 16 | 17 | import Data.Csv 18 | import GHC.Generics 19 | import Control.Applicative 20 | 21 | data Sunspot = Sunspot 22 | { time :: Double 23 | , sunspotMonth :: Double 24 | } deriving (Show, Read, Generic) 25 | 26 | instance FromNamedRecord Sunspot where 27 | parseNamedRecord m = Sunspot <$> 28 | m .: "time" <*> 29 | m .: "sunspot.month" 30 | 31 | sunspots :: Dataset Sunspot 32 | sunspots = csvHdrDataset 33 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/datasets/sunspot.month.csv" 34 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/UN.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | GDP and infant mortality 6 | 7 | 8 | 9 | -} 10 | 11 | module Numeric.Datasets.UN where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | 19 | data GdpMortality = GdpMortality 20 | { country :: String 21 | , infantMortality :: Maybe Int 22 | , gdp :: Maybe Int 23 | } deriving (Show, Read, Generic) 24 | 25 | instance FromNamedRecord GdpMortality where 26 | parseNamedRecord m = GdpMortality <$> 27 | m .: "" <*> 28 | (m .: "infant.mortality" <|> return Nothing) <*> 29 | (m .: "gdp" <|> return Nothing) 30 | 31 | gdpMortalityUN :: Dataset GdpMortality 32 | gdpMortalityUN = csvHdrDataset 33 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/car/UN.csv" 34 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Nightingale.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | Florence Nightingale's count of injuries in the Crimean War, used for her rose plots 6 | 7 | Data from 8 | 9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors. 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Nightingale where 14 | 15 | import Numeric.Datasets 16 | import Data.FileEmbed 17 | import Data.ByteString.Lazy (fromStrict) 18 | import Data.Aeson hiding (parseJSON) 19 | import Data.Time (UTCTime) 20 | import GHC.Generics 21 | 22 | 23 | data Nightingale = Nightingale 24 | { date :: UTCTime 25 | , army_size :: Int 26 | , disease :: Int 27 | , wounds :: Int 28 | , other :: Int 29 | } deriving (Show, Read, Generic) 30 | 31 | instance FromJSON Nightingale 32 | 33 | nightingale :: [Nightingale] 34 | nightingale = readDataset JSON $ fromStrict $(embedFile "datafiles/nightingale.json") 35 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Abalone.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Abalone data set 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Abalone where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | 18 | data Sex = M | F | I 19 | deriving (Show, Read, Eq, Generic) 20 | 21 | instance FromField Sex where 22 | parseField = parseReadField 23 | 24 | data Abalone = Abalone 25 | { sex :: Sex 26 | , abaloneLength :: Double 27 | , diameter :: Double 28 | , height :: Double 29 | , wholeWeight :: Double 30 | , shuckedWeight :: Double 31 | , visceraWeight :: Double 32 | , shellWeight :: Double 33 | , rings :: Int 34 | } deriving (Show, Read, Generic) 35 | 36 | instance FromRecord Abalone 37 | 38 | abalone :: Dataset Abalone 39 | abalone = csvDataset $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/abalone/abalone.data" 40 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Wine.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Wine Data set 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Wine where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | 18 | data Wine = Wine 19 | { wineClass :: Int 20 | , alcohol :: Double 21 | , malicAcid :: Double 22 | , ash :: Double 23 | , ashAlcalinity :: Double 24 | , magnesium :: Double 25 | , totalPhenols :: Double 26 | , flavanoids :: Double 27 | , nonflavanoidPhenols :: Double 28 | , proanthocyanins :: Double 29 | , colorIntensity :: Double 30 | , hue :: Double 31 | , dilutedOD280toOD315 :: Double 32 | , proline :: Int 33 | } deriving (Show, Read, Generic) 34 | 35 | instance FromRecord Wine 36 | 37 | wine :: Dataset Wine 38 | wine = csvDatasetPreprocess 39 | fixAmericanDecimals 40 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/wine/wine.data" 41 | -------------------------------------------------------------------------------- /datasets/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Tom Nielsen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Iris.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | The classical Iris dataset, due to R.A. Fisher. 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Iris where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Data.FileEmbed 18 | import Data.ByteString.Lazy (fromStrict) 19 | 20 | 21 | data IrisClass = Setosa | Versicolor | Virginica 22 | deriving (Show, Read, Eq, Ord, Generic, Enum, Bounded) 23 | 24 | instance FromField IrisClass where 25 | parseField "Iris-setosa" = return Setosa 26 | parseField "Iris-versicolor" = return Versicolor 27 | parseField "Iris-virginica" = return Virginica 28 | parseField _ = fail "unknown iris class" 29 | 30 | data Iris = Iris 31 | { sepalLength :: Double 32 | , sepalWidth :: Double 33 | , petalLength :: Double 34 | , petalWidth :: Double 35 | , irisClass :: IrisClass 36 | } deriving (Show, Read, Generic) 37 | 38 | instance FromRecord Iris 39 | 40 | iris :: [Iris] 41 | iris = readDataset csvRecord (fromStrict $(embedFile "datafiles/iris.data")) 42 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/States.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Data on education in US states 6 | 7 | 8 | 9 | -} 10 | 11 | module Numeric.Datasets.States where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | 19 | data StateEdu = StateEdu 20 | { state :: String 21 | , region :: String 22 | , population :: Int 23 | , satVerbal :: Int 24 | , satMath :: Int 25 | , satPercent :: Int 26 | , dollarSpend :: Double 27 | , teacherPay :: Int 28 | } deriving (Show, Read, Generic) 29 | 30 | instance FromNamedRecord StateEdu where 31 | parseNamedRecord m = StateEdu <$> 32 | m .: "" <*> 33 | m .: "region" <*> 34 | m .: "pop" <*> 35 | m .: "SATV" <*> 36 | m .: "SATM" <*> 37 | m .: "percent" <*> 38 | m .: "dollars" <*> 39 | m .: "pay" 40 | 41 | states :: Dataset StateEdu 42 | states = csvHdrDataset 43 | $ URL "http://vincentarelbundock.github.io/Rdatasets/csv/car/States.csv" 44 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Anscombe.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Anscombe's quartet 6 | 7 | Four datasets with nearly identical statistical properties 8 | 9 | Wikipedia article: 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Anscombe where 14 | 15 | anscombe :: [[(Double,Double)]] 16 | anscombe = [anscombe1, anscombe2, anscombe3, anscombe4] 17 | 18 | anscombe1, anscombe2, anscombe3, anscombe4 :: [(Double, Double)] 19 | anscombe1 = [ 20 | (10,8.04), 21 | (8,6.95), 22 | (13,7.58), 23 | (9,8.81), 24 | (11,8.33), 25 | (14,9.96), 26 | (6,7.24), 27 | (4,4.26), 28 | (12,10.84), 29 | (7,4.82), 30 | (5,5.68) 31 | ] 32 | 33 | anscombe2 = [ 34 | (10,9.14), 35 | (8,8.14), 36 | (13,8.74), 37 | (9,8.77), 38 | (11,9.26), 39 | (14,8.1), 40 | (6,6.13), 41 | (4,3.1), 42 | (12,9.13), 43 | (7,7.26), 44 | (5,4.74) 45 | ] 46 | 47 | anscombe3 = [ 48 | (10,7.46), 49 | (8,6.77), 50 | (13,12.74), 51 | (9,7.11), 52 | (11,7.81), 53 | (14,8.84), 54 | (6,6.08), 55 | (4,5.39), 56 | (12,8.15), 57 | (7,6.42), 58 | (5,5.73) 59 | ] 60 | 61 | anscombe4 = [ 62 | (8,6.58), 63 | (8,5.76), 64 | (8,7.71), 65 | (8,8.84), 66 | (8,8.47), 67 | (8,7.04), 68 | (8,5.25), 69 | (19,12.5), 70 | (8,5.56), 71 | (8,7.91), 72 | (8,6.89) 73 | ] 74 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Gapminder.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Gapminder dataset - Life expectancy, GDP, population every five years per country 6 | 7 | Source: 8 | 9 | More information: https://cran.r-project.org/web/packages/gapminder/gapminder.pdf 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Gapminder where 14 | 15 | import Numeric.Datasets 16 | 17 | import Data.Csv 18 | import GHC.Generics 19 | import Control.Applicative 20 | import Data.Text (Text) 21 | 22 | data Gapminder = Gapminder 23 | { country :: Text 24 | , year :: Int 25 | , pop :: Integer 26 | , continent :: Text 27 | , lifeExp :: Double 28 | , gdpPercap :: Double 29 | } deriving (Show, Read, Generic) 30 | 31 | instance FromNamedRecord Gapminder where 32 | parseNamedRecord m = Gapminder <$> 33 | m .: "country" <*> 34 | m .: "year" <*> 35 | (roundIt <$> m .: "pop") <*> 36 | m .: "continent" <*> 37 | m .: "lifeExp" <*> 38 | m .: "gdpPercap" 39 | where roundIt :: Double -> Integer 40 | roundIt = round 41 | 42 | gapminder :: Dataset Gapminder 43 | gapminder = csvHdrDataset 44 | $ URL "https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv" 45 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/BostonHousing.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | BostonHousing Data set 6 | 7 | scikit-learn calls this "boston" and UCI calls it "Housing" 8 | 9 | UCI ML Repository link 10 | 11 | -} 12 | 13 | module Numeric.Datasets.BostonHousing where 14 | 15 | import Numeric.Datasets 16 | 17 | import Data.Csv 18 | import GHC.Generics 19 | import Control.Applicative 20 | 21 | 22 | data BostonHousing = BostonHousing 23 | { crimeRate :: Double 24 | , zoned :: Double 25 | , industrial :: Double 26 | , charlesRiver :: Bool 27 | , nitricOxides :: Double 28 | , rooms :: Double 29 | , age :: Double 30 | , distance :: Double 31 | , radialHwy :: Double 32 | , tax :: Double 33 | , ptRatio :: Double 34 | , b :: Double 35 | , lowerStatus :: Double 36 | , medianValue :: Double 37 | } deriving (Show, Read, Generic) 38 | 39 | instance FromRecord BostonHousing where 40 | parseRecord v = BostonHousing <$> 41 | v .! 0 <*> 42 | v .! 1 <*> 43 | v .! 2 <*> 44 | (intToBool <$> v .! 3) <*> 45 | v .! 4 <*> 46 | v .! 5 <*> 47 | v .! 6 <*> 48 | v .! 7 <*> 49 | v .! 8 <*> 50 | v .! 9 <*> 51 | v .! 10 <*> 52 | v .! 11 <*> 53 | v .! 12 <*> 54 | v .! 13 55 | where intToBool :: Int -> Bool 56 | intToBool 0 = False 57 | intToBool 1 = True 58 | intToBool _ = error "intToBool" 59 | bostonHousing :: Dataset BostonHousing 60 | bostonHousing = csvDatasetPreprocess 61 | fixedWidthToCSV 62 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/housing/housing.data" 63 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/WineQuality.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Quality of red and white wines based on physicochemical properties 6 | 7 | See 8 | 9 | -} 10 | 11 | module Numeric.Datasets.WineQuality where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | 19 | data WineQuality = WineQuality 20 | { fixedAcidity :: Double 21 | , volatileAcidity :: Double 22 | , citricAcid :: Double 23 | , residualSugar :: Double 24 | , chlorides :: Double 25 | , freeSulfurDioxide :: Double 26 | , totalSulfurDioxide :: Double 27 | , density :: Double 28 | , pH :: Double 29 | , sulphates :: Double 30 | , alcohol :: Double 31 | , quality :: Int 32 | } deriving (Show, Read, Generic) 33 | 34 | instance FromNamedRecord WineQuality where 35 | parseNamedRecord m = WineQuality <$> 36 | m .: "fixed acidity" <*> 37 | m .: "volatile acidity" <*> 38 | m .: "citric acid" <*> 39 | m .: "residual sugar" <*> 40 | m .: "chlorides" <*> 41 | m .: "free sulfur dioxide" <*> 42 | m .: "total sulfur dioxide" <*> 43 | m .: "density" <*> 44 | m .: "pH" <*> 45 | m .: "sulphates" <*> 46 | m .: "alcohol" <*> 47 | m .: "quality" 48 | 49 | redWineQuality, whiteWineQuality :: Dataset WineQuality 50 | redWineQuality = csvHdrDatasetSep ';' 51 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 52 | 53 | whiteWineQuality = csvHdrDatasetSep ';' 54 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" 55 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Car.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Car dataset 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Car where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | 19 | data RelScore = Low | Med | High | VeryHigh deriving (Show, Read, Eq, Generic) 20 | 21 | instance FromField RelScore where 22 | parseField "vhigh" = pure VeryHigh 23 | parseField "high" = pure High 24 | parseField "med" = pure Med 25 | parseField "low" = pure Low 26 | parseField _ = fail "unknown relative score" 27 | 28 | data RelSize = Small | Medium | Big deriving (Show, Read, Eq, Generic) 29 | 30 | instance FromField RelSize where 31 | parseField "small" = pure Small 32 | parseField "med" = pure Medium 33 | parseField "big" = pure Big 34 | parseField _ = fail "unknown relative size" 35 | 36 | data Acceptability = Unacceptable | Acceptable | Good | VeryGood deriving (Show, Read, Eq, Generic) 37 | 38 | instance FromField Acceptability where 39 | parseField "unacc" = pure Unacceptable 40 | parseField "acc" = pure Acceptable 41 | parseField "good" = pure Good 42 | parseField "vgood" = pure VeryGood 43 | parseField _ = fail "unknown acceptability" 44 | 45 | data Count = N Int | NOrMore Int | More deriving (Show, Read, Eq, Generic) 46 | 47 | instance FromField Count where 48 | parseField "more" = pure More 49 | parseField "5more" = pure (NOrMore 5) 50 | parseField "2" = pure (N 2) 51 | parseField "3" = pure (N 3) 52 | parseField "4" = pure (N 4) 53 | parseField _ = fail "unknown count" 54 | 55 | data Car = Car 56 | { buying :: RelScore 57 | , maintenance :: RelScore 58 | , doors :: Count 59 | , persons :: Count 60 | , luggageBoot :: RelSize 61 | , safety :: RelScore 62 | , acceptability:: Acceptability 63 | } deriving (Show, Read, Generic) 64 | 65 | instance FromRecord Car 66 | 67 | car :: Dataset Car 68 | car = csvDataset 69 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/car/car.data" 70 | -------------------------------------------------------------------------------- /datasets/datafiles/nightingale.json: -------------------------------------------------------------------------------- 1 | [{"date":"1854-04-01T07:00:00.000Z","army_size":8571,"disease":1,"wounds":0,"other":5},{"date":"1854-05-01T07:00:00.000Z","army_size":23333,"disease":12,"wounds":0,"other":9},{"date":"1854-06-01T07:00:00.000Z","army_size":28333,"disease":11,"wounds":0,"other":6},{"date":"1854-07-01T07:00:00.000Z","army_size":28722,"disease":359,"wounds":0,"other":23},{"date":"1854-08-01T07:00:00.000Z","army_size":30246,"disease":828,"wounds":1,"other":30},{"date":"1854-09-01T07:00:00.000Z","army_size":30290,"disease":788,"wounds":81,"other":70},{"date":"1854-10-01T07:00:00.000Z","army_size":30643,"disease":503,"wounds":132,"other":128},{"date":"1854-11-01T07:00:00.000Z","army_size":29736,"disease":844,"wounds":287,"other":106},{"date":"1854-12-01T08:00:00.000Z","army_size":32779,"disease":1725,"wounds":114,"other":131},{"date":"1855-01-01T08:00:00.000Z","army_size":32393,"disease":2761,"wounds":83,"other":324},{"date":"1855-02-01T08:00:00.000Z","army_size":30919,"disease":2120,"wounds":42,"other":361},{"date":"1855-03-01T08:00:00.000Z","army_size":30107,"disease":1205,"wounds":32,"other":172},{"date":"1855-04-01T07:00:00.000Z","army_size":32252,"disease":477,"wounds":48,"other":57},{"date":"1855-05-01T07:00:00.000Z","army_size":35473,"disease":508,"wounds":49,"other":37},{"date":"1855-06-01T07:00:00.000Z","army_size":38863,"disease":802,"wounds":209,"other":31},{"date":"1855-07-01T07:00:00.000Z","army_size":42647,"disease":382,"wounds":134,"other":33},{"date":"1855-08-01T07:00:00.000Z","army_size":44614,"disease":483,"wounds":164,"other":25},{"date":"1855-09-01T07:00:00.000Z","army_size":47751,"disease":189,"wounds":276,"other":20},{"date":"1855-10-01T07:00:00.000Z","army_size":46852,"disease":128,"wounds":53,"other":18},{"date":"1855-11-01T07:00:00.000Z","army_size":37853,"disease":178,"wounds":33,"other":32},{"date":"1855-12-01T08:00:00.000Z","army_size":43217,"disease":91,"wounds":18,"other":28},{"date":"1856-01-01T08:00:00.000Z","army_size":44212,"disease":42,"wounds":2,"other":48},{"date":"1856-02-01T08:00:00.000Z","army_size":43485,"disease":24,"wounds":0,"other":19},{"date":"1856-03-01T08:00:00.000Z","army_size":46140,"disease":15,"wounds":0,"other":35}] 2 | -------------------------------------------------------------------------------- /datasets/stack.yaml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by 'stack init' 2 | # 3 | # Some commonly used options have been documented as comments in this file. 4 | # For advanced use and comprehensive documentation of the format, please see: 5 | # http://docs.haskellstack.org/en/stable/yaml_configuration/ 6 | 7 | # Resolver to choose a 'specific' stackage snapshot or a compiler version. 8 | # A snapshot resolver dictates the compiler version and the set of packages 9 | # to be used for project dependencies. For example: 10 | # 11 | # resolver: lts-3.5 12 | # resolver: nightly-2015-09-21 13 | # resolver: ghc-7.10.2 14 | # resolver: ghcjs-0.1.0_ghc-7.10.2 15 | # resolver: 16 | # name: custom-snapshot 17 | # location: "./custom-snapshot.yaml" 18 | resolver: lts-6.2 19 | 20 | # User packages to be built. 21 | # Various formats can be used as shown in the example below. 22 | # 23 | # packages: 24 | # - some-directory 25 | # - https://example.com/foo/bar/baz-0.0.2.tar.gz 26 | # - location: 27 | # git: https://github.com/commercialhaskell/stack.git 28 | # commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a 29 | # - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a 30 | # extra-dep: true 31 | # subdirs: 32 | # - auto-update 33 | # - wai 34 | # 35 | # A package marked 'extra-dep: true' will only be built if demanded by a 36 | # non-dependency (i.e. a user package), and its test suites and benchmarks 37 | # will not be run. This is useful for tweaking upstream packages. 38 | packages: 39 | - '.' 40 | 41 | # Dependency packages to be pulled from upstream that are not in the resolver 42 | # (e.g., acme-missiles-0.3) 43 | extra-deps: [] 44 | 45 | # Override default flag values for local packages and extra-deps 46 | flags: {} 47 | 48 | # Extra package databases containing global packages 49 | extra-package-dbs: [] 50 | 51 | # Control whether we use the GHC we find on the path 52 | # system-ghc: true 53 | # 54 | # Require a specific version of stack, using version ranges 55 | # require-stack-version: -any # Default 56 | # require-stack-version: ">=1.1" 57 | # 58 | # Override the architecture used by stack, especially useful on Windows 59 | # arch: i386 60 | # arch: x86_64 61 | # 62 | # Extra directories used by stack for building 63 | # extra-include-dirs: [/path/to/dir] 64 | # extra-lib-dirs: [/path/to/dir] 65 | # 66 | # Allow a newer minor version of GHC than the snapshot specifies 67 | # compiler-check: newer-minor -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Adult.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Adult (AKA Census Income) dataset. 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Adult where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | import Data.Text (Text, strip) 19 | 20 | data WorkClass = Private | SelfEmpNotInc | SelfEmpInc | FederalGov 21 | | LocalGov | StateGov | WithoutPay | NeverWorked 22 | deriving (Show, Read, Eq, Generic) 23 | 24 | instance FromField WorkClass where 25 | parseField = parseDashToCamelField 26 | 27 | 28 | data MaritalStatus = MarriedCivSpouse | Divorced | NeverMarried 29 | | Separated | Widowed | MarriedSpouseAbsent | MarriedAFSpouse 30 | deriving (Show, Read, Eq, Generic) 31 | 32 | instance FromField MaritalStatus where 33 | -- parseField "Married-AF-spouse" = pure MarriedAFSpouse 34 | parseField s = parseDashToCamelField s 35 | 36 | data Occupation = TechSupport | CraftRepair | OtherService | Sales | ExecManagerial | ProfSpecialty 37 | | HandlersCleaners | MachineOpInspct | AdmClerical | FarmingFishing | TransportMoving 38 | | PrivHouseServ | ProtectiveServ | ArmedForces 39 | deriving (Show, Read, Eq, Generic) 40 | 41 | instance FromField Occupation where 42 | -- parseField "ArmedForces" = pure ArmedForces 43 | parseField s = parseDashToCamelField s 44 | 45 | data Relationship = Wife | OwnChild | Husband | NotInFamily | OtherRelative | Unmarried 46 | deriving (Show, Read, Eq, Generic) 47 | 48 | instance FromField Relationship where 49 | parseField s = parseDashToCamelField s 50 | 51 | data Race = White | AsianPacIslander | AmerIndianEskimo | Other | Black 52 | deriving (Show, Read, Eq, Generic) 53 | 54 | instance FromField Race where 55 | parseField s = parseDashToCamelField s 56 | 57 | data Sex = Female | Male 58 | deriving (Show, Read, Eq, Generic) 59 | 60 | instance FromField Sex where 61 | parseField s = parseDashToCamelField s 62 | 63 | data Income = GT50K | LE50K 64 | deriving (Show, Read, Eq, Generic) 65 | 66 | instance FromField Income where 67 | parseField " >50K" = pure GT50K 68 | parseField " <=50K" = pure LE50K 69 | parseField " >50K." = pure GT50K 70 | parseField " <=50K." = pure LE50K 71 | parseField ">50K" = pure GT50K 72 | parseField "<=50K" = pure LE50K 73 | parseField _ = fail "unknown income" 74 | 75 | data Adult = Adult 76 | { age :: Int 77 | , workClass :: Maybe WorkClass 78 | , finalWeight :: Int 79 | , education :: Text 80 | , educationNum :: Int 81 | , maritalStatus :: MaritalStatus 82 | , occupation :: Maybe Occupation 83 | , relationship :: Relationship 84 | , race :: Race 85 | , sex :: Sex 86 | , capitalGain :: Int 87 | , capitalLoss :: Int 88 | , hoursPerWeek :: Int 89 | , nativeCountry :: Text 90 | , income :: Income 91 | } deriving (Show, Read, Generic) 92 | 93 | instance FromRecord Adult where 94 | parseRecord v = Adult <$> v .! 0 <*> (v.! 1 <|> return Nothing) <*> v.!2 <*> (strip <$> v.!3) 95 | <*> v.!4 <*> v.!5<*> (v.!6 <|> return Nothing) <*> v.!7 <*> v.!8 96 | <*> v.!9 <*> v.!10 <*> v.!11 <*> v.!12<*> v.!13<*> v.!14 97 | 98 | adult :: Dataset Adult 99 | adult = csvDataset $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data" 100 | 101 | adultTestSet :: Dataset Adult 102 | adultTestSet = csvDatasetPreprocess (dropLines 1) $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test" 103 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/BreastCancerWisconsin.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Breast Cancer Wisconsin (Diagnostic) Data Set 6 | 7 | Repository link: 8 | 9 | -} 10 | 11 | module Numeric.Datasets.BreastCancerWisconsin where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | 19 | 20 | data Diagnosis = Malignant | Benign deriving (Show, Read, Eq, Generic) 21 | 22 | data Prognosis = Recurrent | Nonrecurrent deriving (Show, Read, Eq, Generic) 23 | 24 | intToDiagnosis :: Int -> Diagnosis 25 | intToDiagnosis 2 = Benign 26 | intToDiagnosis 4 = Malignant 27 | intToDiagnosis _ = error "unknown diagnosis code" 28 | 29 | data BreastCancerEntry = BreastCancerEntry 30 | { sampleCodeNumber :: Int 31 | , clumpThickness :: Int 32 | , uniformityCellSize :: Int 33 | , uniformityCellShape :: Int 34 | , marginalAdhesion :: Int 35 | , singleEpithelialCellSize :: Int 36 | , bareNuclei :: Maybe Int 37 | , blandChromatin :: Int 38 | , normalNucleoli :: Int 39 | , mitosis :: Int 40 | , sampleClass :: Diagnosis 41 | } deriving (Show, Read, Generic) 42 | 43 | instance FromRecord BreastCancerEntry where 44 | parseRecord v = BreastCancerEntry <$> v .! 0 <*> v .! 1 <*> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> (v .! 6 <|> return Nothing) <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> (intToDiagnosis <$> v .! 10) 45 | 46 | breastCancerDatabase :: Dataset BreastCancerEntry 47 | breastCancerDatabase = csvDataset 48 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data" 49 | 50 | data DiagnosticBreastCancer = DiagnosticBreastCancer 51 | { diagnosticID :: Int 52 | , diagnosis :: Diagnosis 53 | , diagnosticCells :: CellFeatures 54 | } deriving (Show, Read, Generic) 55 | 56 | data PrognosticBreastCancer = PrognosticBreastCancer 57 | { prognosticID :: Int 58 | , prognosis :: Prognosis 59 | , prognosticCells :: CellFeatures 60 | } deriving (Show, Read, Generic) 61 | 62 | data CellFeatures = CellFeatures 63 | { radius :: Double 64 | , perimeter :: Double 65 | , area :: Double 66 | , smoothness :: Double 67 | , compactness :: Double 68 | , concavity :: Double 69 | , concavePoints :: Double 70 | , symmetry :: Double 71 | , fractalDimension :: Double 72 | } deriving (Show, Read, Generic) 73 | 74 | charToDiagnosis :: String -> Diagnosis 75 | charToDiagnosis "M" = Malignant 76 | charToDiagnosis "B" = Benign 77 | charToDiagnosis _ = error "unknown diagnosis" 78 | 79 | charToPrognosis :: String -> Prognosis 80 | charToPrognosis "N" = Nonrecurrent 81 | charToPrognosis "R" = Recurrent 82 | charToPrognosis _ = error "unknown diagnosis" 83 | 84 | instance FromRecord DiagnosticBreastCancer where 85 | parseRecord v = DiagnosticBreastCancer <$> v .! 0 <*> (charToDiagnosis <$> v .! 1) <*> parseRecord v 86 | 87 | instance FromRecord PrognosticBreastCancer where 88 | parseRecord v = PrognosticBreastCancer <$> v .! 0 <*> (charToPrognosis <$> v .! 1) <*> parseRecord v 89 | 90 | instance FromRecord CellFeatures where 91 | parseRecord v = CellFeatures <$> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> v .! 6 <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> v .! 10 92 | 93 | diagnosticBreastCancer :: Dataset DiagnosticBreastCancer 94 | diagnosticBreastCancer = csvDataset 95 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data" 96 | 97 | prognosticBreastCancer :: Dataset PrognosticBreastCancer 98 | prognosticBreastCancer = csvDataset 99 | $ URL "http://mlr.cs.umass.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data" 100 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # This Travis job script has been generated by a script via 2 | # 3 | # make_travis_yml_2.hs 'datasets.cabal' 4 | # 5 | # For more information, see https://github.com/hvr/multi-ghc-travis 6 | # 7 | language: c 8 | sudo: false 9 | 10 | git: 11 | submodules: false # whether to recursively clone submodules 12 | 13 | cache: 14 | directories: 15 | - $HOME/.cabal/packages 16 | - $HOME/.cabal/store 17 | 18 | before_cache: 19 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/build-reports.log 20 | # remove files that are regenerated by 'cabal update' 21 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/00-index.* 22 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/*.json 23 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.cache 24 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar 25 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar.idx 26 | 27 | matrix: 28 | include: 29 | - compiler: "ghc-7.6.3" 30 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 31 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.6.3], sources: [hvr-ghc]}} 32 | - compiler: "ghc-7.8.4" 33 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 34 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.8.4], sources: [hvr-ghc]}} 35 | - compiler: "ghc-7.10.2" 36 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 37 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.2], sources: [hvr-ghc]}} 38 | - compiler: "ghc-7.10.3" 39 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 40 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.3], sources: [hvr-ghc]}} 41 | - compiler: "ghc-8.0.1" 42 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 43 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-8.0.1], sources: [hvr-ghc]}} 44 | 45 | before_install: 46 | - HC=${CC} 47 | - unset CC 48 | - PATH=/opt/ghc/bin:/opt/ghc-ppa-tools/bin:$PATH 49 | - PKGNAME='datasets' 50 | 51 | install: 52 | - cabal --version 53 | - echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]" 54 | - BENCH=${BENCH---enable-benchmarks} 55 | - TEST=${TEST---enable-tests} 56 | - travis_retry cabal update -v 57 | - sed -i 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config 58 | - rm -fv cabal.project.local 59 | - "echo 'packages: .' > cabal.project" 60 | - rm -f cabal.project.freeze 61 | - cabal new-build -w ${HC} ${TEST} ${BENCH} --dep -j2 62 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks --dep -j2 63 | 64 | # Here starts the actual work to be performed for the package under test; 65 | # any command which exits with a non-zero exit code causes the build to fail. 66 | script: 67 | - if [ -f configure.ac ]; then autoreconf -i; fi 68 | - rm -rf dist/ 69 | - cabal sdist # test that a source-distribution can be generated 70 | - cd dist/ 71 | - SRCTAR=(${PKGNAME}-*.tar.gz) 72 | - SRC_BASENAME="${SRCTAR/%.tar.gz}" 73 | - tar -xvf "./$SRC_BASENAME.tar.gz" 74 | - cd "$SRC_BASENAME/" 75 | ## from here on, CWD is inside the extracted source-tarball 76 | - rm -fv cabal.project.local 77 | - "echo 'packages: .' > cabal.project" 78 | # this builds all libraries and executables (without tests/benchmarks) 79 | - rm -f cabal.project.freeze 80 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks 81 | # this builds all libraries and executables (including tests/benchmarks) 82 | # - rm -rf ./dist-newstyle 83 | - cabal new-build -w ${HC} ${TEST} ${BENCH} 84 | 85 | # there's no 'cabal new-test' yet, so let's emulate for now 86 | - TESTS=( $(awk 'tolower($0) ~ /^test-suite / { print $2 }' *.cabal) ) 87 | - if [ "$TEST" != "--enable-tests" ]; then TESTS=(); fi 88 | - shopt -s globstar; 89 | RC=true; for T in ${TESTS[@]}; do echo "== $T =="; 90 | if dist-newstyle/build/**/$SRC_BASENAME/**/build/$T/$T; then echo "= $T OK ="; 91 | else echo "= $T FAILED ="; RC=false; fi; done; $RC 92 | 93 | # EOF 94 | -------------------------------------------------------------------------------- /datasets/datasets.cabal: -------------------------------------------------------------------------------- 1 | Name: datasets 2 | Version: 0.2.3 3 | Synopsis: Classical data sets for statistics and machine learning 4 | Description: Classical machine learning and statistics datasets from 5 | the UCI Machine Learning Repository and other sources. 6 | . 7 | The datasets package defines two different kinds of datasets: 8 | . 9 | * small data sets which are directly (or indirectly with `file-embed`) 10 | embedded in the package as pure values and do not require network or IO to download 11 | the data set. This includes Iris, Anscombe and OldFaithful. 12 | . 13 | * other data sets which need to be fetched over the network with 14 | `Numeric.Datasets.getDataset` and are cached in a local temporary directory. 15 | . 16 | > import Numeric.Datasets (getDataset) 17 | > import Numeric.Datasets.Iris (iris) 18 | > import Numeric.Datasets.Abalone (abalone) 19 | > 20 | > main = do 21 | > -- The Iris data set is embedded 22 | > print (length iris) 23 | > print (head iris) 24 | > -- The Abalone dataset is fetched 25 | > abas <- getDataset abalone 26 | > print (length abas) 27 | > print (head abas) 28 | 29 | License: MIT 30 | License-file: LICENSE 31 | Author: Tom Nielsen 32 | Maintainer: Tom Nielsen 33 | build-type: Simple 34 | Cabal-Version: >= 1.8 35 | homepage: https://github.com/glutamate/datasets 36 | bug-reports: https://github.com/glutamate/datasets/issues 37 | category: Statistics, Machine Learning, Data Mining, Data 38 | Tested-With: GHC == 7.6.3, GHC == 7.8.4, GHC == 7.10.2, GHC == 7.10.3, GHC == 8.0.1 39 | extra-source-files: 40 | changelog.md 41 | datafiles/iris.data 42 | datafiles/michelson.json 43 | datafiles/nightingale.json 44 | 45 | source-repository head 46 | type: git 47 | location: https://github.com/glutamate/datasets 48 | 49 | Library 50 | ghc-options: -Wall 51 | hs-source-dirs: src 52 | other-extensions: TemplateHaskell 53 | Exposed-modules: 54 | Numeric.Datasets 55 | , Numeric.Datasets.Anscombe 56 | , Numeric.Datasets.BostonHousing 57 | , Numeric.Datasets.OldFaithful 58 | , Numeric.Datasets.Abalone 59 | , Numeric.Datasets.Adult 60 | , Numeric.Datasets.BreastCancerWisconsin 61 | , Numeric.Datasets.Car 62 | , Numeric.Datasets.Coal 63 | , Numeric.Datasets.CO2 64 | , Numeric.Datasets.Gapminder 65 | , Numeric.Datasets.Iris 66 | , Numeric.Datasets.Michelson 67 | , Numeric.Datasets.Nightingale 68 | , Numeric.Datasets.Quakes 69 | , Numeric.Datasets.States 70 | , Numeric.Datasets.Sunspots 71 | , Numeric.Datasets.UN 72 | , Numeric.Datasets.Vocabulary 73 | , Numeric.Datasets.Wine 74 | , Numeric.Datasets.WineQuality 75 | Build-depends: 76 | base >= 4.6 && < 5 77 | , cassava 78 | , wreq 79 | , hashable 80 | , filepath 81 | , bytestring 82 | , directory 83 | , vector 84 | , text 85 | , stringsearch 86 | , file-embed 87 | , aeson 88 | , time 89 | , microlens -------------------------------------------------------------------------------- /datasets/datafiles/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/OldFaithful.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Old Faithful Geyser Eruption data 6 | 7 | Article: http://web.pdx.edu/~jfreder/M212/oldfaithful.pdf 8 | 9 | These data from: 10 | 11 | For more data, see 12 | 13 | -} 14 | 15 | module Numeric.Datasets.OldFaithful where 16 | 17 | data OldFaithful = OF 18 | { recordingDay :: Int -- ^ Recording day. 1-8: 1978, 16-23: 1979 19 | , waiting :: Double -- ^ waiting time until next eruption 20 | , duration :: Double -- ^ duration of eruption in minutes 21 | } deriving Show 22 | 23 | -- The first comp 24 | oldFaithful :: [OldFaithful] 25 | oldFaithful = 26 | [ OF 1 78 4.4 27 | , OF 1 74 3.9 28 | , OF 1 68 4.0 29 | , OF 1 76 4.0 30 | , OF 1 80 3.5 31 | , OF 1 84 4.1 32 | , OF 1 50 2.3 33 | , OF 1 93 4.7 34 | , OF 1 55 1.7 35 | , OF 1 76 4.9 36 | , OF 1 58 1.7 37 | , OF 1 74 4.6 38 | , OF 1 75 3.4 39 | , OF 2 80 4.3 40 | , OF 2 56 1.7 41 | , OF 2 80 3.9 42 | , OF 2 69 3.7 43 | , OF 2 57 3.1 44 | , OF 2 90 4.0 45 | , OF 2 42 1.8 46 | , OF 2 91 4.1 47 | , OF 2 51 1.8 48 | , OF 2 79 3.2 49 | , OF 2 53 1.9 50 | , OF 2 82 4.6 51 | , OF 2 51 2.0 52 | , OF 3 76 4.5 53 | , OF 3 82 3.9 54 | , OF 3 84 4.3 55 | , OF 3 53 2.3 56 | , OF 3 86 3.8 57 | , OF 3 51 1.9 58 | , OF 3 85 4.6 59 | , OF 3 45 1.8 60 | , OF 3 88 4.7 61 | , OF 3 51 1.8 62 | , OF 3 80 4.6 63 | , OF 3 49 1.9 64 | , OF 3 82 3.5 65 | , OF 4 75 4.0 66 | , OF 4 73 3.7 67 | , OF 4 67 3.7 68 | , OF 4 68 4.3 69 | , OF 4 86 3.6 70 | , OF 4 72 3.8 71 | , OF 4 75 3.8 72 | , OF 4 75 3.8 73 | , OF 4 66 2.5 74 | , OF 4 84 4.5 75 | , OF 4 70 4.1 76 | , OF 4 79 3.7 77 | , OF 4 60 3.8 78 | , OF 4 86 3.4 79 | , OF 5 71 4.0 80 | , OF 5 67 2.3 81 | , OF 5 81 4.4 82 | , OF 5 76 4.1 83 | , OF 5 83 4.3 84 | , OF 5 76 3.3 85 | , OF 5 55 2.0 86 | , OF 5 73 4.3 87 | , OF 5 56 2.9 88 | , OF 5 83 4.6 89 | , OF 5 57 1.9 90 | , OF 5 71 3.6 91 | , OF 5 72 3.7 92 | , OF 5 77 3.7 93 | , OF 6 55 1.8 94 | , OF 6 75 4.6 95 | , OF 6 73 3.5 96 | , OF 6 70 4.0 97 | , OF 6 83 3.7 98 | , OF 6 50 1.7 99 | , OF 6 95 4.6 100 | , OF 6 51 1.7 101 | , OF 6 82 4.0 102 | , OF 6 54 1.8 103 | , OF 6 83 4.4 104 | , OF 6 51 1.9 105 | , OF 6 80 4.6 106 | , OF 6 78 2.9 107 | , OF 7 81 3.5 108 | , OF 7 53 2.0 109 | , OF 7 89 4.3 110 | , OF 7 44 1.8 111 | , OF 7 78 4.1 112 | , OF 7 61 1.8 113 | , OF 7 73 4.7 114 | , OF 7 75 4.2 115 | , OF 7 73 3.9 116 | , OF 7 76 4.3 117 | , OF 7 55 1.8 118 | , OF 7 86 4.5 119 | , OF 7 48 2.0 120 | , OF 8 77 4.2 121 | , OF 8 73 4.4 122 | , OF 8 70 4.1 123 | , OF 8 88 4.1 124 | , OF 8 75 4.0 125 | , OF 8 83 4.1 126 | , OF 8 61 2.7 127 | , OF 8 78 4.6 128 | , OF 8 61 1.9 129 | , OF 8 81 4.5 130 | , OF 8 51 2.0 131 | , OF 8 80 4.8 132 | , OF 8 79 4.1 133 | , OF 16 82 4.1 134 | , OF 16 80 4.2 135 | , OF 16 76 4.5 136 | , OF 16 56 1.9 137 | , OF 16 82 4.7 138 | , OF 16 47 2.0 139 | , OF 16 76 4.7 140 | , OF 16 61 2.5 141 | , OF 16 75 4.3 142 | , OF 16 72 4.4 143 | , OF 16 74 4.4 144 | , OF 16 69 4.3 145 | , OF 16 78 4.6 146 | , OF 16 52 2.1 147 | , OF 17 91 4.8 148 | , OF 17 66 4.1 149 | , OF 17 71 4.0 150 | , OF 17 75 4.0 151 | , OF 17 81 4.4 152 | , OF 17 77 4.1 153 | , OF 17 74 4.3 154 | , OF 17 70 4.0 155 | , OF 17 83 3.9 156 | , OF 17 53 3.2 157 | , OF 17 82 4.5 158 | , OF 17 62 2.2 159 | , OF 17 73 4.7 160 | , OF 17 84 4.6 161 | , OF 18 58 2.2 162 | , OF 18 82 4.8 163 | , OF 18 77 4.3 164 | , OF 18 75 3.8 165 | , OF 18 77 4.0 166 | , OF 18 77 4.1 167 | , OF 18 53 1.8 168 | , OF 18 75 4.4 169 | , OF 18 78 4.0 170 | , OF 18 51 2.2 171 | , OF 18 81 5.1 172 | , OF 18 52 1.9 173 | , OF 18 76 5.0 174 | , OF 18 73 4.4 175 | , OF 19 84 4.5 176 | , OF 19 72 3.8 177 | , OF 19 89 4.3 178 | , OF 19 75 4.4 179 | , OF 19 57 2.2 180 | , OF 19 81 4.8 181 | , OF 19 49 1.9 182 | , OF 19 87 4.7 183 | , OF 19 43 1.8 184 | , OF 19 94 4.8 185 | , OF 19 45 2.0 186 | , OF 19 81 4.4 187 | , OF 19 59 2.5 188 | , OF 19 82 4.3 189 | , OF 20 80 4.4 190 | , OF 20 54 1.9 191 | , OF 20 75 4.7 192 | , OF 20 73 4.3 193 | , OF 20 57 2.2 194 | , OF 20 80 4.7 195 | , OF 20 51 2.3 196 | , OF 20 77 4.6 197 | , OF 20 66 3.3 198 | , OF 20 77 4.2 199 | , OF 20 60 2.9 200 | , OF 20 86 4.6 201 | , OF 20 62 3.3 202 | , OF 20 75 4.2 203 | , OF 20 67 2.6 204 | , OF 20 69 4.6 205 | , OF 21 84 3.7 206 | , OF 21 58 1.8 207 | , OF 21 90 4.7 208 | , OF 21 82 4.5 209 | , OF 21 71 4.5 210 | , OF 21 80 4.8 211 | , OF 21 51 2.0 212 | , OF 21 80 4.8 213 | , OF 21 62 1.9 214 | , OF 21 84 4.7 215 | , OF 21 51 2.0 216 | , OF 21 81 5.1 217 | , OF 21 83 4.3 218 | , OF 21 84 4.8 219 | , OF 22 72 3.0 220 | , OF 22 54 2.1 221 | , OF 22 75 4.6 222 | , OF 22 74 4.0 223 | , OF 22 51 2.2 224 | , OF 22 91 5.1 225 | , OF 22 60 2.9 226 | , OF 22 80 4.3 227 | , OF 22 54 2.1 228 | , OF 22 80 4.7 229 | , OF 22 70 4.5 230 | , OF 22 60 1.7 231 | , OF 22 86 4.2 232 | , OF 22 78 4.3 233 | , OF 23 51 1.7 234 | , OF 23 83 4.4 235 | , OF 23 76 4.2 236 | , OF 23 51 2.2 237 | , OF 23 90 4.7 238 | , OF 23 71 4.0 239 | , OF 23 49 1.8 240 | , OF 23 88 4.7 241 | , OF 23 52 1.8 242 | , OF 23 79 4.5 243 | , OF 23 61 2.1 244 | , OF 23 81 4.2 245 | , OF 23 48 2.1 246 | , OF 23 84 5.2 247 | , OF 23 63 2.0 ] 248 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets.hs: -------------------------------------------------------------------------------- 1 | {- | 2 | 3 | The datasets package defines two different kinds of datasets: 4 | 5 | * small data sets which are directly (or indirectly with `file-embed`) 6 | embedded in the package as pure values and do not require 7 | network or IO to download the data set. 8 | 9 | * other data sets which need to be fetched over the network with 10 | `getDataset` and are cached in a local temporary directory 11 | 12 | This module defines the `getDataset` function for fetching datasets 13 | and utilies for defining new data sets. It is only necessary to import 14 | this module when using fetched data sets. Embedded data sets can be 15 | imported directly. 16 | 17 | -} 18 | 19 | {-# LANGUAGE OverloadedStrings, GADTs #-} 20 | 21 | module Numeric.Datasets where 22 | 23 | import Data.Csv 24 | import System.FilePath 25 | import System.Directory 26 | import Data.Hashable 27 | import Data.Monoid 28 | import qualified Data.ByteString.Lazy as BL 29 | import qualified Data.Vector as V 30 | import Data.Aeson as JSON 31 | import Control.Applicative 32 | import Data.Time 33 | import Data.Char (ord) 34 | import qualified Network.Wreq as Wreq 35 | import Lens.Micro ((^.)) 36 | 37 | import Data.Char (toUpper) 38 | import Text.Read (readMaybe) 39 | import Data.Maybe (fromMaybe) 40 | import Data.ByteString.Char8 (unpack) 41 | import qualified Data.ByteString.Lazy.Char8 as BL8 42 | import Data.ByteString.Lazy.Search (replace) 43 | 44 | -- * Using datasets 45 | 46 | -- |Load a dataset, using the system temporary directory as a cache 47 | getDataset :: Dataset a -> IO [a] 48 | getDataset ds = do 49 | dir <- case temporaryDirectory ds of 50 | Nothing -> getTemporaryDirectory 51 | Just tdir -> return tdir 52 | bs <- fmap (fromMaybe id $ preProcess ds) $ getFileFromSource dir $ source ds 53 | return $ readDataset (readAs ds) bs 54 | 55 | -- |Read a ByteString into a Haskell value 56 | readDataset :: ReadAs a -> BL.ByteString -> [a] 57 | readDataset JSON bs = 58 | case JSON.decode bs of 59 | Just theData -> theData 60 | Nothing -> error "failed to parse json" 61 | readDataset (CSVRecord hhdr opts) bs = 62 | case decodeWith opts hhdr bs of 63 | Right theData -> V.toList theData 64 | Left err -> error err 65 | readDataset (CSVNamedRecord opts) bs = 66 | case decodeByNameWith opts bs of 67 | Right (_,theData) -> V.toList theData 68 | Left err -> error err 69 | 70 | data Source = URL String 71 | 72 | -- | A dataset is a record telling us how to load the data 73 | 74 | data Dataset a = Dataset 75 | { source :: Source 76 | , temporaryDirectory :: Maybe FilePath 77 | , preProcess :: Maybe (BL.ByteString -> BL.ByteString) 78 | , readAs :: ReadAs a 79 | } 80 | 81 | -- | ReadAs is a datatype to describe data formats that hold data sets 82 | 83 | data ReadAs a where 84 | JSON :: FromJSON a => ReadAs a 85 | CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a 86 | CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a 87 | 88 | csvRecord :: FromRecord a => ReadAs a 89 | csvRecord = CSVRecord NoHeader defaultDecodeOptions 90 | 91 | -- * Defining datasets 92 | 93 | -- |Define a dataset from a pre-processing function and a source for a CSV file 94 | csvDatasetPreprocess :: FromRecord a => (BL.ByteString -> BL.ByteString) -> Source -> Dataset a 95 | csvDatasetPreprocess preF src = (csvDataset src) { preProcess = Just preF } 96 | -- parseCSV preF <$> getFileFromSource cacheDir src 97 | 98 | -- |Define a dataset from a source for a CSV file 99 | csvDataset :: FromRecord a => Source -> Dataset a 100 | csvDataset src = Dataset src Nothing Nothing $ CSVRecord NoHeader defaultDecodeOptions 101 | 102 | -- |Define a dataset from a source for a CSV file with a known header 103 | csvHdrDataset :: FromNamedRecord a => Source -> Dataset a 104 | csvHdrDataset src = Dataset src Nothing Nothing $ CSVNamedRecord defaultDecodeOptions 105 | 106 | -- |Define a dataset from a source for a CSV file with a known header and separator 107 | csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a 108 | csvHdrDatasetSep sepc src 109 | = Dataset src Nothing Nothing 110 | $ CSVNamedRecord defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)} 111 | 112 | -- |Define a dataset from a source for a JSON file -- data file must be accessible with HTTP, not HTTPS 113 | jsonDataset :: FromJSON a => Source -> Dataset a 114 | jsonDataset src = Dataset src Nothing Nothing JSON 115 | 116 | -- | Get a ByteString from the specified Source 117 | getFileFromSource :: FilePath -> Source -> IO (BL.ByteString) 118 | getFileFromSource cacheDir (URL url) = do 119 | createDirectoryIfMissing True cacheDir 120 | let fnm = cacheDir "ds" <> show (hash url) 121 | 122 | ex <- doesFileExist fnm 123 | if ex 124 | then BL.readFile fnm 125 | else do 126 | rsp <- Wreq.get url 127 | let bs = rsp ^. Wreq.responseBody 128 | BL.writeFile fnm bs 129 | return bs 130 | 131 | -- * Helper functions for parsing 132 | 133 | -- |Turn dashes to CamlCase 134 | dashToCamelCase :: String -> String 135 | dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs 136 | dashToCamelCase (c:cs) = c : dashToCamelCase cs 137 | dashToCamelCase [] = [] 138 | 139 | -- | Parse a field, first turning dashes to CamlCase 140 | parseDashToCamelField :: Read a => Field -> Parser a 141 | parseDashToCamelField s = 142 | case readMaybe (dashToCamelCase $ unpack s) of 143 | Just wc -> pure wc 144 | Nothing -> fail "unknown" 145 | 146 | -- | parse somethign, based on its read instance 147 | parseReadField :: Read a => Field -> Parser a 148 | parseReadField s = 149 | case readMaybe (unpack s) of 150 | Just wc -> pure wc 151 | Nothing -> fail "unknown" 152 | 153 | -- |Drop lines from a bytestring 154 | dropLines :: Int -> BL.ByteString -> BL.ByteString 155 | dropLines 0 s = s 156 | dropLines n s = dropLines (n-1) $ BL.tail $ BL8.dropWhile (/='\n') s 157 | 158 | -- | Turn US-style decimals starting with a period (e.g. .2) into something Haskell can parse (e.g. 0.2) 159 | fixAmericanDecimals :: BL.ByteString -> BL.ByteString 160 | fixAmericanDecimals = replace ",." (",0."::BL.ByteString) 161 | 162 | -- | Convert a Fixed-width format to a CSV 163 | fixedWidthToCSV :: BL.ByteString -> BL.ByteString 164 | fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where 165 | f [] = [] 166 | f (' ':cs) = ',':f (chomp cs) 167 | f ('\n':cs) = '\n':fnl cs 168 | f (c:cs) = c:f cs 169 | fnl cs = f (chomp cs) --newline 170 | chomp (' ':cs) = chomp cs 171 | chomp (c:cs) = c:cs 172 | chomp [] = [] 173 | 174 | -- * Helper functions for data analysis 175 | 176 | -- | convert a fractional year to UTCTime with second-level precision (due to not taking into account leap seconds) 177 | yearToUTCTime :: Double -> UTCTime 178 | yearToUTCTime yearDbl = 179 | let (yearn,yearFrac) = properFraction yearDbl 180 | dayYearBegin = fromGregorian yearn 1 1 181 | (dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365) 182 | day = addDays dayn dayYearBegin 183 | dt = secondsToDiffTime $ round $ dayFrac * 86400 184 | in UTCTime day dt 185 | --------------------------------------------------------------------------------