├── .gitignore ├── Dockerfile ├── INTRO.md ├── LICENSE ├── README.md ├── Setup.hs ├── karps.cabal ├── notebooks ├── 00_Intro.ipynb ├── 01_Datasets_Dataframes_Observable_DynObservable.ipynb ├── 02_Organizing_workflows.ipynb ├── 03_Caching_data.ipynb ├── 04_Reading_data.ipynb ├── 05_Displaying_data_with_Python.ipynb ├── 06_Column_operations.ipynb ├── 07_Spark_program_introspection.ipynb ├── KarpsDagDisplay.hs ├── KarpsDisplays.hs ├── ihaskell-tensorboard.png └── rendered │ ├── 00_Intro.html │ ├── 01_Datasets_Dataframes_Observable_DynObservable.html │ ├── 02_Organizing_workflows.html │ ├── 03_Caching_data.html │ └── 06_Column_operations.html ├── src └── Spark │ ├── Core.hs │ ├── Core │ ├── Column.hs │ ├── ColumnFunctions.hs │ ├── Context.hs │ ├── Dataset.hs │ ├── Functions.hs │ ├── Internal │ │ ├── AggregationFunctions.hs │ │ ├── AlgebraStructures.hs │ │ ├── Arithmetics.hs │ │ ├── ArithmeticsImpl.hs │ │ ├── BasicStructures.hs │ │ ├── Caching.hs │ │ ├── CachingUntyped.hs │ │ ├── CanRename.hs │ │ ├── Client.hs │ │ ├── ColumnFunctions.hs │ │ ├── ColumnStandard.hs │ │ ├── ColumnStructures.hs │ │ ├── ComputeDag.hs │ │ ├── ContextIOInternal.hs │ │ ├── ContextInteractive.hs │ │ ├── ContextInternal.hs │ │ ├── ContextStructures.hs │ │ ├── DAGFunctions.hs │ │ ├── DAGStructures.hs │ │ ├── DatasetFunctions.hs │ │ ├── DatasetStructures.hs │ │ ├── Filter.hs │ │ ├── FunctionsInternals.hs │ │ ├── Groups.hs │ │ ├── Joins.hs │ │ ├── LocalDataFunctions.hs │ │ ├── LocatedBase.hs │ │ ├── ObservableStandard.hs │ │ ├── OpFunctions.hs │ │ ├── OpStructures.hs │ │ ├── Paths.hs │ │ ├── PathsUntyped.hs │ │ ├── Projections.hs │ │ ├── Pruning.hs │ │ ├── RowGenerics.hs │ │ ├── RowGenericsFrom.hs │ │ ├── RowStructures.hs │ │ ├── RowUtils.hs │ │ ├── TypesFunctions.hs │ │ ├── TypesGenerics.hs │ │ ├── TypesStructures.hs │ │ ├── TypesStructuresRepr.hs │ │ └── Utilities.hs │ ├── Row.hs │ ├── StructuresInternal.hs │ ├── Try.hs │ └── Types.hs │ ├── IO │ ├── Inputs.hs │ └── Internal │ │ ├── InputGeneric.hs │ │ ├── Json.hs │ │ └── OutputCommon.hs │ └── Inputs │ └── Inputs.hs ├── stack-ihaskell.yaml ├── stack.yaml ├── test-integration ├── Spark │ ├── Core │ │ ├── CachingSpec.hs │ │ ├── CollectSpec.hs │ │ ├── ColumnSpec.hs │ │ ├── GroupsSpec.hs │ │ ├── IntegrationUtilities.hs │ │ ├── JoinsSpec.hs │ │ ├── PruningSpec.hs │ │ ├── SimpleAddSpec.hs │ │ └── Spec.hs │ └── IO │ │ ├── JsonSpec.hs │ │ └── StampSpec.hs └── Spec.hs └── test ├── Spark └── Core │ ├── ColumnSpec.hs │ ├── ContextSpec.hs │ ├── DatasetSpec.hs │ ├── Internal │ ├── CachingSpec.hs │ ├── DAGFunctionsSpec.hs │ ├── GroupsSpec.hs │ ├── LocalDataFunctionsSpec.hs │ ├── OpFunctionsSpec.hs │ ├── PathsSpec.hs │ └── RowUtilsSpec.hs │ ├── PathSpec.hs │ ├── ProjectionsSpec.hs │ ├── RowToSQLSpec.hs │ ├── SimpleExamplesSpec.hs │ └── TypesSpec.hs └── Spec.hs /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | dist-* 3 | cabal-dev 4 | *.o 5 | *.hi 6 | *.chi 7 | *.chs.h 8 | *.dyn_o 9 | *.dyn_hi 10 | .hpc 11 | .hsenv 12 | .cabal-sandbox/ 13 | cabal.sandbox.config 14 | *.prof 15 | *.aux 16 | *.hp 17 | *.eventlog 18 | .stack-work/ 19 | cabal.project.local 20 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gibiansky/ihaskell:latest 2 | 3 | # Build: 4 | # docker build -t ihaskell-karps . 5 | 6 | # Install pandas for nice visualizations in Python 7 | WORKDIR /tmp 8 | RUN pip install pandas 9 | RUN pip install requests 10 | 11 | RUN rm /ihaskell/.stack-work/install/x86_64-linux/nightly-2015-08-15/7.10.2/bin/ihaskell 12 | 13 | RUN mkdir /karps 14 | WORKDIR /karps 15 | 16 | COPY stack-ihaskell.yaml stack.yaml 17 | COPY karps.cabal karps.cabal 18 | COPY src src 19 | COPY test test 20 | COPY LICENSE LICENSE 21 | 22 | RUN stack setup 7.10.2 23 | RUN stack clean 24 | RUN stack update 25 | RUN stack install ipython-kernel-0.8.3.0 26 | RUN stack install ihaskell-0.8.3.0 27 | RUN stack install ihaskell-blaze-0.3.0.0 28 | RUN stack install ihaskell-basic-0.3.0.0 29 | RUN stack install 30 | 31 | 32 | # Run the notebook 33 | ENV PATH /karps/.stack-work/install/x86_64-linux/nightly-2015-08-15/7.10.2/bin:/root/.stack/snapshots/x86_64-linux/nightly-2015-08-15/7.10.2/bin:/root/.stack/programs/x86_64-linux/ghc-7.10.2/bin:/root/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 34 | RUN ihaskell install --stack 35 | WORKDIR /karps 36 | ENTRYPOINT stack exec -- jupyter notebook --NotebookApp.port=8888 '--NotebookApp.ip=*' --NotebookApp.notebook_dir=/karps 37 | EXPOSE 8888 38 | -------------------------------------------------------------------------------- /INTRO.md: -------------------------------------------------------------------------------- 1 | # Karps-Haskell - Haskell bindings for Spark Datasets and Dataframes 2 | 3 | This project is an exploration vehicle for developing safe, robust and reliable 4 | data pipelines over Apache Spark. It consists in multiple sub-projects: 5 | - a specification to describe data pipelines in a language-agnostic manner, 6 | and a communication protocol to submit these pipelines to Spark. The 7 | specification is currently specified in [this repository](https://github.com/krapsh/karps-interface), using 8 | [Protocol Buffers 3](https://developers.google.com/protocol-buffers/docs/proto3) ( 9 | which is also compatible with JSON). 10 | - a serving library, called 11 | [karps-server](https://github.com/krapsh/kraps-server), that implements this specification on top of Spark. 12 | It is written in Scala and is loaded as a standard Spark package. 13 | - a client written in Haskell that sends pipelines to Spark for execution. In 14 | addition, this client serves as an experimental platform for whole-program optimization and verification, as well as compiler-enforced type checking. 15 | 16 | There is also a separate set of utilities to visualize such pipelines using 17 | Jupyter notebooks and IHaskell. 18 | 19 | This is a preview, the API may (will) change in the future. 20 | 21 | The name is a play on a tasty fish of the family Cyprinidae, and an anagram of Spark. The programming model is strongly influenced by the 22 | [TensorFlow project](https://www.tensorflow.org/) and follows a similar design. 23 | 24 | ## Introduction 25 | 26 | This project explores an alternative API to run complex workflows on top of 27 | Apache Spark. Note that it is neither endorsed or supported by the Apache 28 | Foundation nor by Databricks, Inc. 29 | 30 | For the developers of Spark bindings: 31 | 32 | By using this API, rich transforms can be expressed on top of Spark using various programming languages, without having to implement elaborate socket protocols to communicate with Java objects. Each programming language only needs to implement a relatively small interface that does not rely on features specific to the Java virtual machines, and which can be implemented using standard REST technologies. Each language is then free to express computations in the most idiomatic way. 33 | 34 | As a reference, a set of bindings is being developed in the Haskell programming language and is used as the reference implementation. Despite its limited usage in data science, it is a useful tool to design strongly principled APIs that work across various programming languages. 35 | 36 | 37 | For the user: 38 | 39 | The user may be interested by a few features unique to this interface. 40 | 41 | 1. Lazy computations. 42 | No call to Spark is issued until a result is absolutely required. Unlike standard Spark interfaces, even aggregation operations such as `collect()` or `sum()` are lazy. This allows Karps to perform whole-program analysis of the computation and to make optimizations that are currently beyond the reach of Spark. 43 | 44 | 2. Strong checks. 45 | Thanks to lazy evaluation, a complete data science pipeline can be checked for correctness before it is evaluated. This is useful when composing multiple notebooks or commands together. For example, a lot of interesting operations in Spark such as Machine Learning algorithms involve an aggregation step. In Spark, such a step would break the analysis of the program and prevent the Spark analyzer from checking further transforms. Karps does not suffer from such limitations and checks all the pipeline at once. 46 | 47 | 3. Automatic resource management. 48 | Because Karps has a global view of the pipeline, it can check when data needs to be cached or uncached. It is able to schedule caching and uncaching operations automatically, and it refuses to run program that may be incorrect with respect to caching (for example when uncaching happens before the data is accessed again) 49 | 50 | 4. Complex nested pipelines. 51 | Computations can be arbitrarily nested and composed, allowing to conceptually condense complex sequences of operations into a single high-level operations. This is useful for debugging and understanding a pipeline at a high-level without being distracted by the implementation details of each step. This follows the same approach as TensorFlow. 52 | 53 | 5. Stable format and language agnostic. 54 | A complex pipeline may be stored as a JSON file in one programming language and read/restored in a different programming language. If this program is run on the same session, that other language can access the cached data. 55 | 56 | ## Installation 57 | 58 | You need the stack tool to build and install Karps. Additionally, you will need the Karps server running to run some queries against Spark. 59 | 60 | 61 | ## Development ideas 62 | 63 | These are notes for developers interested in understanding the general philosophy behind Karps. 64 | 65 | Doing data science at scale is hard and requires multiple iterations. Accessing the data can be a long operations (minutes, or hours), even in Spark. Because of that, data should only be accessed if we are reasonably confident that the program is going to finish. 66 | 67 | As a result, Karps is designed to detect a number of common programming mistakes in Spark before even attempting to access the data. These checks are either enforced by the runtime, or by Haskell's compiler when using the typed API. This is possible thanks to whole program analysis and lazy evaluation. 68 | 69 | The execution model is heavily inspired by TensorFlow: 70 | 71 | 1. Deterministic operations: 72 | All the operations are deterministic, and some non-deterministic operations such as `currentTime` in SQL are forbidden. While this may seem a restriction, it provides multiple benefits such as aggressive caching and computation reuse. 73 | 74 | 2. Stateless operations on a graph. A lot of the transforms operate as simple graph transforms and will eventually be merged in the server, making them available to all languages. 75 | 76 | 3. Simple JSON-based format for encoding transform. It may be switched to Protocol Buffers (v3) if I figure out how to use it with Haskell and Spray-Can. 77 | 78 | 4. Separation of the description of the transform from the operational details required by Spark. 79 | 80 | 5. Trying to use the type system in the most effective way (Haskell interface). Making sure that the type system can enforce some simple rules such as not adding an integer to a string, but at the same time trying to give understandable error messages to the user when something goes wrong. 81 | 82 | 83 | ## Differences from Spark 84 | 85 | As mentioned Karps attempts to provide strong semantic guarantees at the expense of some flexibility. 86 | 87 | In order to guarantee determinism, some operations are/will be forbidden: 88 | - randn() and rand() without a seed 89 | - get_current_time 90 | 91 | Also, all the operations are meant to be expressed in a way that does not depend on the partitioning of the data. This is a significant departure from Spark, in which the PartitionID is available. It is possible in most cases to replace these at the expense of extra shuffles. In this case, it is considered worth it because of the strong guarantees that it offers with respect to reproducibility. In any case, it is a matter of debate. 92 | 93 | ## Current status 94 | 95 | Most of the underlying engines is working. Some important pieces are 96 | still incomplete or missing though: 97 | - simple json ingest with data specification 98 | - debug mode for the backend (check for correctness) 99 | - autogeneration of accessors with template Haskell 100 | - better IHaskell interface (especially for reporting errors) 101 | - python frontend 102 | - pandas backend 103 | 104 | Advanced feature that require more thoughts are considered after that: 105 | - SQL commands (interned strings) 106 | - user-defined functions (Scala only) 107 | - simple ML pipelines 108 | - meta data and user-defined types 109 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /notebooks/05_Displaying_data_with_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Displaying data with Python\n", 8 | "\n", 9 | "Haskell is a great language for complex processing, but it lacks the visualization libraries that R and Python users have come to enjoy. This tutorial shows how to integrate both together when doing interactive analysis." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | ":extension DeriveGeneric\n", 21 | ":extension FlexibleContexts\n", 22 | ":extension OverloadedStrings\n", 23 | ":extension GeneralizedNewtypeDeriving\n", 24 | ":extension FlexibleInstances\n", 25 | ":extension MultiParamTypeClasses" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "[Debug] Creating spark session at url: http://10.0.2.2:8081/sessions/session05_python @(: :0:0)" 37 | ] 38 | }, 39 | "metadata": {}, 40 | "output_type": "display_data" 41 | } 42 | ], 43 | "source": [ 44 | "import GHC.Generics (Generic)\n", 45 | "\n", 46 | "import Spark.Core.Dataset\n", 47 | "import Spark.Core.Context\n", 48 | "import Spark.Core.Functions\n", 49 | "import Spark.Core.Column\n", 50 | "import Spark.Core.Types\n", 51 | "import Spark.Core.Row\n", 52 | "import Spark.Core.ColumnFunctions\n", 53 | "\n", 54 | "conf = defaultConf {\n", 55 | " confEndPoint = \"http://10.0.2.2\",\n", 56 | " confRequestedSessionName = \"session05_python\" }\n", 57 | "createSparkSessionDef conf" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "import Spark.Core.Types\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data MyData = MyData {\n", 80 | " aBigId :: Int,\n", 81 | " importantData :: Int } deriving (Show, Eq, Generic, Ord)\n", 82 | "\n", 83 | "instance SQLTypeable MyData\n", 84 | "instance FromSQL MyData\n", 85 | "instance ToSQL MyData" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "[Debug] executeCommand1: computing observable collected_data@org.spark.Collect![{aBigId:int importantData:int}] @(: :0:0)\n", 97 | "[Info] Sending computations at url: http://10.0.2.2:8081/computations/session05_python/0/create @(: :0:0)\n", 98 | "[Debug] executeCommand1: Tracked nodes are [(9c12a..,NPath(collected_data),[{aBigId:int importantData:int}],collected_data)] @(: :0:0)\n", 99 | "[Info] _computationMultiStatus: /collected_data finished @(: :0:0)" 100 | ] 101 | }, 102 | "metadata": {}, 103 | "output_type": "display_data" 104 | } 105 | ], 106 | "source": [ 107 | "let collection = [MyData 1 2, MyData 3 2, MyData 5 4]\n", 108 | "\n", 109 | "let ds = dataset collection @@ \"dataset\"\n", 110 | "let c = collect (asCol ds) @@ \"collected_data\"\n", 111 | "_ <- exec1Def c" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 2, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "ename": "ImportError", 121 | "evalue": "No module named kraps", 122 | "output_type": "error", 123 | "traceback": [ 124 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 125 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 126 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mkraps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnectSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"session05_python\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maddress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'localhost'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 127 | "\u001b[0;31mImportError\u001b[0m: No module named kraps" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "from kraps import *\n", 133 | "ks = connectSession(\"session05_python\", address='localhost')\n", 134 | "ks" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "ks.pandas(\"collected_data\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "print ks.url('collected_data')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 2", 172 | "language": "python", 173 | "name": "python2" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 2 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython2", 185 | "version": "2.7.12+" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 1 190 | } 191 | -------------------------------------------------------------------------------- /notebooks/KarpsDisplays.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE TypeSynonymInstances #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | 6 | -- This module is meant to be loaded from the IHaskell REPL. 7 | module KarpsDisplays where 8 | 9 | import IHaskell.Display 10 | import Formatting 11 | import Data.Text(unpack) 12 | import qualified Data.Text 13 | import Data.Foldable(toList) 14 | 15 | import KarpsDagDisplay(nodeToDisplayGraph, tfIFrame, DisplayGraph, exportNodes) 16 | 17 | import Spark.Core.Internal.Utilities(forceRight) 18 | import Spark.Core.Internal.PathsUntyped 19 | import Spark.Core.Internal.ComputeDag 20 | import Spark.Core.Internal.DAGStructures 21 | import Spark.Core.Internal.DatasetFunctions(untyped) 22 | import Spark.Core.Internal.DatasetStructures(ComputeNode) 23 | import Spark.Core.Internal.Utilities(forceRight) 24 | import Spark.Core.Internal.Client 25 | import Spark.Core.StructuresInternal(ComputationID(..)) 26 | import Spark.Core.Try 27 | 28 | instance IHaskellDisplay DisplayGraph where 29 | display g = return $ Display [html code] 30 | where 31 | code = unpack . tfIFrame . exportNodes $ g 32 | 33 | 34 | showGraph = display . forceRight . nodeToDisplayGraph 35 | 36 | showGraph' = showGraph . forceRight 37 | 38 | showNameGraph node = showGraph . vertexData . last . forceRight $ x where 39 | x = do 40 | cg <- tryEither $ buildCGraph (untyped node) 41 | nameCg <- assignPathsUntyped cg 42 | let tiedCg = tieNodes nameCg 43 | return $ toList (cdVertices tiedCg) 44 | -------------------------------------------------------------------------------- /notebooks/ihaskell-tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krapsh/kraps-haskell/c3a03afe9a4a5ddcedf994756d9bc5967818cec2/notebooks/ihaskell-tensorboard.png -------------------------------------------------------------------------------- /src/Spark/Core.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | Module : Spark.Core 3 | Description : Core functions and data structures to communicate with the Karps 4 | server. 5 | Copyright : (c) Karps contributors, 2016 6 | License : Apache-2.0 7 | Maintainer : krapsh@yandex.com 8 | Stability : experimental 9 | Portability : POSIX 10 | 11 | -} 12 | module Spark.Core where 13 | -------------------------------------------------------------------------------- /src/Spark/Core/Column.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | 3 | {- | 4 | Module : Spark.Core.Column 5 | Description : Column types and basic operations. 6 | 7 | Operations on columns. 8 | -} 9 | module Spark.Core.Column( 10 | -- * Types 11 | Column, 12 | DynColumn, 13 | GenericColumn, 14 | -- * Extractions and collations 15 | asCol, 16 | asCol', 17 | pack1, 18 | pack, 19 | pack', 20 | struct, 21 | struct', 22 | castCol, 23 | castCol', 24 | colRef, 25 | (//), 26 | (/-), 27 | -- ToStaticProjectable, 28 | StaticColProjection, 29 | DynamicColProjection, 30 | unsafeStaticProjection, 31 | -- * Column type manipulations 32 | dropColType, 33 | -- * Column functions 34 | colType, 35 | untypedCol, 36 | colFromObs, 37 | colFromObs', 38 | applyCol1, 39 | ) where 40 | 41 | import Spark.Core.Internal.ColumnStructures 42 | import Spark.Core.Internal.ColumnFunctions 43 | import Spark.Core.Internal.FunctionsInternals 44 | import Spark.Core.Internal.Projections 45 | -------------------------------------------------------------------------------- /src/Spark/Core/ColumnFunctions.hs: -------------------------------------------------------------------------------- 1 | 2 | {-| 3 | Module : Spark.Core.ColumnFunctions 4 | Description : Column operations 5 | 6 | The standard library of functions that operate on 7 | data columns. 8 | -} 9 | module Spark.Core.ColumnFunctions( 10 | -- * Reductions 11 | sumCol, 12 | sumCol', 13 | countCol, 14 | countCol', 15 | -- * Casting 16 | asDoubleCol 17 | ) where 18 | 19 | import Spark.Core.Internal.ArithmeticsImpl() 20 | import Spark.Core.Internal.ColumnStandard 21 | import Spark.Core.Internal.AggregationFunctions 22 | import Spark.Core.Internal.Projections() 23 | -------------------------------------------------------------------------------- /src/Spark/Core/Context.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | {- | This module defines session objects that act as entry points to spark. 5 | 6 | There are two ways to interact with Spark: using an explicit state object, 7 | or using the default state object (interactive session). 8 | 9 | While the interactive session is the most convenient, it should not be 10 | used for more than quick experimentations. Any complex code should use 11 | the SparkSession and SparkState objects. 12 | -} 13 | module Spark.Core.Context( 14 | SparkSessionConf(..), 15 | SparkSession, 16 | SparkState, 17 | SparkInteractiveException, 18 | FromSQL, 19 | defaultConf, 20 | executeCommand1, 21 | executeCommand1', 22 | computationStats, 23 | createSparkSessionDef, 24 | closeSparkSessionDef, 25 | currentSessionDef, 26 | computationStatsDef, 27 | exec1Def, 28 | exec1Def', 29 | execStateDef 30 | ) where 31 | 32 | import Data.Text(pack) 33 | 34 | import Spark.Core.Internal.ContextStructures 35 | import Spark.Core.Internal.ContextIOInternal 36 | import Spark.Core.Internal.ContextInteractive 37 | import Spark.Core.Internal.RowGenericsFrom(FromSQL) 38 | 39 | 40 | -- | The default configuration if the Karps server is being run locally. 41 | defaultConf :: SparkSessionConf 42 | defaultConf = 43 | SparkSessionConf { 44 | confEndPoint = pack "http://127.0.0.1", 45 | confPort = 8081, 46 | confPollingIntervalMillis = 500, 47 | confRequestedSessionName = "", 48 | confUseNodePrunning = False -- Disable graph pruning by default 49 | } 50 | -------------------------------------------------------------------------------- /src/Spark/Core/Dataset.hs: -------------------------------------------------------------------------------- 1 | 2 | {- | 3 | Module : Spark.Core.Dataset 4 | Description : Dataset types and basic operations. 5 | 6 | This module describes the core data types (Dataset, DataFrame, 7 | Observable and DynObservable), and some basic operations to relate them. 8 | -} 9 | module Spark.Core.Dataset( 10 | -- * Common data structures 11 | -- TODO Should it be hidden? 12 | ComputeNode, 13 | LocLocal, 14 | LocDistributed, 15 | LocUnknown, 16 | UntypedNode, 17 | -- * Distributed data structures 18 | Dataset, 19 | DataFrame, 20 | -- * Local data structures 21 | LocalData, 22 | LocalFrame, 23 | -- * Conversions 24 | asDF, 25 | asDS, 26 | asLocalObservable, 27 | castType, 28 | castType', 29 | -- * Relations 30 | parents, 31 | untyped, 32 | untyped', 33 | depends, 34 | logicalParents, 35 | logicalParents', 36 | -- * Attributes 37 | nodeLogicalParents, 38 | nodeLogicalDependencies, 39 | nodeParents, 40 | nodeOp, 41 | nodeId, 42 | nodeName, 43 | nodeType, 44 | ) where 45 | 46 | import Spark.Core.Internal.DatasetStructures 47 | import Spark.Core.Internal.DatasetFunctions 48 | import Spark.Core.Internal.Projections() 49 | -------------------------------------------------------------------------------- /src/Spark/Core/Functions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | 4 | module Spark.Core.Functions( 5 | -- * Creation 6 | dataset, 7 | dataframe, 8 | constant, 9 | -- * Standard conversions 10 | asLocalObservable, 11 | asDouble, 12 | -- * Arithmetic operations 13 | (.+), 14 | (.-), 15 | (./), 16 | div', 17 | -- * Utilities 18 | (@@), 19 | _1, 20 | _2, 21 | -- * Standard library 22 | collect, 23 | collect', 24 | count, 25 | identity, 26 | autocache, 27 | cache, 28 | uncache, 29 | joinInner, 30 | joinInner', 31 | broadcastPair 32 | ) where 33 | 34 | 35 | import Data.Aeson(toJSON) 36 | import qualified Data.Vector as V 37 | 38 | import Spark.Core.Dataset 39 | import Spark.Core.Types 40 | import Spark.Core.Row 41 | import Spark.Core.Internal.ArithmeticsImpl 42 | import Spark.Core.Internal.DatasetFunctions 43 | import Spark.Core.Internal.Joins 44 | import Spark.Core.Internal.Utilities 45 | import Spark.Core.Internal.LocalDataFunctions 46 | import Spark.Core.Internal.ObservableStandard 47 | import Spark.Core.Internal.FunctionsInternals() 48 | import Spark.Core.Internal.OpStructures 49 | import Spark.Core.Internal.AggregationFunctions 50 | import Spark.Core.Internal.TypesStructures(SQLType(..)) 51 | import Spark.Core.Internal.Projections 52 | import Spark.Core.Internal.CanRename 53 | 54 | dataset :: (ToSQL a, SQLTypeable a, HasCallStack) => [a] -> Dataset a 55 | dataset l = emptyDataset op tp where 56 | tp = buildType 57 | op = NodeDistributedLit (unSQLType tp) (V.fromList ((toJSON . valueToCell) <$> l)) 58 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/AggregationFunctions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE RankNTypes #-} 4 | {-# LANGUAGE FlexibleContexts #-} 5 | 6 | -- A number of standard aggregation functions. 7 | 8 | module Spark.Core.Internal.AggregationFunctions( 9 | -- Standard library 10 | collect, 11 | collect', 12 | count, 13 | count', 14 | countCol, 15 | countCol', 16 | sumCol, 17 | sumCol', 18 | -- Developer functions 19 | AggTry, 20 | UniversalAggregator(..), 21 | applyUAOUnsafe, 22 | applyUntypedUniAgg3 23 | ) where 24 | 25 | import Data.Aeson(Value(Null)) 26 | import qualified Data.Text as T 27 | import qualified Data.Vector as V 28 | 29 | import Spark.Core.Internal.DatasetStructures 30 | import Spark.Core.Internal.ColumnStructures 31 | import Spark.Core.Internal.ColumnFunctions(colType, untypedCol) 32 | import Spark.Core.Internal.DatasetFunctions 33 | import Spark.Core.Internal.RowGenerics(ToSQL) 34 | import Spark.Core.Internal.LocalDataFunctions() 35 | import Spark.Core.Internal.FunctionsInternals 36 | import Spark.Core.Internal.OpStructures 37 | import Spark.Core.Internal.TypesStructures 38 | import Spark.Core.Internal.Utilities 39 | import Spark.Core.Internal.TypesFunctions(arrayType') 40 | import Spark.Core.StructuresInternal(emptyFieldPath) 41 | import Spark.Core.Types 42 | import Spark.Core.Try 43 | 44 | {-| The sum of all the elements in a column. 45 | 46 | If the data type is too small to represent the sum, the value being returned is 47 | undefined. 48 | -} 49 | sumCol :: forall ref a. (Num a, SQLTypeable a, ToSQL a) => 50 | Column ref a -> LocalData a 51 | sumCol = applyUAOUnsafe _sumAgg' 52 | 53 | sumCol' :: DynColumn -> LocalFrame 54 | sumCol' = applyUntypedUniAgg3 _sumAgg' 55 | 56 | {-| The number of elements in a column. 57 | 58 | -} 59 | -- TODO use Long for the return data type. 60 | count :: forall a. Dataset a -> LocalData Int 61 | count = countCol . asCol 62 | 63 | count' :: DataFrame -> LocalFrame 64 | count' = countCol' . asCol' 65 | 66 | countCol :: Column ref a -> LocalData Int 67 | countCol = applyUAOUnsafe _countAgg' 68 | 69 | countCol' :: DynColumn -> LocalFrame 70 | countCol' = applyUntypedUniAgg3 _countAgg' 71 | 72 | 73 | {-| Collects all the elements of a column into a list. 74 | 75 | NOTE: 76 | This list is sorted in the canonical ordering of the data type: however the 77 | data may be stored by Spark, the result will always be in the same order. 78 | This is a departure from Spark, which does not guarantee an ordering on 79 | the returned data. 80 | -} 81 | collect :: forall ref a. (SQLTypeable a) => Column ref a -> LocalData [a] 82 | collect = applyUAOUnsafe _collectAgg' 83 | 84 | {-| See the documentation of collect. -} 85 | collect' :: DynColumn -> LocalFrame 86 | collect' = applyUntypedUniAgg3 _collectAgg' 87 | 88 | type AggTry a = Either T.Text a 89 | 90 | {-| 91 | This is the universal aggregator: the invariant aggregator and 92 | some extra laws to combine multiple outputs. 93 | It is useful for combining the results over multiple passes. 94 | A real implementation in Spark has also an inner pass. 95 | -} 96 | data UniversalAggregator a buff = UniversalAggregator { 97 | uaMergeType :: SQLType buff, 98 | -- The result is partioning invariant 99 | uaInitialOuter :: Dataset a -> LocalData buff, 100 | -- This operation is associative and commutative 101 | -- The logical parents of the final observable have to be the 2 inputs 102 | uaMergeBuffer :: LocalData buff -> LocalData buff -> LocalData buff 103 | } 104 | 105 | -- TODO(kps) check the coming type for non-summable types 106 | _sumAgg' :: DataType -> AggTry UniversalAggregatorOp 107 | _sumAgg' dt = pure UniversalAggregatorOp { 108 | uaoMergeType = dt, 109 | uaoInitialOuter = InnerAggOp $ AggFunction "SUM" (V.singleton emptyFieldPath), 110 | uaoMergeBuffer = ColumnSemiGroupLaw "SUM_SL" 111 | } 112 | 113 | _countAgg' :: DataType -> AggTry UniversalAggregatorOp 114 | -- Counting will always succeed. 115 | _countAgg' _ = pure UniversalAggregatorOp { 116 | -- TODO(kps) switch to BigInt 117 | uaoMergeType = StrictType IntType, 118 | uaoInitialOuter = InnerAggOp $ AggFunction "COUNT" (V.singleton emptyFieldPath), 119 | uaoMergeBuffer = ColumnSemiGroupLaw "SUM" 120 | } 121 | 122 | _collectAgg' :: DataType -> AggTry UniversalAggregatorOp 123 | -- Counting will always succeed. 124 | _collectAgg' dt = 125 | let ldt = arrayType' dt 126 | soMerge = StandardOperator { 127 | soName = "org.spark.Collect", 128 | soOutputType = ldt, 129 | soExtra = Null 130 | } 131 | soMono = StandardOperator { 132 | soName = "org.spark.CatSorted", 133 | soOutputType = ldt, 134 | soExtra = Null 135 | } 136 | in pure UniversalAggregatorOp { 137 | -- TODO(kps) switch to BigInt 138 | uaoMergeType = ldt, 139 | uaoInitialOuter = OpaqueAggTransform soMerge, 140 | uaoMergeBuffer = OpaqueSemiGroupLaw soMono 141 | } 142 | 143 | applyUntypedUniAgg3 :: (DataType -> AggTry UniversalAggregatorOp) -> DynColumn -> LocalFrame 144 | applyUntypedUniAgg3 f dc = do 145 | c <- dc 146 | let uaot = f . unSQLType . colType $ c 147 | uao <- tryEither uaot 148 | let no = NodeAggregatorReduction uao 149 | let ds = pack1 c 150 | return $ emptyLocalData no (SQLType (uaoMergeType uao)) `parents` [untyped ds] 151 | 152 | applyUAOUnsafe :: forall a b ref. (SQLTypeable b, HasCallStack) => (DataType -> AggTry UniversalAggregatorOp) -> Column ref a -> LocalData b 153 | applyUAOUnsafe f c = 154 | let lf = applyUntypedUniAgg3 f (untypedCol c) 155 | in forceRight (asObservable lf) 156 | 157 | -- _guardType :: DataType -> (UntypedDataset -> UntypedLocalData) -> (UntypedDataset -> LocalFrame) 158 | -- _guardType dt f ds = 159 | -- if unSQLType (nodeType ds) == dt 160 | -- then 161 | -- pure $ f ds 162 | -- else 163 | -- tryError $ sformat ("Expected type "%sh%" but got type "%sh) dt (nodeType ds) 164 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/AlgebraStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | {-# LANGUAGE FunctionalDependencies #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE TypeFamilies #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | {-# LANGUAGE FlexibleContexts #-} 9 | {-# LANGUAGE FunctionalDependencies #-} 10 | {-# LANGUAGE RankNTypes #-} 11 | 12 | -- TODO remove this file 13 | module Spark.Core.Internal.AlgebraStructures where 14 | 15 | -- | Algebraic structures that are common to columns and observables. 16 | 17 | 18 | data BinaryOpFun in1 in2 to = BinaryOpFun { 19 | bodLift1 :: in1 -> to, 20 | bodLift2 :: in2 -> to, 21 | bodOp :: to -> to -> to 22 | } 23 | 24 | 25 | class HomoBinaryOp2 in1 in2 to | in1 in2 -> to where 26 | _liftFun :: (to -> to -> to) -> BinaryOpFun in1 in2 to 27 | 28 | _applyBinOp0 :: forall in1 in2 to. in1 -> in2 -> BinaryOpFun in1 in2 to -> to 29 | _applyBinOp0 i1 i2 (BinaryOpFun l1 l2 bo) = bo (l1 i1) (l2 i2) 30 | 31 | applyBinOp :: forall in1 in2 to. (HomoBinaryOp2 in1 in2 to) => (to -> to -> to) -> in1 -> in2 -> to 32 | applyBinOp f i1 i2 = 33 | _applyBinOp0 i1 i2 (_liftFun f) 34 | 35 | -- -- | Overloaded operator for operationts that are guaranteed to succeed. 36 | -- (.+) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out 37 | -- (.+) = applyBinOp (+) 38 | 39 | (.-) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out 40 | (.-) = applyBinOp (-) 41 | 42 | (.*) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out 43 | (.*) = applyBinOp (*) 44 | 45 | -- TODO(kps) add here the rest of the Integral operations 46 | div' :: (Integral out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out 47 | div' = applyBinOp div 48 | 49 | -- **** Fractional **** 50 | 51 | (./) :: (Fractional out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out 52 | (./) = applyBinOp (/) 53 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Arithmetics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE TypeFamilies #-} 4 | -- Required by old versions 5 | {-# LANGUAGE FlexibleContexts #-} 6 | 7 | module Spark.Core.Internal.Arithmetics( 8 | GeneralizedHomoReturn, 9 | GeneralizedHomo2, 10 | HomoColOp2, 11 | -- | Developer API 12 | performOp, 13 | ) where 14 | 15 | 16 | import Spark.Core.Internal.ColumnFunctions 17 | import Spark.Core.Internal.ColumnStructures 18 | import Spark.Core.Internal.DatasetStructures 19 | import Spark.Core.Internal.FunctionsInternals(projectColFunction2') 20 | import Spark.Core.Internal.Utilities 21 | 22 | {-| All the automatic conversions supported when lifting a -} 23 | type family GeneralizedHomoReturn x1 x2 where 24 | GeneralizedHomoReturn (Column ref x1) (Column ref x1) = Column ref x1 25 | GeneralizedHomoReturn (Column ref x1) DynColumn = DynColumn 26 | GeneralizedHomoReturn (Column ref x1) (LocalData x1) = Column ref x1 27 | GeneralizedHomoReturn (Column ref x1) LocalFrame = DynColumn 28 | GeneralizedHomoReturn DynColumn (Column ref x1) = DynColumn 29 | GeneralizedHomoReturn DynColumn DynColumn = DynColumn 30 | GeneralizedHomoReturn DynColumn (LocalData x1) = DynColumn 31 | GeneralizedHomoReturn DynColumn LocalFrame = DynColumn 32 | GeneralizedHomoReturn (LocalData x1) (Column ref x1) = Column ref x1 33 | GeneralizedHomoReturn (LocalData x1) DynColumn = DynColumn 34 | GeneralizedHomoReturn (LocalData x1) (LocalData x1) = LocalData x1 35 | GeneralizedHomoReturn (LocalData x1) LocalFrame = LocalFrame 36 | GeneralizedHomoReturn LocalFrame (Column ref x1) = DynColumn 37 | GeneralizedHomoReturn LocalFrame LocalFrame = LocalFrame 38 | 39 | -- The type of an homogeneous operation. 40 | -- TODO it would be nice to enforce this contstraint at the type level, 41 | -- but it is a bit more complex to do. 42 | type HomoColOp2 = UntypedColumnData -> UntypedColumnData -> UntypedColumnData 43 | 44 | {-| The class of types that can be lifted to operations onto Karps types. 45 | 46 | This is the class for operations on homogeneous types (the inputs and the 47 | output have the same underlying type). 48 | 49 | At its core, it takes a broadcasted operation that works on columns, and 50 | makes that operation available on other shapes. 51 | -} 52 | class GeneralizedHomo2 x1 x2 where 53 | _projectHomo :: x1 -> x2 -> HomoColOp2 -> GeneralizedHomoReturn x1 x2 54 | 55 | {-| Performs an operation, using a reference operation defined on columns. 56 | -} 57 | performOp :: (GeneralizedHomo2 x1 x2) => 58 | HomoColOp2 -> 59 | x1 -> 60 | x2 -> 61 | GeneralizedHomoReturn x1 x2 62 | performOp f x1 x2 = _projectHomo x1 x2 f 63 | 64 | -- ******* INSTANCES ********* 65 | 66 | instance GeneralizedHomo2 DynColumn DynColumn where 67 | _projectHomo = _performDynDyn 68 | 69 | instance GeneralizedHomo2 (Column ref x) (Column ref x) where 70 | _projectHomo = _performCC 71 | 72 | instance GeneralizedHomo2 DynColumn (Column ref x) where 73 | _projectHomo dc1 c2 = _performDynDyn dc1 (untypedCol c2) 74 | 75 | instance GeneralizedHomo2 (Column ref x) DynColumn where 76 | _projectHomo c1 = _performDynDyn (untypedCol c1) 77 | 78 | instance GeneralizedHomo2 (Column ref x) (LocalData x) where 79 | _projectHomo c1 o2 = _projectHomo c1 (broadcast o2 c1) 80 | 81 | instance GeneralizedHomo2 (LocalData x) (Column ref x) where 82 | _projectHomo o1 c2 = _projectHomo (broadcast o1 c2) c2 83 | 84 | instance GeneralizedHomo2 (Column ref x) LocalFrame where 85 | _projectHomo c1 o2' = _projectHomo c1 (broadcast' o2' (untypedCol c1)) 86 | 87 | instance GeneralizedHomo2 LocalFrame (Column ref x) where 88 | _projectHomo o1' c2 = _projectHomo (broadcast' o1' (untypedCol c2)) c2 89 | 90 | instance GeneralizedHomo2 LocalFrame LocalFrame where 91 | _projectHomo o1' o2' f = 92 | let f' x y = f <$> x <*> y 93 | in projectColFunction2' f' o1' o2' 94 | 95 | 96 | _performDynDyn :: 97 | DynColumn -> DynColumn -> HomoColOp2 -> DynColumn 98 | _performDynDyn dc1 dc2 f = do 99 | c1 <- dc1 100 | c2 <- dc2 101 | -- TODO: add type guard 102 | let c = f c1 c2 103 | -- TODO: add dynamic check on the type of the return 104 | return (dropColType c) 105 | 106 | _performCC :: (HasCallStack) => 107 | Column ref x -> Column ref x -> HomoColOp2 -> Column ref x 108 | _performCC c1 c2 f = 109 | let sqlt = colType c1 110 | c = f (iUntypedColData c1) (iUntypedColData c2) 111 | c' = forceRight $ castCol (colRef c1) sqlt (pure c) 112 | in c' 113 | 114 | _performCO :: (HasCallStack) => 115 | Column ref x -> LocalData x -> HomoColOp2 -> Column ref x 116 | _performCO c1 o2 = _performCC c1 (broadcast o2 c1) 117 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ArithmeticsImpl.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | -- Disabled for old versions 3 | -- {-# OPTIONS_GHC -fno-warn-redundant-constraints #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | -- Required by old versions 8 | {-# LANGUAGE FlexibleContexts #-} 9 | 10 | {-| This module contains all the class instances and operators related 11 | to arithmetics with Datasets, Dataframes, Columns and Observables. 12 | -} 13 | module Spark.Core.Internal.ArithmeticsImpl( 14 | (.+), 15 | (.-), 16 | (./), 17 | div' 18 | ) where 19 | 20 | import Spark.Core.Internal.ColumnFunctions 21 | import Spark.Core.Internal.DatasetFunctions 22 | import Spark.Core.Internal.DatasetStructures 23 | import Spark.Core.Internal.LocalDataFunctions(constant) 24 | import Spark.Core.Internal.FunctionsInternals(projectColFunction2', projectColFunction') 25 | import Spark.Core.Internal.Arithmetics 26 | 27 | 28 | {-| A generalization of the addition for the Karps types. 29 | -} 30 | (.+) :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) => 31 | a1 -> a2 -> GeneralizedHomoReturn a1 a2 32 | (.+) = performOp (homoColOp2 "+") 33 | 34 | {-| A generalization of the negation for the Karps types. 35 | -} 36 | (.-) :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) => 37 | a1 -> a2 -> GeneralizedHomoReturn a1 a2 38 | (.-) = performOp (homoColOp2 "-") 39 | 40 | (./) :: (Fractional a1, Fractional a2, GeneralizedHomo2 a1 a2) => 41 | a1 -> a2 -> GeneralizedHomoReturn a1 a2 42 | (./) = performOp (homoColOp2 "/") 43 | 44 | div' :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) => 45 | a1 -> a2 -> GeneralizedHomoReturn a1 a2 46 | div' = performOp (homoColOp2 "/") 47 | 48 | -- All the operations are defined from column operations 49 | -- This adds a little overhead, but it can be optimized by the backend. 50 | instance Num LocalFrame where 51 | (+) = projectColFunction2' (+) 52 | (-) = projectColFunction2' (-) 53 | (*) = projectColFunction2' (*) 54 | abs = projectColFunction' abs 55 | signum = projectColFunction' signum 56 | -- It will choose by default to use the Int type, which may not be 57 | -- what the user wants. 58 | -- In case there is some doubt, user should use typed operations. 59 | fromInteger x = asLocalObservable $ constant (fromInteger x :: Int) 60 | negate = projectColFunction' negate 61 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/BasicStructures.hs: -------------------------------------------------------------------------------- 1 | -- Some basic structures shared across most modules. 2 | 3 | -- TODO remove this file 4 | module Spark.Core.Internal.BasicStructures where 5 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/CachingUntyped.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | 6 | {-| Implementation of the caching interfaces for the compute data structures. 7 | -} 8 | module Spark.Core.Internal.CachingUntyped( 9 | cachingType, 10 | autocacheGen 11 | ) where 12 | 13 | import Control.Monad.Except 14 | 15 | import Spark.Core.Internal.Caching 16 | import Spark.Core.Internal.DatasetStructures 17 | import Spark.Core.Internal.DatasetFunctions 18 | import Spark.Core.Internal.OpStructures 19 | import Spark.Core.Internal.PathsUntyped() 20 | import Spark.Core.Internal.DAGStructures 21 | import Spark.Core.StructuresInternal 22 | 23 | cachingType :: UntypedNode -> CacheTry NodeCachingType 24 | cachingType n = case nodeOp n of 25 | NodeLocalOp _ -> pure Stop 26 | NodeAggregatorReduction _ -> pure Stop 27 | NodeAggregatorLocalReduction _ -> pure Stop 28 | NodeOpaqueAggregator _ -> pure Stop 29 | NodeLocalLit _ _ -> pure Stop 30 | NodeStructuredTransform _ -> pure Through 31 | NodeDistributedLit _ _ -> pure Through 32 | NodeDistributedOp so | soName so == opnameCache -> 33 | pure $ CacheOp (vertexToId n) 34 | NodeDistributedOp so | soName so == opnameUnpersist -> 35 | case nodeParents n of 36 | [n'] -> pure $ UncacheOp (vertexToId n) (vertexToId n') 37 | _ -> throwError "Node is not valid uncache node" 38 | NodeDistributedOp so | soName so == opnameAutocache -> 39 | pure $ AutocacheOp (vertexToId n) 40 | NodeDistributedOp _ -> pure Through -- Nothing special for the other operations 41 | NodeBroadcastJoin -> pure Through 42 | NodeGroupedReduction _ -> pure Stop 43 | NodeReduction _ -> pure Stop 44 | NodePointer _ -> pure Stop -- It is supposed to be an observable 45 | 46 | autocacheGen :: AutocacheGen UntypedNode 47 | autocacheGen = AutocacheGen { 48 | deriveUncache = deriveUncache', 49 | deriveIdentity = deriveIdentity' 50 | } where 51 | -- TODO: use path-based identification in the future 52 | -- f :: String -> VertexId -> VertexId 53 | -- f s (VertexId bs) = VertexId . C8.pack . (++s) . C8.unpack $ bs 54 | deriveIdentity' (Vertex _ un) = 55 | let x = identity un 56 | vid' = VertexId . unNodeId . nodeId $ x -- f "_identity" vid 57 | in Vertex vid' x 58 | deriveUncache' (Vertex _ un) = 59 | let x = uncache un 60 | vid' = VertexId . unNodeId . nodeId $ x -- f "_uncache" vid 61 | in Vertex vid' x 62 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/CanRename.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | {-# LANGUAGE IncoherentInstances #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE TypeFamilies #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | {-# LANGUAGE FlexibleContexts #-} 9 | {-# LANGUAGE RankNTypes #-} 10 | 11 | -- TODO(kps): this module stretches my understanding of Haskell. 12 | -- There is probably better than that. 13 | 14 | {-| Defines the notion of renaming something. 15 | 16 | This is closed over a few well-defined types. 17 | -} 18 | module Spark.Core.Internal.CanRename where 19 | 20 | import qualified Data.Text as T 21 | import Formatting 22 | 23 | import Spark.Core.Try 24 | import Spark.Core.StructuresInternal 25 | import Spark.Core.Internal.ColumnFunctions() 26 | import Spark.Core.Internal.ColumnStructures 27 | import Spark.Core.Internal.DatasetStructures 28 | import Spark.Core.Internal.Utilities 29 | 30 | -- | The class of types that can be renamed. 31 | -- It is made generic because it covers 2 notions: 32 | -- - the name of a compute node that will eventually determine its compute path 33 | -- - the name of field (which may become an object path) 34 | -- This syntax tries to be convenient and will fail immediately 35 | -- for basic errors such as illegal characters. 36 | -- 37 | -- This could be revisited in the future, but it is a compromise 38 | -- on readability. 39 | class CanRename a txt where 40 | (@@) :: a -> txt -> a 41 | 42 | infixl 1 @@ 43 | 44 | 45 | instance forall ref a. CanRename (ColumnData ref a) FieldName where 46 | c @@ fn = c { _cReferingPath = Just fn } 47 | 48 | 49 | instance forall ref a s. (s ~ String) => CanRename (Column ref a) s where 50 | c @@ str = case fieldName (T.pack str) of 51 | Right fp -> c @@ fp 52 | Left msg -> 53 | -- The syntax check here is pretty lenient, so it fails, it has 54 | -- some good reasons. We stop here. 55 | failure $ sformat ("Could not make a field path out of string "%shown%" for column "%shown%":"%shown) str c msg 56 | 57 | instance CanRename DynColumn FieldName where 58 | (Right cd) @@ fn = Right (cd @@ fn) 59 | -- TODO better error handling 60 | x @@ _ = x 61 | 62 | instance forall s. (s ~ String) => CanRename DynColumn s where 63 | -- An error could happen when making a path out of a string. 64 | (Right cd) @@ str = case fieldName (T.pack str) of 65 | Right fp -> Right $ cd @@ fp 66 | Left msg -> 67 | -- The syntax check here is pretty lenient, so it fails, it has 68 | -- some good reasons. We stop here. 69 | tryError $ sformat ("Could not make a field path out of string "%shown%" for column "%shown%":"%shown) str cd msg 70 | -- TODO better error handling 71 | x @@ _ = x 72 | 73 | 74 | instance forall loc a s. (s ~ String) => CanRename (ComputeNode loc a) s where 75 | -- There is no need to update the id, as this field is not involved 76 | -- in the calculation of the id. 77 | -- TODO: make this fail immediately? If the name is wrong, it is 78 | -- harder to figure out what is happening. 79 | (@@) cn name = cn { _cnName = Just nn } where 80 | nn = NodeName . T.pack $ name 81 | 82 | instance forall loc a s. (s ~ String) => CanRename (Try (ComputeNode loc a)) s where 83 | (Right n) @@ str = Right (n @@ str) 84 | (Left n) @@ _ = Left n 85 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Client.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | -- The communication protocol with the server 4 | 5 | module Spark.Core.Internal.Client where 6 | 7 | import Spark.Core.StructuresInternal 8 | import Spark.Core.Dataset(UntypedNode) 9 | import Spark.Core.Internal.Utilities 10 | import Spark.Core.Internal.TypesStructures(DataType) 11 | import Spark.Core.Internal.TypesFunctions() 12 | 13 | import Data.Text(Text, pack) 14 | import Data.Aeson 15 | import Data.Aeson.Types(Parser) 16 | import GHC.Generics 17 | 18 | 19 | -- Imports for the client 20 | 21 | {-| The ID of an RDD in Spark. 22 | -} 23 | data RDDId = RDDId { 24 | unRDDId :: !Int 25 | } deriving (Eq, Show, Ord) 26 | 27 | data LocalSessionId = LocalSessionId { 28 | unLocalSession :: !Text 29 | } deriving (Eq, Show) 30 | 31 | data Computation = Computation { 32 | cSessionId :: !LocalSessionId, 33 | cId :: !ComputationID, 34 | cNodes :: ![UntypedNode], 35 | -- Non-empty 36 | cTerminalNodes :: ![NodePath], 37 | -- The node at the top of the computation. 38 | -- Must be part of the terminal nodes. 39 | cCollectingNode :: !NodePath, 40 | -- This redundant information is not serialized. 41 | -- It is used internally to track the resulting nodes. 42 | cTerminalNodeIds :: ![NodeId] 43 | } deriving (Show, Generic) 44 | 45 | data BatchComputationKV = BatchComputationKV { 46 | bckvLocalPath :: !NodePath, 47 | bckvDeps :: ![NodePath], 48 | bckvResult :: !PossibleNodeStatus 49 | } deriving (Show, Generic) 50 | 51 | data BatchComputationResult = BatchComputationResult { 52 | bcrTargetLocalPath :: !NodePath, 53 | bcrResults :: ![(NodePath, [NodePath], PossibleNodeStatus)] 54 | } deriving (Show, Generic) 55 | 56 | data RDDInfo = RDDInfo { 57 | rddiId :: !RDDId, 58 | rddiClassName :: !Text, 59 | rddiRepr :: !Text, 60 | rddiParents :: ![RDDId] 61 | } deriving (Show, Generic) 62 | 63 | data SparkComputationItemStats = SparkComputationItemStats { 64 | scisRddInfo :: ![RDDInfo] 65 | } deriving (Show, Generic) 66 | 67 | data PossibleNodeStatus = 68 | NodeQueued 69 | | NodeRunning 70 | | NodeFinishedSuccess !(Maybe NodeComputationSuccess) !(Maybe SparkComputationItemStats) 71 | | NodeFinishedFailure NodeComputationFailure deriving (Show, Generic) 72 | 73 | data NodeComputationSuccess = NodeComputationSuccess { 74 | -- Because Row requires additional information to be deserialized. 75 | ncsData :: Value, 76 | -- The data type is also available, but it is not going to be parsed for now. 77 | ncsDataType :: DataType 78 | } deriving (Show, Generic) 79 | 80 | data NodeComputationFailure = NodeComputationFailure { 81 | ncfMessage :: !Text 82 | } deriving (Show, Generic) 83 | 84 | 85 | -- **** AESON INSTANCES *** 86 | 87 | instance ToJSON LocalSessionId where 88 | toJSON = toJSON . unLocalSession 89 | 90 | instance FromJSON RDDId where 91 | parseJSON x = RDDId <$> parseJSON x 92 | 93 | instance FromJSON RDDInfo where 94 | parseJSON = withObject "RDDInfo" $ \o -> do 95 | _id <- o .: "id" 96 | className <- o .: "className" 97 | repr <- o .: "repr" 98 | parents <- o .: "parents" 99 | return $ RDDInfo _id className repr parents 100 | 101 | instance FromJSON SparkComputationItemStats where 102 | parseJSON = withObject "SparkComputationItemStats" $ \o -> do 103 | rddinfo <- o .: "rddInfo" 104 | return $ SparkComputationItemStats rddinfo 105 | 106 | instance FromJSON BatchComputationKV where 107 | parseJSON = withObject "BatchComputationKV" $ \o -> do 108 | np <- o .: "localPath" 109 | deps <- o .: "pathDependencies" 110 | res <- o .: "result" 111 | return $ BatchComputationKV np deps res 112 | 113 | instance FromJSON BatchComputationResult where 114 | parseJSON = withObject "BatchComputationResult" $ \o -> do 115 | kvs <- o .: "results" 116 | tlp <- o .: "targetLocalPath" 117 | let f (BatchComputationKV k d v) = (k, d, v) 118 | return $ BatchComputationResult tlp (f <$> kvs) 119 | 120 | instance FromJSON NodeComputationSuccess where 121 | parseJSON = withObject "NodeComputationSuccess" $ \o -> NodeComputationSuccess 122 | <$> o .: "content" 123 | <*> o .: "type" 124 | 125 | -- Because we get a row back, we need to supply a SQLType for deserialization. 126 | instance FromJSON PossibleNodeStatus where 127 | parseJSON = 128 | let parseSuccess :: Object -> Parser PossibleNodeStatus 129 | parseSuccess o = NodeFinishedSuccess 130 | <$> o .:? "finalResult" 131 | <*> o .:? "stats" 132 | parseFailure :: Object -> Parser PossibleNodeStatus 133 | parseFailure o = 134 | (NodeFinishedFailure . NodeComputationFailure) <$> o .: pack "finalError" 135 | in 136 | withObject "PossibleNodeStatus" $ \o -> do 137 | status <- o .: pack "status" 138 | case pack status of 139 | "running" -> return NodeRunning 140 | "finished_success" -> parseSuccess o 141 | "finished_failure" -> parseFailure o 142 | "scheduled" -> return NodeQueued 143 | _ -> failure $ pack ("FromJSON PossibleNodeStatus " ++ show status) 144 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ColumnStandard.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE TypeFamilies #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | {-# LANGUAGE FlexibleContexts #-} 9 | 10 | {-| The standard library of functions operating on columns only. 11 | -} 12 | module Spark.Core.Internal.ColumnStandard( 13 | asDoubleCol 14 | ) where 15 | 16 | 17 | import Spark.Core.Internal.ColumnStructures 18 | import Spark.Core.Internal.ColumnFunctions 19 | import Spark.Core.Internal.TypesGenerics(buildType) 20 | 21 | asDoubleCol :: (Num a) => Column ref a -> Column ref Double 22 | asDoubleCol = makeColOp1 "double" buildType 23 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ColumnStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | 7 | module Spark.Core.Internal.ColumnStructures where 8 | 9 | import Control.Arrow ((&&&)) 10 | import Data.Function(on) 11 | import Data.Vector(Vector) 12 | 13 | import Spark.Core.Internal.DatasetStructures 14 | import Spark.Core.Internal.DatasetFunctions() 15 | import Spark.Core.Internal.RowStructures 16 | import Spark.Core.Internal.TypesStructures 17 | import Spark.Core.Internal.OpStructures 18 | import Spark.Core.StructuresInternal 19 | import Spark.Core.Try 20 | 21 | {-| The data structure that implements the notion of data columns. 22 | 23 | The type on this one may either be a Cell or a proper type. 24 | 25 | A column of data from a dataset 26 | The ref is a reference potentially to the originating 27 | dataset, but it may be more general than that to perform 28 | type-safe tricks. 29 | 30 | Unlike Spark, columns are always attached to a reference dataset or dataframe. 31 | One cannot materialize a column out of thin air. In order to broadcast a value 32 | along a given column, the `broadcast` function is provided. 33 | 34 | TODO: try something like this https://www.vidarholen.net/contents/junk/catbag.html 35 | -} 36 | data ColumnData ref a = ColumnData { 37 | _cOrigin :: !UntypedDataset, 38 | _cType :: !DataType, 39 | _cOp :: !GeneralizedColOp, 40 | -- The name in the dataset. 41 | -- If not set, it will be deduced from the operation. 42 | _cReferingPath :: !(Maybe FieldName) 43 | } 44 | 45 | {-| A generalization of the column operation. 46 | 47 | This structure is useful to performn some extra operations not supported by 48 | the Spark engine: 49 | - express joins with an observable 50 | - keep track of DAGs of column operations (not implemented yet) 51 | -} 52 | data GeneralizedColOp = 53 | GenColExtraction !FieldPath 54 | | GenColFunction !SqlFunctionName !(Vector GeneralizedColOp) 55 | | GenColLit !DataType !Cell 56 | -- This is the extra operation that needs to be flattened with a broadcast. 57 | | BroadcastColOp !UntypedLocalData 58 | | GenColStruct !(Vector GeneralizedTransField) 59 | deriving (Eq, Show) 60 | 61 | data GeneralizedTransField = GeneralizedTransField { 62 | gtfName :: !FieldName, 63 | gtfValue :: !GeneralizedColOp 64 | } deriving (Eq, Show) 65 | 66 | {-| A column of data from a dataset or a dataframe. 67 | 68 | This column is typed: the operations on this column will be 69 | validdated by Haskell' type inferenc. 70 | -} 71 | type Column ref a = ColumnData ref a 72 | 73 | {-| An untyped column of data from a dataset or a dataframe. 74 | 75 | This column is untyped and may not be properly constructed. Any error 76 | will be found during the analysis phase at runtime. 77 | -} 78 | type DynColumn = Try (ColumnData UnknownReference Cell) 79 | 80 | 81 | -- | (dev) 82 | -- The type of untyped column data. 83 | type UntypedColumnData = ColumnData UnknownReference Cell 84 | 85 | {-| (dev) 86 | A column for which the type of the cells is unavailable (at the type level), 87 | but for which the origin is available at the type level. 88 | -} 89 | type GenericColumn ref = Column ref Cell 90 | 91 | {-| A dummy data type that indicates the data referenc is missing. 92 | -} 93 | data UnknownReference 94 | 95 | {-| A tag that carries the reference information of a column at a 96 | type level. This is useful when creating column. 97 | 98 | See ref and colRef. 99 | -} 100 | data ColumnReference a = ColumnReference 101 | 102 | instance forall ref a. Eq (ColumnData ref a) where 103 | (==) = (==) `on` (_cOrigin &&& _cType &&& _cOp &&& _cReferingPath) 104 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ComputeDag.hs: -------------------------------------------------------------------------------- 1 | 2 | 3 | module Spark.Core.Internal.ComputeDag where 4 | 5 | import Data.Foldable(toList) 6 | import qualified Data.Map.Strict as M 7 | import qualified Data.Vector as V 8 | import Data.Vector(Vector) 9 | 10 | import Spark.Core.Internal.DAGStructures 11 | import Spark.Core.Internal.DAGFunctions 12 | 13 | {-| A DAG of computation nodes. 14 | 15 | At a high level, it is a total function with a number of inputs and a number 16 | of outputs. 17 | 18 | Note about the edges: the edges flow along the path of dependencies: 19 | the inputs are the start points, and the outputs are the end points of the 20 | graph. 21 | 22 | -} 23 | data ComputeDag v e = ComputeDag { 24 | -- The edges that make up the DAG 25 | cdEdges :: !(AdjacencyMap v e), 26 | -- All the vertices of the graph 27 | -- Sorted by lexicographic order + node id for uniqueness 28 | cdVertices :: !(Vector (Vertex v)), 29 | -- The inputs of the computation graph. These correspond to the 30 | -- sinks of the dependency graph. 31 | cdInputs :: !(Vector (Vertex v)), 32 | -- The outputs of the computation graph. These correspond to the 33 | -- sources of the dependency graph. 34 | cdOutputs :: !(Vector (Vertex v)) 35 | } deriving (Show) 36 | 37 | 38 | -- | Conversion 39 | computeGraphToGraph :: ComputeDag v e -> Graph v e 40 | computeGraphToGraph cg = 41 | Graph (cdEdges cg) (cdVertices cg) 42 | 43 | -- | Conversion 44 | graphToComputeGraph :: Graph v e -> ComputeDag v e 45 | graphToComputeGraph g = 46 | ComputeDag { 47 | cdEdges = gEdges g, 48 | cdVertices = gVertices g, 49 | -- We work on the graph of dependencies (not flows) 50 | -- The sources correspond to the outputs. 51 | cdInputs = V.fromList $ graphSinks g, 52 | cdOutputs = V.fromList $ graphSources g 53 | } 54 | 55 | _mapVerticesAdj :: (Vertex v -> v') -> AdjacencyMap v e -> AdjacencyMap v' e 56 | _mapVerticesAdj f m = 57 | let f1 ve = 58 | let vx = veEndVertex ve 59 | d' = f vx in 60 | ve { veEndVertex = vx { vertexData = d' } } 61 | f' v = f1 <$> v 62 | in M.map f' m 63 | 64 | mapVertices :: (Vertex v -> v') -> ComputeDag v e -> ComputeDag v' e 65 | mapVertices f cd = 66 | let f' vx = vx { vertexData = f vx } 67 | in ComputeDag { 68 | cdEdges = _mapVerticesAdj f (cdEdges cd), 69 | cdVertices = f' <$> cdVertices cd, 70 | cdInputs = f' <$> cdInputs cd, 71 | cdOutputs = f' <$> cdOutputs cd 72 | } 73 | 74 | mapVertexData :: (v -> v') -> ComputeDag v e -> ComputeDag v' e 75 | mapVertexData f = mapVertices (f . vertexData) 76 | 77 | buildCGraph :: (GraphOperations v e, Show v) => 78 | v -> DagTry (ComputeDag v e) 79 | buildCGraph n = graphToComputeGraph <$> buildGraph n 80 | 81 | graphDataLexico :: ComputeDag v e -> [v] 82 | graphDataLexico cd = vertexData <$> toList (cdVertices cd) 83 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ContextInteractive.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | 4 | -- | Functions to create and manipulate one default context. 5 | -- 6 | -- This is most appropriate when working in an interactive session, 7 | -- during which it is usually clear that there is a single 8 | -- Spark context in use. 9 | -- 10 | -- This module uses unsafe Haskell code that should not be used 11 | -- outside prototyping in an interactive REPL. In any good case, 12 | -- you should use the SparkState monad. 13 | module Spark.Core.Internal.ContextInteractive( 14 | SparkInteractiveException, 15 | createSparkSessionDef, 16 | exec1Def, 17 | exec1Def', 18 | closeSparkSessionDef, 19 | execStateDef, 20 | computationStatsDef, 21 | currentSessionDef 22 | ) where 23 | 24 | import qualified Data.Vector as V 25 | import Control.Exception 26 | import Control.Monad.Catch(throwM) 27 | import Data.IORef 28 | import Data.Typeable 29 | import Control.Monad.State(runStateT) 30 | import Data.Text 31 | import System.IO.Unsafe(unsafePerformIO) 32 | import Control.Monad.Logger(runStdoutLoggingT) 33 | 34 | 35 | import Spark.Core.Internal.Client(BatchComputationResult) 36 | import Spark.Core.Internal.ContextStructures 37 | import Spark.Core.Internal.DatasetStructures 38 | import Spark.Core.Internal.DatasetFunctions(untypedLocalData) 39 | import Spark.Core.Internal.ContextIOInternal 40 | import Spark.Core.Internal.RowGenericsFrom(FromSQL, cellToValue) 41 | import Spark.Core.Internal.RowStructures(Cell) 42 | import Spark.Core.StructuresInternal 43 | import Spark.Core.Try 44 | 45 | -- The global session reference. Should not be accessed outside 46 | -- this file. 47 | _globalSessionRef :: IORef (Maybe SparkSession) 48 | {-# NOINLINE _globalSessionRef #-} 49 | _globalSessionRef = unsafePerformIO (newIORef Nothing) 50 | 51 | -- | The exception thrown when a request cannot be completed 52 | -- in an interactive session. 53 | data SparkInteractiveException = SparkInteractiveException { 54 | _sieInner :: NodeError 55 | } deriving Typeable 56 | 57 | instance Show SparkInteractiveException where 58 | show (SparkInteractiveException inner) = 59 | show inner 60 | 61 | instance Exception SparkInteractiveException 62 | 63 | {- | Creates a spark session that will be used as the default session. 64 | 65 | If a session already exists, an exception will be thrown. 66 | -} 67 | createSparkSessionDef :: SparkSessionConf -> IO () 68 | createSparkSessionDef conf = do 69 | current <- _currentSession 70 | case current of 71 | Nothing -> 72 | return () 73 | Just _ -> 74 | -- TODO let users change the state 75 | _throw "A default context already exist. If you wish to modify the exsting context, you must use modifySparkConfDef" 76 | new <- createSparkSession' conf 77 | _setSession new 78 | return () 79 | 80 | {- | Executes a command using the default spark session. 81 | 82 | This is the most unsafe way of running a command: 83 | it executes a command using the default spark session, and 84 | throws an exception if any error happens. 85 | -} 86 | exec1Def :: (FromSQL a) => LocalData a -> IO a 87 | exec1Def ld = do 88 | c <- exec1Def' (pure (untypedLocalData ld)) 89 | _forceEither $ cellToValue c 90 | 91 | exec1Def' :: LocalFrame -> IO Cell 92 | exec1Def' lf = do 93 | ld <- _getOrThrow lf 94 | res <- execStateDef (executeCommand1' ld) 95 | _getOrThrow res 96 | 97 | {-| Runs the computation described in the state transform, using the default 98 | Spark session. 99 | 100 | Will throw an exception if no session currently exists. 101 | -} 102 | execStateDef :: SparkState a -> IO a 103 | execStateDef s = do 104 | ctx <- _currentSessionOrThrow 105 | (res, newSt) <- (runStateT . runStdoutLoggingT) s ctx 106 | _setSession newSt 107 | return res 108 | 109 | {-| Closes the default session. The default session is empty after this call 110 | completes. 111 | 112 | NOTE: This does not currently clear up the resources! It is a stub implementation 113 | used in testing. 114 | -} 115 | closeSparkSessionDef :: IO () 116 | closeSparkSessionDef = do 117 | _ <- _removeSession 118 | return () 119 | 120 | computationStatsDef :: ComputationID -> IO BatchComputationResult 121 | computationStatsDef compId = execStateDef (computationStats compId) 122 | 123 | currentSessionDef :: IO (Maybe SparkSession) 124 | currentSessionDef = _currentSession 125 | 126 | _currentSession :: IO (Maybe SparkSession) 127 | _currentSession = readIORef _globalSessionRef 128 | 129 | _setSession :: SparkSession -> IO () 130 | _setSession st = writeIORef _globalSessionRef (Just st) 131 | 132 | _removeSession :: IO (Maybe SparkSession) 133 | _removeSession = do 134 | current <- _currentSession 135 | _ <- writeIORef _globalSessionRef Nothing 136 | return current 137 | 138 | _currentSessionOrThrow :: IO SparkSession 139 | _currentSessionOrThrow = do 140 | mCtx <- _currentSession 141 | case mCtx of 142 | Nothing -> 143 | _throw "No default context found. You must first create a default spark context with createSparkSessionDef" 144 | Just ctx -> return ctx 145 | 146 | 147 | _getOrThrow :: Try a -> IO a 148 | _getOrThrow (Right x) = return x 149 | _getOrThrow (Left err) = throwM (SparkInteractiveException err) 150 | 151 | _forceEither :: Either Text a -> IO a 152 | _forceEither = _getOrThrow . tryEither 153 | 154 | _throw :: Text -> IO a 155 | _throw txt = throwM $ 156 | SparkInteractiveException Error { 157 | ePath = NodePath V.empty, 158 | eMessage = txt 159 | } 160 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ContextStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Spark.Core.Internal.ContextStructures( 4 | SparkSessionConf(..), 5 | SparkSession(..), 6 | SparkState, 7 | SparkStatePure, 8 | ComputeGraph, 9 | HdfsPath(..), 10 | NodeCacheInfo(..), 11 | NodeCacheStatus(..), 12 | SparkStateT, 13 | SparkStatePureT 14 | ) where 15 | 16 | import Data.Text(Text) 17 | import Control.Monad.State(StateT, State) 18 | import Control.Monad.Logger(LoggingT) 19 | 20 | import Spark.Core.Internal.Client(LocalSessionId) 21 | import Spark.Core.Internal.ComputeDag(ComputeDag) 22 | import Spark.Core.Internal.OpStructures(HdfsPath(..)) 23 | import Spark.Core.Internal.Pruning 24 | import Spark.Core.Internal.DatasetStructures(UntypedNode, StructureEdge) 25 | 26 | -- | The configuration of a remote spark session in Karps. 27 | data SparkSessionConf = SparkSessionConf { 28 | -- | The URL of the end point. 29 | confEndPoint :: !Text, 30 | -- | The port used to configure the end point. 31 | confPort :: !Int, 32 | -- | (internal) the polling interval 33 | confPollingIntervalMillis :: !Int, 34 | -- | (optional) the requested name of the session. 35 | -- This name must obey a number of rules: 36 | -- - it must consist in alphanumerical and -,_: [a-zA-Z0-9\-_] 37 | -- - if it already exists on the server, it will be reconnected to 38 | -- 39 | -- The default value is "" (a new random context name will be chosen). 40 | confRequestedSessionName :: !Text, 41 | {-| If enabled, attempts to prune the computation graph as much as possible. 42 | 43 | This option is useful in interactive sessions when long chains of computations 44 | are extracted. This forces the execution of only the missing parts. 45 | The algorithm is experimental, so disabling it is a safe option. 46 | 47 | Disabled by default. 48 | -} 49 | confUseNodePrunning :: !Bool 50 | } deriving (Show) 51 | 52 | -- | A session in Spark. 53 | -- Encapsualates all the state needed to communicate with Spark 54 | -- and to perfor some simple optimizations on the code. 55 | data SparkSession = SparkSession { 56 | ssConf :: !SparkSessionConf, 57 | ssId :: !LocalSessionId, 58 | ssCommandCounter :: !Integer, 59 | ssNodeCache :: !NodeCache 60 | } deriving (Show) 61 | 62 | 63 | 64 | -- | Represents the state of a session and accounts for the communication 65 | -- with the server. 66 | type SparkState a = SparkStateT IO a 67 | 68 | -- More minimalistic state transforms when doing pure evaluation. 69 | -- (internal type) 70 | -- TODO: use the transformer 71 | type SparkStatePure x = State SparkSession x 72 | 73 | type SparkStatePureT m = StateT SparkSession m 74 | type SparkStateT m = LoggingT (SparkStatePureT m) 75 | 76 | {-| internal 77 | 78 | A graph of computations. This graph is a direct acyclic graph. Each node is 79 | associated to a global path. 80 | -} 81 | type ComputeGraph = ComputeDag UntypedNode StructureEdge 82 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/DAGStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | {-| Data structures to represent Directed Acyclic Graphs (DAGs). 6 | 7 | -} 8 | module Spark.Core.Internal.DAGStructures where 9 | 10 | import qualified Data.Map.Strict as M 11 | import qualified Data.Text as T 12 | import Data.ByteString(ByteString) 13 | import Data.Vector(Vector) 14 | import Data.Foldable(toList) 15 | import Data.Hashable(Hashable) 16 | import GHC.Generics(Generic) 17 | import Formatting 18 | 19 | import Spark.Core.Internal.Utilities 20 | 21 | -- | The unique ID of a vertex. 22 | newtype VertexId = VertexId { unVertexId :: ByteString } deriving (Eq, Ord, Generic) 23 | 24 | 25 | -- | An edge in a graph, parametrized by some payload. 26 | data Edge e = Edge { 27 | edgeFrom :: !VertexId, 28 | edgeTo :: !VertexId, 29 | edgeData :: !e 30 | } 31 | 32 | -- | A vertex in a graph, parametrized by some payload. 33 | data Vertex v = Vertex { 34 | vertexId :: !VertexId, 35 | vertexData :: !v 36 | } 37 | 38 | {-| An edge, along with its end node. 39 | -} 40 | data VertexEdge e v = VertexEdge { 41 | veEndVertex :: !(Vertex v), 42 | veEdge :: !(Edge e) } 43 | 44 | {-| The adjacency map of a graph. 45 | 46 | The node Id corresponds to the start node, the pairs are the end node and 47 | and the edge to reach to the node. There may be multiple edges leading to the 48 | same node. 49 | -} 50 | type AdjacencyMap v e = M.Map VertexId (Vector (VertexEdge e v)) 51 | 52 | -- | The representation of a graph. 53 | -- 54 | -- In all the project, it is considered as a DAG. 55 | data Graph v e = Graph { 56 | gEdges :: !(AdjacencyMap v e), 57 | gVertices :: !(Vector (Vertex v)) 58 | } 59 | 60 | -- | Graph operations on types that are supposed to 61 | -- represent vertices. 62 | class GraphVertexOperations v where 63 | vertexToId :: v -> VertexId 64 | expandVertexAsVertices :: v -> [v] 65 | 66 | -- | Graph operations on types that are supposed to represent 67 | -- edges. 68 | class (GraphVertexOperations v) => GraphOperations v e where 69 | expandVertex :: v -> [(e,v)] 70 | 71 | instance Functor Vertex where 72 | fmap f vx = vx { vertexData = f (vertexData vx) } 73 | 74 | instance Functor Edge where 75 | fmap f ed = ed { edgeData = f (edgeData ed) } 76 | 77 | instance (Show v) => Show (Vertex v) where 78 | show vx = "Vertex(vId=" ++ show (vertexId vx) ++ " v=" ++ show (vertexData vx) ++ ")" 79 | 80 | instance (Show e) => Show (Edge e) where 81 | show ed = "Edge(from=" ++ show (edgeFrom ed) ++ " to=" ++ show (edgeTo ed) ++ " e=" ++ show (edgeData ed) ++ ")" 82 | 83 | instance (Show v, Show e) => Show (VertexEdge e v) where 84 | show (VertexEdge v e) = "(" ++ show v ++ ", " ++ show e ++ ")" 85 | 86 | instance (Show v, Show e) => Show (Graph v e) where 87 | show g = 88 | let vxs = toList $ gVertices g <&> \(Vertex vid x) -> 89 | sformat (sh%":"%sh) vid x 90 | vedges = foldMap toList (M.elems (gEdges g)) 91 | edges = (veEdge <$> vedges) <&> \(Edge efrom eto x) -> 92 | sformat (sh%"->"%sh%"->"%sh) efrom x eto 93 | -- eds = (M.elems (gEdges g)) `foldMap` \v -> 94 | -- (toList v) <&> 95 | vxs' = T.intercalate "," vxs 96 | eds' = T.intercalate " " edges 97 | str = T.concat ["Graph{", vxs', ", ", eds', "}"] 98 | in T.unpack str 99 | 100 | instance Hashable VertexId 101 | 102 | instance Show VertexId where 103 | show (VertexId bs) = let s = show bs in 104 | if length s > 9 then 105 | (drop 1 . take 6) s ++ ".." 106 | else 107 | s 108 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/DatasetStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE TypeSynonymInstances #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | 7 | module Spark.Core.Internal.DatasetStructures where 8 | 9 | import Data.Vector(Vector) 10 | 11 | import Spark.Core.StructuresInternal 12 | import Spark.Core.Try 13 | import Spark.Core.Row 14 | import Spark.Core.Internal.OpStructures 15 | import Spark.Core.Internal.TypesStructures 16 | 17 | {-| (internal) The main data structure that represents a data node in the 18 | computation graph. 19 | 20 | This data structure forms the backbone of computation graphs expressed 21 | with spark operations. 22 | 23 | loc is a typed locality tag. 24 | a is the type of the data, as seen by the Haskell compiler. If erased, it 25 | will be a Cell type. 26 | -} 27 | -- TODO: separate the topology info from the node info. It will help when 28 | -- building the graphs. 29 | data ComputeNode loc a = ComputeNode { 30 | -- | The id of the node. 31 | -- 32 | -- Non strict because it may be expensive. 33 | _cnNodeId :: NodeId, 34 | -- The following fields are used to build a unique ID to 35 | -- a compute node: 36 | 37 | -- | The operation associated to this node. 38 | _cnOp :: !NodeOp, 39 | -- | The type of the node 40 | _cnType :: !DataType, 41 | -- | The direct parents of the node. The order of the parents is important 42 | -- for the semantics of the operation. 43 | _cnParents :: !(Vector UntypedNode), 44 | -- | A set of extra dependencies that can be added to force an order between 45 | -- the nodes. 46 | -- 47 | -- The order is not important, they are sorted by ID. 48 | -- 49 | -- TODO(kps) add this one to the id 50 | _cnLogicalDeps :: !(Vector UntypedNode), 51 | -- | The locality of this node. 52 | -- 53 | -- TODO(kps) add this one to the id 54 | _cnLocality :: !Locality, 55 | -- Attributes that are not included in the id 56 | -- These attributes are mostly for the user to relate to the nodes. 57 | -- They are not necessary for the computation. 58 | -- 59 | -- | The name 60 | _cnName :: !(Maybe NodeName), 61 | -- | A set of nodes considered as the logical input for this node. 62 | -- This has no influence on the calculation of the id and is used 63 | -- for organization purposes only. 64 | _cnLogicalParents :: !(Maybe (Vector UntypedNode)), 65 | -- | The path of this oned in a computation flow. 66 | -- 67 | -- This path includes the node name. 68 | -- Not strict because it may be expensive to compute. 69 | -- By default it only contains the name of the node (i.e. the node is 70 | -- attached to the root) 71 | _cnPath :: NodePath 72 | } deriving (Eq) 73 | 74 | -- (internal) Phantom type tags for the locality 75 | data TypedLocality loc = TypedLocality { unTypedLocality :: !Locality } deriving (Eq, Show) 76 | data LocLocal 77 | data LocDistributed 78 | data LocUnknown 79 | 80 | -- (developer) The type for which we drop all the information expressed in 81 | -- types. 82 | -- 83 | -- This is useful to express parent dependencies (pending a more type-safe 84 | -- interface) 85 | type UntypedNode = ComputeNode LocUnknown Cell 86 | 87 | -- (internal) A dataset for which we have dropped type information. 88 | -- Used internally by columns. 89 | type UntypedDataset = Dataset Cell 90 | 91 | type UntypedLocalData = LocalData Cell 92 | 93 | {-| A typed collection of distributed data. 94 | 95 | Most operations on datasets are type-checked by the Haskell 96 | compiler: the type tag associated to this dataset is guaranteed 97 | to be convertible to a proper Haskell type. In particular, building 98 | a Dataset of dynamic cells is guaranteed to never happen. 99 | 100 | If you want to do untyped operations and gain 101 | some flexibility, consider using UDataFrames instead. 102 | 103 | Computations with Datasets and observables are generally checked for 104 | correctness using the type system of Haskell. 105 | -} 106 | type Dataset a = ComputeNode LocDistributed a 107 | 108 | {-| 109 | A unit of data that can be accessed by the user. 110 | 111 | This is a typed unit of data. The type is guaranteed to be a proper 112 | type accessible by the Haskell compiler (instead of simply a Cell 113 | type, which represents types only accessible at runtime). 114 | 115 | TODO(kps) rename to Observable 116 | -} 117 | type LocalData a = ComputeNode LocLocal a 118 | 119 | 120 | {-| 121 | The dataframe type. Any dataset can be converted to a dataframe. 122 | 123 | For the Spark users: this is different than the definition of the 124 | dataframe in Spark, which is a dataset of rows. Because the support 125 | for single columns is more akward in the case of rows, it is more 126 | natural to generalize datasets to contain cells. 127 | When communicating with Spark, though, single cells are wrapped 128 | into rows with single field, as Spark does. 129 | -} 130 | type DataFrame = Try UntypedDataset 131 | 132 | {-| Observable, whose type can only be infered at runtime and 133 | that can fail to be computed at runtime. 134 | 135 | Any observable can be converted to an untyped 136 | observable. 137 | 138 | Untyped observables are more flexible and can be combined in 139 | arbitrary manner, but they will fail during the validation of 140 | the Spark computation graph. 141 | 142 | TODO(kps) rename to DynObservable 143 | -} 144 | type LocalFrame = Try UntypedLocalData 145 | 146 | type UntypedNode' = Try UntypedNode 147 | 148 | {-| The different paths of edges in the compute DAG of nodes, at the 149 | start of computations. 150 | 151 | - scope edges specify the scope of a node for naming. They are not included in 152 | the id. 153 | 154 | -} 155 | data NodeEdge = ScopeEdge | DataStructureEdge StructureEdge deriving (Show, Eq) 156 | 157 | {-| The edges in a compute DAG, after name resolution (which is where most of 158 | the checks and computations are being done) 159 | 160 | - parent edges are the direct parents of a node, the only ones required for 161 | defining computations. They are included in the id. 162 | - logical edges define logical dependencies between nodes to force a specific 163 | ordering of the nodes. They are included in the id. 164 | -} 165 | data StructureEdge = ParentEdge | LogicalEdge deriving (Show, Eq) 166 | 167 | 168 | class CheckedLocalityCast loc where 169 | _validLocalityValues :: [TypedLocality loc] 170 | 171 | -- Class to retrieve the locality associated to a type. 172 | -- Is it better to use type classes? 173 | class (CheckedLocalityCast loc) => IsLocality loc where 174 | _getTypedLocality :: TypedLocality loc 175 | 176 | instance CheckedLocalityCast LocLocal where 177 | _validLocalityValues = [TypedLocality Local] 178 | 179 | instance CheckedLocalityCast LocDistributed where 180 | _validLocalityValues = [TypedLocality Distributed] 181 | 182 | -- LocLocal is a locality associated to Local 183 | instance IsLocality LocLocal where 184 | _getTypedLocality = TypedLocality Local 185 | 186 | -- LocDistributed is a locality associated to Distributed 187 | instance IsLocality LocDistributed where 188 | _getTypedLocality = TypedLocality Distributed 189 | 190 | instance CheckedLocalityCast LocUnknown where 191 | _validLocalityValues = [TypedLocality Distributed, TypedLocality Local] 192 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Filter.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Spark.Core.Internal.Filter where 4 | 5 | import Data.Aeson(Value(Null)) 6 | import qualified Data.Text as T 7 | import Formatting 8 | 9 | import Spark.Core.Internal.DatasetStructures 10 | import Spark.Core.Internal.ColumnStructures 11 | import Spark.Core.Internal.ColumnFunctions(colType, untypedCol, iUntypedColData, colOrigin) 12 | import Spark.Core.Internal.DatasetFunctions 13 | import Spark.Core.Internal.RowGenerics(ToSQL) 14 | import Spark.Core.Internal.LocalDataFunctions() 15 | import Spark.Core.Internal.FunctionsInternals 16 | import Spark.Core.Internal.OpStructures 17 | import Spark.Core.Internal.TypesStructures 18 | import Spark.Core.Internal.Utilities 19 | import Spark.Core.Internal.TypesFunctions(arrayType') 20 | import Spark.Core.Internal.RowStructures(Cell) 21 | import Spark.Core.Types 22 | import Spark.Core.Try 23 | 24 | 25 | {-| Performs a filtering operation on columns of a dataset. 26 | 27 | The first column is the reference column that is used to filter out values of 28 | the second column. 29 | -} 30 | filterCol :: Column ref Bool -> Column ref a -> Dataset a 31 | filterCol = missing "filterCol" 32 | 33 | {-| Filters a column depending to only keep the strict data. 34 | 35 | This function is useful to filter out some data within a structure, some of which 36 | may not be strict. 37 | -} 38 | filterOpt :: Column ref (Maybe a) -> Column ref b -> Dataset (a, b) 39 | filterOpt = missing "filterOpt" 40 | 41 | {-| Only keeps the strict values of a column. 42 | -} 43 | filterMaybe :: Column ref (Maybe a) -> Dataset a 44 | filterMaybe = missing "filterMaybe" 45 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Joins.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-| Exposes some of Spark's joining algorithms. 4 | -} 5 | module Spark.Core.Internal.Joins( 6 | join, 7 | join', 8 | joinInner, 9 | joinInner', 10 | joinObs, 11 | joinObs' 12 | ) where 13 | 14 | import qualified Data.Aeson as A 15 | import qualified Data.Vector as V 16 | import Control.Arrow 17 | 18 | import Spark.Core.Internal.ColumnStructures 19 | import Spark.Core.Internal.ColumnFunctions 20 | import Spark.Core.Internal.DatasetStructures 21 | import Spark.Core.Internal.DatasetFunctions 22 | import Spark.Core.Internal.FunctionsInternals 23 | import Spark.Core.Internal.OpStructures 24 | import Spark.Core.Internal.TypesStructures 25 | import Spark.Core.Internal.Utilities 26 | import Spark.Core.Internal.TypesFunctions(structTypeFromFields) 27 | import Spark.Core.Try 28 | import Spark.Core.StructuresInternal(unsafeFieldName) 29 | 30 | {-| Standard (inner) join on two sets of data. 31 | -} 32 | join :: Column ref1 key -> Column ref1 value1 -> Column ref2 key -> Column ref2 value2 -> Dataset (key, value1, value2) 33 | join = joinInner 34 | 35 | {-| Untyped version of the standard join. 36 | -} 37 | join' :: DynColumn -> DynColumn -> DynColumn -> DynColumn -> DataFrame 38 | join' = joinInner' 39 | 40 | {-| Explicit inner join. 41 | -} 42 | joinInner :: Column ref1 key -> Column ref1 value1 -> Column ref2 key -> Column ref2 value2 -> Dataset (key, value1, value2) 43 | joinInner key1 val1 key2 val2 = unsafeCastDataset (forceRight df) where 44 | df = joinInner' (untypedCol key1) (untypedCol val1) (untypedCol key2) (untypedCol val2) 45 | 46 | {-| Untyped version of the inner join. 47 | -} 48 | joinInner' :: DynColumn -> DynColumn -> DynColumn -> DynColumn -> DataFrame 49 | joinInner' key1 val1 key2 val2 = do 50 | df1 <- pack' (struct' [key1, val1]) 51 | df2 <- pack' (struct' [key2, val2]) 52 | dt <- _joinTypeInner key1 val1 val2 53 | let so = StandardOperator { soName = "org.spark.Join", soOutputType = dt, soExtra = A.String "inner" } 54 | let ds = emptyDataset (NodeDistributedOp so) (SQLType dt) 55 | let f ds' = ds' { _cnParents = V.fromList [untyped df1, untyped df2] } 56 | return $ updateNode ds f 57 | 58 | {-| Broadcasts an observable alongside a dataset to make it available as an 59 | extra column. 60 | -} 61 | -- This is the low-level operation that is used to implement the other 62 | -- broadcast operations. 63 | joinObs :: (HasCallStack) => Column ref val -> LocalData val' -> Dataset (val, val') 64 | joinObs c ld = 65 | -- TODO: has a forcing at the last moment so that we can at least 66 | -- have stronger guarantees in the type coercion. 67 | unsafeCastDataset $ forceRight $ joinObs' (untypedCol c) (pure (untypedLocalData ld)) 68 | 69 | {-| Broadcasts an observable along side a dataset to make it available as 70 | an extra column. 71 | 72 | The resulting dataframe has 2 columns: 73 | - one column called 'values' 74 | - one column called 'broadcast' 75 | 76 | Note: this is a low-level operation. Users may want to use broadcastObs instead. 77 | -} 78 | -- TODO: what is the difference with broadcastPair??? 79 | joinObs' :: DynColumn -> LocalFrame -> DataFrame 80 | joinObs' dc lf = do 81 | let df = pack' dc 82 | dc' <- df 83 | c <- asCol' df 84 | o <- lf 85 | st <- structTypeFromFields [(unsafeFieldName "values", unSQLType (colType c)), (unsafeFieldName "broadcast", unSQLType (nodeType o))] 86 | let sqlt = SQLType (StrictType (Struct st)) 87 | return $ emptyDataset NodeBroadcastJoin sqlt `parents` [untyped dc', untyped o] 88 | 89 | _joinTypeInner :: DynColumn -> DynColumn -> DynColumn -> Try DataType 90 | _joinTypeInner kcol col1 col2 = do 91 | cs <- sequence [kcol, col1, col2] 92 | st <- structTypeFromFields $ (colFieldName &&& unSQLType . colType) <$> cs 93 | return $ StrictType (Struct st) 94 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/LocalDataFunctions.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | {-# LANGUAGE TypeSynonymInstances #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | 7 | -- A number of functions related to local data. 8 | 9 | module Spark.Core.Internal.LocalDataFunctions( 10 | constant, 11 | iPackTupleObs 12 | ) where 13 | 14 | import Data.Aeson(toJSON, Value(Null)) 15 | import qualified Data.Text as T 16 | import qualified Data.List.NonEmpty as N 17 | import Control.Exception.Base(assert) 18 | 19 | import Spark.Core.Internal.DatasetFunctions 20 | import Spark.Core.Internal.DatasetStructures 21 | import Spark.Core.Internal.TypesFunctions 22 | import Spark.Core.Internal.TypesStructures 23 | import Spark.Core.Internal.OpStructures 24 | import Spark.Core.Internal.Utilities 25 | import Spark.Core.Internal.TypesGenerics(SQLTypeable, buildType) 26 | import Spark.Core.Row 27 | 28 | constant :: (ToSQL a, SQLTypeable a) => a -> LocalData a 29 | constant cst = 30 | let 31 | sqlt = buildType 32 | dt = unSQLType sqlt 33 | in emptyLocalData (NodeLocalLit dt (toJSON (valueToCell cst))) sqlt 34 | 35 | {-| (developer API) 36 | 37 | This function takes a non-empty list of observables and puts them 38 | into a structure. The names of each element is _0 ... _(n-1) 39 | -} 40 | iPackTupleObs :: N.NonEmpty UntypedLocalData -> UntypedLocalData 41 | iPackTupleObs ulds = 42 | let dt = structTypeTuple' (unSQLType . nodeType <$> ulds) 43 | so = StandardOperator { 44 | soName = "org.spark.LocalPack", 45 | soOutputType = dt, 46 | soExtra = Null } 47 | op = NodeLocalOp so 48 | in emptyLocalData op (SQLType dt) 49 | `parents` (untyped <$> N.toList ulds) 50 | 51 | instance (Num a, ToSQL a, SQLTypeable a) => Num (LocalData a) where 52 | -- TODO: convert all that to use column operations 53 | (+) = _binOp "org.spark.LocalPlus" 54 | (-) = _binOp "org.spark.LocalMinus" 55 | (*) = _binOp "org.spark.LocalMult" 56 | abs = _unaryOp "org.spark.LocalAbs" 57 | signum = _unaryOp "org.spark.LocalSignum" 58 | fromInteger x = constant (fromInteger x :: a) 59 | negate = _unaryOp "org.spark.LocalNegate" 60 | 61 | instance forall a. (ToSQL a, Enum a, SQLTypeable a) => Enum (LocalData a) where 62 | toEnum x = constant (toEnum x :: a) 63 | fromEnum = failure "Cannot use fromEnum against a local data object" 64 | -- TODO(kps) some of the others are still available for implementation 65 | 66 | instance (Num a, Ord a) => Ord (LocalData a) where 67 | compare = failure "You cannot compare instances of LocalData. (yet)." 68 | min = _binOp "org.spark.LocalMin" 69 | max = _binOp "org.spark.LocalMax" 70 | 71 | instance forall a. (Real a, ToSQL a, SQLTypeable a) => Real (LocalData a) where 72 | toRational = failure "Cannot convert LocalData to rational" 73 | 74 | instance (ToSQL a, Integral a, SQLTypeable a) => Integral (LocalData a) where 75 | quot = _binOp "org.spark.LocalQuotient" 76 | rem = _binOp "org.spark.LocalReminder" 77 | div = _binOp "org.spark.LocalDiv" 78 | mod = _binOp "org.spark.LocalMod" 79 | quotRem = failure "quotRem is not implemented (yet). Use quot and rem." 80 | divMod = failure "divMod is not implemented (yet). Use div and mod." 81 | toInteger = failure "Cannot convert LocalData to integer" 82 | 83 | instance (ToSQL a, SQLTypeable a, Fractional a) => Fractional (LocalData a) where 84 | fromRational x = constant (fromRational x :: a) 85 | (/) = _binOp "org.spark.LocalDiv" 86 | 87 | 88 | _unaryOp :: T.Text -> LocalData a -> LocalData a 89 | _unaryOp optxt ld = 90 | let so = StandardOperator { 91 | soName = optxt, 92 | soOutputType = unSQLType $ nodeType ld, 93 | soExtra = Null } 94 | op = NodeLocalOp so in 95 | emptyLocalData op (nodeType ld) 96 | `parents` [untyped ld] 97 | 98 | _binOp :: T.Text -> LocalData a -> LocalData a -> LocalData a 99 | _binOp optxt ld1 ld2 = assert (nodeType ld1 == nodeType ld2) $ 100 | let so = StandardOperator { 101 | soName = optxt, 102 | soOutputType = unSQLType $ nodeType ld1, 103 | soExtra = Null } 104 | op = NodeLocalOp so in 105 | emptyLocalData op (nodeType ld1) 106 | `parents` [untyped ld1, untyped ld2] 107 | 108 | -- TODO(kps) more input tests 109 | _binOp' :: StandardOperator -> LocalData a -> LocalData a -> LocalData a 110 | _binOp' so ld1 ld2 = assert (nodeType ld1 == nodeType ld2) $ 111 | let op = NodeLocalOp so in 112 | emptyLocalData op (nodeType ld1) 113 | `parents` [untyped ld1, untyped ld2] 114 | 115 | _intOperator :: T.Text -> StandardOperator 116 | _intOperator optxt = StandardOperator { 117 | soName = optxt, 118 | soOutputType = intType, 119 | soExtra = Null 120 | } 121 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/LocatedBase.hs: -------------------------------------------------------------------------------- 1 | -- Taken from https://hackage.haskell.org/package/located-base-0.1.1.0/docs/src/GHC-Err-Located.html 2 | 3 | {-# LANGUAGE CPP #-} 4 | {-# LANGUAGE ConstraintKinds #-} 5 | {-# LANGUAGE ImplicitParams #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | module Spark.Core.Internal.LocatedBase (error, undefined, HasCallStack, showCallStack) where 8 | 9 | #if __GLASGOW_HASKELL__ < 800 10 | 11 | import GHC.SrcLoc 12 | import GHC.Stack (CallStack, getCallStack) 13 | import Prelude hiding (error, undefined) 14 | import qualified Prelude 15 | import Text.Printf 16 | import Data.Text(Text, unpack) 17 | 18 | type HasCallStack = (?callStack :: CallStack) 19 | 20 | error :: HasCallStack => Text -> a 21 | error msg = Prelude.error (unpack msg ++ "\n" ++ showCallStack ?callStack) 22 | 23 | undefined :: HasCallStack => a 24 | undefined = error "Prelude.undefined" 25 | 26 | showCallStack :: CallStack -> String 27 | showCallStack stk = case getCallStack stk of 28 | _:locs -> unlines $ "Callstack:" : map format locs 29 | _ -> Prelude.error "showCallStack: empty call-stack" 30 | where 31 | format (fn, loc) = printf " %s, called at %s" fn (showSrcLoc loc) 32 | 33 | #else 34 | 35 | import GHC.Stack(HasCallStack, CallStack, prettyCallStack) 36 | import qualified GHC.Stack() 37 | import Data.Text(Text, unpack) 38 | import qualified Prelude 39 | import Prelude((.)) 40 | 41 | {-# DEPRECATED showCallStack "use GHC.Stack.prettyCallStack instead" #-} 42 | showCallStack :: CallStack -> Prelude.String 43 | showCallStack = prettyCallStack 44 | 45 | error :: HasCallStack => Text -> a 46 | error = Prelude.error . unpack 47 | 48 | undefined :: HasCallStack => a 49 | undefined = error "Prelude.undefined" 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/ObservableStandard.hs: -------------------------------------------------------------------------------- 1 | 2 | module Spark.Core.Internal.ObservableStandard( 3 | asDouble) where 4 | 5 | import Spark.Core.Internal.ColumnStandard 6 | import Spark.Core.Internal.DatasetStructures 7 | import Spark.Core.Internal.FunctionsInternals 8 | import Spark.Core.Internal.TypesGenerics(SQLTypeable) 9 | 10 | {-| Casts a local data as a double. 11 | -} 12 | asDouble :: (Num a, SQLTypeable a) => LocalData a -> LocalData Double 13 | asDouble = projectColFunction asDoubleCol 14 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Paths.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | {-# LANGUAGE TypeSynonymInstances #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | {-# LANGUAGE FlexibleContexts #-} 8 | 9 | module Spark.Core.Internal.Paths( 10 | HasNodeName(..), 11 | PathEdge(..), 12 | computePaths, 13 | assignPaths', 14 | -- For testing: 15 | Scopes, 16 | ParentSplit(..), 17 | mergeScopes, 18 | gatherPaths, 19 | iGetScopes0, 20 | ) where 21 | 22 | import qualified Data.Map.Strict as M 23 | import qualified Data.Set as S 24 | import qualified Data.Vector as V 25 | import Data.List(sort) 26 | import Data.Maybe(fromMaybe, catMaybes) 27 | import Data.Foldable(foldr', foldl', toList) 28 | import Formatting 29 | 30 | import Spark.Core.Try 31 | import Spark.Core.Internal.DAGStructures 32 | import Spark.Core.Internal.ComputeDag 33 | import Spark.Core.StructuresInternal 34 | 35 | class HasNodeName v where 36 | -- Retrieves the name of the node 37 | getNodeName :: v -> NodeName 38 | -- Assigns a path to the node 39 | assignPath :: v -> NodePath -> v 40 | 41 | {-| The types of edges for the calculation of paths. 42 | - same level parent -> the node should have the same prefix as its parents 43 | - inner edge -> the parent defines the scope of this node 44 | -} 45 | data PathEdge = SameLevelEdge | InnerEdge deriving (Show, Eq) 46 | 47 | -- Assigns paths in a graph. 48 | -- 49 | computePaths :: (HasNodeName v) => 50 | ComputeDag v PathEdge -> Try (M.Map VertexId NodePath) 51 | computePaths cd = 52 | let nodecg = mapVertexData getNodeName cd 53 | in _computePaths nodecg 54 | 55 | assignPaths' :: (HasNodeName v) => 56 | M.Map VertexId NodePath -> ComputeDag v e -> ComputeDag v e 57 | assignPaths' m cd = 58 | let f vx = 59 | let old = NodePath . V.singleton $ getNodeName (vertexData vx) 60 | new = M.findWithDefault old (vertexId vx) m 61 | in assignPath (vertexData vx) new 62 | in mapVertices f cd 63 | 64 | -- The main function to perform the pass assignments. 65 | -- It starts from the graph of dependencies and from the local name info, 66 | -- and computes the complete paths (if possible), starting from the fringe. 67 | _computePaths :: ComputeDag NodeName PathEdge -> Try (M.Map VertexId NodePath) 68 | _computePaths cg = 69 | let 70 | scopes = iGetScopes0 (toList . cdOutputs $ cg) (_splitParents' (cdEdges cg)) 71 | paths = gatherPaths scopes 72 | nodeNames = M.fromList [(vertexId vx, vertexData vx)| vx <- toList . cdVertices $ cg] 73 | lookup' nid = M.lookup nid nodeNames 74 | f :: VertexId -> [[VertexId]] -> Try NodePath 75 | f nid ls = case ls of 76 | [l] -> 77 | return . NodePath . V.fromList . catMaybes $ lookup' <$> (l ++ [nid]) 78 | x -> 79 | tryError $ sformat ("Node has too many paths: node="%shown%" discovered paths ="%shown) nid x 80 | nodePaths = M.traverseWithKey f paths 81 | in nodePaths 82 | 83 | -- (private) 84 | -- The top-level scope may not have an ID associated to it 85 | type Scopes = M.Map (Maybe VertexId) (S.Set VertexId) 86 | 87 | 88 | -- (internal) 89 | -- The separation of parents into logical and inner parents 90 | data ParentSplit a = ParentSplit { 91 | psLogical :: ![Vertex a], 92 | psInner :: ![Vertex a] 93 | } deriving (Show) 94 | 95 | _lookupOrEmpty :: Scopes -> Maybe VertexId -> [VertexId] 96 | _lookupOrEmpty scopes mnid = 97 | S.toList $ fromMaybe S.empty (M.lookup mnid scopes) 98 | 99 | mergeScopes :: Scopes -> Scopes -> Scopes 100 | mergeScopes = M.unionWith S.union 101 | 102 | _singleScope :: Maybe VertexId -> VertexId -> Scopes 103 | _singleScope mKey nid = M.singleton mKey (S.singleton nid) 104 | 105 | -- For each node, finds the one, or more than one if possible, path(s) 106 | -- from the root to the node (which is itself not included at the end) 107 | -- The gathering of paths may not be exaustive. 108 | gatherPaths :: Scopes -> M.Map VertexId [[VertexId]] 109 | gatherPaths scopes = M.map sort $ _gatherPaths0 scopes start where 110 | start = _lookupOrEmpty scopes Nothing 111 | 112 | _gatherPaths0 :: Scopes -> [VertexId] -> M.Map VertexId [[VertexId]] 113 | _gatherPaths0 _ [] = M.empty 114 | _gatherPaths0 scopes (nid : t) = 115 | let 116 | inner = _lookupOrEmpty scopes (Just nid) 117 | innerPaths = _gatherPaths0 scopes inner 118 | innerWithHead = M.map (\l -> (nid : ) <$> l) innerPaths 119 | thisPaths = M.singleton nid [[]] 120 | innerPaths2 = M.unionWith (++) innerWithHead thisPaths 121 | in M.unionWith (++) innerPaths2 (_gatherPaths0 scopes t) 122 | 123 | 124 | iGetScopes0 :: forall a. (Show a) => 125 | [Vertex a] -> 126 | (Vertex a -> ParentSplit a) -> 127 | Scopes 128 | iGetScopes0 [] _splitter = M.empty 129 | iGetScopes0 (h : t) splitter = 130 | let 131 | startScope = _singleScope Nothing (vertexId h) 132 | folder :: Scopes -> Vertex a -> Scopes 133 | folder current un = 134 | if M.member (Just (vertexId un)) current then 135 | current 136 | else 137 | let split = _getScopes' splitter Nothing S.empty un current 138 | in mergeScopes split current 139 | -- Important here to use a left folder, as we want to start with the head 140 | -- and move down the list. 141 | in foldl' folder startScope (h : t) 142 | 143 | _splitParents' :: AdjacencyMap v PathEdge -> Vertex v -> ParentSplit v 144 | _splitParents' m vx = 145 | let ves = V.toList $ M.findWithDefault V.empty (vertexId vx) m 146 | scope = [veEndVertex ve | ve <- ves, edgeData (veEdge ve) == SameLevelEdge] 147 | parents' = [veEndVertex ve | ve <- ves, edgeData (veEdge ve) == InnerEdge] 148 | in ParentSplit { psLogical = scope, psInner = parents' } 149 | 150 | 151 | -- TODO(kps) this recursive code is most probably going to explode for deep stacks 152 | _getScopes' :: forall a. (Show a) => 153 | (Vertex a -> ParentSplit a) -> -- The expansion of a node into logical and inner nodes 154 | Maybe VertexId -> -- the current parent (if any) 155 | S.Set VertexId -> -- the current boundary to respect 156 | Vertex a -> -- the current node to expand 157 | Scopes -> -- the scopes seen so far 158 | Scopes 159 | _getScopes' splitter mScopeId boundary un scopes = 160 | if S.member (vertexId un) boundary then 161 | scopes 162 | else 163 | let 164 | split = splitter un 165 | logParents = psLogical split 166 | innerParents = psInner split 167 | -- A fold on the parents 168 | parF :: Vertex a -> Scopes -> Scopes 169 | parF = 170 | -- Same boundary and parent, but update the scopes 171 | _getScopes' splitter mScopeId boundary 172 | scopesPar = foldr' parF scopes logParents 173 | -- Now work on the inner nodes: 174 | vid = vertexId un 175 | boundary' = S.fromList (vertexId <$> logParents) 176 | inF :: Vertex a -> Scopes -> Scopes 177 | inF = 178 | -- parent is current, boundary is current logical 179 | _getScopes' splitter (Just vid) boundary' 180 | scopesIn = foldr' inF scopesPar innerParents 181 | scopesFinal = scopesIn 182 | `mergeScopes` _singleScope mScopeId vid 183 | `mergeScopes` M.singleton (Just vid) S.empty 184 | in scopesFinal 185 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/PathsUntyped.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | {-# LANGUAGE MultiParamTypeClasses #-} 3 | {-# LANGUAGE TypeSynonymInstances #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE FlexibleContexts #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | 8 | module Spark.Core.Internal.PathsUntyped( 9 | assignPathsUntyped, 10 | tieNodes 11 | ) where 12 | 13 | import qualified Data.Vector as V 14 | import qualified Data.Map.Strict as M 15 | import Data.Maybe(fromMaybe) 16 | import Data.Foldable(toList) 17 | import Data.List(nub) 18 | import Control.Arrow((&&&)) 19 | import Formatting 20 | import Control.Monad.Identity 21 | 22 | import Spark.Core.Internal.DAGStructures 23 | import Spark.Core.Internal.DAGFunctions 24 | import Spark.Core.Internal.ComputeDag 25 | import Spark.Core.Internal.DatasetStructures 26 | import Spark.Core.Internal.DatasetFunctions 27 | import Spark.Core.Internal.Paths 28 | import Spark.Core.Internal.Utilities 29 | import Spark.Core.Try 30 | import Spark.Core.StructuresInternal(unNodeId) 31 | 32 | instance GraphVertexOperations UntypedNode where 33 | vertexToId = VertexId . unNodeId . nodeId 34 | expandVertexAsVertices n = 35 | nodeParents n 36 | ++ fromMaybe [] (nodeLogicalParents n) 37 | ++ nodeLogicalDependencies n 38 | 39 | instance GraphOperations UntypedNode NodeEdge where 40 | expandVertex n = 41 | -- The logical parents are more important than the parents 42 | let 43 | -- If the logical parents are not specified, the logical parents are the 44 | -- direct parents. 45 | scopeNodes = fromMaybe (nodeParents n) (nodeLogicalParents n) 46 | loParents = [(ScopeEdge, v) | v <- scopeNodes] 47 | -- The direct parents. They may overload with the scoping parents, but 48 | -- this will be checked during the name analysis. 49 | parents' = (const (DataStructureEdge ParentEdge) &&& id) <$> nodeParents n 50 | loDeps = (const (DataStructureEdge LogicalEdge) &&& id) <$> nodeLogicalDependencies n 51 | in loParents ++ parents' ++ loDeps 52 | 53 | instance HasNodeName UntypedNode where 54 | getNodeName = nodeName 55 | assignPath n p = updateNode n $ \n' -> n' { _cnPath = p } 56 | 57 | 58 | -- Stitches the nodes together to make sure that the edges in the graph also 59 | -- correspond to the dependencies in the nodes themselves. 60 | -- This does not update the nodeIds 61 | -- This must happen before the pruning is performed, otherwise the node IDs will 62 | -- not match. 63 | tieNodes :: ComputeDag UntypedNode StructureEdge -> ComputeDag UntypedNode StructureEdge 64 | tieNodes cd = 65 | let g = computeGraphToGraph cd 66 | f :: UntypedNode -> [(UntypedNode, StructureEdge)] -> Identity UntypedNode 67 | f v l = 68 | let parents' = V.fromList [n | (n, e) <- l, e == ParentEdge] 69 | logDeps = V.fromList [n | (n, e) <- l, e == LogicalEdge] 70 | res = updateNode v $ \n -> n { 71 | _cnParents = parents', 72 | _cnLogicalDeps = logDeps, 73 | _cnLogicalParents = Nothing } 74 | in return res 75 | g2 = runIdentity $ graphMapVertices g f 76 | in graphToComputeGraph g2 77 | 78 | -- Assigs the paths, and drops the scoping edges. 79 | assignPathsUntyped :: (HasCallStack) => 80 | ComputeDag UntypedNode NodeEdge -> Try (ComputeDag UntypedNode StructureEdge) 81 | assignPathsUntyped cd = do 82 | let pathCGraph = _getPathCDag cd 83 | paths <- computePaths pathCGraph 84 | let g = computeGraphToGraph $ assignPaths' paths cd 85 | let f ScopeEdge = [] 86 | f (DataStructureEdge x) = [x] 87 | let g' = graphFlatMapEdges g f 88 | return $ graphToComputeGraph g' 89 | 90 | 91 | -- transforms node edges into path edges 92 | _cleanEdges :: (HasCallStack) => [VertexEdge NodeEdge v] -> [VertexEdge PathEdge v] 93 | _cleanEdges [] = [] 94 | _cleanEdges (h : t) = 95 | let vid = vertexId (veEndVertex h) 96 | others = [ve | ve <- t, (vertexId . veEndVertex $ ve) /= vid] 97 | sames = [ve | ve <- t, (vertexId . veEndVertex $ ve) == vid] 98 | rest = _cleanEdges others 99 | e = veEdge h 100 | -- If there multiple edges between nodes, they are dropped. 101 | -- This distinction is not required for names. 102 | eData = nub $ edgeData . veEdge <$> (h : sames) 103 | eData' = case eData of 104 | [DataStructureEdge ParentEdge] -> Just InnerEdge 105 | [DataStructureEdge ParentEdge, ScopeEdge] -> Just SameLevelEdge 106 | [ScopeEdge, DataStructureEdge ParentEdge] -> Just SameLevelEdge 107 | [ScopeEdge] -> Just SameLevelEdge 108 | [DataStructureEdge LogicalEdge] -> Nothing 109 | l -> failure (sformat ("Could not understand combination "%shown) l) 110 | res = case eData' of 111 | Just v -> (h { veEdge = e { edgeData = v } }) : rest 112 | Nothing -> rest 113 | in res 114 | 115 | 116 | _getPathCDag :: (HasCallStack) => ComputeDag v NodeEdge -> ComputeDag v PathEdge 117 | _getPathCDag cd = 118 | let adj' = M.map (V.fromList . _cleanEdges . toList) (cdEdges cd) 119 | in cd { cdEdges = adj' } 120 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Pruning.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | {-| Methods to prune the computation graph. 6 | -} 7 | module Spark.Core.Internal.Pruning( 8 | NodeCacheStatus(..), 9 | NodeCacheInfo(..), 10 | NodeCache, 11 | pruneGraph, 12 | pruneGraphDefault, 13 | emptyNodeCache 14 | ) where 15 | 16 | import Data.HashMap.Strict as HM 17 | 18 | import Spark.Core.StructuresInternal(NodeId, NodePath, ComputationID) 19 | import Spark.Core.Internal.DatasetStructures(UntypedNode, StructureEdge) 20 | import Spark.Core.Internal.DAGFunctions 21 | import Spark.Core.Internal.DAGStructures 22 | import Spark.Core.Internal.DatasetFunctions 23 | import Spark.Core.Internal.OpStructures 24 | 25 | 26 | {-| The status of a node being computed. 27 | 28 | On purpose, it does not store data. This is meant to be 29 | only the control plane of the compuations. 30 | -} 31 | data NodeCacheStatus = 32 | NodeCacheRunning 33 | | NodeCacheError 34 | | NodeCacheSuccess 35 | deriving (Eq, Show) 36 | 37 | {-| This structure describes the last time a node was observed by the 38 | controller, and the state it was in. 39 | 40 | This information is used to do smart computation pruning, by assuming 41 | that the observables are kept by the Spark processes. 42 | -} 43 | data NodeCacheInfo = NodeCacheInfo { 44 | nciStatus :: !NodeCacheStatus, 45 | nciComputation :: !ComputationID, 46 | nciPath :: !NodePath 47 | } deriving (Eq, Show) 48 | 49 | type NodeCache = HM.HashMap NodeId NodeCacheInfo 50 | 51 | emptyNodeCache :: NodeCache 52 | emptyNodeCache = HM.empty 53 | 54 | {-| It assumes a compute graph, NOT a dependency dag. 55 | -} 56 | pruneGraph :: (Show v) => 57 | -- The current cache 58 | NodeCache -> 59 | (v -> NodeId) -> 60 | -- A function to create a node replacement 61 | (v -> NodeCacheInfo -> v) -> 62 | -- The graph 63 | Graph v StructureEdge -> 64 | Graph v StructureEdge 65 | pruneGraph c getNodeId f g = 66 | -- Prune the node that we do not want 67 | let depGraph = reverseGraph g 68 | fop v = if HM.member (getNodeId v) c 69 | then CutChildren 70 | else Keep 71 | filtered = graphFilterVertices fop depGraph 72 | -- Bring back to normal flow. 73 | comFiltered = reverseGraph filtered 74 | -- Replace the nodes in the cache by place holders. 75 | -- This is done on the compute graph. 76 | repOp v = case HM.lookup (getNodeId v) c of 77 | Just nci -> f v nci 78 | Nothing -> v 79 | g' = graphMapVertices' repOp comFiltered 80 | in g' 81 | 82 | pruneGraphDefault :: 83 | NodeCache -> Graph UntypedNode StructureEdge -> Graph UntypedNode StructureEdge 84 | pruneGraphDefault c = pruneGraph c nodeId _createNodeCache 85 | 86 | _createNodeCache :: UntypedNode -> NodeCacheInfo -> UntypedNode 87 | _createNodeCache n nci = 88 | let name = "org.spark.PlaceholderCache" 89 | no = NodePointer (Pointer (nciComputation nci) (nciPath nci)) 90 | n2 = emptyNodeStandard (nodeLocality n) (nodeType n) name 91 | `updateNodeOp` no 92 | in n2 93 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/RowGenerics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE PolyKinds #-} 2 | {-# LANGUAGE TypeOperators #-} 3 | {-# LANGUAGE DeriveAnyClass #-} 4 | {-# LANGUAGE FlexibleContexts #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE DefaultSignatures #-} 7 | {-# LANGUAGE ScopedTypeVariables #-} 8 | {-# LANGUAGE BangPatterns #-} 9 | {-# LANGUAGE MultiParamTypeClasses #-} 10 | 11 | 12 | -- The generic implementation for the protocol that converts to 13 | -- and from SQL cells. 14 | -- Going through JSON is not recommended because of precision loss 15 | -- for the numbers, and other issues related to numbers. 16 | module Spark.Core.Internal.RowGenerics( 17 | ToSQL, 18 | valueToCell, 19 | ) where 20 | 21 | import GHC.Generics 22 | import qualified Data.Vector as V 23 | import Data.Text(pack, Text) 24 | 25 | import Spark.Core.Internal.RowStructures 26 | import Spark.Core.Internal.Utilities 27 | 28 | -- We need to differentiate between the list built for the 29 | -- constructor and an inner object. 30 | data CurrentBuffer = 31 | ConsData ![Cell] 32 | | BuiltCell !Cell deriving (Show) 33 | 34 | _cellOrError :: CurrentBuffer -> Cell 35 | _cellOrError (BuiltCell cell) = cell 36 | _cellOrError x = let msg = "Expected built cell, received " ++ show x in 37 | failure (pack msg) 38 | 39 | -- All the types that can be converted to a SQL value. 40 | class ToSQL a where 41 | _valueToCell :: a -> Cell 42 | 43 | default _valueToCell :: (Generic a, GToSQL (Rep a)) => a -> Cell 44 | _valueToCell !x = _g2cell (from x) 45 | 46 | valueToCell :: (ToSQL a) => a -> Cell 47 | valueToCell = _valueToCell 48 | 49 | -- class FromSQL a where 50 | -- _cellToValue :: Cell -> Try a 51 | 52 | instance ToSQL a => ToSQL (Maybe a) where 53 | _valueToCell (Just x) = _valueToCell x 54 | _valueToCell Nothing = Empty 55 | 56 | instance (ToSQL a, ToSQL b) => ToSQL (a, b) where 57 | _valueToCell (x, y) = RowArray (V.fromList [valueToCell x, valueToCell y]) 58 | 59 | instance ToSQL Int where 60 | _valueToCell = IntElement 61 | 62 | instance ToSQL Double where 63 | _valueToCell = DoubleElement 64 | 65 | instance ToSQL Text where 66 | _valueToCell = StringElement 67 | 68 | 69 | class GToSQL r where 70 | _g2buffer :: r a -> CurrentBuffer 71 | _g2cell :: r a -> Cell 72 | _g2cell = _cellOrError . _g2buffer 73 | 74 | instance GToSQL U1 where 75 | _g2buffer U1 = failure $ pack "GToSQL UI called" 76 | 77 | -- | Constants, additional parameters and recursion of kind * 78 | instance (GToSQL a, GToSQL b) => GToSQL (a :*: b) where 79 | _g2buffer (a :*: b) = case (_g2buffer a, _g2buffer b) of 80 | (ConsData l1, ConsData l2) -> ConsData (l1 ++ l2) 81 | (y1, y2) -> failure $ pack $ "GToSQL (a :*: b): Expected buffers, received " ++ show y1 ++ " and " ++ show y2 82 | 83 | instance (GToSQL a, GToSQL b) => GToSQL (a :+: b) where 84 | _g2buffer (L1 x) = _g2buffer x 85 | _g2buffer (R1 x) = let !y = _g2buffer x in y 86 | 87 | -- -- | Sums: encode choice between constructors 88 | -- instance (GToSQL a) => GToSQL (M1 i c a) where 89 | -- _g2cell !(M1 x) = let !y = _g2cell x in 90 | -- trace ("GToSQL M1: y = " ++ show y) y 91 | 92 | instance (GToSQL a) => GToSQL (M1 C c a) where 93 | _g2buffer (M1 x) = let !y = _g2buffer x in y 94 | 95 | instance (GToSQL a) => GToSQL (M1 S c a) where 96 | _g2buffer (M1 x) = let !y = ConsData [_g2cell x] in y 97 | 98 | instance (GToSQL a) => GToSQL (M1 D c a) where 99 | _g2buffer (M1 x) = 100 | case _g2buffer x of 101 | ConsData cs -> BuiltCell $ RowArray (V.fromList cs) 102 | BuiltCell cell -> BuiltCell cell 103 | 104 | -- | Products: encode multiple arguments to constructors 105 | instance (ToSQL a) => GToSQL (K1 i a) where 106 | _g2buffer (K1 x) = let !y = _valueToCell x in BuiltCell y 107 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/RowGenericsFrom.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE PolyKinds #-} 2 | {-# LANGUAGE TypeOperators #-} 3 | {-# LANGUAGE DeriveAnyClass #-} 4 | {-# LANGUAGE FlexibleContexts #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE DefaultSignatures #-} 7 | {-# LANGUAGE ScopedTypeVariables #-} 8 | {-# LANGUAGE MultiParamTypeClasses #-} 9 | {-# LANGUAGE OverloadedStrings #-} 10 | 11 | 12 | -- The generic implementation for the protocol that converts to 13 | -- and from SQL cells. 14 | -- Going through JSON is not recommended because of precision loss 15 | -- for the numbers, and other issues related to numbers. 16 | module Spark.Core.Internal.RowGenericsFrom( 17 | FromSQL(_cellToValue), 18 | TryS, 19 | cellToValue, 20 | ) where 21 | 22 | import GHC.Generics 23 | import Data.Text(Text, pack) 24 | import Control.Applicative(liftA2) 25 | import Control.Monad.Except 26 | import Formatting 27 | import qualified Data.Vector as V 28 | 29 | import Spark.Core.Internal.RowStructures 30 | import Spark.Core.Internal.Utilities 31 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeRepr, DataTypeElementRepr) 32 | 33 | -- Convert a cell to a value (if possible) 34 | cellToValue :: (FromSQL a) => Cell -> Either Text a 35 | cellToValue = _cellToValue 36 | 37 | type TryS = Either Text 38 | 39 | -- Because of the way the generic decoders work, 40 | -- an array of cell needs special treatment when it is 41 | -- decoded as the constructor of an object. Then it should 42 | -- be interpreted as a stateful tape, for which we read a 43 | -- few cells (number unknown) and return some value from the 44 | -- cells that have been consumed. 45 | data Decode2 = 46 | -- A tape with some potentially remaining cells 47 | D2Cons ![Cell] 48 | -- Just a normal cell. 49 | | D2Normal !Cell 50 | deriving (Eq, Show) 51 | 52 | 53 | -- All the types that can be converted to a SQL value. 54 | class FromSQL a where 55 | _cellToValue :: Cell -> TryS a 56 | 57 | default _cellToValue :: (Generic a, GFromSQL (Rep a)) => Cell -> TryS a 58 | _cellToValue cell = let 59 | x = undefined :: a 60 | x1r = _gFcell (from x) (D2Normal cell) :: InterResult (Decode2, Rep a a) 61 | x2r = snd <$> x1r 62 | x1t = to <$> x2r 63 | in _toTry x1t 64 | 65 | -- ******** Basic instance ******** 66 | 67 | instance FromSQL a => FromSQL (Maybe a) where 68 | _cellToValue Empty = pure Nothing 69 | _cellToValue x = pure <$> _cellToValue x 70 | 71 | instance FromSQL Int where 72 | _cellToValue (IntElement x) = pure x 73 | _cellToValue x = throwError $ sformat ("FromSQL: Decoding an int from "%shown) x 74 | 75 | instance FromSQL Double where 76 | _cellToValue (DoubleElement x) = pure x 77 | _cellToValue x = throwError $ sformat ("FromSQL: Decoding a double from "%shown) x 78 | 79 | instance FromSQL Text where 80 | _cellToValue (StringElement txt) = pure txt 81 | _cellToValue x = throwError $ sformat ("FromSQL: Decoding a unicode text from "%shown) x 82 | 83 | instance FromSQL Cell where 84 | _cellToValue = pure 85 | 86 | instance FromSQL Bool where 87 | _cellToValue (BoolElement b) = pure b 88 | _cellToValue x = throwError $ sformat ("FromSQL: Decoding a boolean from "%shown) x 89 | 90 | instance FromSQL DataTypeRepr 91 | instance FromSQL DataTypeElementRepr 92 | 93 | instance FromSQL a => FromSQL [a] where 94 | _cellToValue (RowArray xs) = 95 | sequence (_cellToValue <$> V.toList xs) 96 | _cellToValue x = throwError $ sformat ("FromSQL[]: Decoding array from "%shown) x 97 | 98 | instance (FromSQL a1, FromSQL a2) => FromSQL (a1, a2) where 99 | _cellToValue (RowArray xs) = case V.toList xs of 100 | [x1, x2] -> 101 | liftA2 (,) (_cellToValue x1) (_cellToValue x2) 102 | l -> throwError $ sformat ("FromSQL: Expected 2 elements but got "%sh) l 103 | _cellToValue x = throwError $ sformat ("FromSQL(,): Decoding array from "%shown) x 104 | 105 | -- ******* GENERIC ******** 106 | 107 | -- A final message at the bottom 108 | -- A path in the elements to get there 109 | data FailureInfo = FailureInfo !Text ![Text] deriving (Eq, Show) 110 | 111 | type InterResult a = Either FailureInfo a 112 | 113 | 114 | class GFromSQL r where 115 | -- An evidence about the type (in order to have info about the field names) 116 | -- The current stuff that has been decoded 117 | _gFcell :: r a -> Decode2 -> InterResult (Decode2, r a) 118 | 119 | _toTry :: InterResult a -> TryS a 120 | _toTry (Right x) = pure x 121 | _toTry (Left (FailureInfo msg p)) = Left $ show' (reverse p) <> " : " <> msg 122 | 123 | _fromTry :: TryS a -> InterResult a 124 | _fromTry (Right x) = Right x 125 | _fromTry (Left x) = Left $ FailureInfo x [] 126 | 127 | instance GFromSQL U1 where 128 | _gFcell x = failure $ pack $ "GFromSQL UI called" ++ show x 129 | 130 | instance (GFromSQL a, GFromSQL b) => GFromSQL (a :*: b) where 131 | -- Switching to tape-reading mode 132 | _gFcell ev (D2Normal (RowArray arr)) = _gFcell ev (D2Cons (V.toList arr)) 133 | -- Advancing into the reader 134 | _gFcell ev (D2Cons l) = do 135 | let (ev1 :*: ev2) = ev 136 | (d1, x1) <- _gFcell ev1 (D2Cons l) 137 | (d2, x2) <- _gFcell ev2 d1 138 | return (d2, x1 :*: x2) 139 | _gFcell _ x = failure $ pack ("GFromSQL (a :*: b) " ++ show x) 140 | 141 | 142 | instance (GFromSQL a, GFromSQL b) => GFromSQL (a :+: b) where 143 | _gFcell _ x = failure $ pack $ "GFromSQL (a :+: b)" ++ show x 144 | 145 | instance (GFromSQL a, Constructor c) => GFromSQL (M1 C c a) where 146 | _gFcell _ (D2Cons x) = failure $ pack ("GFromSQL (M1 C c a)" ++ " FAILED CONS: " ++ show x) 147 | _gFcell ev (D2Normal cell) = do 148 | let ev' = unM1 ev 149 | (d, x) <- _withHint (pack (conName ev)) $ _gFcell ev' (D2Normal cell) 150 | return (d, M1 x) 151 | 152 | instance (GFromSQL a, Selector c) => GFromSQL (M1 S c a) where 153 | _gFcell ev (D2Normal (RowArray arr)) = do 154 | let ev' = unM1 ev 155 | let l = V.toList arr 156 | (d, x) <- _withHint ("(1)" <> pack (selName ev)) $ _gFcell ev' (D2Cons l) 157 | return (d, M1 x) 158 | _gFcell ev d = do 159 | let ev' = unM1 ev 160 | (d', x) <- _withHint ("(2)" <> pack (selName ev)) $ _gFcell ev' d 161 | return (d', M1 x) 162 | 163 | instance (GFromSQL a, Datatype c) => GFromSQL (M1 D c a) where 164 | _gFcell ev (z @ (D2Normal (RowArray _))) = do 165 | let ev' = unM1 ev 166 | (d, x) <- _gFcell ev' z 167 | return (d, M1 x) 168 | _gFcell _ x = failure $ pack $ "FAIL GFromSQL (M1 D c a)" ++ show x 169 | 170 | -- | Products: encode multiple arguments to constructors 171 | instance (FromSQL a) => GFromSQL (K1 i a) where 172 | -- It is just a normal cell. 173 | -- Read one element and move on. 174 | _gFcell _ (D2Cons (cell : r)) = do 175 | x <- _fromTry $ _cellToValue cell 176 | return (D2Cons r, K1 x) 177 | -- Just reading a normal cell, return no tape. 178 | _gFcell _ (D2Normal cell) = do 179 | x <- _fromTry $ _cellToValue cell 180 | return (D2Cons [], K1 x) 181 | _gFcell _ x = failure $ pack ("GFromSQLK FAIL " ++ show x) 182 | 183 | _withHint :: Text -> InterResult a -> InterResult a 184 | _withHint extra (Left (FailureInfo msg l)) = Left (FailureInfo msg (extra : l)) 185 | _withHint _ (Right x) = Right x 186 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/RowStructures.hs: -------------------------------------------------------------------------------- 1 | module Spark.Core.Internal.RowStructures where 2 | 3 | import Data.Aeson 4 | import Data.Vector(Vector) 5 | import qualified Data.Text as T 6 | 7 | -- | The basic representation of one row of data. This is a standard type that comes out of the 8 | -- SQL engine in Spark. 9 | 10 | -- | An element in a Row object. 11 | -- All objects manipulated by the Spark framework are assumed to 12 | -- be convertible to cells. 13 | -- 14 | -- This is usually handled by generic transforms. 15 | data Cell = 16 | Empty -- To represent maybe 17 | | IntElement !Int 18 | | DoubleElement !Double 19 | | StringElement !T.Text 20 | | BoolElement !Bool 21 | | RowArray !(Vector Cell) deriving (Show, Eq) 22 | 23 | -- | A Row of data: the basic data structure to transport information 24 | -- TODO rename to rowCells 25 | data Row = Row { 26 | cells :: !(Vector Cell) 27 | } deriving (Show, Eq) 28 | 29 | 30 | -- AESON INSTANCES 31 | 32 | -- TODO(kps) add some workaround to account for the restriction of 33 | -- JSON types: 34 | -- int32 -> int32 35 | -- double -> double 36 | -- weird double -> string? 37 | -- long/bigint -> string? 38 | 39 | -- | Cell 40 | instance ToJSON Cell where 41 | toJSON Empty = Null 42 | toJSON (DoubleElement d) = toJSON d 43 | toJSON (IntElement i) = toJSON i 44 | toJSON (BoolElement b) = toJSON b 45 | toJSON (StringElement s) = toJSON s 46 | toJSON (RowArray arr) = toJSON arr 47 | 48 | -- | Row 49 | instance ToJSON Row where 50 | toJSON (Row x) = toJSON x 51 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/RowUtils.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Spark.Core.Internal.RowUtils( 4 | jsonToCell, 5 | checkCell, 6 | rowArray 7 | ) where 8 | 9 | import Data.Aeson 10 | import Data.Text(Text) 11 | import Data.Maybe(catMaybes, listToMaybe) 12 | import Formatting 13 | import qualified Data.Vector as V 14 | import qualified Data.HashMap.Strict as HM 15 | import Data.Scientific(floatingOrInteger, toRealFloat) 16 | import Control.Monad.Except 17 | 18 | import Spark.Core.Internal.TypesStructures 19 | import Spark.Core.Internal.TypesFunctions 20 | import Spark.Core.Internal.RowStructures 21 | import Spark.Core.StructuresInternal(FieldName(..)) 22 | import Spark.Core.Internal.Utilities 23 | 24 | type TryCell = Either Text Cell 25 | 26 | -- | Decodes a JSON into a row. 27 | -- This operation requires a SQL type that describes 28 | -- the schema. 29 | jsonToCell :: DataType -> Value -> Either Text Cell 30 | jsonToCell dt v = withContext ("jsonToCell: dt="<>show' dt<>" v="<>show' v) $ 31 | _j2Cell v dt 32 | 33 | {-| Given a datatype, ensures that the cell has the corresponding type. 34 | -} 35 | checkCell :: DataType -> Cell -> Either Text Cell 36 | checkCell dt c = case _checkCell dt c of 37 | Nothing -> pure c 38 | Just txt -> throwError txt 39 | 40 | {-| Convenience constructor for an array of cells. 41 | -} 42 | rowArray :: [Cell] -> Cell 43 | rowArray = RowArray . V.fromList 44 | 45 | 46 | -- Returns an error message if something wrong is found 47 | _checkCell :: DataType -> Cell -> Maybe Text 48 | _checkCell dt c = case (dt, c) of 49 | (NullableType _, Empty) -> Nothing 50 | (StrictType _, Empty) -> 51 | pure $ sformat ("Expected a strict value of type "%sh%" but no value") dt 52 | (StrictType sdt, x) -> _checkCell' sdt x 53 | (NullableType sdt, x) -> _checkCell' sdt x 54 | 55 | -- Returns an error message if something wrong is found 56 | _checkCell' :: StrictDataType -> Cell -> Maybe Text 57 | _checkCell' sdt c = case (sdt, c) of 58 | (_, Empty) -> 59 | pure $ sformat ("Expected a strict value of type "%sh%" but no value") sdt 60 | (IntType, IntElement _) -> Nothing 61 | (StringType, StringElement _) -> Nothing 62 | (Struct (StructType fields), RowArray cells') -> 63 | if V.length fields == V.length cells' 64 | then 65 | let types = V.toList $ structFieldType <$> fields 66 | res = uncurry _checkCell <$> (types `zip` V.toList cells') 67 | in listToMaybe (catMaybes res) 68 | else 69 | pure $ sformat ("Struct "%sh%" has "%sh%" fields, asked to be matched with "%sh%" cells") sdt (V.length fields) (V.length cells') 70 | (ArrayType dt, RowArray cells') -> 71 | let res = uncurry _checkCell <$> (repeat dt `zip` V.toList cells') 72 | in listToMaybe (catMaybes res) 73 | (_, _) -> 74 | pure $ sformat ("Type "%sh%" is incompatible with cell content "%sh) sdt c 75 | 76 | 77 | _j2Cell :: Value -> DataType -> TryCell 78 | _j2Cell Null (StrictType t) = 79 | throwError $ sformat ("_j2Cell: Expected "%shown%", got null") t 80 | _j2Cell Null (NullableType _) = pure Empty 81 | _j2Cell x (StrictType t) = _j2CellS x t 82 | -- We do not express optional types at cell level. They have to be 83 | -- encoded in the data type. 84 | _j2Cell x (NullableType t) = _j2CellS x t 85 | --_j2Cell x t = throwError $ sformat ("_j2Cell: Could not match value "%shown%" with type "%shown) x t 86 | 87 | _j2CellS :: Value -> StrictDataType -> TryCell 88 | _j2CellS (String t) StringType = pure . StringElement $ t 89 | _j2CellS (Bool t) BoolType = pure . BoolElement $ t 90 | _j2CellS (Array v) (ArrayType t) = 91 | let trys = flip _j2Cell t <$> v in 92 | RowArray <$> sequence trys 93 | _j2CellS (Number s) IntType = case floatingOrInteger s :: Either Double Int of 94 | Left _ -> throwError $ sformat ("_j2CellS: Could not cast as int "%shown) s 95 | Right i -> pure (IntElement i) 96 | _j2CellS (Number s) DoubleType = pure . DoubleElement . toRealFloat $ s 97 | -- Normal representation as object. 98 | _j2CellS (Object o) (Struct struct) = 99 | let 100 | o2f :: StructField -> TryCell 101 | o2f field = 102 | let nullable = isNullable $ structFieldType field 103 | val = HM.lookup (unFieldName $ structFieldName field) o in 104 | case val of 105 | Nothing -> 106 | if nullable then 107 | pure Empty 108 | else throwError $ sformat ("_j2CellS: Could not find key "%shown%" in object "%shown) field o 109 | Just x -> _j2Cell x (structFieldType field) 110 | fields = o2f <$> structFields struct 111 | in RowArray <$> sequence fields 112 | -- Compact array-based representation. 113 | _j2CellS (Array v) (Struct (StructType fields)) = 114 | if V.length v == V.length fields 115 | then 116 | let dts = structFieldType <$> fields 117 | inner = uncurry _j2Cell <$> V.zip v dts 118 | in RowArray <$> sequence inner 119 | else throwError $ sformat ("_j2CellS: Compact object format a different number of fields '"%shown%"' compared "%shown) v fields 120 | _j2CellS x t = throwError $ sformat ("_j2CellS: Could not match value '"%shown%"' with type "%shown) x t 121 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/TypesGenerics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE PolyKinds #-} 2 | {-# LANGUAGE TypeOperators #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE DeriveAnyClass #-} 5 | {-# LANGUAGE FlexibleContexts #-} 6 | {-# LANGUAGE FlexibleInstances #-} 7 | {-# LANGUAGE DefaultSignatures #-} 8 | {-# LANGUAGE ScopedTypeVariables #-} 9 | {-# LANGUAGE BangPatterns #-} 10 | {-# LANGUAGE MultiParamTypeClasses #-} 11 | 12 | module Spark.Core.Internal.TypesGenerics where 13 | 14 | import qualified Data.Vector as V 15 | import qualified Data.Text as T 16 | import GHC.Generics 17 | import Formatting 18 | 19 | import Spark.Core.Internal.TypesStructures 20 | import Spark.Core.Internal.TypesFunctions 21 | import Spark.Core.Internal.Utilities 22 | import Spark.Core.StructuresInternal(FieldName(..), unsafeFieldName) 23 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeRepr, DataTypeElementRepr) 24 | 25 | -- The 3rd attempt to get generics conversions. 26 | 27 | -- Given a tag on a type, returns the equivalent SQL type. 28 | -- This is the type for a cell, not for a row. 29 | -- TODO(kps) more documentation 30 | buildType :: (HasCallStack, SQLTypeable a) => SQLType a 31 | buildType = _buildType 32 | 33 | 34 | -- The class of all the types for which the SQL type can be inferred 35 | -- from the Haskell type only. 36 | -- Two notable exceptions are Row and Cell, which are the dynamic types 37 | -- used by Spark. 38 | -- See also buildType on how to use it. 39 | class SQLTypeable a where 40 | _genericTypeFromValue :: (HasCallStack) => a -> GenericType 41 | default _genericTypeFromValue :: (HasCallStack, Generic a, GenSQLTypeable (Rep a)) => a -> GenericType 42 | _genericTypeFromValue x = genTypeFromProxy (from x) 43 | 44 | -- Generic SQLTypeable 45 | class GenSQLTypeable f where 46 | genTypeFromProxy :: (HasCallStack) => f a -> GenericType 47 | 48 | 49 | -- | The only function that should matter for users in this file. 50 | -- Given a type, returns the SQL representation of this type. 51 | _buildType :: forall a. (HasCallStack, SQLTypeable a) => SQLType a 52 | _buildType = 53 | let dt = _genericTypeFromValue (undefined :: a) 54 | in SQLType dt 55 | 56 | type GenericType = DataType 57 | 58 | instance SQLTypeable Int where 59 | _genericTypeFromValue _ = StrictType IntType 60 | 61 | instance SQLTypeable Double where 62 | _genericTypeFromValue _ = StrictType DoubleType 63 | 64 | instance SQLTypeable T.Text where 65 | _genericTypeFromValue _ = StrictType StringType 66 | 67 | instance SQLTypeable Bool where 68 | _genericTypeFromValue _ = StrictType BoolType 69 | 70 | instance SQLTypeable DataTypeRepr 71 | instance SQLTypeable DataTypeElementRepr 72 | 73 | instance SQLTypeable DataType where 74 | _genericTypeFromValue _ = _genericTypeFromValue (undefined :: DataTypeRepr) 75 | 76 | 77 | -- instance {-# INCOHERENT #-} SQLTypeable String where 78 | -- _genericTypeFromValue _ = StrictType StringType 79 | 80 | instance SQLTypeable a => SQLTypeable (Maybe a) where 81 | _genericTypeFromValue _ = let SQLType dt = buildType :: (SQLType a) in 82 | (NullableType . iInnerStrictType) dt 83 | 84 | instance {-# OVERLAPPABLE #-} SQLTypeable a => SQLTypeable [a] where 85 | _genericTypeFromValue _ = 86 | let SQLType dt = buildType :: (SQLType a) in 87 | (StrictType . ArrayType) dt 88 | 89 | 90 | instance forall a1 a2. ( 91 | SQLTypeable a2, 92 | SQLTypeable a1) => SQLTypeable (a1, a2) where 93 | _genericTypeFromValue _ = 94 | let 95 | SQLType t1 = buildType :: SQLType a1 96 | SQLType t2 = buildType :: SQLType a2 97 | in _buildTupleStruct [t1, t2] 98 | 99 | _buildTupleStruct :: [GenericType] -> GenericType 100 | _buildTupleStruct dts = 101 | let fnames = unsafeFieldName . T.pack. ("_" ++) . show <$> ([1..] :: [Int]) 102 | fs = uncurry StructField <$> zip fnames dts 103 | in StrictType . Struct . StructType $ V.fromList fs 104 | 105 | -- instance (SQLTypeable a, SQLTypeable b) => SQLTypeable (a,b) where 106 | -- _genericTypeFromValue _ = _genericTypeFromValue (undefined :: a) ++ _genericTypeFromValue (undefined :: b) 107 | 108 | instance (GenSQLTypeable f) => GenSQLTypeable (M1 D c f) where 109 | genTypeFromProxy m = genTypeFromProxy (unM1 m) 110 | 111 | instance (GenSQLTypeable f, Constructor c) => GenSQLTypeable (M1 C c f) where 112 | genTypeFromProxy m 113 | | conIsRecord m = 114 | let x = unM1 m 115 | dt = genTypeFromProxy x in 116 | dt 117 | | otherwise = 118 | -- It is assumed to be a newtype and we are going to unwrap it 119 | let !dt1 = genTypeFromProxy (unM1 m) 120 | in case iSingleField dt1 of 121 | Just dt -> dt 122 | Nothing -> 123 | failure $ sformat ("M1 C "%sh%" dt1="%sh) n dt1 124 | where n = conName m 125 | 126 | -- Selector Metadata 127 | instance (GenSQLTypeable f, Selector c) => GenSQLTypeable (M1 S c f) where 128 | genTypeFromProxy m = 129 | let st = genTypeFromProxy (unM1 m) 130 | n = selName m 131 | field = StructField { structFieldName = FieldName $ T.pack n, structFieldType = st } 132 | st2 = StructType (V.singleton field) in 133 | StrictType $ Struct st2 134 | 135 | instance (SQLTypeable a) => GenSQLTypeable (K1 R a) where 136 | genTypeFromProxy m = _genericTypeFromValue (unK1 m) 137 | 138 | -- Sum branch 139 | instance (GenSQLTypeable a, GenSQLTypeable b) => GenSQLTypeable (a :+: b) where 140 | genTypeFromProxy (L1 x) = genTypeFromProxy x 141 | genTypeFromProxy (R1 x) = genTypeFromProxy x 142 | 143 | -- Product branch 144 | instance (GenSQLTypeable a, GenSQLTypeable b) => GenSQLTypeable (a :*: b) where 145 | genTypeFromProxy z = 146 | -- Due to optimizations that I do not understand, the decomposition has to 147 | -- be done inside the function. 148 | -- Otherwise, the value (which is undefined) gets to be evaluated, and breaks 149 | -- the code. 150 | let (x1 :*: x2) = z 151 | y1 = genTypeFromProxy x1 152 | y2 = genTypeFromProxy x2 in case (y1, y2) of 153 | (StrictType (Struct s1), StrictType (Struct s2)) -> 154 | (StrictType . Struct) s where 155 | fs = structFields s1 V.++ structFields s2 156 | s = StructType fs 157 | _ -> failure $ sformat ("should not happen: left="%sh%" right="%sh) y1 y2 158 | 159 | -- Void branch 160 | instance GenSQLTypeable U1 where 161 | genTypeFromProxy _ = failure "U1" 162 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/TypesStructures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | {-| The structures of data types in Karps. 5 | 6 | For a detailed description of the supported types, see 7 | http://spark.apache.org/docs/latest/sql-programming-guide.html#data-types 8 | 9 | At a high-level, Spark DataFrames and Datasets are equivalent to lists of 10 | objects whose type can be mapped to the same StructType: 11 | Dataset a ~ ArrayType StructType (...) 12 | Columns of a dataset are equivalent to lists of object whose type can be 13 | mapped to the same DataType (either Strict or Nullable) 14 | Local data (or "blobs") are single elements whose type can be mapped to a 15 | DataType (either strict or nullable) 16 | -} 17 | module Spark.Core.Internal.TypesStructures where 18 | 19 | import Data.Aeson 20 | import Data.Vector(Vector) 21 | import Control.Monad(guard) 22 | import qualified Data.Vector as V 23 | import qualified Data.Aeson as A 24 | import qualified Data.Text as T 25 | import GHC.Generics(Generic) 26 | import Test.QuickCheck 27 | 28 | import Spark.Core.StructuresInternal(FieldName(..)) 29 | import Spark.Core.Internal.Utilities 30 | 31 | -- The core type algebra 32 | 33 | -- | The data types that are guaranteed to not be null: evaluating them will return a value. 34 | data StrictDataType = 35 | IntType 36 | | DoubleType 37 | | StringType 38 | | BoolType 39 | | Struct !StructType 40 | | ArrayType !DataType 41 | deriving (Eq) 42 | 43 | -- | All the data types supported by the Spark engine. 44 | -- The data types can either be nullable (they may contain null values) or strict (all the values are present). 45 | -- There are a couple of differences with the algebraic data types in Haskell: 46 | -- Maybe (Maybe a) ~ Maybe a which implies that arbitrary nesting of values will be flattened to a top-level Nullable 47 | -- Similarly, [[]] ~ [] 48 | data DataType = 49 | StrictType !StrictDataType 50 | | NullableType !StrictDataType deriving (Eq) 51 | 52 | -- | A field in a structure 53 | data StructField = StructField { 54 | structFieldName :: !FieldName, 55 | structFieldType :: !DataType 56 | } deriving (Eq) 57 | 58 | -- | The main structure of a dataframe or a dataset 59 | data StructType = StructType { 60 | structFields :: !(Vector StructField) 61 | } deriving (Eq) 62 | 63 | 64 | -- Convenience types 65 | 66 | -- | Represents the choice between a strict and a nullable field 67 | data Nullable = CanNull | NoNull deriving (Show, Eq) 68 | 69 | -- | Encodes the type of all the nullable data types 70 | data NullableDataType = NullableDataType !StrictDataType deriving (Eq) 71 | 72 | -- | A tagged datatype that encodes the sql types 73 | -- This is the main type information that should be used by users. 74 | data SQLType a = SQLType { 75 | -- | The underlying data type. 76 | unSQLType :: !DataType 77 | } deriving (Eq, Generic) 78 | 79 | 80 | instance Show DataType where 81 | show (StrictType x) = show x 82 | show (NullableType x) = show x ++ "?" 83 | 84 | instance Show StrictDataType where 85 | show StringType = "string" 86 | show DoubleType = "double" 87 | show IntType = "int" 88 | show BoolType = "bool" 89 | show (Struct struct) = show struct 90 | show (ArrayType at) = "[" ++ show at ++ "]" 91 | 92 | instance Show StructField where 93 | show field = (T.unpack . unFieldName . structFieldName) field ++ ":" ++ s where 94 | s = show $ structFieldType field 95 | 96 | instance Show StructType where 97 | show struct = "{" ++ unwords (map show (V.toList . structFields $ struct)) ++ "}" 98 | 99 | instance Show (SQLType a) where 100 | show (SQLType dt) = show dt 101 | 102 | 103 | -- QUICKCHECK INSTANCES 104 | 105 | 106 | instance Arbitrary StructField where 107 | arbitrary = do 108 | name <- elements ["_1", "a", "b", "abc"] 109 | dt <- arbitrary :: Gen DataType 110 | return $ StructField (FieldName $ T.pack name) dt 111 | 112 | instance Arbitrary StructType where 113 | arbitrary = do 114 | fields <- listOf arbitrary 115 | return . StructType . V.fromList $ fields 116 | 117 | instance Arbitrary StrictDataType where 118 | arbitrary = do 119 | idx <- elements [1,2] :: Gen Int 120 | return $ case idx of 121 | 1 -> StringType 122 | 2 -> IntType 123 | _ -> failure "Arbitrary StrictDataType" 124 | 125 | instance Arbitrary DataType where 126 | arbitrary = do 127 | x <- arbitrary 128 | u <- arbitrary 129 | return $ if x then 130 | StrictType u 131 | else 132 | NullableType u 133 | 134 | -- AESON INSTANCES 135 | 136 | -- This follows the same structure as the JSON generated by Spark. 137 | instance ToJSON StrictDataType where 138 | toJSON IntType = "integer" 139 | toJSON DoubleType = "double" 140 | toJSON StringType = "string" 141 | toJSON BoolType = "bool" 142 | toJSON (Struct struct) = toJSON struct 143 | toJSON (ArrayType (StrictType dt)) = 144 | object [ "type" .= A.String "array" 145 | , "elementType" .= toJSON dt 146 | , "containsNull" .= A.Bool False ] 147 | toJSON (ArrayType (NullableType dt)) = 148 | object [ "type" .= A.String "array" 149 | , "elementType" .= toJSON dt 150 | , "containsNull" .= A.Bool True ] 151 | 152 | instance ToJSON StructType where 153 | toJSON (StructType fields) = 154 | let 155 | fs = (snd . _fieldToJson) <$> V.toList fields 156 | in object [ "type" .= A.String "struct" 157 | , "fields" .= fs ] 158 | 159 | -- Spark drops the info at the highest level. 160 | instance ToJSON DataType where 161 | toJSON (StrictType dt) = object [ 162 | "nullable" .= A.Bool False, 163 | "dt" .= toJSON dt] 164 | toJSON (NullableType dt) = object [ 165 | "nullable" .= A.Bool True, 166 | "dt" .= toJSON dt] 167 | 168 | instance FromJSON DataType where 169 | parseJSON = withObject "DataType" $ \o -> do 170 | nullable <- o .: "nullable" 171 | dt <- o .: "dt" 172 | let c = if nullable then NullableType else StrictType 173 | return (c dt) 174 | 175 | instance FromJSON StructField where 176 | parseJSON = withObject "StructField" $ \o -> do 177 | n <- o .: "name" 178 | dt <- o .: "type" 179 | nullable <- o .: "nullable" 180 | let c = if nullable then NullableType else StrictType 181 | return $ StructField (FieldName n) (c dt) 182 | 183 | instance FromJSON StructType where 184 | parseJSON = withObject "StructType" $ \o -> do 185 | tp <- o .: "type" 186 | guard (tp == T.pack "struct") 187 | fs <- o .: "fields" 188 | return (StructType fs) 189 | 190 | instance FromJSON StrictDataType where 191 | parseJSON (A.String s) = case s of 192 | "integer" -> return IntType 193 | "double" -> return DoubleType 194 | "string" -> return StringType 195 | "bool" -> return BoolType 196 | -- TODO: figure out which one is correct 197 | "boolean" -> return BoolType 198 | _ -> fail ("StrictDataType: unknown type " ++ T.unpack s) 199 | parseJSON (Object o) = do 200 | tp <- o .: "type" 201 | case T.pack tp of 202 | "struct" -> Struct <$> parseJSON (Object o) 203 | "array" -> do 204 | dt <- o .: "elementType" 205 | containsNull <- o .: "containsNull" 206 | let c = if containsNull then NullableType else StrictType 207 | return $ ArrayType (c dt) 208 | s -> fail ("StrictDataType: unknown type " ++ T.unpack s) 209 | 210 | parseJSON x = fail ("StrictDataType: cannot parse " ++ show x) 211 | 212 | 213 | _fieldToJson :: StructField -> (T.Text, A.Value) 214 | _fieldToJson (StructField (FieldName n) (StrictType dt)) = 215 | (n, object [ "name" .= A.String n 216 | , "type" .= toJSON dt 217 | , "nullable" .= A.Bool False]) 218 | _fieldToJson (StructField (FieldName n) (NullableType dt)) = 219 | (n, object [ "name" .= A.String n 220 | , "type" .= toJSON dt 221 | , "nullable" .= A.Bool True]) 222 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/TypesStructuresRepr.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | 5 | module Spark.Core.Internal.TypesStructuresRepr( 6 | DataTypeElementRepr(..), 7 | DataTypeRepr(..) 8 | ) where 9 | 10 | import qualified Data.Text as T 11 | import GHC.Generics(Generic) 12 | 13 | -- The inner representation of a dataype as a Row object. 14 | -- This representation is meant to be internal. 15 | -- Because the Spark data types do not support recursive types (trees), 16 | -- This is a flattened representation of types. 17 | data DataTypeElementRepr = DataTypeElementRepr { 18 | fieldPath :: ![T.Text], 19 | isNullable :: !Bool, 20 | typeId :: !Int, 21 | fieldIndex :: !Int 22 | } deriving (Eq, Show, Generic) 23 | 24 | data DataTypeRepr = DataTypeRepr { 25 | rows :: [DataTypeElementRepr] 26 | } deriving (Eq, Show, Generic) 27 | -------------------------------------------------------------------------------- /src/Spark/Core/Internal/Utilities.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE BangPatterns #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | 6 | 7 | {-| A collection of small utility functions. 8 | -} 9 | module Spark.Core.Internal.Utilities( 10 | LB.HasCallStack, 11 | UnknownType, 12 | pretty, 13 | myGroupBy, 14 | myGroupBy', 15 | missing, 16 | failure, 17 | failure', 18 | forceRight, 19 | show', 20 | encodeDeterministicPretty, 21 | withContext, 22 | strictList, 23 | traceHint, 24 | SF.sh, 25 | (<&>), 26 | (<>) 27 | ) where 28 | 29 | import Data.Aeson 30 | import Data.Aeson.Encode.Pretty 31 | import qualified Data.ByteString.Lazy.Char8 as Char8 32 | import qualified Data.ByteString.Lazy as LBS 33 | import qualified Data.Text as T 34 | import qualified Formatting.ShortFormatters as SF 35 | import Control.Arrow ((&&&)) 36 | import Data.List 37 | import Data.Function 38 | import Data.Text(Text) 39 | import Formatting 40 | import Debug.Trace(trace) 41 | import qualified Data.Map.Strict as M 42 | import Data.Monoid((<>)) 43 | 44 | import qualified Spark.Core.Internal.LocatedBase as LB 45 | 46 | (<&>) :: Functor f => f a -> (a -> b) -> f b 47 | (<&>) = flip fmap 48 | 49 | -- | A type that is is not known and that is not meant to be exposed to the 50 | -- user. 51 | data UnknownType 52 | 53 | {-| Pretty printing for Aeson values (and deterministic output) 54 | -} 55 | pretty :: Value -> Text 56 | pretty = T.pack . Char8.unpack . encodeDeterministicPretty 57 | 58 | -- | Produces a bytestring output of a JSON value that is deterministic 59 | -- and that is invariant to the insertion order of the keys. 60 | -- (i.e the keys are stored in alphabetic order) 61 | -- This is to ensure that all id computations are stable and reproducible 62 | -- on the server part. 63 | -- TODO(kps) use everywhere JSON is converted 64 | encodeDeterministicPretty :: Value -> LBS.ByteString 65 | encodeDeterministicPretty = 66 | encodePretty' (defConfig { confIndent = Spaces 0, confCompare = compare }) 67 | 68 | -- | group by 69 | -- TODO: have a non-empty list instead 70 | myGroupBy' :: (Ord b) => (a -> b) -> [a] -> [(b, [a])] 71 | myGroupBy' f = map (f . head &&& id) 72 | . groupBy ((==) `on` f) 73 | . sortBy (compare `on` f) 74 | 75 | -- | group by 76 | -- TODO: have a non-empty list instead 77 | myGroupBy :: (Ord a) => [(a, b)] -> M.Map a [b] 78 | myGroupBy l = let 79 | l2 = myGroupBy' fst l in 80 | M.map (snd <$>) $ M.fromList l2 81 | 82 | 83 | -- | Missing implementations in the code base. 84 | missing :: (LB.HasCallStack) => Text -> a 85 | missing msg = LB.error $ T.concat ["MISSING IMPLEMENTATION: ", msg] 86 | 87 | {-| The function that is used to trigger exception due to internal programming 88 | errors. 89 | 90 | Currently, all programming errors simply trigger an exception. All these 91 | impure functions are tagged with an implicit call stack argument. 92 | -} 93 | failure :: (LB.HasCallStack) => Text -> a 94 | failure msg = LB.error (T.concat ["FAILURE in Spark. Hint: ", msg]) 95 | 96 | failure' :: (LB.HasCallStack) => Format Text (a -> Text) -> a -> c 97 | failure' x = failure . sformat x 98 | 99 | 100 | {-| Given a DataFrame or a LocalFrame, attempts to get the value, 101 | or throws the error. 102 | 103 | This function is not total. 104 | -} 105 | forceRight :: (LB.HasCallStack, Show a) => Either a b -> b 106 | forceRight (Right b) = b 107 | forceRight (Left a) = LB.error $ 108 | sformat ("Failure from either, got instead a left: "%shown) a 109 | 110 | -- | Force the complete evaluation of a list to WNF. 111 | strictList :: (Show a) => [a] -> [a] 112 | strictList [] = [] 113 | strictList (h : t) = let !t' = strictList t in (h : t') 114 | 115 | -- | (internal) prints a hint with a value 116 | traceHint :: (Show a) => Text -> a -> a 117 | traceHint hint x = trace (T.unpack hint ++ show x) x 118 | 119 | -- | show with Text 120 | show' :: (Show a) => a -> Text 121 | show' x = T.pack (show x) 122 | 123 | withContext :: Text -> Either Text a -> Either Text a 124 | withContext _ (Right x) = Right x 125 | withContext msg (Left other) = Left (msg <> "\n>>" <> other) 126 | -------------------------------------------------------------------------------- /src/Spark/Core/Row.hs: -------------------------------------------------------------------------------- 1 | module Spark.Core.Row( 2 | module Spark.Core.Internal.RowStructures, 3 | ToSQL, 4 | FromSQL, 5 | valueToCell, 6 | cellToValue, 7 | jsonToCell, 8 | rowArray 9 | ) where 10 | 11 | import Spark.Core.Internal.RowStructures 12 | import Spark.Core.Internal.RowGenerics 13 | import Spark.Core.Internal.RowGenericsFrom 14 | import Spark.Core.Internal.RowUtils 15 | -------------------------------------------------------------------------------- /src/Spark/Core/StructuresInternal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | -- Some basic structures about nodes in a graph, etc. 6 | 7 | module Spark.Core.StructuresInternal( 8 | NodeName(..), 9 | NodePath(..), 10 | NodeId(..), 11 | FieldName(..), 12 | FieldPath(..), 13 | ComputationID(..), 14 | catNodePath, 15 | fieldName, 16 | unsafeFieldName, 17 | emptyFieldPath, 18 | nullFieldPath, 19 | headFieldPath, 20 | fieldPath, 21 | prettyNodePath, 22 | ) where 23 | 24 | import qualified Data.Text as T 25 | import Data.ByteString(ByteString) 26 | import GHC.Generics (Generic) 27 | import Data.Hashable(Hashable) 28 | import Data.List(intercalate) 29 | import qualified Data.Aeson as A 30 | import Data.String(IsString(..)) 31 | import Data.Vector(Vector) 32 | import qualified Data.Vector as V 33 | 34 | import Spark.Core.Internal.Utilities 35 | 36 | -- | The name of a node (without path information) 37 | newtype NodeName = NodeName { unNodeName :: T.Text } deriving (Eq, Ord) 38 | 39 | -- | The user-defined path of the node in the hierarchical representation of the graph. 40 | newtype NodePath = NodePath { unNodePath :: Vector NodeName } deriving (Eq, Ord) 41 | 42 | -- | The unique ID of a node. It is based on the parents of the node 43 | -- and all the relevant intrinsic values of the node. 44 | newtype NodeId = NodeId { unNodeId :: ByteString } deriving (Eq, Ord, Generic) 45 | 46 | -- | The name of a field in a sql structure 47 | -- This structure ensures that proper escaping happens if required. 48 | -- TODO: prevent the constructor from being used, it should be checked first. 49 | newtype FieldName = FieldName { unFieldName :: T.Text } deriving (Eq) 50 | 51 | -- | A path to a nested field an a sql structure. 52 | -- This structure ensures that proper escaping happens if required. 53 | newtype FieldPath = FieldPath { unFieldPath :: Vector FieldName } deriving (Eq) 54 | 55 | {-| A unique identifier for a computation (a batch of nodes sent for execution 56 | to Spark). 57 | -} 58 | data ComputationID = ComputationID { 59 | unComputationID :: !T.Text 60 | } deriving (Eq, Show, Generic) 61 | 62 | 63 | 64 | -- | A safe constructor for field names that fixes all the issues relevant to 65 | -- SQL escaping 66 | -- TODO: proper implementation 67 | fieldName :: T.Text -> Either String FieldName 68 | fieldName = Right . FieldName 69 | 70 | -- | Constructs the field name, but will fail if the content is not correct. 71 | unsafeFieldName :: (HasCallStack) => T.Text -> FieldName 72 | unsafeFieldName = forceRight . fieldName 73 | 74 | -- | A safe constructor for field names that fixes all the issues relevant to SQL escaping 75 | -- TODO: proper implementation 76 | fieldPath :: T.Text -> Either String FieldPath 77 | fieldPath x = Right . FieldPath . V.singleton $ FieldName x 78 | 79 | emptyFieldPath :: FieldPath 80 | emptyFieldPath = FieldPath V.empty 81 | 82 | nullFieldPath :: FieldPath -> Bool 83 | nullFieldPath = V.null . unFieldPath 84 | 85 | headFieldPath :: FieldPath -> Maybe FieldName 86 | headFieldPath (FieldPath v) | V.null v = Nothing 87 | headFieldPath (FieldPath v) = Just $ V.head v 88 | 89 | -- | The concatenated path. This is the inverse function of fieldPath. 90 | -- | TODO: this one should be hidden? 91 | catNodePath :: NodePath -> T.Text 92 | catNodePath (NodePath np) = 93 | T.intercalate "/" (unNodeName <$> V.toList np) 94 | 95 | prettyNodePath :: NodePath -> T.Text 96 | -- Only a single slash, double slashes are reserved for the case 97 | -- of global paths (including session and computation) 98 | prettyNodePath np = "/" <> catNodePath np 99 | 100 | instance Show NodeId where 101 | show (NodeId bs) = let s = show bs in 102 | if length s > 9 then 103 | (drop 1 . take 6) s ++ ".." 104 | else 105 | s 106 | 107 | instance Show NodeName where 108 | show (NodeName nn) = T.unpack nn 109 | 110 | instance Show NodePath where 111 | show np = T.unpack $ T.concat ["NPath(", catNodePath np, ")" ] 112 | 113 | instance Show FieldPath where 114 | show (FieldPath l) = 115 | intercalate "." (show <$> V.toList l) 116 | 117 | instance Show FieldName where 118 | -- TODO(kps) escape the '.' characters in the field name 119 | show (FieldName fn) = T.unpack fn 120 | 121 | instance Hashable NodeId 122 | 123 | instance IsString FieldName where 124 | fromString = FieldName . T.pack 125 | 126 | instance A.ToJSON NodeName where 127 | toJSON = A.toJSON . unNodeName 128 | 129 | instance A.FromJSON NodeName where 130 | -- TODO: more parse checks 131 | parseJSON x = NodeName <$> A.parseJSON x 132 | 133 | instance A.ToJSON NodePath where 134 | toJSON = A.toJSON . unNodePath 135 | 136 | instance A.FromJSON NodePath where 137 | parseJSON x = NodePath <$> A.parseJSON x 138 | 139 | instance A.ToJSON FieldName where 140 | toJSON = A.toJSON . unFieldName 141 | 142 | instance A.ToJSON FieldPath where 143 | toJSON = A.toJSON . unFieldPath 144 | 145 | instance Ord FieldName where 146 | compare f1 f2 = compare (unFieldName f1) (unFieldName f2) 147 | 148 | instance A.ToJSON ComputationID where 149 | toJSON = A.toJSON . unComputationID 150 | -------------------------------------------------------------------------------- /src/Spark/Core/Try.hs: -------------------------------------------------------------------------------- 1 | 2 | {-| 3 | Useful classes and functions to deal with failures 4 | within the Karps framework. 5 | 6 | This is a developer API. Users should not have to invoke functions 7 | from this module. 8 | -} 9 | module Spark.Core.Try( 10 | NodeError(..), 11 | Try, 12 | tryError, 13 | tryEither 14 | ) where 15 | 16 | import qualified Data.Text as T 17 | import qualified Data.Vector as V 18 | 19 | import Spark.Core.StructuresInternal 20 | 21 | -- | An error associated to a particular node (an observable or a dataset). 22 | data NodeError = Error { 23 | ePath :: NodePath, 24 | eMessage :: T.Text 25 | } deriving (Eq, Show) 26 | 27 | -- | The common result of attempting to build something. 28 | type Try = Either NodeError 29 | 30 | 31 | -- TODO: rename to tryError 32 | _error :: T.Text -> Try a 33 | _error txt = Left Error { 34 | ePath = NodePath V.empty, 35 | eMessage = txt 36 | } 37 | 38 | -- | Returns an error object given a text clue. 39 | tryError :: T.Text -> Try a 40 | tryError = _error 41 | 42 | -- | Returns an error object given a string clue. 43 | --Remove this method 44 | --tryError' :: String -> Try a 45 | --tryError' = _error . T.pack 46 | 47 | -- | (internal) 48 | -- Given a potentially errored object, converts it to a Try. 49 | tryEither :: Either T.Text a -> Try a 50 | tryEither (Left msg) = tryError msg 51 | tryEither (Right x) = Right x 52 | -------------------------------------------------------------------------------- /src/Spark/Core/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module Spark.Core.Types( 5 | DataType, 6 | Nullable(..), 7 | TupleEquivalence(..), 8 | NameTuple(..), 9 | -- intType, 10 | -- canNull, 11 | -- noNull, 12 | -- stringType, 13 | -- arrayType', 14 | -- cellType, 15 | -- structField, 16 | -- structType, 17 | -- arrayType, 18 | SQLType, 19 | columnType, 20 | SQLTypeable, 21 | buildType, 22 | StructField, 23 | StructType, 24 | -- castType, 25 | catNodePath, 26 | unSQLType 27 | ) where 28 | 29 | import Spark.Core.Internal.TypesStructures 30 | import Spark.Core.Internal.TypesGenerics 31 | import Spark.Core.Internal.TypesFunctions 32 | import Spark.Core.StructuresInternal 33 | import Spark.Core.Internal.FunctionsInternals(TupleEquivalence(..), NameTuple(..)) 34 | 35 | -- | Description of types supported in DataSets 36 | -- Karps supports a restrictive subset of Algebraic Datatypes that is amenable to SQL 37 | -- transformations. This file contains the description of all the supported types, and some 38 | -- conversion tools. 39 | -------------------------------------------------------------------------------- /src/Spark/IO/Inputs.hs: -------------------------------------------------------------------------------- 1 | 2 | module Spark.IO.Inputs( 3 | SparkPath, 4 | JsonMode, 5 | DataSchema, 6 | JsonOptions, 7 | SourceDescription, 8 | json', 9 | json, 10 | jsonInfer 11 | ) where 12 | 13 | import Spark.IO.Internal.Json 14 | import Spark.IO.Internal.InputGeneric 15 | -------------------------------------------------------------------------------- /src/Spark/IO/Internal/InputGeneric.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.IO.Internal.InputGeneric( 5 | SparkPath(..), 6 | DataSchema(..), 7 | InputOptionValue(..), 8 | InputOptionKey(..), 9 | DataFormat(..), 10 | SourceDescription(..), 11 | generic', 12 | genericWithSchema', 13 | genericWithSchema 14 | ) where 15 | 16 | import Data.Text(Text) 17 | import Data.String(IsString(..)) 18 | import qualified Data.Map.Strict as M 19 | import qualified Data.Aeson as A 20 | import qualified Data.Text as T 21 | import Data.Aeson(toJSON, (.=)) 22 | -- import Debug.Trace 23 | 24 | import Spark.Core.Types 25 | import Spark.Core.Context 26 | import Spark.Core.Try 27 | import Spark.Core.Dataset 28 | 29 | import Spark.Core.Internal.Utilities(forceRight) 30 | import Spark.Core.Internal.DatasetFunctions(asDF, emptyDataset, emptyLocalData) 31 | import Spark.Core.Internal.TypesStructures(SQLType(..)) 32 | import Spark.Core.Internal.OpStructures 33 | 34 | {-| A path to some data that can be read by Spark. 35 | -} 36 | newtype SparkPath = SparkPath Text deriving (Show, Eq) 37 | 38 | {-| The schema policty with respect to a data source. It should either 39 | request Spark to infer the schema from the source, or it should try to 40 | match the source against a schema provided by the user. 41 | -} 42 | data DataSchema = InferSchema | UseSchema DataType deriving (Show, Eq) 43 | 44 | {-| The low-level option values accepted by the Spark reader API. 45 | -} 46 | data InputOptionValue = 47 | InputIntOption Int 48 | | InputDoubleOption Double 49 | | InputStringOption Text 50 | | InputBooleanOption Bool 51 | deriving (Eq, Show) 52 | 53 | instance A.ToJSON InputOptionValue where 54 | toJSON (InputIntOption i) = toJSON i 55 | toJSON (InputDoubleOption d) = toJSON d 56 | toJSON (InputStringOption s) = toJSON s 57 | toJSON (InputBooleanOption b) = toJSON b 58 | 59 | newtype InputOptionKey = InputOptionKey { unInputOptionKey :: Text } deriving (Eq, Show, Ord) 60 | 61 | {-| The type of the source. 62 | 63 | This enumeration contains all the data formats that are natively supported by 64 | Spark, either for input or for output, and allows the users to express their 65 | own format if requested. 66 | -} 67 | data DataFormat = 68 | JsonFormat 69 | | TextFormat 70 | | CsvFormat 71 | | CustomSourceFormat !Text 72 | deriving (Eq, Show) 73 | -- data InputSource = JsonSource | TextSource | CsvSource | InputSource SparkPath 74 | 75 | {-| A description of a data source, following Spark's reader API version 2. 76 | 77 | Eeach source constists in an input source (json, xml, etc.), an optional schema 78 | for this source, and a number of options specific to this source. 79 | 80 | Since this descriptions is rather low-level, a number of wrappers of provided 81 | for each of the most popular sources that are already built into Spark. 82 | -} 83 | data SourceDescription = SourceDescription { 84 | inputPath :: !SparkPath, 85 | inputSource :: !DataFormat, 86 | inputSchema :: !DataSchema, 87 | sdOptions :: !(M.Map InputOptionKey InputOptionValue), 88 | inputStamp :: !(Maybe DataInputStamp) 89 | } deriving (Eq, Show) 90 | 91 | instance IsString SparkPath where 92 | fromString = SparkPath . T.pack 93 | 94 | {-| Generates a dataframe from a source description. 95 | 96 | This may trigger some calculations on the Spark side if schema inference is 97 | required. 98 | -} 99 | generic' :: SourceDescription -> SparkState DataFrame 100 | generic' sd = do 101 | dtt <- _inferSchema sd 102 | return $ dtt >>= \dt -> genericWithSchema' dt sd 103 | 104 | {-| Generates a dataframe from a source description, and assumes a given schema. 105 | 106 | This schema overrides whatever may have been given in the source description. If 107 | the source description specified that the schema must be checked or inferred, 108 | this instruction is overriden. 109 | 110 | While this is convenient, it may lead to runtime errors that are hard to 111 | understand if the data does not follow the given schema. 112 | -} 113 | genericWithSchema' :: DataType -> SourceDescription -> DataFrame 114 | genericWithSchema' dt sd = asDF $ emptyDataset no (SQLType dt) where 115 | sd' = sd { inputSchema = UseSchema dt } 116 | so = StandardOperator { 117 | soName = "org.spark.GenericDatasource", 118 | soOutputType = dt, 119 | soExtra = A.toJSON sd' 120 | } 121 | no = NodeDistributedOp so 122 | 123 | {-| Generates a dataframe from a source description, and assumes a certain 124 | schema on the source. 125 | -} 126 | genericWithSchema :: forall a. (SQLTypeable a) => SourceDescription -> Dataset a 127 | genericWithSchema sd = 128 | let sqlt = buildType :: SQLType a 129 | dt = unSQLType sqlt in 130 | forceRight $ castType sqlt =<< genericWithSchema' dt sd 131 | 132 | -- Wraps the action of inferring the schema. 133 | -- This is not particularly efficient here: it does a first pass to get the 134 | -- schema, and then will do a second pass in order to read the data. 135 | _inferSchema :: SourceDescription -> SparkState (Try DataType) 136 | _inferSchema = executeCommand1 . _inferSchemaCmd 137 | 138 | -- TODO: this is a monoidal operation, it could be turned into a universal 139 | -- aggregator. 140 | _inferSchemaCmd :: SourceDescription -> LocalData DataType 141 | _inferSchemaCmd sd = emptyLocalData no sqlt where 142 | sqlt = buildType :: SQLType DataType 143 | dt = unSQLType sqlt 144 | so = StandardOperator { 145 | soName = "org.spark.InferSchema", 146 | soOutputType = dt, 147 | soExtra = A.toJSON sd 148 | } 149 | no = NodeOpaqueAggregator so 150 | 151 | instance A.ToJSON SparkPath where 152 | toJSON (SparkPath p) = toJSON p 153 | 154 | instance A.ToJSON DataSchema where 155 | toJSON InferSchema = "infer_schema" 156 | toJSON (UseSchema dt) = toJSON dt 157 | 158 | instance A.ToJSON DataFormat where 159 | toJSON JsonFormat = "json" 160 | toJSON TextFormat = "text" 161 | toJSON CsvFormat = "csv" 162 | toJSON (CustomSourceFormat s) = toJSON s 163 | 164 | instance A.ToJSON SourceDescription where 165 | toJSON sd = A.object [ 166 | "inputPath" .= toJSON (inputPath sd), 167 | "inputSource" .= toJSON (inputSource sd), 168 | "inputSchema" .= toJSON (inputSchema sd), 169 | "inputStamp" .= A.Null, 170 | "options" .= A.object (f <$> M.toList (sdOptions sd)) 171 | ] where 172 | f (k, v) = unInputOptionKey k .= toJSON v 173 | -------------------------------------------------------------------------------- /src/Spark/IO/Internal/Json.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module Spark.IO.Internal.Json( 5 | JsonMode, 6 | JsonOptions(..), 7 | json', 8 | json, 9 | jsonInfer, 10 | jsonOpt', 11 | jsonOpt, 12 | defaultJsonOptions 13 | ) where 14 | 15 | import qualified Data.Map.Strict as M 16 | import Data.Text(pack) 17 | 18 | 19 | import Spark.Core.Types 20 | import Spark.Core.Dataset(DataFrame, Dataset, castType') 21 | import Spark.Core.Context 22 | import Spark.Core.Try 23 | 24 | import Spark.IO.Internal.InputGeneric 25 | 26 | {-| 27 | -} 28 | data JsonMode = Permissive | DropMalformed | FailFast 29 | 30 | {-| The options for the json input. 31 | -} 32 | data JsonOptions = JsonOptions { 33 | mode :: !JsonMode, 34 | jsonSchema :: !DataSchema 35 | } 36 | 37 | 38 | {-| Declares a source of data of the given data type. 39 | 40 | The source is not read at this point, it is just declared. It may be found to be 41 | invalid in subsequent computations. 42 | -} 43 | json' :: DataType -> String -> DataFrame 44 | json' dt p = genericWithSchema' dt (_jsonSourceDescription (SparkPath (pack p)) defaultJsonOptions) 45 | 46 | {-| Declares a source of data of the given data type. 47 | 48 | The source is not read at this point, it is just declared. 49 | -} 50 | json :: (SQLTypeable a) => String -> Dataset a 51 | json p = genericWithSchema (_jsonSourceDescription (SparkPath (pack p)) defaultJsonOptions) 52 | 53 | {-| Reads a source of data expected to be in the JSON format. 54 | 55 | The schema is not required and Spark will infer the schema of the source. 56 | However, all the data contained in the source may end up being read in the 57 | process. 58 | -} 59 | jsonInfer :: SparkPath -> SparkState DataFrame 60 | jsonInfer = jsonOpt' defaultJsonOptions 61 | 62 | {-| Reads a source of data expected to be in the JSON format. 63 | 64 | The schema is not required and Spark will infer the schema of the source. 65 | However, all the data contained in the source may end up being read in the 66 | process. 67 | -} 68 | jsonOpt' :: JsonOptions -> SparkPath -> SparkState DataFrame 69 | jsonOpt' jo sp = generic' (_jsonSourceDescription sp jo) 70 | 71 | {-| Reads a source of data expected to be in the JSON format. 72 | 73 | The schema is not required and Spark will infer the schema of the source. 74 | However, all the data contained in the source may end up being read in the 75 | process. 76 | -} 77 | jsonOpt :: forall a. (SQLTypeable a) => JsonOptions -> SparkPath -> SparkState (Try (Dataset a)) 78 | jsonOpt jo sp = 79 | let sqlt = buildType :: SQLType a 80 | dt = unSQLType sqlt 81 | jo' = jo { jsonSchema = UseSchema dt } 82 | in castType' sqlt <$> jsonOpt' jo' sp 83 | 84 | defaultJsonOptions :: JsonOptions 85 | defaultJsonOptions = JsonOptions { 86 | -- Fail fast by default, to be conservative about errors, 87 | -- and respect the strictness arguments. 88 | mode = FailFast, 89 | jsonSchema = InferSchema 90 | } 91 | 92 | _jsonSourceDescription :: SparkPath -> JsonOptions -> SourceDescription 93 | _jsonSourceDescription sp jo = SourceDescription { 94 | inputSource = JsonFormat, 95 | inputPath = sp, 96 | inputSchema = jsonSchema jo, 97 | sdOptions = _jsonOptions jo, 98 | inputStamp = Nothing 99 | } 100 | 101 | _jsonOptions :: JsonOptions -> M.Map InputOptionKey InputOptionValue 102 | _jsonOptions jo = M.fromList [(InputOptionKey "mode", _mode (mode jo))] 103 | 104 | _mode :: JsonMode -> InputOptionValue 105 | _mode Permissive = InputStringOption "PERMISSIVE" 106 | _mode DropMalformed = InputStringOption "DROPMALFORMED" 107 | _mode FailFast = InputStringOption "FAILFAST" 108 | -------------------------------------------------------------------------------- /src/Spark/IO/Internal/OutputCommon.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | 5 | module Spark.IO.Internal.OutputCommon( 6 | SaveMode(..), 7 | OutputBucket, 8 | DynOutputBucket, 9 | OutputPartition, 10 | DynOutputPartition, 11 | SavingDescription(..), 12 | partition, 13 | partition', 14 | bucket, 15 | bucket', 16 | saveDefaults, 17 | saveCol 18 | ) where 19 | 20 | -- import Data.Text(Text) 21 | -- import qualified Data.Map.Strict as M 22 | -- import qualified Data.Aeson as A 23 | -- import Data.Aeson(toJSON, (.=)) 24 | 25 | -- import Spark.Core.Types 26 | -- import Spark.Core.Context 27 | import Spark.Core.Try 28 | import Spark.Core.Column 29 | -- import Spark.Core.ColumnFunctions 30 | -- import Spark.Core.Row 31 | import Spark.Core.Dataset 32 | 33 | import Spark.Core.Internal.ColumnStructures(UnknownReference, UntypedColumnData) 34 | import Spark.Core.Internal.ColumnFunctions(dropColReference) 35 | import Spark.Core.Internal.Utilities 36 | import Spark.IO.Internal.InputGeneric 37 | 38 | {-| The mode when saving the data. 39 | -} 40 | data SaveMode = 41 | Overwrite 42 | | Append 43 | | Ignore 44 | | ErrorIfExists deriving(Eq, Show) 45 | 46 | data OutputPartition ref = OutputPartition UntypedColumnData 47 | 48 | type DynOutputPartition = Try (OutputPartition UnknownReference) 49 | 50 | data OutputBucket ref = OutputBucket UntypedColumnData 51 | 52 | type DynOutputBucket = Try (OutputBucket UnknownReference) 53 | 54 | partition :: Column ref a -> OutputPartition ref 55 | partition = OutputPartition . dropColType . dropColReference 56 | 57 | partition' :: DynColumn -> DynOutputPartition 58 | partition' = fmap partition 59 | 60 | bucket :: Column ref a -> OutputBucket ref 61 | bucket = OutputBucket . dropColType . dropColReference 62 | 63 | bucket' :: DynColumn -> DynOutputBucket 64 | bucket' = fmap bucket 65 | 66 | 67 | data SavingDescription ref a = SavingDescription { 68 | partitions :: ![OutputPartition ref], 69 | buckets :: ![OutputBucket ref], 70 | savedCol :: !(Column ref a), 71 | saveFormat :: !DataFormat, 72 | savePath :: !SparkPath 73 | } 74 | 75 | saveDefaults :: SparkPath -> DataFormat -> Column ref a -> SavingDescription ref a 76 | saveDefaults sp f c = SavingDescription { 77 | partitions = [], 78 | buckets = [], 79 | savedCol = c, 80 | saveFormat = f, 81 | savePath = sp 82 | } 83 | 84 | {-| Inserts an action to store the given dataframe in the graph of computations. 85 | 86 | NOTE: Because of some limitations in Spark, all the columns used when forming 87 | the buckets and the parttions must be present inside the column being written. 88 | These columns will be appended to the column being written if they happen to be 89 | missing. The consequence is that more data may be written than expected. 90 | 91 | It returns true if the update was successful. The return type is subject to 92 | change. 93 | -} 94 | saveCol :: SavingDescription ref a -> LocalData Bool 95 | saveCol _ = missing "saveCol" 96 | 97 | -- test :: Int 98 | -- test = 99 | -- let c = undefined :: Column Int Int 100 | -- ld = saveCol (saveDefaults undefined JsonFormat c) { partitions = [partition c, partition c] } 101 | -- in 3 102 | -- 103 | -- repeatDS :: Column ref Int -> Column ref a -> Dataset a 104 | -- 105 | -- repeatFast :: Column ref Int -> Column ref a -> Dataset a 106 | -- 107 | -- repeatScatter :: Int -> Column ref Int -> Column ref a -> Dataset a 108 | -------------------------------------------------------------------------------- /src/Spark/Inputs/Inputs.hs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krapsh/kraps-haskell/c3a03afe9a4a5ddcedf994756d9bc5967818cec2/src/Spark/Inputs/Inputs.hs -------------------------------------------------------------------------------- /stack-ihaskell.yaml: -------------------------------------------------------------------------------- 1 | flags: {} 2 | extra-package-dbs: [] 3 | packages: 4 | - '.' 5 | #- ./ihaskell-display/ihaskell-blaze 6 | #- ./ihaskell-display/ihaskell-basic 7 | extra-deps: 8 | - generator-0.5.5 9 | - cryptohash-sha256-0.11.100.1 10 | - aeson-pretty-0.8.0 11 | - system-argv0-0.1.1 # Necessary for LTS 2.22 (GHC 7.8) 12 | - ihaskell-0.8.3.0 13 | - ipython-kernel-0.8.3.0 14 | resolver: nightly-2015-08-15 #lts-6.2 15 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | flags: {} 2 | extra-package-dbs: [] 3 | packages: 4 | - '.' 5 | extra-deps: 6 | - generator-0.5.5 7 | - cryptohash-sha256-0.11.100.1 8 | - aeson-pretty-0.8.2 9 | resolver: lts-7.13 10 | pvp-bounds: both 11 | build: 12 | library-profiling: true 13 | executable-profiling: true 14 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/CachingSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.CachingSpec where 5 | 6 | import Test.Hspec 7 | import qualified Data.Text 8 | 9 | import Spark.Core.Context 10 | import Spark.Core.Functions 11 | import Spark.Core.Column 12 | import Spark.Core.ColumnFunctions 13 | import Spark.Core.StructuresInternal(ComputationID(..)) 14 | 15 | 16 | -- Collecting a dataset made from a list should yield the same list (modulo 17 | -- some reordering) 18 | collectIdempotent :: [Int] -> IO () 19 | collectIdempotent l = do 20 | -- stats <- computationStatsDef (ComputationID "0") 21 | -- print "STATS" 22 | -- print (show stats) 23 | let ds = dataset l 24 | let ds' = autocache ds 25 | let c1 = asCol ds' 26 | let s1 = sumCol c1 27 | let s2 = count ds' 28 | let x = s1 + s2 29 | l2 <- exec1Def x 30 | l2 `shouldBe` (sum l + length l) 31 | 32 | run :: String -> IO () -> SpecWith (Arg (IO ())) 33 | run s f = it s $ do 34 | createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s } 35 | f 36 | -- This is horribly not robust to any sort of failure, but it will do for now 37 | -- TODO(kps) make more robust 38 | closeSparkSessionDef 39 | return () 40 | 41 | spec :: Spec 42 | spec = do 43 | describe "Integration test - caching on ints" $ do 44 | run "cache_sum1" $ 45 | collectIdempotent ([1,2,3] :: [Int]) 46 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/CollectSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.CollectSpec where 5 | 6 | import Test.Hspec 7 | import qualified Data.Text 8 | import Data.List(sort) 9 | 10 | import Spark.Core.Context 11 | import Spark.Core.Types 12 | import Spark.Core.Row 13 | import Spark.Core.Functions 14 | import Spark.Core.Column 15 | import Spark.Core.IntegrationUtilities 16 | import Spark.Core.Internal.Utilities 17 | 18 | 19 | -- Collecting a dataset made from a list should yield the same list (modulo 20 | -- some reordering) 21 | -- TODO: replace the ordering by the canonical ordering over the data 22 | collectIdempotent :: (Ord a, Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a) => [a] -> IO () 23 | collectIdempotent l = do 24 | let ds = dataset l 25 | l2 <- exec1Def $ collect (asCol ds) 26 | l2 `shouldBe` sort l 27 | 28 | run :: String -> IO () -> SpecWith (Arg (IO ())) 29 | run s f = it s $ do 30 | createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s } 31 | f 32 | -- This is horribly not robust to any sort of failure, but it will do for now 33 | -- TODO(kps) make more robust 34 | closeSparkSessionDef 35 | return () 36 | 37 | spec :: Spec 38 | spec = do 39 | describe "Integration test - collect on ints" $ do 40 | run "running_twice" $ do 41 | let ds = dataset [1::Int,2] 42 | let c = traceHint "c=" $ collect (asCol ds) 43 | l2 <- exec1Def $ c 44 | l2' <- exec1Def $ collect (asCol ds) 45 | l2 `shouldBe` l2' 46 | run "empty_ints1" $ 47 | collectIdempotent ([] :: [Int]) 48 | run "ints1" $ 49 | collectIdempotent ([4,5,1,2,3] :: [Int]) 50 | run "ints1_opt" $ 51 | collectIdempotent ([Just 1, Nothing] :: [Maybe Int]) 52 | run "nothing_ints_opt" $ 53 | collectIdempotent ([Nothing] :: [Maybe Int]) 54 | run "ints1_opt" $ 55 | collectIdempotent ([Just 1, Just 2] :: [Maybe Int]) 56 | run "empty_ints_opt" $ 57 | collectIdempotent ([] :: [Maybe Int]) 58 | describe "Integration test - collect on TestStruct5" $ do 59 | run "empty_TestStruct5" $ 60 | collectIdempotent ([] :: [TestStruct5]) 61 | run "single_TestStruct5" $ 62 | collectIdempotent ([TestStruct5 1 2] :: [TestStruct5]) 63 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/ColumnSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.ColumnSpec where 5 | 6 | import Test.Hspec 7 | import Data.List.NonEmpty(NonEmpty( (:|) )) 8 | 9 | import Spark.Core.Context 10 | import Spark.Core.Dataset 11 | import Spark.Core.Column 12 | import Spark.Core.Row 13 | import Spark.Core.Functions 14 | import Spark.Core.ColumnFunctions 15 | import Spark.Core.SimpleAddSpec(run) 16 | import Spark.Core.Internal.LocalDataFunctions(iPackTupleObs) 17 | import Spark.Core.Internal.DatasetFunctions(untypedLocalData) 18 | 19 | myScaler :: Column ref Double -> Column ref Double 20 | myScaler col = 21 | let cnt = asDouble (countCol col) 22 | m = sumCol col / cnt 23 | centered = col .- m 24 | stdDev = sumCol (centered * centered) / cnt 25 | in centered ./ stdDev 26 | 27 | 28 | spec :: Spec 29 | spec = do 30 | describe "local data operations" $ do 31 | run "broadcastPair_struct" $ do 32 | let ds = dataset [1] :: Dataset Int 33 | let cnt = countCol (asCol ds) 34 | let c = collect (asCol ds .+ cnt) 35 | res <- exec1Def c 36 | res `shouldBe` [2] 37 | run "LocalPack (doubles)" $ do 38 | let x = untypedLocalData (1 :: LocalData Double) 39 | let x2 = iPackTupleObs (x :| [x]) 40 | res <- exec1Def x2 41 | res `shouldBe` rowArray [DoubleElement 1, DoubleElement 1] 42 | run "LocalPack" $ do 43 | let x = untypedLocalData (1 :: LocalData Int) 44 | let x2 = iPackTupleObs (x :| [x]) 45 | res <- exec1Def x2 46 | res `shouldBe` rowArray [IntElement 1, IntElement 1] 47 | run "BroadcastPair" $ do 48 | let x = 1 :: LocalData Int 49 | let ds = dataset [2, 3] :: Dataset Int 50 | let ds2 = broadcastPair ds x 51 | res <- exec1Def (collect (asCol ds2)) 52 | res `shouldBe` [(2, 1), (3, 1)] 53 | -- TODO: this combines a lot of elements together. 54 | describe "columns - integration" $ do 55 | run "mean" $ do 56 | let ds = dataset [-1, 1] :: Dataset Double 57 | let c = myScaler (asCol ds) 58 | res <- exec1Def (collect c) 59 | res `shouldBe` [-1, 1] 60 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/GroupsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.GroupsSpec where 5 | 6 | import Test.Hspec 7 | import Data.Text(Text) 8 | 9 | import Spark.Core.Context 10 | import Spark.Core.Functions 11 | import Spark.Core.ColumnFunctions 12 | import Spark.Core.Column 13 | import Spark.Core.IntegrationUtilities 14 | import Spark.Core.CollectSpec(run) 15 | import Spark.Core.Internal.Groups 16 | 17 | sumGroup :: [MyPair] -> [(Text, Int)] -> IO () 18 | sumGroup l lexp = do 19 | let ds = dataset l 20 | let keys = ds // myKey' 21 | let values = ds // myVal' 22 | let g = groupByKey keys values 23 | let ds2 = g `aggKey` sumCol 24 | l2 <- exec1Def $ collect (asCol ds2) 25 | l2 `shouldBe` lexp 26 | 27 | spec :: Spec 28 | spec = do 29 | describe "Integration test - groups on (text, int)" $ do 30 | run "empty" $ 31 | sumGroup [] [] 32 | run "one" $ 33 | sumGroup [MyPair "x" 1] [("x", 1)] 34 | run "two" $ 35 | sumGroup [MyPair "x" 1, MyPair "x" 2, MyPair "y" 1] [("x", 3), ("y", 1)] 36 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/IntegrationUtilities.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE DeriveGeneric #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | 6 | module Spark.Core.IntegrationUtilities where 7 | 8 | import GHC.Generics (Generic) 9 | import Data.Text(Text) 10 | import Data.Aeson(ToJSON) 11 | 12 | import Spark.Core.Context 13 | import Spark.Core.Types 14 | import Spark.Core.Row 15 | import Spark.Core.Column 16 | 17 | data TestStruct1 = TestStruct1 { 18 | ts1f1 :: Int, 19 | ts1f2 :: Maybe Int } deriving (Show, Eq, Generic) 20 | instance ToSQL TestStruct1 21 | instance FromSQL TestStruct1 22 | instance SQLTypeable TestStruct1 23 | 24 | data TestStruct2 = TestStruct2 { 25 | ts2f1 :: [Int] 26 | } deriving (Show, Generic) 27 | instance SQLTypeable TestStruct2 28 | 29 | data TestStruct3 = TestStruct3 { 30 | ts3f1 :: Int 31 | } deriving (Show, Eq, Generic) 32 | instance ToSQL TestStruct3 33 | instance SQLTypeable TestStruct3 34 | 35 | data TestStruct4 = TestStruct4 { 36 | ts4f1 :: TestStruct3 37 | } deriving (Show, Eq, Generic) 38 | 39 | data TestStruct5 = TestStruct5 { 40 | ts5f1 :: Int, 41 | ts5f2 :: Int 42 | } deriving (Show, Eq, Generic, Ord) 43 | -- instance ToJSON TestStruct5 44 | instance SQLTypeable TestStruct5 45 | instance FromSQL TestStruct5 46 | instance ToSQL TestStruct5 47 | 48 | data TestStruct6 = TestStruct6 { 49 | ts6f1 :: Int, 50 | ts6f2 :: Int, 51 | ts6f3 :: TestStruct3 52 | } deriving (Show, Eq, Generic) 53 | 54 | data TestStruct7 = TestStruct7 { 55 | ts7f1 :: Text 56 | } deriving (Show, Eq, Generic) 57 | instance ToSQL TestStruct7 58 | instance SQLTypeable TestStruct7 59 | instance ToJSON TestStruct7 60 | 61 | newtype TestT1 = TestT1 { 62 | unTestT1 :: Int 63 | } deriving (Eq, Show, Generic, Num) 64 | 65 | data MyPair = MyPair { 66 | myKey :: Text, 67 | myVal :: Int } deriving (Generic, Show) 68 | 69 | myKey' :: StaticColProjection MyPair Text 70 | myKey' = unsafeStaticProjection buildType "myKey" 71 | myVal' :: StaticColProjection MyPair Int 72 | myVal' = unsafeStaticProjection buildType "myVal" 73 | instance SQLTypeable MyPair 74 | instance ToSQL MyPair 75 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/JoinsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.JoinsSpec where 5 | 6 | import Test.Hspec 7 | 8 | import Spark.Core.Context 9 | import Spark.Core.Dataset 10 | import Spark.Core.Column 11 | import Spark.Core.ColumnFunctions 12 | import Spark.Core.Row 13 | import Spark.Core.Functions 14 | import Spark.Core.SimpleAddSpec(run) 15 | 16 | spec :: Spec 17 | spec = do 18 | describe "Path test" $ do 19 | run "test_path1" $ do 20 | let ds1 = dataset [1] :: Dataset Int 21 | let x1 = sumCol (asCol ds1) @@ "x1" 22 | let x2 = ((x1 + 1) @@ "x2") `logicalParents` [untyped ds1] 23 | res <- exec1Def x2 24 | res `shouldBe` 2 25 | -- describe "Join test - join on ints" $ do 26 | -- run "empty_ints1" $ do 27 | -- let ds1 = dataset [(1,2)] :: Dataset (Int, Int) 28 | -- let ds2 = dataset [(1,3)] :: Dataset (Int, Int) 29 | -- let df1 = asDF ds1 30 | -- let df2 = asDF ds2 31 | -- let df = joinInner' (df1/-"_1") (df1/-"_2") (df2/-"_1") (df2/-"_2" @@ "_3") 32 | -- res <- exec1Def' (collect' (asCol' df)) 33 | -- res `shouldBe` rowArray [rowArray [IntElement 1, IntElement 2, IntElement 3]] 34 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/PruningSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.PruningSpec where 5 | 6 | import Test.Hspec 7 | import qualified Data.Text as T 8 | import Data.List(sort) 9 | 10 | import Spark.Core.Context 11 | import Spark.Core.Types 12 | import Spark.Core.Row 13 | import Spark.Core.Functions 14 | import Spark.Core.Column 15 | import Spark.Core.IntegrationUtilities 16 | import Spark.Core.CollectSpec(run) 17 | 18 | run2 :: T.Text -> IO () -> SpecWith (Arg (IO ())) 19 | run2 s f = it (T.unpack s) $ do 20 | createSparkSessionDef $ defaultConf { 21 | confRequestedSessionName = s, 22 | confUseNodePrunning = True } 23 | f 24 | -- This is horribly not robust to any sort of failure, but it will do for now 25 | -- TODO(kps) make more robust 26 | closeSparkSessionDef 27 | return () 28 | 29 | 30 | spec :: Spec 31 | spec = do 32 | describe "Integration test - pruning" $ do 33 | run2 "running_twice" $ do 34 | let ds = dataset [1::Int,2] 35 | l2 <- exec1Def $ collect (asCol ds) 36 | l2' <- exec1Def $ collect (asCol ds) 37 | l2 `shouldBe` l2' 38 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/SimpleAddSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.SimpleAddSpec where 5 | 6 | import Test.Hspec 7 | import qualified Data.Text 8 | 9 | import Spark.Core.Context 10 | import Spark.Core.Types 11 | import Spark.Core.Row 12 | import Spark.Core.Functions 13 | 14 | 15 | smallSum :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> a -> IO () 16 | smallSum x y = do 17 | let x' = constant x 18 | let y' = constant y 19 | let z1' = x' + y' 20 | z1 <- exec1Def z1' 21 | z1 `shouldBe` (x + y) 22 | let z2' = y' + x' 23 | z2 <- exec1Def z2' 24 | z2 `shouldBe` (x + y) 25 | 26 | negation :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> a -> IO () 27 | negation x y = do 28 | let x' = constant x 29 | let y' = constant y 30 | let z1' = x' - y' 31 | z1 <- exec1Def z1' 32 | z1 `shouldBe` (x - y) 33 | let z2' = y' - x' 34 | z2 <- exec1Def z2' 35 | z2 `shouldBe` (y - x) 36 | 37 | checkNegate :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> IO () 38 | checkNegate x = do 39 | let x' = constant x 40 | let z1' = negate x' 41 | z1 <- exec1Def z1' 42 | z1 `shouldBe` negate x 43 | 44 | run :: String -> IO () -> SpecWith (Arg (IO ())) 45 | run s f = it s $ do 46 | createSparkSessionDef $ defaultConf { 47 | confRequestedSessionName = Data.Text.pack s, 48 | confPollingIntervalMillis = 100, 49 | confUseNodePrunning = False } -- Disabling caching for now, it causes issues. 50 | f 51 | -- This is horribly not robust to any sort of failure, but it will do for now 52 | -- TODO(kps) make more robust 53 | closeSparkSessionDef 54 | return () 55 | 56 | spec :: Spec 57 | spec = do 58 | describe "Integration test - sum on ints" $ do 59 | run "empty_ints1" $ 60 | smallSum (1 :: Int) (2 :: Int) 61 | run "zero_ints1" $ 62 | smallSum (0 :: Int) (2 :: Int) 63 | run "negation_ints1" $ 64 | negation (1 :: Int) (2 :: Int) 65 | run "negate_ints1" $ 66 | checkNegate (1 :: Int) 67 | -------------------------------------------------------------------------------- /test-integration/Spark/Core/Spec.hs: -------------------------------------------------------------------------------- 1 | -- Not working??? 2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 3 | -------------------------------------------------------------------------------- /test-integration/Spark/IO/JsonSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.IO.JsonSpec where 5 | 6 | import Test.Hspec 7 | import Data.Aeson(encode) 8 | import qualified Data.ByteString.Lazy 9 | -- import System.IO 10 | 11 | import Spark.Core.Context 12 | import Spark.Core.Types 13 | import Spark.Core.Row 14 | import Spark.Core.Functions 15 | import Spark.Core.Column 16 | import Spark.IO.Inputs 17 | import Spark.Core.IntegrationUtilities 18 | import Spark.Core.SimpleAddSpec(run) 19 | 20 | spec :: Spec 21 | spec = do 22 | describe "Read a json file" $ do 23 | run "simple read" $ do 24 | let xs = [TestStruct7 "x"] 25 | let js = encode xs 26 | _ <- Data.ByteString.Lazy.writeFile "/tmp/x.json" js 27 | let dt = unSQLType (buildType :: SQLType TestStruct7) 28 | let df = json' dt "/tmp/x.json" 29 | let c = collect' (asCol' df) 30 | c1 <- exec1Def' c 31 | c1 `shouldBe` rowArray [rowArray [StringElement "x"]] 32 | c2 <- exec1Def' c 33 | c2 `shouldBe` rowArray [rowArray [StringElement "x"]] 34 | run "simple inference" $ do 35 | let xs = [TestStruct7 "x"] 36 | let js = encode xs 37 | _ <- Data.ByteString.Lazy.writeFile "/tmp/x.json" js 38 | df <- execStateDef $ jsonInfer "/tmp/x.json" 39 | let c = collect' (asCol' df) 40 | c1 <- exec1Def' c 41 | c1 `shouldBe` rowArray [rowArray [StringElement "x"]] 42 | -------------------------------------------------------------------------------- /test-integration/Spark/IO/StampSpec.hs: -------------------------------------------------------------------------------- 1 | {- 2 | let s = "s" 3 | 4 | createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s } 5 | 6 | execStateDef (checkDataStamps [HdfsPath (Data.Text.pack "/tmp/")]) 7 | 8 | -} 9 | 10 | module Spark.IO.StampSpec where 11 | 12 | import Test.Hspec 13 | 14 | -- import Spark.Core.Context 15 | -- import Spark.Core.Types 16 | -- import Spark.Core.Row 17 | -- import Spark.Core.Functions 18 | -- import Spark.Core.Column 19 | -- import Spark.IO.Inputs 20 | -- import Spark.Core.IntegrationUtilities 21 | import Spark.Core.SimpleAddSpec(run) 22 | 23 | spec :: Spec 24 | spec = do 25 | describe "Read a json file" $ do 26 | run "simple read" $ do 27 | let x = 1 :: Int 28 | x `shouldBe` x 29 | -------------------------------------------------------------------------------- /test-integration/Spec.hs: -------------------------------------------------------------------------------- 1 | -- Not working??? 2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 3 | -------------------------------------------------------------------------------- /test/Spark/Core/ColumnSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Spark.Core.ColumnSpec where 4 | 5 | import Test.Hspec 6 | 7 | import Spark.Core.Column 8 | import Spark.Core.Dataset 9 | import Spark.Core.Functions 10 | import Spark.Core.Types 11 | import Spark.Core.ColumnFunctions 12 | import Spark.Core.Internal.Utilities 13 | 14 | data Z 15 | data Y 16 | 17 | myScaler :: Column ref Double -> Column ref Double 18 | myScaler col = 19 | let cnt = asDouble (countCol col) 20 | m = sumCol col / cnt 21 | centered = col .- m 22 | stdDev = sumCol (centered * centered) / cnt 23 | in centered ./ stdDev 24 | 25 | 26 | spec :: Spec 27 | spec = do 28 | describe "ColumnSpec: ensure rules compile correctly" $ do 29 | let ds = dataset [(1,2)] :: Dataset (Int, Int) 30 | let c1 = ds // _1 31 | let c2 = ds // _2 32 | let c1' = untypedCol c1 33 | let c2' = untypedCol c2 34 | let i1 = 3 :: Int 35 | let o1 = constant 4 :: LocalData Int 36 | let o2 = 5 :: LocalData Int 37 | let o1' = asLocalObservable o1 38 | let o2' = asLocalObservable o2 39 | it "+ should not blow up" $ do 40 | let z1 = c1 + c2 41 | let z2 = c1' + c2' 42 | let z3 = c1 + 1 43 | let z4 = 1 + c1 44 | 'a' `shouldBe` 'a' 45 | it ".+ should not blow up with colums" $ do 46 | let z1 = c1 .+ c2 47 | let z2 = c1' .+ c2' 48 | let z3 = c1 .+ c2' 49 | let z4 = c1' .+ c2 50 | let z5 = c1 .+ o1 51 | let z6 = c1 .+ o1' 52 | 'a' `shouldBe` 'a' 53 | it "simple aggregations" $ do 54 | let c3 = c1 + (c2 .+ sumCol c2) 55 | let ds2 = pack1 c3 56 | nodeType ds2 `shouldBe` (buildType :: SQLType Int) 57 | it "mean" $ do 58 | let ds' = dataset [1, 2] :: Dataset Double 59 | let c = asCol ds' 60 | let cnt = asDouble (countCol c) 61 | let m = traceHint "m=" $ sumCol c / cnt 62 | let centered = c .- m 63 | let stdDev = sumCol (centered * centered) / cnt 64 | let scaled = traceHint "scaled=" $ centered ./ stdDev 65 | let ds2 = pack1 scaled 66 | nodeType ds2 `shouldBe` (buildType :: SQLType Double) 67 | -------------------------------------------------------------------------------- /test/Spark/Core/ContextSpec.hs: -------------------------------------------------------------------------------- 1 | 2 | 3 | module Spark.Core.ContextSpec where 4 | 5 | import Test.Hspec 6 | 7 | import Spark.Core.Functions 8 | 9 | spec :: Spec 10 | spec = do 11 | describe "Basic routines to get something out" $ do 12 | it "should print a node" $ do 13 | let x = dataset ([1 ,2, 3, 4]::[Int]) 14 | x `shouldBe` x 15 | -- b = nodeToBundle (untyped x) in 16 | -- trace (pretty b) $ 17 | -- 1 `shouldBe` 1 18 | -------------------------------------------------------------------------------- /test/Spark/Core/DatasetSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Spark.Core.DatasetSpec where 4 | 5 | import qualified Data.Text as T 6 | import Test.Hspec 7 | import qualified Data.Vector as V 8 | 9 | import Spark.Core.Dataset 10 | import Spark.Core.Functions 11 | import Spark.Core.Column 12 | import Spark.Core.StructuresInternal 13 | import Spark.Core.Internal.ContextInternal 14 | import Spark.Core.Internal.Utilities 15 | import Spark.Core.Internal.DAGStructures 16 | import Spark.Core.Internal.ComputeDag 17 | 18 | nName :: String -> NodeName 19 | nName = NodeName . T.pack 20 | 21 | spec :: Spec 22 | spec = do 23 | describe "create a dataframe" $ do 24 | it "should not explode" $ 25 | let x = dataset ([1 ,2, 3, 4]::[Int]) in 26 | nodeName x `shouldBe` nName "distributedliteral_c87697" 27 | 28 | it "renaming should work" $ 29 | let x = dataset ([1 ,2, 3]::[Int]) @@ "ds1" in 30 | nodeName x `shouldBe` nName "ds1" 31 | 32 | describe "check localset" $ do 33 | it "should not explode" $ 34 | let x = dataset ([1 ,2, 3]::[Int]) in 35 | nodeName x `shouldBe` nName "distributedliteral_1ba31e" 36 | 37 | it "renaming should work" $ 38 | let x = dataset ([1 ,2, 3]::[Int]) @@ "ds1" in 39 | nodeName x `shouldBe` nName "ds1" 40 | 41 | describe "column syntax" $ 42 | it "should not explode" $ do 43 | let ds = dataset ([1 ,2, 3]::[Int]) 44 | let c1 = ds/-"c1" 45 | c1 `shouldBe` c1 46 | 47 | describe "Logical dependencies" $ do 48 | it "should work" $ do 49 | let ds = dataset ([1 ,2, 3, 4]::[Int]) 50 | let ds1 = dataset ([1]::[Int]) `depends` [untyped ds] 51 | let g = traceHint (T.pack "g=") $ computeGraphToGraph $ forceRight $ buildComputationGraph ds1 52 | V.length (gVertices g) `shouldBe` 2 53 | 54 | 55 | -- describe "simple test" $ do 56 | -- it "the type should match" $ do 57 | -- let 58 | -- n1 = constant "xxx" 59 | -- n = NodeType $ T.pack "org.spark.Constant" in 60 | -- (nodeOp n1) `shouldBe` n 61 | 62 | -- it "no name" $ do 63 | -- let n1 = constant "xxx" 64 | -- t = NodeName $ T.pack "org.spark.Constant" in 65 | -- (nodeName n1) `shouldBe` t 66 | 67 | -- it "some name" $ do 68 | -- let n1 = constant "xxx" @@ "name" 69 | -- t = NodeName $ T.pack "name" in 70 | -- (nodeName n1) `shouldBe` t 71 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/CachingSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Spark.Core.Internal.CachingSpec where 5 | 6 | import Test.Hspec 7 | 8 | import qualified Data.ByteString.Char8 as C8 9 | import Data.Either(isLeft, isRight) 10 | import Control.Arrow((&&&)) 11 | import Data.Text(Text) 12 | import Data.Foldable(toList) 13 | import Formatting 14 | 15 | import Spark.Core.Try 16 | import Spark.Core.Functions 17 | import Spark.Core.Column 18 | import Spark.Core.ColumnFunctions 19 | import Spark.Core.Internal.Caching 20 | -- Required for instance resolution 21 | import Spark.Core.StructuresInternal() 22 | import Spark.Core.Internal.Client(LocalSessionId(..)) 23 | import Spark.Core.Internal.DAGStructures 24 | import Spark.Core.Internal.DAGFunctions 25 | import Spark.Core.Internal.DatasetStructures 26 | import Spark.Core.Internal.Utilities 27 | import Spark.Core.Internal.ContextStructures 28 | import Spark.Core.Internal.ContextInternal 29 | import Spark.Core.Internal.Pruning(emptyNodeCache) 30 | 31 | data TestType = AutocacheNode | CacheNode | UncacheNode | Dataset | Row deriving (Eq, Show) 32 | 33 | data TestNode = TestNode { tnId :: VertexId, tnType :: TestType, tnParents :: [(StructureEdge, TestNode)] } deriving (Eq) 34 | 35 | instance Show TestNode where 36 | show v = "TestNode(" ++ (C8.unpack . unVertexId. tnId $ v) ++ ")" 37 | 38 | nid :: String -> VertexId 39 | nid = VertexId . C8.pack 40 | 41 | node :: String -> TestType -> [TestNode] -> TestNode 42 | node s tp p = TestNode (VertexId (C8.pack s)) tp ((const ParentEdge &&& id) <$> p) 43 | 44 | instance GraphVertexOperations TestNode where 45 | vertexToId = tnId 46 | expandVertexAsVertices x = snd <$> tnParents x 47 | 48 | instance GraphOperations TestNode StructureEdge where 49 | expandVertex tn = tnParents tn 50 | 51 | acGen :: AutocacheGen TestNode 52 | acGen = 53 | let deriveUncache' (Vertex (VertexId x) (TestNode _ AutocacheNode _)) = 54 | let vid' = VertexId $ C8.pack . (++ "_uncache") . C8.unpack $ x 55 | in Vertex vid' (TestNode vid' UncacheNode []) 56 | deriveUncache' x = error (show x) 57 | deriveIdentity' (Vertex (VertexId x) (TestNode _ r _)) = 58 | let vid' = VertexId $ C8.pack . (++ "_identity") . C8.unpack $ x 59 | in Vertex vid' (TestNode vid' r []) 60 | in AutocacheGen { 61 | deriveUncache = deriveUncache', 62 | deriveIdentity = deriveIdentity' 63 | } 64 | 65 | expandFun :: TestNode -> CacheTry NodeCachingType 66 | expandFun n = case (tnType n, tnParents n) of 67 | (AutocacheNode, [_]) -> pure $ AutocacheOp (tnId n) 68 | (AutocacheNode, x) -> Left $ sformat ("Node: "%shown%": expected one parent for autocaching, got "%shown) n x 69 | (CacheNode, [_]) -> pure $ CacheOp (tnId n) 70 | (CacheNode, x) -> Left $ sformat ("Node: "%shown%": expected one parent for caching, got "%shown) n x 71 | (UncacheNode, [(ParentEdge, x)]) -> pure $ UncacheOp (tnId n) (tnId x) 72 | (UncacheNode, x) -> Left $ sformat ("Node: "%shown%": Expected one parent for uncaching, got "%shown) n x 73 | (Dataset, _) -> Right Through 74 | (Row, _) -> Right Stop 75 | 76 | errors :: TestNode -> CacheTry [CachingFailure] 77 | errors tn = do 78 | g <- buildGraph tn :: Either Text (Graph TestNode StructureEdge) 79 | checkCaching (graphMapEdges g (const ParentEdge)) expandFun 80 | 81 | errors' :: TestNode -> CacheTry (Graph TestNode StructureEdge) 82 | errors' tn = do 83 | g <- buildGraph tn :: Either Text (Graph TestNode StructureEdge) 84 | fillAutoCache expandFun acGen g 85 | 86 | intErrors :: LocalData a -> Try ComputeGraph 87 | intErrors ld = 88 | let cg = buildComputationGraph ld 89 | in performGraphTransforms emptySession =<< cg 90 | 91 | emptySession :: SparkSession 92 | emptySession = SparkSession c (LocalSessionId "id") 3 emptyNodeCache 93 | where c = SparkSessionConf "end_point" (negate 1) 10 "session_name" True 94 | 95 | spec :: Spec 96 | spec = do 97 | describe "Caching operations" $ do 98 | it "missing parent node" $ do 99 | let n1 = node "1" CacheNode [] 100 | errors n1 `shouldSatisfy` isLeft 101 | it "caching: parent node" $ do 102 | let n0 = node "0" Dataset [] 103 | let n1 = node "1" CacheNode [n0] 104 | errors n1 `shouldBe` Right [] 105 | it "uncaching: missing parent node" $ do 106 | let n1 = node "1" UncacheNode [] 107 | errors n1 `shouldSatisfy` isLeft 108 | it "uncaching: parent node" $ do 109 | let n0 = node "0" Dataset [] 110 | let n1 = node "1" CacheNode [n0] 111 | let n2 = node "2" CacheNode [n1] 112 | errors n2 `shouldBe` Right [] 113 | it "too many nodes for uncaching" $ do 114 | let n0 = node "0" Dataset [] 115 | let n1 = node "1" CacheNode [n0] 116 | let n2 = node "2" UncacheNode [n1, n2] 117 | errors n2 `shouldSatisfy` isLeft 118 | it "access after uncaching" $ do 119 | let n0 = node "0" Dataset [] 120 | let n1 = node "1" CacheNode [n0] 121 | let n2 = node "2" UncacheNode [n1] 122 | let n3 = node "3" Dataset [n1, n2] 123 | errors n3 `shouldBe` Right [CachingFailure (nid "1") (nid "2") (nid "3")] 124 | it "ambigous access after uncaching" $ do 125 | let n0 = node "0" Dataset [] 126 | let n1 = node "1" CacheNode [n0] 127 | let n2 = node "2" UncacheNode [n1] 128 | let n3 = node "3" Dataset [n1] 129 | let n4 = node "4" Dataset [n3, n2] 130 | errors n4 `shouldBe` Right [CachingFailure (nid "1") (nid "2") (nid "3") 131 | ,CachingFailure (nid "1") (nid "2") (nid "4")] 132 | describe "Autocaching operations" $ do 133 | it "missing parent node" $ do 134 | let n1 = node "1" AutocacheNode [] 135 | let g = traceHint "g=" (errors' n1) 136 | g `shouldSatisfy` isLeft 137 | it "auto-uncaching with no child should not create uncaching" $ do 138 | let n0 = node "0" Dataset [] 139 | let n1 = node "1" AutocacheNode [n0] 140 | let g = traceHint "g=" (errors' n1) 141 | g `shouldSatisfy` isRight 142 | ((length . toList . gVertices) <$> g) `shouldBe` Right 2 143 | it "access after uncaching" $ do 144 | let n0 = node "0" Dataset [] 145 | let n1 = node "1" AutocacheNode [n0] 146 | let n2 = node "2" Row [n1] 147 | let g = traceHint "g=" (errors' n2) 148 | g `shouldSatisfy` isRight 149 | ((length . toList . gVertices) <$> g) `shouldBe` Right 5 150 | it "access after and scoping" $ do 151 | let n0 = node "0" Dataset [] 152 | let n1 = node "1" AutocacheNode [n0] 153 | let n2a = node "2a" Row [n1] 154 | let n2b = node "2b" Row [n1] 155 | let n3 = node "3" Row [n2a, n2b] 156 | let g = traceHint "g=" (errors' n3) 157 | g `shouldSatisfy` isRight 158 | ((length . toList . gVertices) <$> g) `shouldBe` Right 8 159 | describe "Autocaching integration tests" $ do 160 | it "test 1" $ do 161 | let l = [1,2,3] :: [Int] 162 | let ds = dataset l 163 | let ds' = autocache ds 164 | let c1 = asCol ds' 165 | let s1 = sumCol c1 166 | let s2 = count ds' 167 | let x = s1 + s2 168 | let g = traceHint "g=" (intErrors x) 169 | g `shouldSatisfy` isRight 170 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/DAGFunctionsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | 5 | -- :script test/Spark/Core/Internal/PathsSpec.hs 6 | module Spark.Core.Internal.DAGFunctionsSpec where 7 | 8 | import Test.Hspec 9 | import qualified Data.Map.Strict as M 10 | import qualified Data.Vector as V 11 | import qualified Data.ByteString.Char8 as C8 12 | import Control.Arrow((&&&)) 13 | import Data.Foldable(toList) 14 | 15 | import Spark.Core.Internal.DAGStructures 16 | import Spark.Core.Internal.DAGFunctions 17 | import Spark.Core.Internal.Utilities 18 | 19 | data MyV = MyV { 20 | mvId :: VertexId, 21 | mvParents :: [MyV] 22 | } deriving (Eq) 23 | 24 | id2Str :: VertexId -> String 25 | id2Str = C8.unpack . unVertexId 26 | 27 | instance Show MyV where 28 | show v = "MyV(" ++ (id2Str . mvId $ v) ++ ")" 29 | 30 | 31 | instance GraphVertexOperations MyV where 32 | vertexToId = mvId 33 | expandVertexAsVertices = mvParents 34 | 35 | instance GraphOperations MyV () where 36 | expandVertex = ((const () &&& id) <$>) . mvParents 37 | 38 | myv :: String -> [MyV] -> MyV 39 | myv s = MyV (VertexId (C8.pack s)) 40 | 41 | expandNodes :: MyV -> DagTry [String] 42 | expandNodes vx = 43 | let tg = buildGraph vx :: DagTry (Graph MyV ()) 44 | in (id2Str . mvId . vertexData <$>) . toList . gVertices <$> tg 45 | 46 | -- edges: from -> to 47 | expandEdges :: MyV -> DagTry [(String, String)] 48 | expandEdges vx = 49 | let tg = buildGraph vx :: DagTry (Graph MyV ()) 50 | in tg <&> \g -> 51 | concat $ M.assocs (gEdges g) <&> \(vid, v) -> 52 | (C8.unpack . unVertexId . vertexId . veEndVertex &&& 53 | C8.unpack . unVertexId . const vid) <$> V.toList v 54 | 55 | spec :: Spec 56 | spec = do 57 | describe "Tests on paths" $ do 58 | it "no parent" $ do 59 | let v0 = myv "v0" [] 60 | expandNodes v0 `shouldBe` Right ["v0"] 61 | it "common parent" $ do 62 | let v0 = myv "v0" [] 63 | let v0' = myv "v0" [] 64 | let v1 = myv "v1" [v0, v0'] 65 | expandEdges v1 `shouldBe` Right [("v0", "v1"), ("v0", "v1")] 66 | it "diamond" $ do 67 | let va = myv "va" [] 68 | let va' = myv "va" [] 69 | let v0 = myv "v0" [va] 70 | let v0' = myv "v0" [va'] 71 | let v1 = myv "v1" [v0, v0'] 72 | expandEdges v1 `shouldBe` Right [("va", "v0"), ("v0", "v1"), ("v0", "v1")] 73 | it "simple sources" $ do 74 | let v0 = myv "v0" [] 75 | let v1 = myv "v1" [v0] 76 | let tg = buildGraph v1 :: DagTry (Graph MyV ()) 77 | let g = forceRight tg 78 | mvId . vertexData <$> graphSources g `shouldBe` [mvId v1] 79 | it "simple sinks" $ do 80 | let v0 = myv "v0" [] 81 | let v1 = myv "v1" [v0] 82 | let tg = buildGraph v1 :: DagTry (Graph MyV ()) 83 | let g = forceRight tg 84 | mvId . vertexData <$> graphSinks g `shouldBe` [mvId v0] 85 | it "longer sources" $ do 86 | let v0 = myv "v0" [] 87 | let v1 = myv "v1" [v0] 88 | let v2 = myv "v2" [v1] 89 | let tg = buildGraph v2 :: DagTry (Graph MyV ()) 90 | let g = forceRight tg 91 | mvId . vertexData <$> graphSources g `shouldBe` [mvId v2] 92 | it "longer sinks" $ do 93 | let v0 = myv "v0" [] 94 | let v1 = myv "v1" [v0] 95 | let v2 = myv "v2" [v1] 96 | let tg = buildGraph v2 :: DagTry (Graph MyV ()) 97 | let g = forceRight tg 98 | mvId . vertexData <$> graphSinks g `shouldBe` [mvId v0] 99 | describe "building DAGs" $ do 100 | it "2 nodes" $ do 101 | let v0 = myv "v0" [] 102 | let v1 = myv "v1" [v0] 103 | let v2 = myv "v2" [v1] 104 | let l = forceRight $ buildVertexList v2 105 | id2Str . mvId <$> l `shouldBe` ["v0", "v1", "v2"] 106 | it "triangle" $ do 107 | let v0 = myv "v0" [] 108 | let v1 = myv "v1" [v0] 109 | let v2 = myv "v2" [v0, v1] 110 | let l = forceRight $ buildVertexList v2 111 | -- The return order should be in lexicographic order 112 | -- (which is unique in this case). 113 | id2Str . mvId <$> l `shouldBe` ["v0", "v1", "v2"] 114 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/GroupsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | 5 | module Spark.Core.Internal.GroupsSpec where 6 | 7 | import Data.Text(Text) 8 | import Test.Hspec 9 | import GHC.Generics 10 | import Data.Either(isRight) 11 | 12 | import Spark.Core.Functions 13 | import Spark.Core.ColumnFunctions 14 | import Spark.Core.Dataset 15 | import Spark.Core.Column 16 | import Spark.Core.Row 17 | import Spark.Core.Types 18 | import Spark.Core.Internal.Groups 19 | 20 | 21 | data MyPair = MyPair { 22 | myKey :: Text, 23 | myVal :: Int } deriving (Generic, Show) 24 | 25 | myKey' :: StaticColProjection MyPair Text 26 | myKey' = unsafeStaticProjection buildType "myKey" 27 | myVal' :: StaticColProjection MyPair Int 28 | myVal' = unsafeStaticProjection buildType "myVal" 29 | instance SQLTypeable MyPair 30 | instance ToSQL MyPair 31 | 32 | -- The tests are really light for now, and just check that the code passes the 33 | -- dynamic type checker. 34 | spec :: Spec 35 | spec = do 36 | describe "typed grouping tests" $ do 37 | let ds = dataset [MyPair "1" 1, MyPair "2" 2] 38 | let keys = ds // myKey' 39 | let values = ds // myVal' 40 | let g = groupByKey keys values 41 | let sqlt1 = buildType :: SQLType MyPair 42 | it "group" $ do 43 | let tds2 = castType sqlt1 (groupAsDS g) 44 | tds2 `shouldSatisfy` isRight 45 | it "map group" $ do 46 | let g2 = g `mapGroup` \c -> c + c 47 | let tds2 = castType sqlt1 (groupAsDS g2) 48 | tds2 `shouldSatisfy` isRight 49 | it "simple reduce" $ do 50 | let ds2 = g `aggKey` sumCol 51 | let tds3 = castType sqlt1 ds2 52 | tds3 `shouldSatisfy` isRight 53 | it "complex reduce" $ do 54 | let ds2 = g `aggKey` \c -> sumCol (c + c) 55 | let tds3 = castType sqlt1 ds2 56 | tds3 `shouldSatisfy` isRight 57 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/LocalDataFunctionsSpec.hs: -------------------------------------------------------------------------------- 1 | 2 | module Spark.Core.Internal.LocalDataFunctionsSpec where 3 | 4 | import Test.Hspec 5 | 6 | import Spark.Core.Dataset 7 | import Spark.Core.Functions() 8 | 9 | spec :: Spec 10 | spec = do 11 | describe "Arithmetic operations on local data (integers)" $ do 12 | it "ints" $ do 13 | let x1 = 1 :: LocalData Int 14 | let x2 = 2 :: LocalData Int 15 | let y1 = x1 + x2 16 | let y2 = x1 `div` x2 17 | (y2 `shouldBe` y2) 18 | (y1 `shouldBe` y1) 19 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/OpFunctionsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | {-# LANGUAGE QuasiQuotes #-} 6 | 7 | module Spark.Core.Internal.OpFunctionsSpec where 8 | 9 | import Data.Aeson 10 | import Test.Hspec 11 | import Text.RawString.QQ 12 | 13 | import Spark.Core.Functions 14 | import Spark.Core.Internal.OpFunctions 15 | import Spark.Core.Internal.DatasetFunctions 16 | 17 | 18 | spec :: Spec 19 | spec = do 20 | describe "extraNodeOpData" $ do 21 | it "should have the content of a constant dataset" $ do 22 | let l = [1,2,3] :: [Int] 23 | let res :: Maybe Value 24 | res = decode 25 | ([r|{"content": [1,2,3], 26 | "cellType" : { 27 | "dt": "integer", 28 | "nullable": false 29 | } 30 | }|]) 31 | let ds = dataset l 32 | let d = extraNodeOpData . nodeOp $ ds 33 | Just d `shouldBe` res 34 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/PathsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | 5 | -- :script test/Spark/Core/Internal/PathsSpec.hs 6 | module Spark.Core.Internal.PathsSpec where 7 | 8 | import Test.Hspec 9 | import qualified Data.Map.Strict as M 10 | import qualified Data.Set as S 11 | import qualified Data.ByteString.Char8 as C8 12 | import qualified Data.Text as T 13 | 14 | import Spark.Core.StructuresInternal 15 | import Spark.Core.Functions 16 | import Spark.Core.Dataset 17 | import Spark.Core.Internal.Paths 18 | import Spark.Core.Internal.DAGStructures 19 | import Spark.Core.Internal.DAGFunctions 20 | import Spark.Core.Internal.ComputeDag 21 | import Spark.Core.Internal.PathsUntyped 22 | import Spark.Core.Internal.Utilities 23 | import Spark.Core.Internal.DatasetFunctions 24 | import Spark.Core.Internal.DatasetStructures 25 | 26 | data MyV = MyV { 27 | mvId :: VertexId, 28 | mvLogical :: [MyV], 29 | mvParents :: [MyV] 30 | } deriving (Eq) 31 | 32 | instance Show MyV where 33 | show v = "MyV(" ++ (C8.unpack . unVertexId . mvId $ v) ++ ")" 34 | 35 | 36 | assignPaths :: UntypedNode -> [UntypedNode] 37 | assignPaths n = 38 | let cgt = buildCGraph n :: DagTry (ComputeDag UntypedNode NodeEdge) 39 | cg = forceRight cgt 40 | acgt = assignPathsUntyped cg 41 | ncg = forceRight acgt 42 | in graphDataLexico . tieNodes $ ncg 43 | 44 | 45 | instance GraphVertexOperations MyV where 46 | vertexToId = mvId 47 | expandVertexAsVertices = mvParents 48 | 49 | myv :: String -> [MyV] -> [MyV] -> MyV 50 | myv s logical inner = MyV (VertexId (C8.pack s)) logical inner 51 | 52 | myvToVertex :: MyV -> Vertex MyV 53 | myvToVertex x = Vertex (mvId x) x 54 | 55 | buildScopes :: [MyV] -> Scopes 56 | buildScopes l = iGetScopes0 l' fun where 57 | l' = myvToVertex <$> l 58 | fun vx = ParentSplit { 59 | psLogical = myvToVertex <$> (mvLogical . vertexData $ vx), 60 | psInner = myvToVertex <$> (mvParents . vertexData $ vx) } 61 | 62 | simple :: [(Maybe String, [String])] -> Scopes 63 | simple [] = M.empty 64 | simple ((ms, ss) : t) = 65 | let 66 | key = VertexId . C8.pack <$> ms 67 | vals = VertexId . C8.pack <$> ss 68 | new = M.singleton key (S.fromList vals) 69 | in mergeScopes new (simple t) 70 | 71 | gatherings :: [(String, [[String]])] -> M.Map VertexId [[VertexId]] 72 | gatherings [] = M.empty 73 | gatherings ((key, paths) : t) = 74 | let 75 | k = VertexId . C8.pack $ key 76 | ps = (VertexId . C8.pack <$>) <$> paths 77 | new = M.singleton k ps 78 | in M.unionWith (++) new (gatherings t) 79 | 80 | gatherPaths' :: [MyV] -> M.Map VertexId [[VertexId]] 81 | gatherPaths' = gatherPaths . buildScopes 82 | 83 | spec :: Spec 84 | spec = do 85 | describe "Tests on paths" $ do 86 | it "nothing" $ do 87 | buildScopes [] `shouldBe` simple [] 88 | it "no parent" $ do 89 | let v0 = myv "v0" [] [] 90 | let res = [ (Nothing, ["v0"]), (Just "v0", []) ] 91 | buildScopes [v0] `shouldBe` simple res 92 | it "one logical parent" $ do 93 | let v0 = myv "v0" [] [] 94 | let v1 = myv "v1" [v0] [] 95 | let res = [ (Nothing, ["v0", "v1"]) 96 | , (Just "v1", []) 97 | , (Just "v0", []) ] 98 | buildScopes [v1, v0] `shouldBe` simple res 99 | it "one inner parent" $ do 100 | let v0 = myv "v0" [] [] 101 | let v1 = myv "v1" [] [v0] 102 | let res = [ (Nothing, ["v1"]) 103 | , (Just "v1", ["v0"]) 104 | , (Just "v0", []) ] 105 | buildScopes [v1, v0] `shouldBe` simple res 106 | it "logical scoping over a parent" $ do 107 | let v0 = myv "v0" [] [] 108 | let v1 = myv "v1" [v0] [] 109 | let v2 = myv "v2" [v0] [v1] 110 | let res = [ (Nothing, ["v0", "v2"]) 111 | , (Just "v0", []) 112 | , (Just "v1", []) 113 | , (Just "v2", ["v1"]) ] 114 | buildScopes [v2] `shouldBe` simple res 115 | it "common ancestor" $ do 116 | let top = myv "top" [] [] 117 | let inner = myv "inner" [top] [] 118 | let v1 = myv "v1" [top] [inner] 119 | let v2 = myv "v2" [top] [inner] 120 | let res = [ (Nothing, ["top", "v1", "v2"]) 121 | , (Just "inner", []) 122 | , (Just "top", []) 123 | , (Just "v1", ["inner"]) 124 | , (Just "v2", ["inner"]) ] 125 | buildScopes [v1, v2] `shouldBe` simple res 126 | it "common ancestor, unbalanced" $ do 127 | let top = myv "top" [] [] 128 | let inner = myv "inner" [top] [] 129 | let v1 = myv "v1" [top] [inner] 130 | let v2 = myv "v2" [] [inner] 131 | let res = [ (Nothing, ["top", "v1", "v2"]) 132 | , (Just "inner", []) 133 | , (Just "top", []) 134 | , (Just "v1", ["inner"]) 135 | , (Just "v2", ["inner", "top"]) ] 136 | buildScopes [v1, v2] `shouldBe` simple res 137 | describe "Path gatherings" $ do 138 | it "nothing" $ do 139 | gatherPaths' [] `shouldBe` gatherings [] 140 | it "no parent" $ do 141 | let v0 = myv "v0" [] [] 142 | let res = [("v0", [[]])] 143 | gatherPaths' [v0] `shouldBe` gatherings res 144 | it "one logical parent" $ do 145 | let v0 = myv "v0" [] [] 146 | let v1 = myv "v1" [v0] [] 147 | let res = [ ("v1", [[]]) 148 | , ("v0", [[]])] 149 | gatherPaths' [v1] `shouldBe` gatherings res 150 | it "one inner parent" $ do 151 | let v0 = myv "v0" [] [] 152 | let v1 = myv "v1" [] [v0] 153 | let res = [ ("v1", [[]]) 154 | , ("v0", [["v1"]])] 155 | gatherPaths' [v1] `shouldBe` gatherings res 156 | it "logical scoping over a parent" $ do 157 | let v0 = myv "v0" [] [] 158 | let v1 = myv "v1" [v0] [] 159 | let v2 = myv "v2" [v0] [v1] 160 | let res = [ ("v0", [[]]) 161 | , ("v1", [["v2"]]) 162 | , ("v2", [[]]) ] 163 | gatherPaths' [v2] `shouldBe` gatherings res 164 | it "common ancestor" $ do 165 | let top = myv "top" [] [] 166 | let inner = myv "inner" [top] [] 167 | let v1 = myv "v1" [top] [inner] 168 | let v2 = myv "v2" [top] [inner] 169 | let res = [ ("inner", [["v1"], ["v2"]]) 170 | , ("top", [[]]) 171 | , ("v1", [[]]) 172 | , ("v2", [[]]) ] 173 | gatherPaths' [v1, v2] `shouldBe` gatherings res 174 | describe "Real paths" $ do 175 | it "simple test" $ do 176 | let c0 = constant (1 :: Int) @@ "c0" 177 | let c1 = identity c0 @@ "c1" 178 | let c2 = identity c1 `logicalParents` [untyped c0] @@ "c2" 179 | nodeId <$> nodeParents c1 `shouldBe` [nodeId c0] 180 | nodeId <$> nodeParents c2 `shouldBe` [nodeId c1] 181 | let withParents = T.unpack . catNodePath . nodePath <$> assignPaths (untyped c2) 182 | withParents `shouldBe` ["c0", "c2/c1", "c2"] 183 | it "simple test 2" $ do 184 | let ds = dataset ([1 ,2, 3, 4]::[Int]) @@ "ds" 185 | let c = count ds @@ "c" 186 | let c2 = (c + (identity c @@ "id")) `logicalParents` [untyped ds] @@ "c2" 187 | let withParents = T.unpack . catNodePath . nodePath <$> assignPaths (untyped c2) 188 | withParents `shouldBe` ["ds", "c2/c","c2/id","c2"] 189 | -------------------------------------------------------------------------------- /test/Spark/Core/Internal/RowUtilsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | 5 | module Spark.Core.Internal.RowUtilsSpec where 6 | 7 | import Data.Aeson 8 | import Data.Maybe(fromJust) 9 | import Test.Hspec 10 | import Data.ByteString.Lazy(ByteString) 11 | import qualified Data.Vector as V 12 | import Data.Either(isRight) 13 | 14 | import Spark.Core.Types 15 | import Spark.Core.Row 16 | import Spark.Core.Internal.TypesFunctions 17 | import Spark.Core.Internal.RowGenericsFrom 18 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeElementRepr) 19 | 20 | fun :: ByteString -> DataType -> Cell -> IO () 21 | fun js dt cell2 = 22 | let 23 | mval = decode js :: Maybe Value 24 | val = fromJust mval 25 | cellt = jsonToCell dt val 26 | in cellt `shouldBe` (Right cell2) 27 | 28 | 29 | spec :: Spec 30 | spec = do 31 | describe "JSON -> Row" $ do 32 | it "ints" $ do 33 | fun "2" intType (IntElement 2) 34 | it "[ints]" $ do 35 | fun "[2]" (arrayType' intType) (RowArray (V.singleton (IntElement 2))) 36 | describe "Decoding data types" $ do 37 | it "should decode DataTypeElementRepr" $ do 38 | let x = rowArray [rowArray [StringElement "ts3f1"],BoolElement True,IntElement 1,IntElement 0] 39 | let elt = cellToValue x :: TryS DataTypeElementRepr 40 | elt `shouldSatisfy` isRight 41 | -------------------------------------------------------------------------------- /test/Spark/Core/PathSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExistentialQuantification #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | 4 | module Spark.Core.PathSpec where 5 | 6 | import Data.Maybe(fromJust) 7 | import Test.Hspec 8 | 9 | import Spark.Core.Functions 10 | import Spark.Core.Dataset 11 | 12 | fun1 :: LocalData Int 13 | fun1 = 14 | let m1 = constant (1::Int) 15 | m2 = constant (2::Int) in 16 | constant (3::Int) 17 | `logicalParents` [untyped m1, untyped m2] 18 | `parents` [] 19 | 20 | 21 | fun2 :: LocalData Int -> LocalData Int 22 | fun2 ld1 = let 23 | m1 = constant (1 :: Int) `parents` [untyped ld1] @@ "m1" 24 | in 25 | constant (3 :: Int) 26 | `logicalParents` [untyped m1] 27 | `parents` [untyped ld1] 28 | @@ "c2" 29 | 30 | -- fun3 :: LocalData Int -> LocalData Int 31 | -- fun3 ld = ld + 3 32 | 33 | spec :: Spec 34 | spec = do 35 | describe "Tests with nodes" $ do 36 | it "should get a node" $ do 37 | let n1 = fun1 38 | let l = fromJust $ nodeLogicalParents n1 39 | (length l) `shouldBe` 2 40 | -- it "should work with ints" $ do 41 | -- let n2 = (fun3 4) @@ "" in 42 | -- (length $ nodeDependencies n2) `shouldBe` 2 43 | -------------------------------------------------------------------------------- /test/Spark/Core/ProjectionsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExistentialQuantification #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE DeriveGeneric #-} 5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | 9 | module Spark.Core.ProjectionsSpec where 10 | 11 | import Test.Hspec 12 | import GHC.Generics 13 | import Data.List(isPrefixOf) 14 | import Data.Either(isRight, isLeft) 15 | import qualified Data.Vector as V 16 | import qualified Data.Text as T 17 | 18 | import Spark.Core.Functions 19 | import Spark.Core.Dataset 20 | import Spark.Core.Column 21 | import Spark.Core.Row 22 | import Spark.Core.Types 23 | import Spark.Core.Try 24 | import Spark.Core.Internal.Utilities 25 | import Spark.Core.Internal.TypesFunctions 26 | 27 | 28 | data Tree = Tree { 29 | treeId :: Int, 30 | treeWidth :: Int, 31 | treeHeight :: Int } deriving (Generic, Show) 32 | 33 | treeId' :: StaticColProjection Tree Int 34 | treeId' = unsafeStaticProjection buildType "treeId" 35 | treeWidth' :: StaticColProjection Tree Int 36 | treeWidth' = unsafeStaticProjection buildType "treeWidth" 37 | instance SQLTypeable Tree 38 | instance ToSQL Tree 39 | 40 | newtype MyId = MyId Int deriving (Generic, Show, Num) 41 | instance SQLTypeable MyId 42 | instance ToSQL MyId 43 | 44 | newtype Height = Height Int deriving (Generic, Num, Show) 45 | instance SQLTypeable Height 46 | instance ToSQL Height 47 | 48 | data STree = STree { 49 | sTreeId :: MyId, 50 | sTreeWidth :: Height, 51 | sTreeHeight :: Int } deriving (Generic, Show) 52 | 53 | instance SQLTypeable STree 54 | instance ToSQL STree 55 | sTreeId' :: StaticColProjection STree MyId 56 | sTreeId' = unsafeStaticProjection buildType "sTreeId" 57 | sTreeWidth' :: StaticColProjection STree Height 58 | sTreeWidth' = unsafeStaticProjection buildType "sTreeWidth" 59 | instance TupleEquivalence STree (MyId, Height, Int) where 60 | tupleFieldNames = NameTuple ["sTreeId", "sTreeWidth", "sTreeHeight"] 61 | 62 | rawData :: [(Int, Int, Int)] 63 | rawData = [(1, 3, 2)] 64 | 65 | spec :: Spec 66 | spec = do 67 | let ds = dataset [Tree 1 3 2] 68 | -- The untyped elements 69 | let dt = structType [structField (T.pack "treeId") intType, structField (T.pack "treeWidth") intType, structField (T.pack "treeHeight") intType] 70 | let fun (id', height, width) = RowArray $ V.fromList [IntElement id', IntElement height, IntElement width] 71 | let df1 = traceHint (T.pack "df1=") $ dataframe dt (fun <$> rawData) 72 | let ds1 = traceHint (T.pack "ds1=") $ forceRight (asDS df1) :: Dataset Tree 73 | describe "Simple projection demos" $ do 74 | it "should get a node" $ do 75 | ds `shouldBe` ds1 76 | it "Failing dynamic projection on dataframe" $ do 77 | df1/-"xx" `shouldSatisfy` isLeft 78 | it "Failing dynamic projection on dataset" $ do 79 | ds1/-"xx" `shouldSatisfy` isLeft 80 | it "Basic arithmetic on DS cols" $ do 81 | let c1 = ds1//treeWidth' 82 | let c2 = (c1 + c1) 83 | (show c2) `shouldSatisfy` ("treeWidth + treeWidth{int}" `isPrefixOf`) 84 | it "Basic arithmetic on DF cols" $ do 85 | let c1 = df1 // treeWidth' 86 | let c2 = c1 + c1 87 | (show c2) `shouldSatisfy` ("Right treeWidth + treeWidth{int}" `isPrefixOf`) 88 | it "Construction of ds2" $ do 89 | let str = struct' [ (df1/-"treeId") @@ "sTreeId", 90 | (df1/-"treeWidth") @@ "sTreeWidth", 91 | (df1/-"treeHeight") @@ "sTreeHeight"] 92 | let df2 = pack' str 93 | let ds2 = traceHint (T.pack "ds2=") $ asDS df2 :: Try (Dataset STree) 94 | ds2 `shouldSatisfy` isRight 95 | it "Static construction of ds2" $ do 96 | let ds2 = do 97 | idCol <- castCol' (buildType::SQLType MyId) (df1/-"treeId") 98 | widthCol <- castCol' (buildType::SQLType Height) (df1/-"treeWidth") 99 | heightCol <- castCol' (buildType::SQLType Int) (df1/-"treeWidth") 100 | let s = pack (idCol, widthCol, heightCol) :: Dataset STree 101 | return $ traceHint (T.pack "ds2=") s 102 | ds2 `shouldSatisfy` isRight 103 | it "Basic arithmetic on DS cols 1" $ do 104 | let ds2' = do 105 | idCol <- castCol' (buildType::SQLType MyId) (df1/-"treeId") 106 | widthCol <- castCol' (buildType::SQLType Height) (df1/-"treeWidth") 107 | heightCol <- castCol' (buildType::SQLType Int) (df1/-"treeWidth") 108 | let s = pack (idCol, widthCol, heightCol) :: Dataset STree 109 | return $ traceHint (T.pack "ds2=") s 110 | let ds2 = forceRight ds2' 111 | let c1 = ds2//sTreeWidth' 112 | let c2 = c1 + c1 113 | (show c2) `shouldSatisfy` ("sTreeWidth + sTreeWidth{int}" `isPrefixOf`) 114 | -------------------------------------------------------------------------------- /test/Spark/Core/RowToSQLSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE BangPatterns #-} 5 | 6 | module Spark.Core.RowToSQLSpec where 7 | 8 | import qualified Data.Vector as V 9 | import GHC.Generics (Generic) 10 | import Test.Hspec 11 | 12 | import Spark.Core.Types 13 | import Spark.Core.Row 14 | 15 | data TestStruct1 = TestStruct1 { 16 | ts1f1 :: Int, 17 | ts1f2 :: Maybe Int } deriving (Show, Eq, Generic, ToSQL, FromSQL) 18 | 19 | data TestStruct2 = TestStruct2 { ts2f1 :: [Int] } deriving (Show, Generic, SQLTypeable) 20 | 21 | data TestStruct3 = TestStruct3 { ts3f1 :: Int } deriving (Show, Eq, Generic, ToSQL, FromSQL) 22 | data TestStruct4 = TestStruct4 { ts4f1 :: TestStruct3 } deriving (Show, Eq, Generic, ToSQL, FromSQL) 23 | 24 | data TestStruct5 = TestStruct5 { 25 | ts5f1 :: Int, 26 | ts5f2 :: Int, 27 | ts5f3 :: TestStruct3 28 | } deriving (Show, Eq, Generic, ToSQL, FromSQL) 29 | 30 | newtype TestT1 = TestT1 { unTestT1 :: Int } deriving (Eq, Show, Generic, ToSQL, FromSQL) 31 | 32 | 33 | v2c :: (Show a, ToSQL a, FromSQL a, Eq a) => a -> Cell -> IO () 34 | v2c !x !y = do 35 | _ <- shouldBe (valueToCell x) y 36 | _ <- shouldBe (cellToValue y) (Right x) 37 | return () 38 | 39 | spec :: Spec 40 | spec = do 41 | describe "Simple type tests" $ do 42 | it "int" $ 43 | v2c (3 :: Int) (IntElement 3) 44 | it "int?" $ 45 | v2c (Just 3 :: Maybe Int) (IntElement 3) 46 | it "int? 2" $ 47 | v2c (Nothing :: Maybe Int) Empty 48 | it "TestStruct3" $ 49 | v2c (TestStruct3 2) (RowArray $ V.fromList [IntElement 2]) 50 | it "TestStruct4" $ 51 | v2c (TestStruct4 (TestStruct3 3)) $ 52 | (RowArray $ V.fromList [ 53 | RowArray $ V.fromList [IntElement 3] 54 | ]) 55 | it "TestStruct1 - empty" $ 56 | v2c (TestStruct1 2 Nothing) (RowArray $ V.fromList [IntElement 2, Empty]) 57 | it "TestStruct1 - full" $ 58 | v2c (TestStruct1 2 (Just 4)) (RowArray $ V.fromList [IntElement 2, IntElement 4]) 59 | it "TestStruct5" $ 60 | v2c (TestStruct5 1 2 (TestStruct3 3)) $ 61 | (RowArray $ V.fromList [ 62 | IntElement 1, 63 | IntElement 2, 64 | RowArray $ V.fromList [IntElement 3] 65 | ]) 66 | -- describe "Simple type tests" $ do 67 | -- it "newtype" $ 68 | -- v2c (TestT1 3) (IntElement 3) 69 | -------------------------------------------------------------------------------- /test/Spark/Core/SimpleExamplesSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | {-# LANGUAGE QuasiQuotes #-} 4 | 5 | -- Some small examples that get fully verified. 6 | module Spark.Core.SimpleExamplesSpec where 7 | 8 | import Data.Either(isRight) 9 | import Data.Maybe(isJust) 10 | import Test.Hspec 11 | import qualified Data.Text as T 12 | import Text.RawString.QQ 13 | 14 | import Spark.Core.Dataset 15 | import Spark.Core.Functions 16 | import Spark.Core.Column 17 | import Spark.Core.ColumnFunctions 18 | import Spark.Core.Internal.DatasetStructures 19 | import Spark.Core.Internal.Utilities(pretty) 20 | import Spark.Core.Internal.OpFunctions(extraNodeOpData) 21 | 22 | ds1 :: Dataset Int 23 | ds1 = dataset [1,2,3] 24 | 25 | ds2 :: Dataset Double 26 | ds2 = error "ds2" 27 | 28 | spec :: Spec 29 | spec = do 30 | describe "Simple examples" $ do 31 | it "Precdence of renaming" $ do 32 | let numbers = asCol ds1 33 | let s = sumCol numbers 34 | let numCount = count ds1 35 | let avg = s `div` numCount @@ "myaverage" 36 | _cnName avg `shouldSatisfy` isJust 37 | it "name for simple integers" $ do 38 | let numbers = asCol ds1 39 | let s = sumCol numbers 40 | let numCount = count ds1 41 | let avg = s `div` numCount @@ "myaverage" 42 | -- TODO: should it show "value: int" instead? 43 | -- I think it should show it for distributed nodes only. 44 | -- SQL is not allowed on observables 45 | (show avg) `shouldBe` "/myaverage@org.spark.LocalDiv!int" 46 | describe "pack1" $ do 47 | it "Extracting and packing one column" $ do 48 | let numbers = asCol ds1 49 | let ds1' = pack1 numbers 50 | (nodeType ds1) `shouldBe` (nodeType ds1') 51 | describe "pack" $ do 52 | it "Extracting and packing one column" $ do 53 | let ds1' = pack' . asCol $ ds1 54 | (nodeType <$> (asDF ds1)) `shouldBe` (nodeType <$> ds1') 55 | describe "simple json example" $ do 56 | it "packing and unpacking one column" $ do 57 | let ds1' = pack' . asCol $ ds1 58 | let d' = pretty . extraNodeOpData . nodeOp <$> ds1' 59 | d' `shouldBe` Right (T.pack "{\"cellType\":{\"dt\":\"integer\",\"nullable\":false},\"content\":[1,2,3]}") 60 | it "packing and unpacking 2 columns, one with a bad name" $ do 61 | let col1 = asCol ds1 62 | let col2 = col1 @@ "other" 63 | let ds1' = pack' (col1, col2) 64 | ds1' `shouldSatisfy` isRight -- NOT SURE WHY IT WOULD FAIL 65 | it "packing and unpacking 2 columns, one with a good name" $ do 66 | let col1 = asCol ds1 @@ "first" 67 | let col2 = col1 @@ "second" 68 | let ds1' = pack' (col1, col2) 69 | ds1' `shouldSatisfy` isRight 70 | 71 | 72 | -- it "example2" $ do 73 | -- let numbers = asCol ds2 74 | -- let avg = (colSum numbers) / (count ds2) 75 | -- 1 `shouldBe` 1 76 | -------------------------------------------------------------------------------- /test/Spark/Core/TypesSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | module Spark.Core.TypesSpec where 6 | 7 | import GHC.Generics (Generic) 8 | import Test.Hspec 9 | import Test.Hspec.QuickCheck 10 | 11 | import Spark.Core.Types 12 | import Spark.Core.Internal.TypesFunctions 13 | import Spark.Core.Internal.TypesGenerics() 14 | 15 | data TestStruct1 = TestStruct1 { 16 | ts1f1 :: Int, 17 | ts1f2 :: Maybe Int } deriving (Show, Generic) 18 | 19 | instance SQLTypeable TestStruct1 20 | 21 | data TestStruct2 = TestStruct2 { ts2f1 :: [Int] } deriving (Show, Generic, SQLTypeable) 22 | 23 | data TestStruct3 = TestStruct3 { ts3f1 :: Int } deriving (Show, Generic, SQLTypeable) 24 | data TestStruct4 = TestStruct4 { ts4f1 :: TestStruct3 } deriving (Show, Generic, SQLTypeable) 25 | 26 | -- instance SQLTypeable TestStruct1 27 | -- instance Menu TestStruct1 28 | 29 | 30 | -- main :: IO () 31 | -- main = hspec spec 32 | 33 | spec :: Spec 34 | spec = do 35 | describe "Simple type tests" $ do 36 | it "show ints" $ 37 | show intType `shouldBe` "int" 38 | 39 | it "show arrays" $ 40 | show (arrayType' intType) `shouldBe` "[int]" 41 | 42 | it "show structures" $ 43 | show (arrayType' (canNull intType)) `shouldBe` "[int?]" 44 | 45 | describe "The basic tests for int types" $ do 46 | it "ints" $ 47 | let t = buildType :: (SQLType Int) 48 | dt = columnType t in 49 | dt `shouldBe` intType 50 | 51 | it "opt ints" $ 52 | let t = buildType :: (SQLType (Maybe Int)) 53 | dt = columnType t in 54 | dt `shouldBe` canNull intType 55 | 56 | -- The projection of all the product types 57 | it "opt opt ints" $ 58 | let t = buildType :: (SQLType (Maybe (Maybe Int))) in 59 | columnType t `shouldBe` canNull intType 60 | 61 | it "array ints" $ 62 | let t = buildType :: (SQLType [Int]) in 63 | columnType t `shouldBe` arrayType' intType 64 | 65 | it "array opt ints" $ 66 | let t = buildType :: (SQLType [Maybe Int]) in 67 | columnType t `shouldBe` arrayType' (canNull intType) 68 | 69 | it "opt array ints" $ 70 | let t = buildType :: (SQLType (Maybe [Int])) in 71 | columnType t `shouldBe` canNull (arrayType' intType) 72 | 73 | describe "The basic tests for records" $ do 74 | it "records with maybe" $ 75 | let t = buildType :: (SQLType TestStruct1) 76 | out = structType [structField "ts1f1" intType, structField "ts1f2" (canNull intType)] in 77 | columnType t `shouldBe` out 78 | 79 | it "records with arrays" $ 80 | let t = buildType :: (SQLType TestStruct2) 81 | out = structType [structField "ts2f1" (arrayType' intType)] in 82 | columnType t `shouldBe` out 83 | 84 | it "records within records" $ 85 | let t = buildType :: (SQLType TestStruct4) 86 | out0 = structType [structField "ts3f1" intType] 87 | out = structType [structField "ts4f1" out0] in 88 | columnType t `shouldBe` out 89 | 90 | describe "Construction of frame types" $ do 91 | prop "frameTypeFromCol should be invertible" $ 92 | \x -> 93 | let dt = colTypeFromFrame x 94 | y = frameTypeFromCol dt 95 | in x == y 96 | -- TODO this is not always working. Figure out the rules here. 97 | -- prop "colTypeFromFrame should be invertible" $ 98 | -- \x -> (colTypeFromFrame . frameTypeFromCol) x == x 99 | -------------------------------------------------------------------------------- /test/Spec.hs: -------------------------------------------------------------------------------- 1 | -- Not working??? 2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 3 | --------------------------------------------------------------------------------