├── .gitignore
├── Dockerfile
├── INTRO.md
├── LICENSE
├── README.md
├── Setup.hs
├── karps.cabal
├── notebooks
    ├── 00_Intro.ipynb
    ├── 01_Datasets_Dataframes_Observable_DynObservable.ipynb
    ├── 02_Organizing_workflows.ipynb
    ├── 03_Caching_data.ipynb
    ├── 04_Reading_data.ipynb
    ├── 05_Displaying_data_with_Python.ipynb
    ├── 06_Column_operations.ipynb
    ├── 07_Spark_program_introspection.ipynb
    ├── KarpsDagDisplay.hs
    ├── KarpsDisplays.hs
    ├── ihaskell-tensorboard.png
    └── rendered
    │   ├── 00_Intro.html
    │   ├── 01_Datasets_Dataframes_Observable_DynObservable.html
    │   ├── 02_Organizing_workflows.html
    │   ├── 03_Caching_data.html
    │   └── 06_Column_operations.html
├── src
    └── Spark
    │   ├── Core.hs
    │   ├── Core
    │       ├── Column.hs
    │       ├── ColumnFunctions.hs
    │       ├── Context.hs
    │       ├── Dataset.hs
    │       ├── Functions.hs
    │       ├── Internal
    │       │   ├── AggregationFunctions.hs
    │       │   ├── AlgebraStructures.hs
    │       │   ├── Arithmetics.hs
    │       │   ├── ArithmeticsImpl.hs
    │       │   ├── BasicStructures.hs
    │       │   ├── Caching.hs
    │       │   ├── CachingUntyped.hs
    │       │   ├── CanRename.hs
    │       │   ├── Client.hs
    │       │   ├── ColumnFunctions.hs
    │       │   ├── ColumnStandard.hs
    │       │   ├── ColumnStructures.hs
    │       │   ├── ComputeDag.hs
    │       │   ├── ContextIOInternal.hs
    │       │   ├── ContextInteractive.hs
    │       │   ├── ContextInternal.hs
    │       │   ├── ContextStructures.hs
    │       │   ├── DAGFunctions.hs
    │       │   ├── DAGStructures.hs
    │       │   ├── DatasetFunctions.hs
    │       │   ├── DatasetStructures.hs
    │       │   ├── Filter.hs
    │       │   ├── FunctionsInternals.hs
    │       │   ├── Groups.hs
    │       │   ├── Joins.hs
    │       │   ├── LocalDataFunctions.hs
    │       │   ├── LocatedBase.hs
    │       │   ├── ObservableStandard.hs
    │       │   ├── OpFunctions.hs
    │       │   ├── OpStructures.hs
    │       │   ├── Paths.hs
    │       │   ├── PathsUntyped.hs
    │       │   ├── Projections.hs
    │       │   ├── Pruning.hs
    │       │   ├── RowGenerics.hs
    │       │   ├── RowGenericsFrom.hs
    │       │   ├── RowStructures.hs
    │       │   ├── RowUtils.hs
    │       │   ├── TypesFunctions.hs
    │       │   ├── TypesGenerics.hs
    │       │   ├── TypesStructures.hs
    │       │   ├── TypesStructuresRepr.hs
    │       │   └── Utilities.hs
    │       ├── Row.hs
    │       ├── StructuresInternal.hs
    │       ├── Try.hs
    │       └── Types.hs
    │   ├── IO
    │       ├── Inputs.hs
    │       └── Internal
    │       │   ├── InputGeneric.hs
    │       │   ├── Json.hs
    │       │   └── OutputCommon.hs
    │   └── Inputs
    │       └── Inputs.hs
├── stack-ihaskell.yaml
├── stack.yaml
├── test-integration
    ├── Spark
    │   ├── Core
    │   │   ├── CachingSpec.hs
    │   │   ├── CollectSpec.hs
    │   │   ├── ColumnSpec.hs
    │   │   ├── GroupsSpec.hs
    │   │   ├── IntegrationUtilities.hs
    │   │   ├── JoinsSpec.hs
    │   │   ├── PruningSpec.hs
    │   │   ├── SimpleAddSpec.hs
    │   │   └── Spec.hs
    │   └── IO
    │   │   ├── JsonSpec.hs
    │   │   └── StampSpec.hs
    └── Spec.hs
└── test
    ├── Spark
        └── Core
        │   ├── ColumnSpec.hs
        │   ├── ContextSpec.hs
        │   ├── DatasetSpec.hs
        │   ├── Internal
        │       ├── CachingSpec.hs
        │       ├── DAGFunctionsSpec.hs
        │       ├── GroupsSpec.hs
        │       ├── LocalDataFunctionsSpec.hs
        │       ├── OpFunctionsSpec.hs
        │       ├── PathsSpec.hs
        │       └── RowUtilsSpec.hs
        │   ├── PathSpec.hs
        │   ├── ProjectionsSpec.hs
        │   ├── RowToSQLSpec.hs
        │   ├── SimpleExamplesSpec.hs
        │   └── TypesSpec.hs
    └── Spec.hs


/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | dist-*
 3 | cabal-dev
 4 | *.o
 5 | *.hi
 6 | *.chi
 7 | *.chs.h
 8 | *.dyn_o
 9 | *.dyn_hi
10 | .hpc
11 | .hsenv
12 | .cabal-sandbox/
13 | cabal.sandbox.config
14 | *.prof
15 | *.aux
16 | *.hp
17 | *.eventlog
18 | .stack-work/
19 | cabal.project.local
20 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gibiansky/ihaskell:latest
 2 | 
 3 | # Build:
 4 | # docker build -t ihaskell-karps .
 5 | 
 6 | # Install pandas for nice visualizations in Python
 7 | WORKDIR /tmp
 8 | RUN pip install pandas
 9 | RUN pip install requests
10 | 
11 | RUN rm /ihaskell/.stack-work/install/x86_64-linux/nightly-2015-08-15/7.10.2/bin/ihaskell
12 | 
13 | RUN mkdir /karps
14 | WORKDIR /karps
15 | 
16 | COPY stack-ihaskell.yaml stack.yaml
17 | COPY karps.cabal karps.cabal
18 | COPY src src
19 | COPY test test
20 | COPY LICENSE LICENSE
21 | 
22 | RUN stack setup 7.10.2
23 | RUN stack clean
24 | RUN stack update
25 | RUN stack install ipython-kernel-0.8.3.0
26 | RUN stack install ihaskell-0.8.3.0
27 | RUN stack install ihaskell-blaze-0.3.0.0
28 | RUN stack install ihaskell-basic-0.3.0.0
29 | RUN stack install
30 | 
31 | 
32 | # Run the notebook
33 | ENV PATH /karps/.stack-work/install/x86_64-linux/nightly-2015-08-15/7.10.2/bin:/root/.stack/snapshots/x86_64-linux/nightly-2015-08-15/7.10.2/bin:/root/.stack/programs/x86_64-linux/ghc-7.10.2/bin:/root/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
34 | RUN ihaskell install --stack
35 | WORKDIR /karps
36 | ENTRYPOINT stack exec -- jupyter notebook --NotebookApp.port=8888 '--NotebookApp.ip=*' --NotebookApp.notebook_dir=/karps
37 | EXPOSE 8888
38 | 


--------------------------------------------------------------------------------
/INTRO.md:
--------------------------------------------------------------------------------
  1 | # Karps-Haskell - Haskell bindings for Spark Datasets and Dataframes
  2 | 
  3 | This project is an exploration vehicle for developing safe, robust and reliable
  4 | data pipelines over Apache Spark. It consists in multiple sub-projects:
  5 |  - a specification to describe data pipelines in a language-agnostic manner,
  6 |    and a communication protocol to submit these pipelines to Spark. The
  7 |    specification is currently specified in [this repository](https://github.com/krapsh/karps-interface), using
  8 |     [Protocol Buffers 3](https://developers.google.com/protocol-buffers/docs/proto3) (
  9 |      which is also compatible with JSON).
 10 |  - a serving library, called
 11 |    [karps-server](https://github.com/krapsh/kraps-server), that implements this specification on top of Spark.
 12 |    It is written in Scala and is loaded as a standard Spark package.
 13 |  - a client written in Haskell that sends pipelines to Spark for execution. In
 14 |    addition, this client serves as an experimental platform for whole-program optimization and verification, as well as compiler-enforced type checking.
 15 | 
 16 | There is also a separate set of utilities to visualize such pipelines using
 17 | Jupyter notebooks and IHaskell.
 18 | 
 19 | This is a preview, the API may (will) change in the future.
 20 | 
 21 | The name is a play on a tasty fish of the family Cyprinidae, and an anagram of Spark. The programming model is strongly influenced by the
 22 | [TensorFlow project](https://www.tensorflow.org/) and follows a similar design.
 23 | 
 24 | ## Introduction
 25 | 
 26 | This project explores an alternative API to run complex workflows on top of
 27 | Apache Spark. Note that it is neither endorsed or supported by the Apache
 28 | Foundation nor by Databricks, Inc.
 29 | 
 30 | For the developers of Spark bindings:
 31 | 
 32 | By using this API, rich transforms can be expressed on top of Spark using various programming languages, without having to implement elaborate socket protocols to communicate with Java objects. Each programming language only needs to implement a relatively small interface that does not rely on features specific to the Java virtual machines, and which can be implemented using standard REST technologies. Each language is then free to express computations in the most idiomatic way.
 33 | 
 34 | As a reference, a set of bindings is being developed in the Haskell programming language and is used as the reference implementation. Despite its limited usage in data science, it is a useful tool to design strongly principled APIs that work across various programming languages.
 35 | 
 36 | 
 37 | For the user:
 38 | 
 39 | The user may be interested by a few features unique to this interface.
 40 | 
 41 | 1. Lazy computations.
 42 | No call to Spark is issued until a result is absolutely required. Unlike standard Spark interfaces, even aggregation operations such as `collect()` or `sum()` are lazy. This allows Karps to perform whole-program analysis of the computation and to make optimizations that are currently beyond the reach of Spark.
 43 | 
 44 | 2. Strong checks.
 45 | Thanks to lazy evaluation, a complete data science pipeline can be checked for correctness before it is evaluated. This is useful when composing multiple notebooks or commands together. For example, a lot of interesting operations in Spark such as Machine Learning algorithms involve an aggregation step. In Spark, such a step would break the analysis of the program and prevent the Spark analyzer from checking further transforms. Karps does not suffer from such limitations and checks all the pipeline at once.
 46 | 
 47 | 3. Automatic resource management.
 48 | Because Karps has a global view of the pipeline, it can check when data needs to be cached or uncached. It is able to schedule caching and uncaching operations automatically, and it refuses to run program that may be incorrect with respect to caching (for example when uncaching happens before the data is accessed again)
 49 | 
 50 | 4. Complex nested pipelines.
 51 | Computations can be arbitrarily nested and composed, allowing to conceptually condense complex sequences of operations into a single high-level operations. This is useful for debugging and understanding a pipeline at a high-level without being distracted by the implementation details of each step. This follows the same approach as TensorFlow.
 52 | 
 53 | 5. Stable format and language agnostic.
 54 | A complex pipeline may be stored as a JSON file in one programming language and read/restored in a different programming language. If this program is run on the same session, that other language can access the cached data.
 55 | 
 56 | ## Installation
 57 | 
 58 | You need the stack tool to build and install Karps. Additionally, you will need the Karps server running to run some queries against Spark.
 59 | 
 60 | 
 61 | ## Development ideas
 62 | 
 63 | These are notes for developers interested in understanding the general philosophy behind Karps.
 64 | 
 65 | Doing data science at scale is hard and requires multiple iterations. Accessing the data can be a long operations (minutes, or hours), even in Spark. Because of that, data should only be accessed if we are reasonably confident that the program is going to finish.
 66 | 
 67 | As a result, Karps is designed to detect a number of common programming mistakes in Spark before even attempting to access the data. These checks are either enforced by the runtime, or by Haskell's compiler when using the typed API. This is possible thanks to whole program analysis and lazy evaluation.
 68 | 
 69 | The execution model is heavily inspired by TensorFlow:
 70 | 
 71 | 1. Deterministic operations:
 72 | All the operations are deterministic, and some non-deterministic operations such as `currentTime` in SQL are forbidden. While this may seem a restriction, it provides multiple benefits such as aggressive caching and computation reuse.
 73 | 
 74 | 2. Stateless operations on a graph. A lot of the transforms operate as simple graph transforms and will eventually be merged in the server, making them available to all languages.
 75 | 
 76 | 3. Simple JSON-based format for encoding transform. It may be switched to Protocol Buffers (v3) if I figure out how to use it with Haskell and Spray-Can.
 77 | 
 78 | 4. Separation of the description of the transform from the operational details required by Spark.
 79 | 
 80 | 5. Trying to use the type system in the most effective way (Haskell interface). Making sure that the type system can enforce some simple rules such as not adding an integer to a string, but at the same time trying to give understandable error messages to the user when something goes wrong.
 81 | 
 82 | 
 83 | ## Differences from Spark
 84 | 
 85 | As mentioned Karps attempts to provide strong semantic guarantees at the expense of some flexibility.
 86 | 
 87 | In order to guarantee determinism, some operations are/will be forbidden:
 88 |  - randn() and rand() without a seed
 89 |  - get_current_time
 90 | 
 91 | Also, all the operations are meant to be expressed in a way that does not depend on the partitioning of the data. This is a significant departure from Spark, in which the PartitionID is available. It is possible in most cases to replace these at the expense of extra shuffles. In this case, it is considered worth it because of the strong guarantees that it offers with respect to reproducibility. In any case, it is a matter of debate.
 92 | 
 93 | ## Current status
 94 | 
 95 | Most of the underlying engines is working. Some important pieces are
 96 | still incomplete or missing though:
 97 |   - simple json ingest with data specification
 98 |   - debug mode for the backend (check for correctness)
 99 |   - autogeneration of accessors with template Haskell
100 |   - better IHaskell interface (especially for reporting errors)
101 |   - python frontend
102 |   - pandas backend
103 | 
104 | Advanced feature that require more thoughts are considered after that:
105 |   - SQL commands (interned strings)
106 |   - user-defined functions (Scala only)
107 |   - simple ML pipelines
108 |   - meta data and user-defined types
109 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/notebooks/05_Displaying_data_with_Python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Displaying data with Python\n",
  8 |     "\n",
  9 |     "Haskell is a great language for complex processing, but it lacks the visualization libraries that R and Python users have come to enjoy. This tutorial shows how to integrate both together when doing interactive analysis."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     ":extension DeriveGeneric\n",
 21 |     ":extension FlexibleContexts\n",
 22 |     ":extension OverloadedStrings\n",
 23 |     ":extension GeneralizedNewtypeDeriving\n",
 24 |     ":extension FlexibleInstances\n",
 25 |     ":extension MultiParamTypeClasses"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "[Debug] Creating spark session at url: http://10.0.2.2:8081/sessions/session05_python @(<unknown>:<unknown> <unknown>:0:0)"
 37 |       ]
 38 |      },
 39 |      "metadata": {},
 40 |      "output_type": "display_data"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "import GHC.Generics (Generic)\n",
 45 |     "\n",
 46 |     "import Spark.Core.Dataset\n",
 47 |     "import Spark.Core.Context\n",
 48 |     "import Spark.Core.Functions\n",
 49 |     "import Spark.Core.Column\n",
 50 |     "import Spark.Core.Types\n",
 51 |     "import Spark.Core.Row\n",
 52 |     "import Spark.Core.ColumnFunctions\n",
 53 |     "\n",
 54 |     "conf = defaultConf {\n",
 55 |     "        confEndPoint = \"http://10.0.2.2\",\n",
 56 |     "        confRequestedSessionName = \"session05_python\" }\n",
 57 |     "createSparkSessionDef conf"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "import Spark.Core.Types\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "data MyData = MyData {\n",
 80 |     "  aBigId :: Int,\n",
 81 |     "  importantData :: Int } deriving (Show, Eq, Generic, Ord)\n",
 82 |     "\n",
 83 |     "instance SQLTypeable MyData\n",
 84 |     "instance FromSQL MyData\n",
 85 |     "instance ToSQL MyData"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "[Debug] executeCommand1: computing observable collected_data@org.spark.Collect![{aBigId:int importantData:int}] @(<unknown>:<unknown> <unknown>:0:0)\n",
 97 |        "[Info] Sending computations at url: http://10.0.2.2:8081/computations/session05_python/0/create @(<unknown>:<unknown> <unknown>:0:0)\n",
 98 |        "[Debug] executeCommand1: Tracked nodes are [(9c12a..,NPath(collected_data),[{aBigId:int importantData:int}],collected_data)] @(<unknown>:<unknown> <unknown>:0:0)\n",
 99 |        "[Info] _computationMultiStatus: /collected_data finished @(<unknown>:<unknown> <unknown>:0:0)"
100 |       ]
101 |      },
102 |      "metadata": {},
103 |      "output_type": "display_data"
104 |     }
105 |    ],
106 |    "source": [
107 |     "let collection = [MyData 1 2, MyData 3 2, MyData 5 4]\n",
108 |     "\n",
109 |     "let ds = dataset collection @@ \"dataset\"\n",
110 |     "let c = collect (asCol ds) @@ \"collected_data\"\n",
111 |     "_ <- exec1Def c"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 2,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "ename": "ImportError",
121 |      "evalue": "No module named kraps",
122 |      "output_type": "error",
123 |      "traceback": [
124 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
125 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
126 |       "\u001b[0;32m<ipython-input-2-7602fdc23d32>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mkraps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnectSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"session05_python\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maddress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'localhost'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
127 |       "\u001b[0;31mImportError\u001b[0m: No module named kraps"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "from kraps import *\n",
133 |     "ks = connectSession(\"session05_python\", address='localhost')\n",
134 |     "ks"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "ks.pandas(\"collected_data\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "print ks.url('collected_data')"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": []
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 2",
172 |    "language": "python",
173 |    "name": "python2"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 2
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython2",
185 |    "version": "2.7.12+"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 1
190 | }
191 | 


--------------------------------------------------------------------------------
/notebooks/KarpsDisplays.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE FlexibleContexts #-}
 2 | {-# LANGUAGE TypeSynonymInstances #-}
 3 | {-# LANGUAGE FlexibleInstances #-}
 4 | {-# LANGUAGE OverloadedStrings #-}
 5 | 
 6 | -- This module is meant to be loaded from the IHaskell REPL.
 7 | module KarpsDisplays where
 8 | 
 9 | import IHaskell.Display
10 | import Formatting
11 | import Data.Text(unpack)
12 | import qualified Data.Text
13 | import Data.Foldable(toList)
14 | 
15 | import KarpsDagDisplay(nodeToDisplayGraph, tfIFrame, DisplayGraph, exportNodes)
16 | 
17 | import Spark.Core.Internal.Utilities(forceRight)
18 | import Spark.Core.Internal.PathsUntyped
19 | import Spark.Core.Internal.ComputeDag
20 | import Spark.Core.Internal.DAGStructures
21 | import Spark.Core.Internal.DatasetFunctions(untyped)
22 | import Spark.Core.Internal.DatasetStructures(ComputeNode)
23 | import Spark.Core.Internal.Utilities(forceRight)
24 | import Spark.Core.Internal.Client
25 | import Spark.Core.StructuresInternal(ComputationID(..))
26 | import Spark.Core.Try
27 | 
28 | instance IHaskellDisplay DisplayGraph where
29 |   display g = return $ Display [html code]
30 |     where
31 |       code = unpack . tfIFrame . exportNodes $ g
32 | 
33 | 
34 | showGraph = display . forceRight . nodeToDisplayGraph
35 | 
36 | showGraph' = showGraph . forceRight
37 | 
38 | showNameGraph node = showGraph . vertexData . last . forceRight $ x where
39 |   x = do
40 |     cg <- tryEither $ buildCGraph (untyped node)
41 |     nameCg <- assignPathsUntyped cg
42 |     let tiedCg = tieNodes nameCg
43 |     return $ toList (cdVertices tiedCg)
44 | 


--------------------------------------------------------------------------------
/notebooks/ihaskell-tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krapsh/kraps-haskell/c3a03afe9a4a5ddcedf994756d9bc5967818cec2/notebooks/ihaskell-tensorboard.png


--------------------------------------------------------------------------------
/src/Spark/Core.hs:
--------------------------------------------------------------------------------
 1 | {-|
 2 | Module      : Spark.Core
 3 | Description : Core functions and data structures to communicate with the Karps
 4 |               server.
 5 | Copyright   : (c) Karps contributors, 2016
 6 | License     : Apache-2.0
 7 | Maintainer  : krapsh@yandex.com
 8 | Stability   : experimental
 9 | Portability : POSIX
10 | 
11 | -}
12 | module Spark.Core where
13 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Column.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE FlexibleContexts #-}
 2 | 
 3 | {- |
 4 | Module      : Spark.Core.Column
 5 | Description : Column types and basic operations.
 6 | 
 7 | Operations on columns.
 8 | -}
 9 | module Spark.Core.Column(
10 |   -- * Types
11 |   Column,
12 |   DynColumn,
13 |   GenericColumn,
14 |   -- * Extractions and collations
15 |   asCol,
16 |   asCol',
17 |   pack1,
18 |   pack,
19 |   pack',
20 |   struct,
21 |   struct',
22 |   castCol,
23 |   castCol',
24 |   colRef,
25 |   (//),
26 |   (/-),
27 |   -- ToStaticProjectable,
28 |   StaticColProjection,
29 |   DynamicColProjection,
30 |   unsafeStaticProjection,
31 |   -- * Column type manipulations
32 |   dropColType,
33 |   -- * Column functions
34 |   colType,
35 |   untypedCol,
36 |   colFromObs,
37 |   colFromObs',
38 |   applyCol1,
39 |   ) where
40 | 
41 | import Spark.Core.Internal.ColumnStructures
42 | import Spark.Core.Internal.ColumnFunctions
43 | import Spark.Core.Internal.FunctionsInternals
44 | import Spark.Core.Internal.Projections
45 | 


--------------------------------------------------------------------------------
/src/Spark/Core/ColumnFunctions.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {-|
 3 | Module      : Spark.Core.ColumnFunctions
 4 | Description : Column operations
 5 | 
 6 | The standard library of functions that operate on
 7 | data columns.
 8 | -}
 9 | module Spark.Core.ColumnFunctions(
10 |   -- * Reductions
11 |   sumCol,
12 |   sumCol',
13 |   countCol,
14 |   countCol',
15 |   -- * Casting
16 |   asDoubleCol
17 | ) where
18 | 
19 | import Spark.Core.Internal.ArithmeticsImpl()
20 | import Spark.Core.Internal.ColumnStandard
21 | import Spark.Core.Internal.AggregationFunctions
22 | import Spark.Core.Internal.Projections()
23 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Context.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE ScopedTypeVariables #-}
 3 | 
 4 | {- | This module defines session objects that act as entry points to spark.
 5 | 
 6 | There are two ways to interact with Spark: using an explicit state object,
 7 | or using the default state object (interactive session).
 8 | 
 9 | While the interactive session is the most convenient, it should not be
10 | used for more than quick experimentations. Any complex code should use
11 | the SparkSession and SparkState objects.
12 | -}
13 | module Spark.Core.Context(
14 |   SparkSessionConf(..),
15 |   SparkSession,
16 |   SparkState,
17 |   SparkInteractiveException,
18 |   FromSQL,
19 |   defaultConf,
20 |   executeCommand1,
21 |   executeCommand1',
22 |   computationStats,
23 |   createSparkSessionDef,
24 |   closeSparkSessionDef,
25 |   currentSessionDef,
26 |   computationStatsDef,
27 |   exec1Def,
28 |   exec1Def',
29 |   execStateDef
30 |   ) where
31 | 
32 | import Data.Text(pack)
33 | 
34 | import Spark.Core.Internal.ContextStructures
35 | import Spark.Core.Internal.ContextIOInternal
36 | import Spark.Core.Internal.ContextInteractive
37 | import Spark.Core.Internal.RowGenericsFrom(FromSQL)
38 | 
39 | 
40 | -- | The default configuration if the Karps server is being run locally.
41 | defaultConf :: SparkSessionConf
42 | defaultConf =
43 |   SparkSessionConf {
44 |     confEndPoint = pack "http://127.0.0.1",
45 |     confPort = 8081,
46 |     confPollingIntervalMillis = 500,
47 |     confRequestedSessionName = "",
48 |     confUseNodePrunning = False -- Disable graph pruning by default
49 |   }
50 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Dataset.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {- |
 3 | Module      : Spark.Core.Dataset
 4 | Description : Dataset types and basic operations.
 5 | 
 6 | This module describes the core data types (Dataset, DataFrame,
 7 | Observable and DynObservable), and some basic operations to relate them.
 8 | -}
 9 | module Spark.Core.Dataset(
10 |   -- * Common data structures
11 |   -- TODO Should it be hidden?
12 |   ComputeNode,
13 |   LocLocal,
14 |   LocDistributed,
15 |   LocUnknown,
16 |   UntypedNode,
17 |   -- * Distributed data structures
18 |   Dataset,
19 |   DataFrame,
20 |   -- * Local data structures
21 |   LocalData,
22 |   LocalFrame,
23 |   -- * Conversions
24 |   asDF,
25 |   asDS,
26 |   asLocalObservable,
27 |   castType,
28 |   castType',
29 |   -- * Relations
30 |   parents,
31 |   untyped,
32 |   untyped',
33 |   depends,
34 |   logicalParents,
35 |   logicalParents',
36 |   -- * Attributes
37 |   nodeLogicalParents,
38 |   nodeLogicalDependencies,
39 |   nodeParents,
40 |   nodeOp,
41 |   nodeId,
42 |   nodeName,
43 |   nodeType,
44 |   ) where
45 | 
46 | import Spark.Core.Internal.DatasetStructures
47 | import Spark.Core.Internal.DatasetFunctions
48 | import Spark.Core.Internal.Projections()
49 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Functions.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE FlexibleContexts #-}
 3 | 
 4 | module Spark.Core.Functions(
 5 |   -- * Creation
 6 |   dataset,
 7 |   dataframe,
 8 |   constant,
 9 |   -- * Standard conversions
10 |   asLocalObservable,
11 |   asDouble,
12 |   -- * Arithmetic operations
13 |   (.+),
14 |   (.-),
15 |   (./),
16 |   div',
17 |   -- * Utilities
18 |   (@@),
19 |   _1,
20 |   _2,
21 |   -- * Standard library
22 |   collect,
23 |   collect',
24 |   count,
25 |   identity,
26 |   autocache,
27 |   cache,
28 |   uncache,
29 |   joinInner,
30 |   joinInner',
31 |   broadcastPair
32 |   ) where
33 | 
34 | 
35 | import Data.Aeson(toJSON)
36 | import qualified Data.Vector as V
37 | 
38 | import Spark.Core.Dataset
39 | import Spark.Core.Types
40 | import Spark.Core.Row
41 | import Spark.Core.Internal.ArithmeticsImpl
42 | import Spark.Core.Internal.DatasetFunctions
43 | import Spark.Core.Internal.Joins
44 | import Spark.Core.Internal.Utilities
45 | import Spark.Core.Internal.LocalDataFunctions
46 | import Spark.Core.Internal.ObservableStandard
47 | import Spark.Core.Internal.FunctionsInternals()
48 | import Spark.Core.Internal.OpStructures
49 | import Spark.Core.Internal.AggregationFunctions
50 | import Spark.Core.Internal.TypesStructures(SQLType(..))
51 | import Spark.Core.Internal.Projections
52 | import Spark.Core.Internal.CanRename
53 | 
54 | dataset :: (ToSQL a, SQLTypeable a, HasCallStack) => [a] -> Dataset a
55 | dataset l = emptyDataset op tp where
56 |   tp = buildType
57 |   op = NodeDistributedLit (unSQLType tp) (V.fromList ((toJSON . valueToCell) <$> l))
58 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/AggregationFunctions.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE RankNTypes #-}
  4 | {-# LANGUAGE FlexibleContexts #-}
  5 | 
  6 | -- A number of standard aggregation functions.
  7 | 
  8 | module Spark.Core.Internal.AggregationFunctions(
  9 |   -- Standard library
 10 |   collect,
 11 |   collect',
 12 |   count,
 13 |   count',
 14 |   countCol,
 15 |   countCol',
 16 |   sumCol,
 17 |   sumCol',
 18 |   -- Developer functions
 19 |   AggTry,
 20 |   UniversalAggregator(..),
 21 |   applyUAOUnsafe,
 22 |   applyUntypedUniAgg3
 23 | ) where
 24 | 
 25 | import Data.Aeson(Value(Null))
 26 | import qualified Data.Text as T
 27 | import qualified Data.Vector as V
 28 | 
 29 | import Spark.Core.Internal.DatasetStructures
 30 | import Spark.Core.Internal.ColumnStructures
 31 | import Spark.Core.Internal.ColumnFunctions(colType, untypedCol)
 32 | import Spark.Core.Internal.DatasetFunctions
 33 | import Spark.Core.Internal.RowGenerics(ToSQL)
 34 | import Spark.Core.Internal.LocalDataFunctions()
 35 | import Spark.Core.Internal.FunctionsInternals
 36 | import Spark.Core.Internal.OpStructures
 37 | import Spark.Core.Internal.TypesStructures
 38 | import Spark.Core.Internal.Utilities
 39 | import Spark.Core.Internal.TypesFunctions(arrayType')
 40 | import Spark.Core.StructuresInternal(emptyFieldPath)
 41 | import Spark.Core.Types
 42 | import Spark.Core.Try
 43 | 
 44 | {-| The sum of all the elements in a column.
 45 | 
 46 | If the data type is too small to represent the sum, the value being returned is
 47 | undefined.
 48 | -}
 49 | sumCol :: forall ref a. (Num a, SQLTypeable a, ToSQL a) =>
 50 |   Column ref a -> LocalData a
 51 | sumCol = applyUAOUnsafe _sumAgg'
 52 | 
 53 | sumCol' :: DynColumn -> LocalFrame
 54 | sumCol' = applyUntypedUniAgg3 _sumAgg'
 55 | 
 56 | {-| The number of elements in a column.
 57 | 
 58 | -}
 59 | -- TODO use Long for the return data type.
 60 | count :: forall a. Dataset a -> LocalData Int
 61 | count = countCol . asCol
 62 | 
 63 | count' :: DataFrame -> LocalFrame
 64 | count' = countCol' . asCol'
 65 | 
 66 | countCol :: Column ref a -> LocalData Int
 67 | countCol = applyUAOUnsafe _countAgg'
 68 | 
 69 | countCol' :: DynColumn -> LocalFrame
 70 | countCol' = applyUntypedUniAgg3 _countAgg'
 71 | 
 72 | 
 73 | {-| Collects all the elements of a column into a list.
 74 | 
 75 | NOTE:
 76 | This list is sorted in the canonical ordering of the data type: however the
 77 | data may be stored by Spark, the result will always be in the same order.
 78 | This is a departure from Spark, which does not guarantee an ordering on
 79 | the returned data.
 80 | -}
 81 | collect :: forall ref a. (SQLTypeable a) => Column ref a -> LocalData [a]
 82 | collect = applyUAOUnsafe _collectAgg'
 83 | 
 84 | {-| See the documentation of collect. -}
 85 | collect' :: DynColumn -> LocalFrame
 86 | collect' = applyUntypedUniAgg3 _collectAgg'
 87 | 
 88 | type AggTry a = Either T.Text a
 89 | 
 90 | {-|
 91 | This is the universal aggregator: the invariant aggregator and
 92 | some extra laws to combine multiple outputs.
 93 | It is useful for combining the results over multiple passes.
 94 | A real implementation in Spark has also an inner pass.
 95 | -}
 96 | data UniversalAggregator a buff = UniversalAggregator {
 97 |   uaMergeType :: SQLType buff,
 98 |   -- The result is partioning invariant
 99 |   uaInitialOuter :: Dataset a -> LocalData buff,
100 |   -- This operation is associative and commutative
101 |   -- The logical parents of the final observable have to be the 2 inputs
102 |   uaMergeBuffer :: LocalData buff -> LocalData buff -> LocalData buff
103 | }
104 | 
105 | -- TODO(kps) check the coming type for non-summable types
106 | _sumAgg' :: DataType -> AggTry UniversalAggregatorOp
107 | _sumAgg' dt = pure UniversalAggregatorOp {
108 |     uaoMergeType = dt,
109 |     uaoInitialOuter = InnerAggOp $ AggFunction "SUM" (V.singleton emptyFieldPath),
110 |     uaoMergeBuffer = ColumnSemiGroupLaw "SUM_SL"
111 |   }
112 | 
113 | _countAgg' :: DataType -> AggTry UniversalAggregatorOp
114 | -- Counting will always succeed.
115 | _countAgg' _ = pure UniversalAggregatorOp {
116 |     -- TODO(kps) switch to BigInt
117 |     uaoMergeType = StrictType IntType,
118 |     uaoInitialOuter = InnerAggOp $ AggFunction "COUNT" (V.singleton emptyFieldPath),
119 |     uaoMergeBuffer = ColumnSemiGroupLaw "SUM"
120 |   }
121 | 
122 | _collectAgg' :: DataType -> AggTry UniversalAggregatorOp
123 | -- Counting will always succeed.
124 | _collectAgg' dt =
125 |   let ldt = arrayType' dt
126 |       soMerge = StandardOperator {
127 |                  soName = "org.spark.Collect",
128 |                  soOutputType = ldt,
129 |                  soExtra = Null
130 |            }
131 |       soMono = StandardOperator {
132 |                   soName = "org.spark.CatSorted",
133 |                   soOutputType = ldt,
134 |                   soExtra = Null
135 |             }
136 |   in pure UniversalAggregatorOp {
137 |     -- TODO(kps) switch to BigInt
138 |     uaoMergeType = ldt,
139 |     uaoInitialOuter = OpaqueAggTransform soMerge,
140 |     uaoMergeBuffer = OpaqueSemiGroupLaw soMono
141 |   }
142 | 
143 | applyUntypedUniAgg3 :: (DataType -> AggTry UniversalAggregatorOp) -> DynColumn -> LocalFrame
144 | applyUntypedUniAgg3 f dc = do
145 |   c <- dc
146 |   let uaot = f . unSQLType . colType $ c
147 |   uao <- tryEither uaot
148 |   let no = NodeAggregatorReduction uao
149 |   let ds = pack1 c
150 |   return $ emptyLocalData no (SQLType (uaoMergeType uao)) `parents` [untyped ds]
151 | 
152 | applyUAOUnsafe :: forall a b ref. (SQLTypeable b, HasCallStack) => (DataType -> AggTry UniversalAggregatorOp) -> Column ref a -> LocalData b
153 | applyUAOUnsafe f c =
154 |   let lf = applyUntypedUniAgg3 f (untypedCol c)
155 |   in forceRight (asObservable lf)
156 | 
157 | -- _guardType :: DataType -> (UntypedDataset -> UntypedLocalData) -> (UntypedDataset -> LocalFrame)
158 | -- _guardType dt f ds =
159 | --   if unSQLType (nodeType ds) == dt
160 | --   then
161 | --     pure $ f ds
162 | --   else
163 | --     tryError $ sformat ("Expected type "%sh%" but got type "%sh) dt (nodeType ds)
164 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/AlgebraStructures.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RankNTypes #-}
 2 | {-# LANGUAGE FunctionalDependencies #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | {-# LANGUAGE MultiParamTypeClasses #-}
 5 | {-# LANGUAGE FlexibleInstances #-}
 6 | {-# LANGUAGE TypeFamilies #-}
 7 | {-# LANGUAGE OverloadedStrings #-}
 8 | {-# LANGUAGE FlexibleContexts #-}
 9 | {-# LANGUAGE FunctionalDependencies #-}
10 | {-# LANGUAGE RankNTypes #-}
11 | 
12 | -- TODO remove this file
13 | module Spark.Core.Internal.AlgebraStructures where
14 | 
15 | -- | Algebraic structures that are common to columns and observables.
16 | 
17 | 
18 | data BinaryOpFun in1 in2 to = BinaryOpFun {
19 |   bodLift1 :: in1 -> to,
20 |   bodLift2 :: in2 -> to,
21 |   bodOp :: to -> to -> to
22 | }
23 | 
24 | 
25 | class HomoBinaryOp2 in1 in2 to | in1 in2 -> to where
26 |   _liftFun :: (to -> to -> to) -> BinaryOpFun in1 in2 to
27 | 
28 | _applyBinOp0 :: forall in1 in2 to. in1 -> in2 -> BinaryOpFun in1 in2 to -> to
29 | _applyBinOp0 i1 i2 (BinaryOpFun l1 l2 bo) = bo (l1 i1) (l2 i2)
30 | 
31 | applyBinOp :: forall in1 in2 to. (HomoBinaryOp2 in1 in2 to) => (to -> to -> to) -> in1 -> in2 -> to
32 | applyBinOp f i1 i2 =
33 |   _applyBinOp0 i1 i2 (_liftFun f)
34 | 
35 | -- -- | Overloaded operator for operationts that are guaranteed to succeed.
36 | -- (.+) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out
37 | -- (.+) = applyBinOp (+)
38 | 
39 | (.-) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out
40 | (.-) = applyBinOp (-)
41 | 
42 | (.*) :: (Num out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out
43 | (.*) = applyBinOp (*)
44 | 
45 | -- TODO(kps) add here the rest of the Integral operations
46 | div' :: (Integral out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out
47 | div' = applyBinOp div
48 | 
49 | -- **** Fractional ****
50 | 
51 | (./) :: (Fractional out, HomoBinaryOp2 a1 a2 out) => a1 -> a2 -> out
52 | (./) = applyBinOp (/)
53 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Arithmetics.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE FlexibleInstances #-}
  3 | {-# LANGUAGE TypeFamilies #-}
  4 | -- Required by old versions
  5 | {-# LANGUAGE FlexibleContexts #-}
  6 | 
  7 | module Spark.Core.Internal.Arithmetics(
  8 |   GeneralizedHomoReturn,
  9 |   GeneralizedHomo2,
 10 |   HomoColOp2,
 11 |   -- | Developer API
 12 |   performOp,
 13 |   ) where
 14 | 
 15 | 
 16 | import Spark.Core.Internal.ColumnFunctions
 17 | import Spark.Core.Internal.ColumnStructures
 18 | import Spark.Core.Internal.DatasetStructures
 19 | import Spark.Core.Internal.FunctionsInternals(projectColFunction2')
 20 | import Spark.Core.Internal.Utilities
 21 | 
 22 | {-| All the automatic conversions supported when lifting a -}
 23 | type family GeneralizedHomoReturn x1 x2 where
 24 |   GeneralizedHomoReturn (Column ref x1) (Column ref x1) = Column ref x1
 25 |   GeneralizedHomoReturn (Column ref x1) DynColumn = DynColumn
 26 |   GeneralizedHomoReturn (Column ref x1) (LocalData x1) = Column ref x1
 27 |   GeneralizedHomoReturn (Column ref x1) LocalFrame = DynColumn
 28 |   GeneralizedHomoReturn DynColumn (Column ref x1) = DynColumn
 29 |   GeneralizedHomoReturn DynColumn DynColumn = DynColumn
 30 |   GeneralizedHomoReturn DynColumn (LocalData x1) = DynColumn
 31 |   GeneralizedHomoReturn DynColumn LocalFrame = DynColumn
 32 |   GeneralizedHomoReturn (LocalData x1) (Column ref x1) = Column ref x1
 33 |   GeneralizedHomoReturn (LocalData x1) DynColumn = DynColumn
 34 |   GeneralizedHomoReturn (LocalData x1) (LocalData x1) = LocalData x1
 35 |   GeneralizedHomoReturn (LocalData x1) LocalFrame = LocalFrame
 36 |   GeneralizedHomoReturn LocalFrame (Column ref x1) = DynColumn
 37 |   GeneralizedHomoReturn LocalFrame LocalFrame = LocalFrame
 38 | 
 39 | -- The type of an homogeneous operation.
 40 | -- TODO it would be nice to enforce this contstraint at the type level,
 41 | -- but it is a bit more complex to do.
 42 | type HomoColOp2 = UntypedColumnData -> UntypedColumnData -> UntypedColumnData
 43 | 
 44 | {-| The class of types that can be lifted to operations onto Karps types.
 45 | 
 46 | This is the class for operations on homogeneous types (the inputs and the
 47 | output have the same underlying type).
 48 | 
 49 | At its core, it takes a broadcasted operation that works on columns, and
 50 | makes that operation available on other shapes.
 51 | -}
 52 | class GeneralizedHomo2 x1 x2 where
 53 |   _projectHomo :: x1 -> x2 -> HomoColOp2 -> GeneralizedHomoReturn x1 x2
 54 | 
 55 | {-| Performs an operation, using a reference operation defined on columns.
 56 | -}
 57 | performOp :: (GeneralizedHomo2 x1 x2) =>
 58 |   HomoColOp2 ->
 59 |   x1 ->
 60 |   x2 ->
 61 |   GeneralizedHomoReturn x1 x2
 62 | performOp f x1 x2 = _projectHomo x1 x2 f
 63 | 
 64 | -- ******* INSTANCES *********
 65 | 
 66 | instance GeneralizedHomo2 DynColumn DynColumn where
 67 |   _projectHomo = _performDynDyn
 68 | 
 69 | instance GeneralizedHomo2 (Column ref x) (Column ref x) where
 70 |   _projectHomo = _performCC
 71 | 
 72 | instance GeneralizedHomo2 DynColumn (Column ref x) where
 73 |   _projectHomo dc1 c2 = _performDynDyn dc1 (untypedCol c2)
 74 | 
 75 | instance GeneralizedHomo2 (Column ref x) DynColumn where
 76 |   _projectHomo c1 = _performDynDyn (untypedCol c1)
 77 | 
 78 | instance GeneralizedHomo2 (Column ref x) (LocalData x) where
 79 |   _projectHomo c1 o2 = _projectHomo c1 (broadcast o2 c1)
 80 | 
 81 | instance GeneralizedHomo2 (LocalData x) (Column ref x) where
 82 |   _projectHomo o1 c2 = _projectHomo (broadcast o1 c2) c2
 83 | 
 84 | instance GeneralizedHomo2 (Column ref x) LocalFrame where
 85 |   _projectHomo c1 o2' = _projectHomo c1 (broadcast' o2' (untypedCol c1))
 86 | 
 87 | instance GeneralizedHomo2 LocalFrame (Column ref x) where
 88 |   _projectHomo o1' c2 = _projectHomo (broadcast' o1' (untypedCol c2)) c2
 89 | 
 90 | instance GeneralizedHomo2 LocalFrame LocalFrame where
 91 |   _projectHomo o1' o2' f =
 92 |     let f' x y = f <$> x <*> y
 93 |     in projectColFunction2' f' o1' o2'
 94 | 
 95 | 
 96 | _performDynDyn ::
 97 |   DynColumn -> DynColumn -> HomoColOp2 -> DynColumn
 98 | _performDynDyn dc1 dc2 f = do
 99 |   c1 <- dc1
100 |   c2 <- dc2
101 |   -- TODO: add type guard
102 |   let c = f c1 c2
103 |   -- TODO: add dynamic check on the type of the return
104 |   return (dropColType c)
105 | 
106 | _performCC :: (HasCallStack) =>
107 |   Column ref x -> Column ref x -> HomoColOp2 -> Column ref x
108 | _performCC c1 c2 f =
109 |   let sqlt = colType c1
110 |       c = f (iUntypedColData c1) (iUntypedColData c2)
111 |       c' = forceRight $ castCol (colRef c1) sqlt (pure c)
112 |   in c'
113 | 
114 | _performCO :: (HasCallStack) =>
115 |   Column ref x -> LocalData x -> HomoColOp2 -> Column ref x
116 | _performCO c1 o2 = _performCC c1 (broadcast o2 c1)
117 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ArithmeticsImpl.hs:
--------------------------------------------------------------------------------
 1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 2 | -- Disabled for old versions
 3 | -- {-# OPTIONS_GHC -fno-warn-redundant-constraints #-}
 4 | {-# LANGUAGE RankNTypes #-}
 5 | {-# LANGUAGE FlexibleInstances #-}
 6 | {-# LANGUAGE OverloadedStrings #-}
 7 | -- Required by old versions
 8 | {-# LANGUAGE FlexibleContexts #-}
 9 | 
10 | {-| This module contains all the class instances and operators related
11 | to arithmetics with Datasets, Dataframes, Columns and Observables.
12 | -}
13 | module Spark.Core.Internal.ArithmeticsImpl(
14 |   (.+),
15 |   (.-),
16 |   (./),
17 |   div'
18 |   ) where
19 | 
20 | import Spark.Core.Internal.ColumnFunctions
21 | import Spark.Core.Internal.DatasetFunctions
22 | import Spark.Core.Internal.DatasetStructures
23 | import Spark.Core.Internal.LocalDataFunctions(constant)
24 | import Spark.Core.Internal.FunctionsInternals(projectColFunction2', projectColFunction')
25 | import Spark.Core.Internal.Arithmetics
26 | 
27 | 
28 | {-| A generalization of the addition for the Karps types.
29 | -}
30 | (.+) :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) =>
31 |   a1 -> a2 -> GeneralizedHomoReturn a1 a2
32 | (.+) = performOp (homoColOp2 "+")
33 | 
34 | {-| A generalization of the negation for the Karps types.
35 | -}
36 | (.-) :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) =>
37 |   a1 -> a2 -> GeneralizedHomoReturn a1 a2
38 | (.-) = performOp (homoColOp2 "-")
39 | 
40 | (./) :: (Fractional a1, Fractional a2, GeneralizedHomo2 a1 a2) =>
41 |   a1 -> a2 -> GeneralizedHomoReturn a1 a2
42 | (./) = performOp (homoColOp2 "/")
43 | 
44 | div' :: forall a1 a2. (Num a1, Num a2, GeneralizedHomo2 a1 a2) =>
45 |   a1 -> a2 -> GeneralizedHomoReturn a1 a2
46 | div' = performOp (homoColOp2 "/")
47 | 
48 | -- All the operations are defined from column operations
49 | -- This adds a little overhead, but it can be optimized by the backend.
50 | instance Num LocalFrame where
51 |   (+) = projectColFunction2' (+)
52 |   (-) = projectColFunction2' (-)
53 |   (*) = projectColFunction2' (*)
54 |   abs = projectColFunction' abs
55 |   signum = projectColFunction' signum
56 |   -- It will choose by default to use the Int type, which may not be
57 |   -- what the user wants.
58 |   -- In case there is some doubt, user should use typed operations.
59 |   fromInteger x = asLocalObservable $ constant (fromInteger x :: Int)
60 |   negate = projectColFunction' negate
61 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/BasicStructures.hs:
--------------------------------------------------------------------------------
1 | -- Some basic structures shared across most modules.
2 | 
3 | -- TODO remove this file
4 | module Spark.Core.Internal.BasicStructures where
5 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/CachingUntyped.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | {-# LANGUAGE FlexibleContexts #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | 
 5 | 
 6 | {-| Implementation of the caching interfaces for the compute data structures.
 7 | -}
 8 | module Spark.Core.Internal.CachingUntyped(
 9 |   cachingType,
10 |   autocacheGen
11 | ) where
12 | 
13 | import Control.Monad.Except
14 | 
15 | import Spark.Core.Internal.Caching
16 | import Spark.Core.Internal.DatasetStructures
17 | import Spark.Core.Internal.DatasetFunctions
18 | import Spark.Core.Internal.OpStructures
19 | import Spark.Core.Internal.PathsUntyped()
20 | import Spark.Core.Internal.DAGStructures
21 | import Spark.Core.StructuresInternal
22 | 
23 | cachingType :: UntypedNode -> CacheTry NodeCachingType
24 | cachingType n = case nodeOp n of
25 |   NodeLocalOp _ -> pure Stop
26 |   NodeAggregatorReduction _ -> pure Stop
27 |   NodeAggregatorLocalReduction _ -> pure Stop
28 |   NodeOpaqueAggregator _ -> pure Stop
29 |   NodeLocalLit _ _ -> pure Stop
30 |   NodeStructuredTransform _ -> pure Through
31 |   NodeDistributedLit _ _ -> pure Through
32 |   NodeDistributedOp so | soName so == opnameCache ->
33 |     pure $ CacheOp (vertexToId n)
34 |   NodeDistributedOp so | soName so == opnameUnpersist ->
35 |     case nodeParents n of
36 |       [n'] -> pure $ UncacheOp (vertexToId n) (vertexToId n')
37 |       _ -> throwError "Node is not valid uncache node"
38 |   NodeDistributedOp so | soName so == opnameAutocache ->
39 |     pure $ AutocacheOp (vertexToId n)
40 |   NodeDistributedOp _ -> pure Through -- Nothing special for the other operations
41 |   NodeBroadcastJoin -> pure Through
42 |   NodeGroupedReduction _ -> pure Stop
43 |   NodeReduction _ -> pure Stop
44 |   NodePointer _ -> pure Stop -- It is supposed to be an observable
45 | 
46 | autocacheGen :: AutocacheGen UntypedNode
47 | autocacheGen = AutocacheGen {
48 |   deriveUncache = deriveUncache',
49 |   deriveIdentity = deriveIdentity'
50 | } where
51 |     -- TODO: use path-based identification in the future
52 |     -- f :: String -> VertexId -> VertexId
53 |     -- f s (VertexId bs) = VertexId . C8.pack . (++s) . C8.unpack $ bs
54 |     deriveIdentity' (Vertex _ un) =
55 |       let x = identity un
56 |           vid' = VertexId . unNodeId . nodeId $ x -- f "_identity" vid
57 |       in Vertex vid' x
58 |     deriveUncache' (Vertex _ un) =
59 |       let x = uncache un
60 |           vid' = VertexId . unNodeId . nodeId $ x -- f "_uncache" vid
61 |       in Vertex vid' x
62 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/CanRename.hs:
--------------------------------------------------------------------------------
 1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 2 | {-# LANGUAGE IncoherentInstances #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | {-# LANGUAGE MultiParamTypeClasses #-}
 5 | {-# LANGUAGE FlexibleInstances #-}
 6 | {-# LANGUAGE TypeFamilies #-}
 7 | {-# LANGUAGE OverloadedStrings #-}
 8 | {-# LANGUAGE FlexibleContexts #-}
 9 | {-# LANGUAGE RankNTypes #-}
10 | 
11 | -- TODO(kps): this module stretches my understanding of Haskell.
12 | -- There is probably better than that.
13 | 
14 | {-| Defines the notion of renaming something.
15 | 
16 | This is closed over a few well-defined types.
17 | -}
18 | module Spark.Core.Internal.CanRename where
19 | 
20 | import qualified Data.Text as T
21 | import Formatting
22 | 
23 | import Spark.Core.Try
24 | import Spark.Core.StructuresInternal
25 | import Spark.Core.Internal.ColumnFunctions()
26 | import Spark.Core.Internal.ColumnStructures
27 | import Spark.Core.Internal.DatasetStructures
28 | import Spark.Core.Internal.Utilities
29 | 
30 | -- | The class of types that can be renamed.
31 | -- It is made generic because it covers 2 notions:
32 | --   - the name of a compute node that will eventually determine its compute path
33 | --   - the name of field (which may become an object path)
34 | -- This syntax tries to be convenient and will fail immediately
35 | -- for basic errors such as illegal characters.
36 | --
37 | -- This could be revisited in the future, but it is a compromise
38 | -- on readability.
39 | class CanRename a txt where
40 |   (@@) :: a -> txt -> a
41 | 
42 | infixl 1 @@
43 | 
44 | 
45 | instance forall ref a. CanRename (ColumnData ref a) FieldName where
46 |   c @@ fn = c { _cReferingPath = Just fn }
47 | 
48 | 
49 | instance forall ref a s. (s ~ String) => CanRename (Column ref a) s where
50 |   c @@ str = case fieldName (T.pack str) of
51 |     Right fp -> c @@ fp
52 |     Left msg ->
53 |       -- The syntax check here is pretty lenient, so it fails, it has
54 |       -- some good reasons. We stop here.
55 |       failure $ sformat ("Could not make a field path out of string "%shown%" for column "%shown%":"%shown) str c msg
56 | 
57 | instance CanRename DynColumn FieldName where
58 |   (Right cd) @@ fn = Right (cd @@ fn)
59 |   -- TODO better error handling
60 |   x @@ _ = x
61 | 
62 | instance forall s. (s ~ String) => CanRename DynColumn s where
63 |   -- An error could happen when making a path out of a string.
64 |   (Right cd) @@ str = case fieldName (T.pack str) of
65 |     Right fp -> Right $ cd @@ fp
66 |     Left msg ->
67 |       -- The syntax check here is pretty lenient, so it fails, it has
68 |       -- some good reasons. We stop here.
69 |       tryError $ sformat ("Could not make a field path out of string "%shown%" for column "%shown%":"%shown) str cd msg
70 |   -- TODO better error handling
71 |   x @@ _ = x
72 | 
73 | 
74 | instance forall loc a s. (s ~ String) => CanRename (ComputeNode loc a) s where
75 |   -- There is no need to update the id, as this field is not involved
76 |   -- in the calculation of the id.
77 |   -- TODO: make this fail immediately? If the name is wrong, it is
78 |   -- harder to figure out what is happening.
79 |   (@@) cn name = cn { _cnName = Just nn } where
80 |     nn = NodeName . T.pack $ name
81 | 
82 | instance forall loc a s. (s ~ String) => CanRename (Try (ComputeNode loc a)) s where
83 |   (Right n) @@ str = Right (n @@ str)
84 |   (Left n) @@ _ = Left n
85 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Client.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | -- The communication protocol with the server
  4 | 
  5 | module Spark.Core.Internal.Client where
  6 | 
  7 | import Spark.Core.StructuresInternal
  8 | import Spark.Core.Dataset(UntypedNode)
  9 | import Spark.Core.Internal.Utilities
 10 | import Spark.Core.Internal.TypesStructures(DataType)
 11 | import Spark.Core.Internal.TypesFunctions()
 12 | 
 13 | import Data.Text(Text, pack)
 14 | import Data.Aeson
 15 | import Data.Aeson.Types(Parser)
 16 | import GHC.Generics
 17 | 
 18 | 
 19 | -- Imports for the client
 20 | 
 21 | {-| The ID of an RDD in Spark.
 22 | -}
 23 | data RDDId = RDDId {
 24 |  unRDDId :: !Int
 25 | } deriving (Eq, Show, Ord)
 26 | 
 27 | data LocalSessionId = LocalSessionId {
 28 |   unLocalSession :: !Text
 29 | } deriving (Eq, Show)
 30 | 
 31 | data Computation = Computation {
 32 |   cSessionId :: !LocalSessionId,
 33 |   cId :: !ComputationID,
 34 |   cNodes :: ![UntypedNode],
 35 |   -- Non-empty
 36 |   cTerminalNodes :: ![NodePath],
 37 |   -- The node at the top of the computation.
 38 |   -- Must be part of the terminal nodes.
 39 |   cCollectingNode :: !NodePath,
 40 |   -- This redundant information is not serialized.
 41 |   -- It is used internally to track the resulting nodes.
 42 |   cTerminalNodeIds :: ![NodeId]
 43 | } deriving (Show, Generic)
 44 | 
 45 | data BatchComputationKV = BatchComputationKV {
 46 |   bckvLocalPath :: !NodePath,
 47 |   bckvDeps :: ![NodePath],
 48 |   bckvResult :: !PossibleNodeStatus
 49 | } deriving (Show, Generic)
 50 | 
 51 | data BatchComputationResult = BatchComputationResult {
 52 |   bcrTargetLocalPath :: !NodePath,
 53 |   bcrResults :: ![(NodePath, [NodePath], PossibleNodeStatus)]
 54 | } deriving (Show, Generic)
 55 | 
 56 | data RDDInfo = RDDInfo {
 57 |  rddiId :: !RDDId,
 58 |  rddiClassName :: !Text,
 59 |  rddiRepr :: !Text,
 60 |  rddiParents :: ![RDDId]
 61 | } deriving (Show, Generic)
 62 | 
 63 | data SparkComputationItemStats = SparkComputationItemStats {
 64 |   scisRddInfo :: ![RDDInfo]
 65 | } deriving (Show, Generic)
 66 | 
 67 | data PossibleNodeStatus =
 68 |     NodeQueued
 69 |   | NodeRunning
 70 |   | NodeFinishedSuccess !(Maybe NodeComputationSuccess) !(Maybe SparkComputationItemStats)
 71 |   | NodeFinishedFailure NodeComputationFailure deriving (Show, Generic)
 72 | 
 73 | data NodeComputationSuccess = NodeComputationSuccess {
 74 |   -- Because Row requires additional information to be deserialized.
 75 |   ncsData :: Value,
 76 |   -- The data type is also available, but it is not going to be parsed for now.
 77 |   ncsDataType :: DataType
 78 | } deriving (Show, Generic)
 79 | 
 80 | data NodeComputationFailure = NodeComputationFailure {
 81 |   ncfMessage :: !Text
 82 | } deriving (Show, Generic)
 83 | 
 84 | 
 85 | -- **** AESON INSTANCES ***
 86 | 
 87 | instance ToJSON LocalSessionId where
 88 |   toJSON = toJSON . unLocalSession
 89 | 
 90 | instance FromJSON RDDId where
 91 |   parseJSON x = RDDId <$> parseJSON x
 92 | 
 93 | instance FromJSON RDDInfo where
 94 |   parseJSON = withObject "RDDInfo" $ \o -> do
 95 |     _id <- o .: "id"
 96 |     className <- o .: "className"
 97 |     repr <- o .: "repr"
 98 |     parents <- o .: "parents"
 99 |     return $ RDDInfo _id className repr parents
100 | 
101 | instance FromJSON SparkComputationItemStats where
102 |   parseJSON = withObject "SparkComputationItemStats" $ \o -> do
103 |     rddinfo <- o .: "rddInfo"
104 |     return $ SparkComputationItemStats rddinfo
105 | 
106 | instance FromJSON BatchComputationKV where
107 |   parseJSON = withObject "BatchComputationKV" $ \o -> do
108 |     np <- o .: "localPath"
109 |     deps <- o .: "pathDependencies"
110 |     res <- o .: "result"
111 |     return $ BatchComputationKV np deps res
112 | 
113 | instance FromJSON BatchComputationResult where
114 |   parseJSON = withObject "BatchComputationResult" $ \o -> do
115 |     kvs <- o .: "results"
116 |     tlp <- o .: "targetLocalPath"
117 |     let f (BatchComputationKV k d v) = (k, d, v)
118 |     return $ BatchComputationResult tlp (f <$> kvs)
119 | 
120 | instance FromJSON NodeComputationSuccess where
121 |   parseJSON = withObject "NodeComputationSuccess" $ \o -> NodeComputationSuccess
122 |     <$> o .: "content"
123 |     <*> o .: "type"
124 | 
125 | -- Because we get a row back, we need to supply a SQLType for deserialization.
126 | instance FromJSON PossibleNodeStatus where
127 |   parseJSON =
128 |     let parseSuccess :: Object -> Parser PossibleNodeStatus
129 |         parseSuccess o = NodeFinishedSuccess
130 |           <$> o .:? "finalResult"
131 |           <*> o .:? "stats"
132 |         parseFailure :: Object -> Parser PossibleNodeStatus
133 |         parseFailure o =
134 |           (NodeFinishedFailure . NodeComputationFailure) <$> o .: pack "finalError"
135 |     in
136 |       withObject "PossibleNodeStatus" $ \o -> do
137 |       status <- o .: pack "status"
138 |       case pack status of
139 |         "running" -> return NodeRunning
140 |         "finished_success" -> parseSuccess o
141 |         "finished_failure" -> parseFailure o
142 |         "scheduled" -> return NodeQueued
143 |         _ -> failure $ pack ("FromJSON PossibleNodeStatus " ++ show status)
144 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ColumnStandard.hs:
--------------------------------------------------------------------------------
 1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 2 | {-# LANGUAGE RankNTypes #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | {-# LANGUAGE MultiParamTypeClasses #-}
 5 | {-# LANGUAGE FlexibleInstances #-}
 6 | {-# LANGUAGE TypeFamilies #-}
 7 | {-# LANGUAGE OverloadedStrings #-}
 8 | {-# LANGUAGE FlexibleContexts #-}
 9 | 
10 | {-| The standard library of functions operating on columns only.
11 | -}
12 | module Spark.Core.Internal.ColumnStandard(
13 |   asDoubleCol
14 | ) where
15 | 
16 | 
17 | import Spark.Core.Internal.ColumnStructures
18 | import Spark.Core.Internal.ColumnFunctions
19 | import Spark.Core.Internal.TypesGenerics(buildType)
20 | 
21 | asDoubleCol :: (Num a) => Column ref a -> Column ref Double
22 | asDoubleCol = makeColOp1 "double" buildType
23 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ColumnStructures.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE RankNTypes #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE FlexibleInstances #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | 
  7 | module Spark.Core.Internal.ColumnStructures where
  8 | 
  9 | import Control.Arrow ((&&&))
 10 | import Data.Function(on)
 11 | import Data.Vector(Vector)
 12 | 
 13 | import Spark.Core.Internal.DatasetStructures
 14 | import Spark.Core.Internal.DatasetFunctions()
 15 | import Spark.Core.Internal.RowStructures
 16 | import Spark.Core.Internal.TypesStructures
 17 | import Spark.Core.Internal.OpStructures
 18 | import Spark.Core.StructuresInternal
 19 | import Spark.Core.Try
 20 | 
 21 | {-| The data structure that implements the notion of data columns.
 22 | 
 23 | The type on this one may either be a Cell or a proper type.
 24 | 
 25 | A column of data from a dataset
 26 | The ref is a reference potentially to the originating
 27 | dataset, but it may be more general than that to perform
 28 | type-safe tricks.
 29 | 
 30 | Unlike Spark, columns are always attached to a reference dataset or dataframe.
 31 | One cannot materialize a column out of thin air. In order to broadcast a value
 32 | along a given column, the `broadcast` function is provided.
 33 | 
 34 | TODO: try something like this https://www.vidarholen.net/contents/junk/catbag.html
 35 | -}
 36 | data ColumnData ref a = ColumnData {
 37 |   _cOrigin :: !UntypedDataset,
 38 |   _cType :: !DataType,
 39 |   _cOp :: !GeneralizedColOp,
 40 |   -- The name in the dataset.
 41 |   -- If not set, it will be deduced from the operation.
 42 |   _cReferingPath :: !(Maybe FieldName)
 43 | }
 44 | 
 45 | {-| A generalization of the column operation.
 46 | 
 47 | This structure is useful to performn some extra operations not supported by
 48 | the Spark engine:
 49 |  - express joins with an observable
 50 |  - keep track of DAGs of column operations (not implemented yet)
 51 | -}
 52 | data GeneralizedColOp =
 53 |     GenColExtraction !FieldPath
 54 |   | GenColFunction !SqlFunctionName !(Vector GeneralizedColOp)
 55 |   | GenColLit !DataType !Cell
 56 |     -- This is the extra operation that needs to be flattened with a broadcast.
 57 |   | BroadcastColOp !UntypedLocalData
 58 |   | GenColStruct !(Vector GeneralizedTransField)
 59 |   deriving (Eq, Show)
 60 | 
 61 | data GeneralizedTransField = GeneralizedTransField {
 62 |   gtfName :: !FieldName,
 63 |   gtfValue :: !GeneralizedColOp
 64 | } deriving (Eq, Show)
 65 | 
 66 | {-| A column of data from a dataset or a dataframe.
 67 | 
 68 | This column is typed: the operations on this column will be
 69 | validdated by Haskell' type inferenc.
 70 | -}
 71 | type Column ref a = ColumnData ref a
 72 | 
 73 | {-| An untyped column of data from a dataset or a dataframe.
 74 | 
 75 | This column is untyped and may not be properly constructed. Any error
 76 | will be found during the analysis phase at runtime.
 77 | -}
 78 | type DynColumn = Try (ColumnData UnknownReference Cell)
 79 | 
 80 | 
 81 | -- | (dev)
 82 | -- The type of untyped column data.
 83 | type UntypedColumnData = ColumnData UnknownReference Cell
 84 | 
 85 | {-| (dev)
 86 | A column for which the type of the cells is unavailable (at the type level),
 87 |  but for which the origin is available at the type level.
 88 | -}
 89 | type GenericColumn ref = Column ref Cell
 90 | 
 91 | {-| A dummy data type that indicates the data referenc is missing.
 92 | -}
 93 | data UnknownReference
 94 | 
 95 | {-| A tag that carries the reference information of a column at a
 96 | type level. This is useful when creating column.
 97 | 
 98 | See ref and colRef.
 99 | -}
100 | data ColumnReference a = ColumnReference
101 | 
102 | instance forall ref a. Eq (ColumnData ref a) where
103 |   (==) = (==) `on` (_cOrigin &&& _cType &&& _cOp &&& _cReferingPath)
104 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ComputeDag.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | module Spark.Core.Internal.ComputeDag where
 4 | 
 5 | import Data.Foldable(toList)
 6 | import qualified Data.Map.Strict as M
 7 | import qualified Data.Vector as V
 8 | import Data.Vector(Vector)
 9 | 
10 | import Spark.Core.Internal.DAGStructures
11 | import Spark.Core.Internal.DAGFunctions
12 | 
13 | {-| A DAG of computation nodes.
14 | 
15 | At a high level, it is a total function with a number of inputs and a number
16 | of outputs.
17 | 
18 | Note about the edges: the edges flow along the path of dependencies:
19 | the inputs are the start points, and the outputs are the end points of the
20 | graph.
21 | 
22 | -}
23 | data ComputeDag v e = ComputeDag {
24 |   -- The edges that make up the DAG
25 |   cdEdges :: !(AdjacencyMap v e),
26 |   -- All the vertices of the graph
27 |   -- Sorted by lexicographic order + node id for uniqueness
28 |   cdVertices :: !(Vector (Vertex v)),
29 |   -- The inputs of the computation graph. These correspond to the
30 |   -- sinks of the dependency graph.
31 |   cdInputs :: !(Vector (Vertex v)),
32 |   -- The outputs of the computation graph. These correspond to the
33 |   -- sources of the dependency graph.
34 |   cdOutputs :: !(Vector (Vertex v))
35 | } deriving (Show)
36 | 
37 | 
38 | -- | Conversion
39 | computeGraphToGraph :: ComputeDag v e -> Graph v e
40 | computeGraphToGraph cg =
41 |   Graph (cdEdges cg) (cdVertices cg)
42 | 
43 | -- | Conversion
44 | graphToComputeGraph :: Graph v e -> ComputeDag v e
45 | graphToComputeGraph g =
46 |   ComputeDag {
47 |     cdEdges = gEdges g,
48 |     cdVertices = gVertices g,
49 |     -- We work on the graph of dependencies (not flows)
50 |     -- The sources correspond to the outputs.
51 |     cdInputs = V.fromList $ graphSinks g,
52 |     cdOutputs = V.fromList $ graphSources g
53 |   }
54 | 
55 | _mapVerticesAdj :: (Vertex v -> v') -> AdjacencyMap v e -> AdjacencyMap v' e
56 | _mapVerticesAdj f m =
57 |   let f1 ve =
58 |         let vx = veEndVertex ve
59 |             d' = f vx in
60 |           ve { veEndVertex = vx { vertexData = d' } }
61 |       f' v = f1 <$> v
62 |   in M.map f' m
63 | 
64 | mapVertices :: (Vertex v -> v') -> ComputeDag v e -> ComputeDag v' e
65 | mapVertices f cd =
66 |   let f' vx = vx { vertexData = f vx }
67 |   in ComputeDag {
68 |       cdEdges = _mapVerticesAdj f (cdEdges cd),
69 |       cdVertices = f' <$> cdVertices cd,
70 |       cdInputs = f' <$> cdInputs cd,
71 |       cdOutputs = f' <$> cdOutputs cd
72 |     }
73 | 
74 | mapVertexData :: (v -> v') -> ComputeDag v e -> ComputeDag v' e
75 | mapVertexData f = mapVertices (f . vertexData)
76 | 
77 | buildCGraph :: (GraphOperations v e, Show v) =>
78 |   v -> DagTry (ComputeDag v e)
79 | buildCGraph n = graphToComputeGraph <$> buildGraph n
80 | 
81 | graphDataLexico :: ComputeDag v e -> [v]
82 | graphDataLexico cd = vertexData <$> toList (cdVertices cd)
83 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ContextInteractive.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | 
  4 | -- | Functions to create and manipulate one default context.
  5 | --
  6 | -- This is most appropriate when working in an interactive session,
  7 | -- during which it is usually clear that there is a single
  8 | -- Spark context in use.
  9 | --
 10 | -- This module uses unsafe Haskell code that should not be used
 11 | -- outside prototyping in an interactive REPL. In any good case,
 12 | -- you should use the SparkState monad.
 13 | module Spark.Core.Internal.ContextInteractive(
 14 |   SparkInteractiveException,
 15 |   createSparkSessionDef,
 16 |   exec1Def,
 17 |   exec1Def',
 18 |   closeSparkSessionDef,
 19 |   execStateDef,
 20 |   computationStatsDef,
 21 |   currentSessionDef
 22 | ) where
 23 | 
 24 | import qualified Data.Vector as V
 25 | import Control.Exception
 26 | import Control.Monad.Catch(throwM)
 27 | import Data.IORef
 28 | import Data.Typeable
 29 | import Control.Monad.State(runStateT)
 30 | import Data.Text
 31 | import System.IO.Unsafe(unsafePerformIO)
 32 | import Control.Monad.Logger(runStdoutLoggingT)
 33 | 
 34 | 
 35 | import Spark.Core.Internal.Client(BatchComputationResult)
 36 | import Spark.Core.Internal.ContextStructures
 37 | import Spark.Core.Internal.DatasetStructures
 38 | import Spark.Core.Internal.DatasetFunctions(untypedLocalData)
 39 | import Spark.Core.Internal.ContextIOInternal
 40 | import Spark.Core.Internal.RowGenericsFrom(FromSQL, cellToValue)
 41 | import Spark.Core.Internal.RowStructures(Cell)
 42 | import Spark.Core.StructuresInternal
 43 | import Spark.Core.Try
 44 | 
 45 | -- The global session reference. Should not be accessed outside
 46 | -- this file.
 47 | _globalSessionRef :: IORef (Maybe SparkSession)
 48 | {-# NOINLINE _globalSessionRef #-}
 49 | _globalSessionRef = unsafePerformIO (newIORef Nothing)
 50 | 
 51 | -- | The exception thrown when a request cannot be completed
 52 | -- in an interactive session.
 53 | data SparkInteractiveException = SparkInteractiveException {
 54 |   _sieInner :: NodeError
 55 | } deriving Typeable
 56 | 
 57 | instance Show SparkInteractiveException where
 58 |   show (SparkInteractiveException inner) =
 59 |     show inner
 60 | 
 61 | instance Exception SparkInteractiveException
 62 | 
 63 | {- | Creates a spark session that will be used as the default session.
 64 | 
 65 | If a session already exists, an exception will be thrown.
 66 |  -}
 67 | createSparkSessionDef :: SparkSessionConf -> IO ()
 68 | createSparkSessionDef conf = do
 69 |   current <- _currentSession
 70 |   case current of
 71 |     Nothing ->
 72 |       return ()
 73 |     Just _ ->
 74 |       -- TODO let users change the state
 75 |       _throw "A default context already exist. If you wish to modify the exsting context, you must use modifySparkConfDef"
 76 |   new <- createSparkSession' conf
 77 |   _setSession new
 78 |   return ()
 79 | 
 80 | {- | Executes a command using the default spark session.
 81 | 
 82 | This is the most unsafe way of running a command:
 83 | it executes a command using the default spark session, and
 84 | throws an exception if any error happens.
 85 |  -}
 86 | exec1Def :: (FromSQL a) => LocalData a -> IO a
 87 | exec1Def ld = do
 88 |   c <- exec1Def' (pure (untypedLocalData ld))
 89 |   _forceEither $ cellToValue c
 90 | 
 91 | exec1Def' :: LocalFrame -> IO Cell
 92 | exec1Def' lf = do
 93 |   ld <- _getOrThrow lf
 94 |   res <- execStateDef (executeCommand1' ld)
 95 |   _getOrThrow res
 96 | 
 97 | {-| Runs the computation described in the state transform, using the default
 98 | Spark session.
 99 | 
100 | Will throw an exception if no session currently exists.
101 | -}
102 | execStateDef :: SparkState a -> IO a
103 | execStateDef s = do
104 |   ctx <- _currentSessionOrThrow
105 |   (res, newSt) <- (runStateT . runStdoutLoggingT) s ctx
106 |   _setSession newSt
107 |   return res
108 | 
109 | {-| Closes the default session. The default session is empty after this call
110 | completes.
111 | 
112 | NOTE: This does not currently clear up the resources! It is a stub implementation
113 | used in testing.
114 | -}
115 | closeSparkSessionDef :: IO ()
116 | closeSparkSessionDef = do
117 |   _ <- _removeSession
118 |   return ()
119 | 
120 | computationStatsDef :: ComputationID -> IO BatchComputationResult
121 | computationStatsDef compId = execStateDef (computationStats compId)
122 | 
123 | currentSessionDef :: IO (Maybe SparkSession)
124 | currentSessionDef = _currentSession
125 | 
126 | _currentSession :: IO (Maybe SparkSession)
127 | _currentSession = readIORef _globalSessionRef
128 | 
129 | _setSession :: SparkSession -> IO ()
130 | _setSession st = writeIORef _globalSessionRef (Just st)
131 | 
132 | _removeSession :: IO (Maybe SparkSession)
133 | _removeSession = do
134 |   current <- _currentSession
135 |   _ <- writeIORef _globalSessionRef Nothing
136 |   return current
137 | 
138 | _currentSessionOrThrow :: IO SparkSession
139 | _currentSessionOrThrow = do
140 |   mCtx <- _currentSession
141 |   case mCtx of
142 |     Nothing ->
143 |       _throw "No default context found. You must first create a default spark context with createSparkSessionDef"
144 |     Just ctx -> return ctx
145 | 
146 | 
147 | _getOrThrow :: Try a -> IO a
148 | _getOrThrow (Right x) = return x
149 | _getOrThrow (Left err) = throwM (SparkInteractiveException err)
150 | 
151 | _forceEither :: Either Text a -> IO a
152 | _forceEither = _getOrThrow . tryEither
153 | 
154 | _throw :: Text -> IO a
155 | _throw txt = throwM $
156 |   SparkInteractiveException Error {
157 |     ePath = NodePath V.empty,
158 |     eMessage = txt
159 |   }
160 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ContextStructures.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Spark.Core.Internal.ContextStructures(
 4 |   SparkSessionConf(..),
 5 |   SparkSession(..),
 6 |   SparkState,
 7 |   SparkStatePure,
 8 |   ComputeGraph,
 9 |   HdfsPath(..),
10 |   NodeCacheInfo(..),
11 |   NodeCacheStatus(..),
12 |   SparkStateT,
13 |   SparkStatePureT
14 | ) where
15 | 
16 | import Data.Text(Text)
17 | import Control.Monad.State(StateT, State)
18 | import Control.Monad.Logger(LoggingT)
19 | 
20 | import Spark.Core.Internal.Client(LocalSessionId)
21 | import Spark.Core.Internal.ComputeDag(ComputeDag)
22 | import Spark.Core.Internal.OpStructures(HdfsPath(..))
23 | import Spark.Core.Internal.Pruning
24 | import Spark.Core.Internal.DatasetStructures(UntypedNode, StructureEdge)
25 | 
26 | -- | The configuration of a remote spark session in Karps.
27 | data SparkSessionConf = SparkSessionConf {
28 |  -- | The URL of the end point.
29 |   confEndPoint :: !Text,
30 |   -- | The port used to configure the end point.
31 |   confPort :: !Int,
32 |   -- | (internal) the polling interval
33 |   confPollingIntervalMillis :: !Int,
34 |   -- | (optional) the requested name of the session.
35 |   -- This name must obey a number of rules:
36 |   --  - it must consist in alphanumerical and -,_: [a-zA-Z0-9\-_]
37 |   --  - if it already exists on the server, it will be reconnected to
38 |   --
39 |   -- The default value is "" (a new random context name will be chosen).
40 |   confRequestedSessionName :: !Text,
41 |   {-| If enabled, attempts to prune the computation graph as much as possible.
42 | 
43 |   This option is useful in interactive sessions when long chains of computations
44 |   are extracted. This forces the execution of only the missing parts.
45 |   The algorithm is experimental, so disabling it is a safe option.
46 | 
47 |   Disabled by default.
48 |   -}
49 |   confUseNodePrunning :: !Bool
50 | } deriving (Show)
51 | 
52 | -- | A session in Spark.
53 | -- Encapsualates all the state needed to communicate with Spark
54 | -- and to perfor some simple optimizations on the code.
55 | data SparkSession = SparkSession {
56 |   ssConf :: !SparkSessionConf,
57 |   ssId :: !LocalSessionId,
58 |   ssCommandCounter :: !Integer,
59 |   ssNodeCache :: !NodeCache
60 | } deriving (Show)
61 | 
62 | 
63 | 
64 | -- | Represents the state of a session and accounts for the communication
65 | -- with the server.
66 | type SparkState a = SparkStateT IO a
67 | 
68 | -- More minimalistic state transforms when doing pure evaluation.
69 | -- (internal type)
70 | -- TODO: use the transformer
71 | type SparkStatePure x = State SparkSession x
72 | 
73 | type SparkStatePureT m = StateT SparkSession m
74 | type SparkStateT m = LoggingT (SparkStatePureT m)
75 | 
76 | {-| internal
77 | 
78 | A graph of computations. This graph is a direct acyclic graph. Each node is
79 | associated to a global path.
80 | -}
81 | type ComputeGraph = ComputeDag UntypedNode StructureEdge
82 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/DAGStructures.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE DeriveGeneric #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | 
  5 | {-| Data structures to represent Directed Acyclic Graphs (DAGs).
  6 | 
  7 | -}
  8 | module Spark.Core.Internal.DAGStructures where
  9 | 
 10 | import qualified Data.Map.Strict as M
 11 | import qualified Data.Text as T
 12 | import Data.ByteString(ByteString)
 13 | import Data.Vector(Vector)
 14 | import Data.Foldable(toList)
 15 | import Data.Hashable(Hashable)
 16 | import GHC.Generics(Generic)
 17 | import Formatting
 18 | 
 19 | import Spark.Core.Internal.Utilities
 20 | 
 21 | -- | The unique ID of a vertex.
 22 | newtype VertexId = VertexId { unVertexId :: ByteString } deriving (Eq, Ord, Generic)
 23 | 
 24 | 
 25 | -- | An edge in a graph, parametrized by some payload.
 26 | data Edge e = Edge {
 27 |   edgeFrom :: !VertexId,
 28 |   edgeTo :: !VertexId,
 29 |   edgeData :: !e
 30 | }
 31 | 
 32 | -- | A vertex in a graph, parametrized by some payload.
 33 | data Vertex v = Vertex {
 34 |   vertexId :: !VertexId,
 35 |   vertexData :: !v
 36 | }
 37 | 
 38 | {-| An edge, along with its end node.
 39 | -}
 40 | data VertexEdge e v = VertexEdge {
 41 |     veEndVertex :: !(Vertex v),
 42 |     veEdge :: !(Edge e) }
 43 | 
 44 | {-| The adjacency map of a graph.
 45 | 
 46 | The node Id corresponds to the start node, the pairs are the end node and
 47 | and the edge to reach to the node. There may be multiple edges leading to the
 48 | same node.
 49 | -}
 50 | type AdjacencyMap v e = M.Map VertexId (Vector (VertexEdge e v))
 51 | 
 52 | -- | The representation of a graph.
 53 | --
 54 | -- In all the project, it is considered as a DAG.
 55 | data Graph v e = Graph {
 56 |   gEdges :: !(AdjacencyMap v e),
 57 |   gVertices :: !(Vector (Vertex v))
 58 | }
 59 | 
 60 | -- | Graph operations on types that are supposed to
 61 | -- represent vertices.
 62 | class GraphVertexOperations v where
 63 |   vertexToId :: v -> VertexId
 64 |   expandVertexAsVertices :: v -> [v]
 65 | 
 66 | -- | Graph operations on types that are supposed to represent
 67 | -- edges.
 68 | class (GraphVertexOperations v) => GraphOperations v e where
 69 |   expandVertex :: v -> [(e,v)]
 70 | 
 71 | instance Functor Vertex where
 72 |   fmap f vx = vx { vertexData = f (vertexData vx) }
 73 | 
 74 | instance Functor Edge where
 75 |   fmap f ed = ed { edgeData = f (edgeData ed) }
 76 | 
 77 | instance (Show v) => Show (Vertex v) where
 78 |   show vx = "Vertex(vId=" ++ show (vertexId vx) ++ " v=" ++ show (vertexData vx) ++ ")"
 79 | 
 80 | instance (Show e) => Show (Edge e) where
 81 |   show ed = "Edge(from=" ++ show (edgeFrom ed) ++ " to=" ++ show (edgeTo ed) ++ " e=" ++ show (edgeData ed) ++ ")"
 82 | 
 83 | instance (Show v, Show e) => Show (VertexEdge e v) where
 84 |   show (VertexEdge v e) = "(" ++ show v ++ ", " ++ show e ++ ")"
 85 | 
 86 | instance (Show v, Show e) => Show (Graph v e) where
 87 |   show g =
 88 |     let vxs = toList $ gVertices g <&> \(Vertex vid x) ->
 89 |           sformat (sh%":"%sh) vid x
 90 |         vedges = foldMap toList (M.elems (gEdges g))
 91 |         edges = (veEdge <$> vedges) <&> \(Edge efrom eto x) ->
 92 |           sformat (sh%"->"%sh%"->"%sh) efrom x eto
 93 |         -- eds = (M.elems (gEdges g)) `foldMap` \v ->
 94 |         --   (toList v) <&>
 95 |         vxs' = T.intercalate "," vxs
 96 |         eds' = T.intercalate " " edges
 97 |         str = T.concat ["Graph{", vxs', ", ", eds', "}"]
 98 |     in T.unpack str
 99 | 
100 | instance Hashable VertexId
101 | 
102 | instance Show VertexId where
103 |   show (VertexId bs) = let s = show bs in
104 |     if length s > 9 then
105 |       (drop 1 . take 6) s ++ ".."
106 |     else
107 |       s
108 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/DatasetStructures.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE RankNTypes #-}
  3 | {-# LANGUAGE TypeSynonymInstances #-}
  4 | {-# LANGUAGE FlexibleInstances #-}
  5 | {-# LANGUAGE TypeFamilies #-}
  6 | 
  7 | module Spark.Core.Internal.DatasetStructures where
  8 | 
  9 | import Data.Vector(Vector)
 10 | 
 11 | import Spark.Core.StructuresInternal
 12 | import Spark.Core.Try
 13 | import Spark.Core.Row
 14 | import Spark.Core.Internal.OpStructures
 15 | import Spark.Core.Internal.TypesStructures
 16 | 
 17 | {-| (internal) The main data structure that represents a data node in the
 18 | computation graph.
 19 | 
 20 | This data structure forms the backbone of computation graphs expressed
 21 | with spark operations.
 22 | 
 23 | loc is a typed locality tag.
 24 | a is the type of the data, as seen by the Haskell compiler. If erased, it
 25 | will be a Cell type.
 26 | -}
 27 | -- TODO: separate the topology info from the node info. It will help when
 28 | -- building the graphs.
 29 | data ComputeNode loc a = ComputeNode {
 30 |   -- | The id of the node.
 31 |   --
 32 |   -- Non strict because it may be expensive.
 33 |   _cnNodeId :: NodeId,
 34 |   -- The following fields are used to build a unique ID to
 35 |   -- a compute node:
 36 | 
 37 |   -- | The operation associated to this node.
 38 |   _cnOp :: !NodeOp,
 39 |   -- | The type of the node
 40 |   _cnType :: !DataType,
 41 |   -- | The direct parents of the node. The order of the parents is important
 42 |   -- for the semantics of the operation.
 43 |   _cnParents :: !(Vector UntypedNode),
 44 |   -- | A set of extra dependencies that can be added to force an order between
 45 |   -- the nodes.
 46 |   --
 47 |   -- The order is not important, they are sorted by ID.
 48 |   --
 49 |   -- TODO(kps) add this one to the id
 50 |   _cnLogicalDeps :: !(Vector UntypedNode),
 51 |   -- | The locality of this node.
 52 |   --
 53 |   -- TODO(kps) add this one to the id
 54 |   _cnLocality :: !Locality,
 55 |   -- Attributes that are not included in the id
 56 |   -- These attributes are mostly for the user to relate to the nodes.
 57 |   -- They are not necessary for the computation.
 58 |   --
 59 |   -- | The name
 60 |   _cnName :: !(Maybe NodeName),
 61 |   -- | A set of nodes considered as the logical input for this node.
 62 |   -- This has no influence on the calculation of the id and is used
 63 |   -- for organization purposes only.
 64 |   _cnLogicalParents :: !(Maybe (Vector UntypedNode)),
 65 |   -- | The path of this oned in a computation flow.
 66 |   --
 67 |   -- This path includes the node name.
 68 |   -- Not strict because it may be expensive to compute.
 69 |   -- By default it only contains the name of the node (i.e. the node is
 70 |   -- attached to the root)
 71 |   _cnPath :: NodePath
 72 | } deriving (Eq)
 73 | 
 74 | -- (internal) Phantom type tags for the locality
 75 | data TypedLocality loc = TypedLocality { unTypedLocality :: !Locality } deriving (Eq, Show)
 76 | data LocLocal
 77 | data LocDistributed
 78 | data LocUnknown
 79 | 
 80 | -- (developer) The type for which we drop all the information expressed in
 81 | -- types.
 82 | --
 83 | -- This is useful to express parent dependencies (pending a more type-safe
 84 | -- interface)
 85 | type UntypedNode = ComputeNode LocUnknown Cell
 86 | 
 87 | -- (internal) A dataset for which we have dropped type information.
 88 | -- Used internally by columns.
 89 | type UntypedDataset = Dataset Cell
 90 | 
 91 | type UntypedLocalData = LocalData Cell
 92 | 
 93 | {-| A typed collection of distributed data.
 94 | 
 95 | Most operations on datasets are type-checked by the Haskell
 96 | compiler: the type tag associated to this dataset is guaranteed
 97 | to be convertible to a proper Haskell type. In particular, building
 98 | a Dataset of dynamic cells is guaranteed to never happen.
 99 | 
100 | If you want to do untyped operations and gain
101 | some flexibility, consider using UDataFrames instead.
102 | 
103 | Computations with Datasets and observables are generally checked for
104 | correctness using the type system of Haskell.
105 | -}
106 | type Dataset a = ComputeNode LocDistributed a
107 | 
108 | {-|
109 | A unit of data that can be accessed by the user.
110 | 
111 | This is a typed unit of data. The type is guaranteed to be a proper
112 | type accessible by the Haskell compiler (instead of simply a Cell
113 | type, which represents types only accessible at runtime).
114 | 
115 | TODO(kps) rename to Observable
116 | -}
117 | type LocalData a = ComputeNode LocLocal a
118 | 
119 | 
120 | {-|
121 | The dataframe type. Any dataset can be converted to a dataframe.
122 | 
123 | For the Spark users: this is different than the definition of the
124 | dataframe in Spark, which is a dataset of rows. Because the support
125 | for single columns is more akward in the case of rows, it is more
126 | natural to generalize datasets to contain cells.
127 | When communicating with Spark, though, single cells are wrapped
128 | into rows with single field, as Spark does.
129 | -}
130 | type DataFrame = Try UntypedDataset
131 | 
132 | {-| Observable, whose type can only be infered at runtime and
133 | that can fail to be computed at runtime.
134 | 
135 | Any observable can be converted to an untyped
136 | observable.
137 | 
138 | Untyped observables are more flexible and can be combined in
139 | arbitrary manner, but they will fail during the validation of
140 | the Spark computation graph.
141 | 
142 | TODO(kps) rename to DynObservable
143 | -}
144 | type LocalFrame = Try UntypedLocalData
145 | 
146 | type UntypedNode' = Try UntypedNode
147 | 
148 | {-| The different paths of edges in the compute DAG of nodes, at the
149 | start of computations.
150 | 
151 |  - scope edges specify the scope of a node for naming. They are not included in
152 |    the id.
153 | 
154 | -}
155 | data NodeEdge = ScopeEdge | DataStructureEdge StructureEdge deriving (Show, Eq)
156 | 
157 | {-| The edges in a compute DAG, after name resolution (which is where most of
158 | the checks and computations are being done)
159 | 
160 | - parent edges are the direct parents of a node, the only ones required for
161 |   defining computations. They are included in the id.
162 | - logical edges define logical dependencies between nodes to force a specific
163 |   ordering of the nodes. They are included in the id.
164 | -}
165 | data StructureEdge = ParentEdge | LogicalEdge deriving (Show, Eq)
166 | 
167 | 
168 | class CheckedLocalityCast loc where
169 |   _validLocalityValues :: [TypedLocality loc]
170 | 
171 | -- Class to retrieve the locality associated to a type.
172 | -- Is it better to use type classes?
173 | class (CheckedLocalityCast loc) => IsLocality loc where
174 |   _getTypedLocality :: TypedLocality loc
175 | 
176 | instance CheckedLocalityCast LocLocal where
177 |   _validLocalityValues = [TypedLocality Local]
178 | 
179 | instance CheckedLocalityCast LocDistributed where
180 |   _validLocalityValues = [TypedLocality Distributed]
181 | 
182 | -- LocLocal is a locality associated to Local
183 | instance IsLocality LocLocal where
184 |   _getTypedLocality = TypedLocality Local
185 | 
186 | -- LocDistributed is a locality associated to Distributed
187 | instance IsLocality LocDistributed where
188 |   _getTypedLocality = TypedLocality Distributed
189 | 
190 | instance CheckedLocalityCast LocUnknown where
191 |   _validLocalityValues = [TypedLocality Distributed, TypedLocality Local]
192 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Filter.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Spark.Core.Internal.Filter where
 4 | 
 5 | import Data.Aeson(Value(Null))
 6 | import qualified Data.Text as T
 7 | import Formatting
 8 | 
 9 | import Spark.Core.Internal.DatasetStructures
10 | import Spark.Core.Internal.ColumnStructures
11 | import Spark.Core.Internal.ColumnFunctions(colType, untypedCol, iUntypedColData, colOrigin)
12 | import Spark.Core.Internal.DatasetFunctions
13 | import Spark.Core.Internal.RowGenerics(ToSQL)
14 | import Spark.Core.Internal.LocalDataFunctions()
15 | import Spark.Core.Internal.FunctionsInternals
16 | import Spark.Core.Internal.OpStructures
17 | import Spark.Core.Internal.TypesStructures
18 | import Spark.Core.Internal.Utilities
19 | import Spark.Core.Internal.TypesFunctions(arrayType')
20 | import Spark.Core.Internal.RowStructures(Cell)
21 | import Spark.Core.Types
22 | import Spark.Core.Try
23 | 
24 | 
25 | {-| Performs a filtering operation on columns of a dataset.
26 | 
27 | The first column is the reference column that is used to filter out values of
28 | the second column.
29 | -}
30 | filterCol :: Column ref Bool -> Column ref a -> Dataset a
31 | filterCol = missing "filterCol"
32 | 
33 | {-| Filters a column depending to only keep the strict data.
34 | 
35 | This function is useful to filter out some data within a structure, some of which
36 | may not be strict.
37 | -}
38 | filterOpt :: Column ref (Maybe a) -> Column ref b -> Dataset (a, b)
39 | filterOpt = missing "filterOpt"
40 | 
41 | {-| Only keeps the strict values of a column.
42 | -}
43 | filterMaybe :: Column ref (Maybe a) -> Dataset a
44 | filterMaybe = missing "filterMaybe"
45 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Joins.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE FlexibleContexts #-}
 3 | {-| Exposes some of Spark's joining algorithms.
 4 | -}
 5 | module Spark.Core.Internal.Joins(
 6 |   join,
 7 |   join',
 8 |   joinInner,
 9 |   joinInner',
10 |   joinObs,
11 |   joinObs'
12 | ) where
13 | 
14 | import qualified Data.Aeson as A
15 | import qualified Data.Vector as V
16 | import Control.Arrow
17 | 
18 | import Spark.Core.Internal.ColumnStructures
19 | import Spark.Core.Internal.ColumnFunctions
20 | import Spark.Core.Internal.DatasetStructures
21 | import Spark.Core.Internal.DatasetFunctions
22 | import Spark.Core.Internal.FunctionsInternals
23 | import Spark.Core.Internal.OpStructures
24 | import Spark.Core.Internal.TypesStructures
25 | import Spark.Core.Internal.Utilities
26 | import Spark.Core.Internal.TypesFunctions(structTypeFromFields)
27 | import Spark.Core.Try
28 | import Spark.Core.StructuresInternal(unsafeFieldName)
29 | 
30 | {-| Standard (inner) join on two sets of data.
31 | -}
32 | join :: Column ref1 key -> Column ref1 value1 -> Column ref2 key -> Column ref2 value2 -> Dataset (key, value1, value2)
33 | join = joinInner
34 | 
35 | {-| Untyped version of the standard join.
36 | -}
37 | join' :: DynColumn -> DynColumn -> DynColumn -> DynColumn -> DataFrame
38 | join' = joinInner'
39 | 
40 | {-| Explicit inner join.
41 | -}
42 | joinInner :: Column ref1 key -> Column ref1 value1 -> Column ref2 key -> Column ref2 value2 -> Dataset (key, value1, value2)
43 | joinInner key1 val1 key2 val2 = unsafeCastDataset (forceRight df) where
44 |   df = joinInner' (untypedCol key1) (untypedCol val1) (untypedCol key2) (untypedCol val2)
45 | 
46 | {-| Untyped version of the inner join.
47 | -}
48 | joinInner' :: DynColumn -> DynColumn -> DynColumn -> DynColumn -> DataFrame
49 | joinInner' key1 val1 key2 val2 = do
50 |   df1 <- pack' (struct' [key1, val1])
51 |   df2 <- pack' (struct' [key2, val2])
52 |   dt <- _joinTypeInner key1 val1 val2
53 |   let so = StandardOperator { soName = "org.spark.Join", soOutputType = dt, soExtra = A.String "inner" }
54 |   let ds = emptyDataset (NodeDistributedOp so) (SQLType dt)
55 |   let f ds' = ds' { _cnParents = V.fromList [untyped df1, untyped df2] }
56 |   return $ updateNode ds f
57 | 
58 | {-| Broadcasts an observable alongside a dataset to make it available as an
59 | extra column.
60 | -}
61 | -- This is the low-level operation that is used to implement the other
62 | -- broadcast operations.
63 | joinObs :: (HasCallStack) => Column ref val -> LocalData val' -> Dataset (val, val')
64 | joinObs c ld =
65 |   -- TODO: has a forcing at the last moment so that we can at least
66 |   -- have stronger guarantees in the type coercion.
67 |   unsafeCastDataset $ forceRight $ joinObs' (untypedCol c) (pure (untypedLocalData ld))
68 | 
69 | {-| Broadcasts an observable along side a dataset to make it available as
70 | an extra column.
71 | 
72 | The resulting dataframe has 2 columns:
73 |  - one column called 'values'
74 |  - one column called 'broadcast'
75 | 
76 |  Note: this is a low-level operation. Users may want to use broadcastObs instead.
77 | -}
78 | -- TODO: what is the difference with broadcastPair???
79 | joinObs' :: DynColumn -> LocalFrame -> DataFrame
80 | joinObs' dc lf = do
81 |   let df = pack' dc
82 |   dc' <- df
83 |   c <- asCol' df
84 |   o <- lf
85 |   st <- structTypeFromFields [(unsafeFieldName "values", unSQLType (colType c)), (unsafeFieldName "broadcast", unSQLType (nodeType o))]
86 |   let sqlt = SQLType (StrictType (Struct st))
87 |   return $ emptyDataset NodeBroadcastJoin sqlt `parents` [untyped dc', untyped o]
88 | 
89 | _joinTypeInner :: DynColumn -> DynColumn -> DynColumn -> Try DataType
90 | _joinTypeInner kcol col1 col2 = do
91 |   cs <- sequence [kcol, col1, col2]
92 |   st <- structTypeFromFields $ (colFieldName &&& unSQLType . colType) <$> cs
93 |   return $ StrictType (Struct st)
94 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/LocalDataFunctions.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  2 | {-# LANGUAGE TypeSynonymInstances #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE FlexibleInstances #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | 
  7 | -- A number of functions related to local data.
  8 | 
  9 | module Spark.Core.Internal.LocalDataFunctions(
 10 |   constant,
 11 |   iPackTupleObs
 12 | ) where
 13 | 
 14 | import Data.Aeson(toJSON, Value(Null))
 15 | import qualified Data.Text as T
 16 | import qualified Data.List.NonEmpty as N
 17 | import Control.Exception.Base(assert)
 18 | 
 19 | import Spark.Core.Internal.DatasetFunctions
 20 | import Spark.Core.Internal.DatasetStructures
 21 | import Spark.Core.Internal.TypesFunctions
 22 | import Spark.Core.Internal.TypesStructures
 23 | import Spark.Core.Internal.OpStructures
 24 | import Spark.Core.Internal.Utilities
 25 | import Spark.Core.Internal.TypesGenerics(SQLTypeable, buildType)
 26 | import Spark.Core.Row
 27 | 
 28 | constant :: (ToSQL a, SQLTypeable a) => a -> LocalData a
 29 | constant cst =
 30 |   let
 31 |     sqlt = buildType
 32 |     dt = unSQLType sqlt
 33 |   in emptyLocalData (NodeLocalLit dt (toJSON (valueToCell cst))) sqlt
 34 | 
 35 | {-| (developer API)
 36 | 
 37 | This function takes a non-empty list of observables and puts them
 38 | into a structure. The names of each element is _0 ... _(n-1)
 39 | -}
 40 | iPackTupleObs :: N.NonEmpty UntypedLocalData -> UntypedLocalData
 41 | iPackTupleObs ulds =
 42 |   let dt = structTypeTuple' (unSQLType . nodeType <$> ulds)
 43 |       so = StandardOperator {
 44 |                 soName = "org.spark.LocalPack",
 45 |                 soOutputType = dt,
 46 |                 soExtra = Null }
 47 |       op = NodeLocalOp so
 48 |   in emptyLocalData op (SQLType dt)
 49 |         `parents` (untyped <$> N.toList ulds)
 50 | 
 51 | instance (Num a, ToSQL a, SQLTypeable a) => Num (LocalData a) where
 52 |   -- TODO: convert all that to use column operations
 53 |   (+) = _binOp "org.spark.LocalPlus"
 54 |   (-) = _binOp "org.spark.LocalMinus"
 55 |   (*) = _binOp "org.spark.LocalMult"
 56 |   abs = _unaryOp "org.spark.LocalAbs"
 57 |   signum = _unaryOp "org.spark.LocalSignum"
 58 |   fromInteger x = constant (fromInteger x :: a)
 59 |   negate = _unaryOp "org.spark.LocalNegate"
 60 | 
 61 | instance forall a. (ToSQL a, Enum a, SQLTypeable a) => Enum (LocalData a) where
 62 |   toEnum x = constant (toEnum x :: a)
 63 |   fromEnum = failure "Cannot use fromEnum against a local data object"
 64 |   -- TODO(kps) some of the others are still available for implementation
 65 | 
 66 | instance (Num a, Ord a) => Ord (LocalData a) where
 67 |   compare = failure "You cannot compare instances of LocalData. (yet)."
 68 |   min = _binOp "org.spark.LocalMin"
 69 |   max = _binOp "org.spark.LocalMax"
 70 | 
 71 | instance forall a. (Real a, ToSQL a, SQLTypeable a) => Real (LocalData a) where
 72 |   toRational = failure "Cannot convert LocalData to rational"
 73 | 
 74 | instance (ToSQL a, Integral a, SQLTypeable a) => Integral (LocalData a) where
 75 |   quot = _binOp "org.spark.LocalQuotient"
 76 |   rem = _binOp "org.spark.LocalReminder"
 77 |   div = _binOp "org.spark.LocalDiv"
 78 |   mod = _binOp "org.spark.LocalMod"
 79 |   quotRem = failure "quotRem is not implemented (yet). Use quot and rem."
 80 |   divMod = failure "divMod is not implemented (yet). Use div and mod."
 81 |   toInteger = failure "Cannot convert LocalData to integer"
 82 | 
 83 | instance (ToSQL a, SQLTypeable a, Fractional a) => Fractional (LocalData a) where
 84 |   fromRational x = constant (fromRational x :: a)
 85 |   (/) = _binOp "org.spark.LocalDiv"
 86 | 
 87 | 
 88 | _unaryOp :: T.Text -> LocalData a -> LocalData a
 89 | _unaryOp optxt ld =
 90 |   let so = StandardOperator {
 91 |             soName = optxt,
 92 |             soOutputType = unSQLType $ nodeType ld,
 93 |             soExtra = Null }
 94 |       op = NodeLocalOp so in
 95 |   emptyLocalData op (nodeType ld)
 96 |     `parents` [untyped ld]
 97 | 
 98 | _binOp :: T.Text -> LocalData a -> LocalData a -> LocalData a
 99 | _binOp optxt ld1 ld2 = assert (nodeType ld1 == nodeType ld2) $
100 |   let so = StandardOperator {
101 |           soName = optxt,
102 |           soOutputType = unSQLType $ nodeType ld1,
103 |           soExtra = Null }
104 |       op = NodeLocalOp so in
105 |   emptyLocalData op (nodeType ld1)
106 |     `parents` [untyped ld1, untyped ld2]
107 | 
108 | -- TODO(kps) more input tests
109 | _binOp' :: StandardOperator -> LocalData a -> LocalData a -> LocalData a
110 | _binOp' so ld1 ld2 = assert (nodeType ld1 == nodeType ld2) $
111 |   let op = NodeLocalOp so in
112 |   emptyLocalData op (nodeType ld1)
113 |     `parents` [untyped ld1, untyped ld2]
114 | 
115 | _intOperator :: T.Text -> StandardOperator
116 | _intOperator optxt = StandardOperator {
117 |   soName = optxt,
118 |   soOutputType = intType,
119 |   soExtra = Null
120 | }
121 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/LocatedBase.hs:
--------------------------------------------------------------------------------
 1 | -- Taken from https://hackage.haskell.org/package/located-base-0.1.1.0/docs/src/GHC-Err-Located.html
 2 | 
 3 | {-# LANGUAGE CPP #-}
 4 | {-# LANGUAGE ConstraintKinds #-}
 5 | {-# LANGUAGE ImplicitParams #-}
 6 | {-# LANGUAGE OverloadedStrings #-}
 7 | module Spark.Core.Internal.LocatedBase (error, undefined, HasCallStack, showCallStack) where
 8 | 
 9 | #if __GLASGOW_HASKELL__ < 800
10 | 
11 | import GHC.SrcLoc
12 | import GHC.Stack (CallStack, getCallStack)
13 | import Prelude hiding (error, undefined)
14 | import qualified Prelude
15 | import Text.Printf
16 | import Data.Text(Text, unpack)
17 | 
18 | type HasCallStack = (?callStack :: CallStack)
19 | 
20 | error :: HasCallStack => Text -> a
21 | error msg = Prelude.error (unpack msg ++ "\n" ++ showCallStack ?callStack)
22 | 
23 | undefined :: HasCallStack => a
24 | undefined = error "Prelude.undefined"
25 | 
26 | showCallStack :: CallStack -> String
27 | showCallStack stk = case getCallStack stk of
28 |   _:locs -> unlines $ "Callstack:" : map format locs
29 |   _ -> Prelude.error "showCallStack: empty call-stack"
30 |   where
31 |   format (fn, loc) = printf "  %s, called at %s" fn (showSrcLoc loc)
32 | 
33 | #else
34 | 
35 | import GHC.Stack(HasCallStack, CallStack, prettyCallStack)
36 | import qualified GHC.Stack()
37 | import Data.Text(Text, unpack)
38 | import qualified Prelude
39 | import Prelude((.))
40 | 
41 | {-# DEPRECATED showCallStack "use GHC.Stack.prettyCallStack instead" #-}
42 | showCallStack :: CallStack -> Prelude.String
43 | showCallStack = prettyCallStack
44 | 
45 | error :: HasCallStack => Text -> a
46 | error = Prelude.error . unpack
47 | 
48 | undefined :: HasCallStack => a
49 | undefined = error "Prelude.undefined"
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/ObservableStandard.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Spark.Core.Internal.ObservableStandard(
 3 |   asDouble) where
 4 | 
 5 | import Spark.Core.Internal.ColumnStandard
 6 | import Spark.Core.Internal.DatasetStructures
 7 | import Spark.Core.Internal.FunctionsInternals
 8 | import Spark.Core.Internal.TypesGenerics(SQLTypeable)
 9 | 
10 | {-| Casts a local data as a double.
11 | -}
12 | asDouble :: (Num a, SQLTypeable a) => LocalData a -> LocalData Double
13 | asDouble = projectColFunction asDoubleCol
14 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Paths.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  2 | {-# LANGUAGE TypeSynonymInstances #-}
  3 | {-# LANGUAGE FlexibleInstances #-}
  4 | {-# LANGUAGE MultiParamTypeClasses #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | {-# LANGUAGE OverloadedStrings #-}
  7 | {-# LANGUAGE FlexibleContexts #-}
  8 | 
  9 | module Spark.Core.Internal.Paths(
 10 |   HasNodeName(..),
 11 |   PathEdge(..),
 12 |   computePaths,
 13 |   assignPaths',
 14 |   -- For testing:
 15 |   Scopes,
 16 |   ParentSplit(..),
 17 |   mergeScopes,
 18 |   gatherPaths,
 19 |   iGetScopes0,
 20 | ) where
 21 | 
 22 | import qualified Data.Map.Strict as M
 23 | import qualified Data.Set as S
 24 | import qualified Data.Vector as V
 25 | import Data.List(sort)
 26 | import Data.Maybe(fromMaybe, catMaybes)
 27 | import Data.Foldable(foldr', foldl', toList)
 28 | import Formatting
 29 | 
 30 | import Spark.Core.Try
 31 | import Spark.Core.Internal.DAGStructures
 32 | import Spark.Core.Internal.ComputeDag
 33 | import Spark.Core.StructuresInternal
 34 | 
 35 | class HasNodeName v where
 36 |   -- Retrieves the name of the node
 37 |   getNodeName :: v -> NodeName
 38 |   -- Assigns a path to the node
 39 |   assignPath :: v -> NodePath -> v
 40 | 
 41 | {-| The types of edges for the calculation of paths.
 42 |  - same level parent -> the node should have the same prefix as its parents
 43 |  - inner edge -> the parent defines the scope of this node
 44 |  -}
 45 | data PathEdge = SameLevelEdge | InnerEdge deriving (Show, Eq)
 46 | 
 47 | -- Assigns paths in a graph.
 48 | --
 49 | computePaths :: (HasNodeName v) =>
 50 |   ComputeDag v PathEdge -> Try (M.Map VertexId NodePath)
 51 | computePaths cd =
 52 |   let nodecg = mapVertexData getNodeName cd
 53 |   in _computePaths nodecg
 54 | 
 55 | assignPaths' :: (HasNodeName v) =>
 56 |   M.Map VertexId NodePath -> ComputeDag v e -> ComputeDag v e
 57 | assignPaths' m cd =
 58 |   let f vx =
 59 |         let old = NodePath . V.singleton $ getNodeName (vertexData vx)
 60 |             new = M.findWithDefault old (vertexId vx) m
 61 |         in assignPath (vertexData vx) new
 62 |   in mapVertices f cd
 63 | 
 64 | -- The main function to perform the pass assignments.
 65 | -- It starts from the graph of dependencies and from the local name info,
 66 | -- and computes the complete paths (if possible), starting from the fringe.
 67 | _computePaths :: ComputeDag NodeName PathEdge -> Try (M.Map VertexId NodePath)
 68 | _computePaths cg =
 69 |   let
 70 |     scopes = iGetScopes0 (toList . cdOutputs $ cg) (_splitParents' (cdEdges cg))
 71 |     paths = gatherPaths scopes
 72 |     nodeNames = M.fromList [(vertexId vx, vertexData vx)| vx <- toList . cdVertices $ cg]
 73 |     lookup' nid = M.lookup nid nodeNames
 74 |     f :: VertexId -> [[VertexId]] -> Try NodePath
 75 |     f nid ls = case ls of
 76 |       [l] ->
 77 |         return . NodePath . V.fromList . catMaybes $ lookup' <$> (l ++ [nid])
 78 |       x ->
 79 |         tryError $ sformat ("Node has too many paths: node="%shown%" discovered paths ="%shown) nid x
 80 |     nodePaths = M.traverseWithKey f paths
 81 |   in nodePaths
 82 | 
 83 | -- (private)
 84 | -- The top-level scope may not have an ID associated to it
 85 | type Scopes = M.Map (Maybe VertexId) (S.Set VertexId)
 86 | 
 87 | 
 88 | -- (internal)
 89 | -- The separation of parents into logical and inner parents
 90 | data ParentSplit a = ParentSplit {
 91 |   psLogical :: ![Vertex a],
 92 |   psInner :: ![Vertex a]
 93 | } deriving (Show)
 94 | 
 95 | _lookupOrEmpty :: Scopes -> Maybe VertexId -> [VertexId]
 96 | _lookupOrEmpty scopes mnid =
 97 |   S.toList $ fromMaybe S.empty (M.lookup mnid scopes)
 98 | 
 99 | mergeScopes :: Scopes -> Scopes -> Scopes
100 | mergeScopes = M.unionWith S.union
101 | 
102 | _singleScope :: Maybe VertexId -> VertexId -> Scopes
103 | _singleScope mKey nid = M.singleton mKey (S.singleton nid)
104 | 
105 | -- For each node, finds the one, or more than one if possible, path(s)
106 | -- from the root to the node (which is itself not included at the end)
107 | -- The gathering of paths may not be exaustive.
108 | gatherPaths :: Scopes -> M.Map VertexId [[VertexId]]
109 | gatherPaths scopes = M.map sort $ _gatherPaths0 scopes start where
110 |   start = _lookupOrEmpty scopes Nothing
111 | 
112 | _gatherPaths0 :: Scopes -> [VertexId] -> M.Map VertexId [[VertexId]]
113 | _gatherPaths0 _ [] = M.empty
114 | _gatherPaths0 scopes (nid : t) =
115 |   let
116 |     inner = _lookupOrEmpty scopes (Just nid)
117 |     innerPaths = _gatherPaths0 scopes inner
118 |     innerWithHead = M.map (\l -> (nid : ) <$> l) innerPaths
119 |     thisPaths = M.singleton nid [[]]
120 |     innerPaths2 = M.unionWith (++) innerWithHead thisPaths
121 |   in M.unionWith (++) innerPaths2 (_gatherPaths0 scopes t)
122 | 
123 | 
124 | iGetScopes0 :: forall a. (Show a) =>
125 |   [Vertex a] ->
126 |   (Vertex a -> ParentSplit a) ->
127 |   Scopes
128 | iGetScopes0 [] _splitter = M.empty
129 | iGetScopes0 (h : t) splitter =
130 |   let
131 |     startScope = _singleScope Nothing (vertexId h)
132 |     folder :: Scopes -> Vertex a -> Scopes
133 |     folder current un =
134 |       if M.member (Just (vertexId un)) current then
135 |         current
136 |       else
137 |         let split = _getScopes' splitter Nothing S.empty un current
138 |         in mergeScopes split current
139 |   -- Important here to use a left folder, as we want to start with the head
140 |   -- and move down the list.
141 |   in foldl' folder startScope (h : t)
142 | 
143 | _splitParents' :: AdjacencyMap v PathEdge -> Vertex v -> ParentSplit v
144 | _splitParents' m vx =
145 |   let ves = V.toList $ M.findWithDefault V.empty (vertexId vx) m
146 |       scope = [veEndVertex ve | ve <- ves, edgeData (veEdge ve) == SameLevelEdge]
147 |       parents' = [veEndVertex ve | ve <- ves, edgeData (veEdge ve) == InnerEdge]
148 |   in ParentSplit { psLogical = scope, psInner = parents' }
149 | 
150 | 
151 | -- TODO(kps) this recursive code is most probably going to explode for deep stacks
152 | _getScopes' :: forall a. (Show a) =>
153 |   (Vertex a -> ParentSplit a) -> -- The expansion of a node into logical and inner nodes
154 |   Maybe VertexId -> -- the current parent (if any)
155 |   S.Set VertexId -> -- the current boundary to respect
156 |   Vertex a -> -- the current node to expand
157 |   Scopes -> -- the scopes seen so far
158 |   Scopes
159 | _getScopes' splitter mScopeId boundary un scopes =
160 |   if S.member (vertexId un) boundary then
161 |     scopes
162 |   else
163 |     let
164 |       split = splitter un
165 |       logParents = psLogical split
166 |       innerParents = psInner split
167 |       -- A fold on the parents
168 |       parF :: Vertex a -> Scopes -> Scopes
169 |       parF =
170 |         -- Same boundary and parent, but update the scopes
171 |         _getScopes' splitter mScopeId boundary
172 |       scopesPar = foldr' parF scopes logParents
173 |       -- Now work on the inner nodes:
174 |       vid = vertexId un
175 |       boundary' = S.fromList (vertexId <$> logParents)
176 |       inF :: Vertex a -> Scopes -> Scopes
177 |       inF =
178 |         -- parent is current, boundary is current logical
179 |         _getScopes' splitter (Just vid) boundary'
180 |       scopesIn = foldr' inF scopesPar innerParents
181 |       scopesFinal = scopesIn
182 |           `mergeScopes` _singleScope mScopeId vid
183 |           `mergeScopes` M.singleton (Just vid) S.empty
184 |     in scopesFinal
185 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/PathsUntyped.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  2 | {-# LANGUAGE MultiParamTypeClasses #-}
  3 | {-# LANGUAGE TypeSynonymInstances #-}
  4 | {-# LANGUAGE FlexibleInstances #-}
  5 | {-# LANGUAGE FlexibleContexts #-}
  6 | {-# LANGUAGE OverloadedStrings #-}
  7 | 
  8 | module Spark.Core.Internal.PathsUntyped(
  9 |   assignPathsUntyped,
 10 |   tieNodes
 11 | ) where
 12 | 
 13 | import qualified Data.Vector as V
 14 | import qualified Data.Map.Strict as M
 15 | import Data.Maybe(fromMaybe)
 16 | import Data.Foldable(toList)
 17 | import Data.List(nub)
 18 | import Control.Arrow((&&&))
 19 | import Formatting
 20 | import Control.Monad.Identity
 21 | 
 22 | import Spark.Core.Internal.DAGStructures
 23 | import Spark.Core.Internal.DAGFunctions
 24 | import Spark.Core.Internal.ComputeDag
 25 | import Spark.Core.Internal.DatasetStructures
 26 | import Spark.Core.Internal.DatasetFunctions
 27 | import Spark.Core.Internal.Paths
 28 | import Spark.Core.Internal.Utilities
 29 | import Spark.Core.Try
 30 | import Spark.Core.StructuresInternal(unNodeId)
 31 | 
 32 | instance GraphVertexOperations UntypedNode where
 33 |   vertexToId = VertexId . unNodeId . nodeId
 34 |   expandVertexAsVertices n =
 35 |     nodeParents n
 36 |       ++ fromMaybe [] (nodeLogicalParents n)
 37 |       ++ nodeLogicalDependencies n
 38 | 
 39 | instance GraphOperations UntypedNode NodeEdge where
 40 |   expandVertex n =
 41 |     -- The logical parents are more important than the parents
 42 |     let
 43 |       -- If the logical parents are not specified, the logical parents are the
 44 |       -- direct parents.
 45 |       scopeNodes = fromMaybe (nodeParents n) (nodeLogicalParents n)
 46 |       loParents = [(ScopeEdge, v) | v <- scopeNodes]
 47 |       -- The direct parents. They may overload with the scoping parents, but
 48 |       -- this will be checked during the name analysis.
 49 |       parents' = (const (DataStructureEdge ParentEdge) &&& id) <$> nodeParents n
 50 |       loDeps = (const (DataStructureEdge LogicalEdge) &&& id) <$> nodeLogicalDependencies n
 51 |     in loParents ++ parents' ++ loDeps
 52 | 
 53 | instance HasNodeName UntypedNode where
 54 |   getNodeName = nodeName
 55 |   assignPath n p = updateNode n $ \n' -> n' { _cnPath = p }
 56 | 
 57 | 
 58 | -- Stitches the nodes together to make sure that the edges in the graph also
 59 | -- correspond to the dependencies in the nodes themselves.
 60 | -- This does not update the nodeIds
 61 | -- This must happen before the pruning is performed, otherwise the node IDs will
 62 | -- not match.
 63 | tieNodes :: ComputeDag UntypedNode StructureEdge -> ComputeDag UntypedNode StructureEdge
 64 | tieNodes cd =
 65 |   let g = computeGraphToGraph cd
 66 |       f :: UntypedNode -> [(UntypedNode, StructureEdge)] -> Identity UntypedNode
 67 |       f v l =
 68 |         let parents' = V.fromList [n | (n, e) <- l, e == ParentEdge]
 69 |             logDeps = V.fromList [n | (n, e) <- l, e == LogicalEdge]
 70 |             res = updateNode v $ \n -> n {
 71 |                   _cnParents = parents',
 72 |                   _cnLogicalDeps = logDeps,
 73 |                   _cnLogicalParents = Nothing }
 74 |         in return res
 75 |       g2 = runIdentity $ graphMapVertices g f
 76 |   in graphToComputeGraph g2
 77 | 
 78 | -- Assigs the paths, and drops the scoping edges.
 79 | assignPathsUntyped :: (HasCallStack) =>
 80 |   ComputeDag UntypedNode NodeEdge -> Try (ComputeDag UntypedNode StructureEdge)
 81 | assignPathsUntyped cd = do
 82 |   let pathCGraph = _getPathCDag cd
 83 |   paths <- computePaths pathCGraph
 84 |   let g = computeGraphToGraph $ assignPaths' paths cd
 85 |   let f ScopeEdge = []
 86 |       f (DataStructureEdge x) = [x]
 87 |   let g' = graphFlatMapEdges g  f
 88 |   return $ graphToComputeGraph g'
 89 | 
 90 | 
 91 | -- transforms node edges into path edges
 92 | _cleanEdges :: (HasCallStack) => [VertexEdge NodeEdge v] -> [VertexEdge PathEdge v]
 93 | _cleanEdges [] = []
 94 | _cleanEdges (h : t) =
 95 |   let vid = vertexId (veEndVertex h)
 96 |       others = [ve | ve <- t, (vertexId . veEndVertex $ ve) /= vid]
 97 |       sames = [ve | ve <- t, (vertexId . veEndVertex $ ve) == vid]
 98 |       rest = _cleanEdges others
 99 |       e = veEdge h
100 |       -- If there multiple edges between nodes, they are dropped.
101 |       -- This distinction is not required for names.
102 |       eData = nub $ edgeData . veEdge <$> (h : sames)
103 |       eData' = case eData of
104 |         [DataStructureEdge ParentEdge] -> Just InnerEdge
105 |         [DataStructureEdge ParentEdge, ScopeEdge] -> Just SameLevelEdge
106 |         [ScopeEdge, DataStructureEdge ParentEdge] -> Just SameLevelEdge
107 |         [ScopeEdge] -> Just SameLevelEdge
108 |         [DataStructureEdge LogicalEdge] -> Nothing
109 |         l -> failure (sformat ("Could not understand combination "%shown) l)
110 |       res = case eData' of
111 |         Just v -> (h { veEdge = e { edgeData = v } }) : rest
112 |         Nothing -> rest
113 |     in res
114 | 
115 | 
116 | _getPathCDag :: (HasCallStack) => ComputeDag v NodeEdge -> ComputeDag v PathEdge
117 | _getPathCDag cd =
118 |   let adj' = M.map (V.fromList . _cleanEdges . toList) (cdEdges cd)
119 |   in cd { cdEdges = adj' }
120 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Pruning.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | {-# LANGUAGE FlexibleContexts #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | 
 5 | {-| Methods to prune the computation graph.
 6 | -}
 7 | module Spark.Core.Internal.Pruning(
 8 |   NodeCacheStatus(..),
 9 |   NodeCacheInfo(..),
10 |   NodeCache,
11 |   pruneGraph,
12 |   pruneGraphDefault,
13 |   emptyNodeCache
14 | ) where
15 | 
16 | import Data.HashMap.Strict as HM
17 | 
18 | import Spark.Core.StructuresInternal(NodeId, NodePath, ComputationID)
19 | import Spark.Core.Internal.DatasetStructures(UntypedNode, StructureEdge)
20 | import Spark.Core.Internal.DAGFunctions
21 | import Spark.Core.Internal.DAGStructures
22 | import Spark.Core.Internal.DatasetFunctions
23 | import Spark.Core.Internal.OpStructures
24 | 
25 | 
26 | {-| The status of a node being computed.
27 | 
28 | On purpose, it does not store data. This is meant to be
29 | only the control plane of the compuations.
30 | -}
31 | data NodeCacheStatus =
32 |     NodeCacheRunning
33 |   | NodeCacheError
34 |   | NodeCacheSuccess
35 |   deriving (Eq, Show)
36 | 
37 | {-| This structure describes the last time a node was observed by the
38 | controller, and the state it was in.
39 | 
40 | This information is used to do smart computation pruning, by assuming
41 | that the observables are kept by the Spark processes.
42 | -}
43 | data NodeCacheInfo = NodeCacheInfo {
44 |   nciStatus :: !NodeCacheStatus,
45 |   nciComputation :: !ComputationID,
46 |   nciPath :: !NodePath
47 | } deriving (Eq, Show)
48 | 
49 | type NodeCache = HM.HashMap NodeId NodeCacheInfo
50 | 
51 | emptyNodeCache :: NodeCache
52 | emptyNodeCache = HM.empty
53 | 
54 | {-| It assumes a compute graph, NOT a dependency dag.
55 | -}
56 | pruneGraph :: (Show v) =>
57 |   -- The current cache
58 |   NodeCache ->
59 |   (v -> NodeId) ->
60 |   -- A function to create a node replacement
61 |   (v -> NodeCacheInfo -> v) ->
62 |   -- The graph
63 |   Graph v StructureEdge ->
64 |   Graph v StructureEdge
65 | pruneGraph c getNodeId f g =
66 |   -- Prune the node that we do not want
67 |   let depGraph = reverseGraph g
68 |       fop v = if HM.member (getNodeId v) c
69 |               then CutChildren
70 |               else Keep
71 |       filtered = graphFilterVertices fop depGraph
72 |       -- Bring back to normal flow.
73 |       comFiltered = reverseGraph filtered
74 |       -- Replace the nodes in the cache by place holders.
75 |       -- This is done on the compute graph.
76 |       repOp v = case HM.lookup (getNodeId v) c of
77 |                   Just nci -> f v nci
78 |                   Nothing -> v
79 |       g' = graphMapVertices' repOp comFiltered
80 |   in g'
81 | 
82 | pruneGraphDefault ::
83 |   NodeCache -> Graph UntypedNode StructureEdge -> Graph UntypedNode StructureEdge
84 | pruneGraphDefault c = pruneGraph c nodeId _createNodeCache
85 | 
86 | _createNodeCache :: UntypedNode -> NodeCacheInfo -> UntypedNode
87 | _createNodeCache n nci =
88 |   let name = "org.spark.PlaceholderCache"
89 |       no = NodePointer (Pointer (nciComputation nci) (nciPath nci))
90 |       n2 = emptyNodeStandard (nodeLocality n) (nodeType n) name
91 |              `updateNodeOp` no
92 |   in n2
93 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/RowGenerics.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE PolyKinds #-}
  2 | {-# LANGUAGE TypeOperators #-}
  3 | {-# LANGUAGE DeriveAnyClass #-}
  4 | {-# LANGUAGE FlexibleContexts #-}
  5 | {-# LANGUAGE FlexibleInstances #-}
  6 | {-# LANGUAGE DefaultSignatures #-}
  7 | {-# LANGUAGE ScopedTypeVariables #-}
  8 | {-# LANGUAGE BangPatterns #-}
  9 | {-# LANGUAGE MultiParamTypeClasses #-}
 10 | 
 11 | 
 12 | -- The generic implementation for the protocol that converts to
 13 | -- and from SQL cells.
 14 | -- Going through JSON is not recommended because of precision loss
 15 | -- for the numbers, and other issues related to numbers.
 16 | module Spark.Core.Internal.RowGenerics(
 17 |   ToSQL,
 18 |   valueToCell,
 19 | ) where
 20 | 
 21 | import GHC.Generics
 22 | import qualified Data.Vector as V
 23 | import Data.Text(pack, Text)
 24 | 
 25 | import Spark.Core.Internal.RowStructures
 26 | import Spark.Core.Internal.Utilities
 27 | 
 28 | -- We need to differentiate between the list built for the
 29 | -- constructor and an inner object.
 30 | data CurrentBuffer =
 31 |   ConsData ![Cell]
 32 |   | BuiltCell !Cell deriving (Show)
 33 | 
 34 | _cellOrError :: CurrentBuffer -> Cell
 35 | _cellOrError (BuiltCell cell) = cell
 36 | _cellOrError x = let msg = "Expected built cell, received " ++ show x in
 37 |   failure (pack msg)
 38 | 
 39 | -- All the types that can be converted to a SQL value.
 40 | class ToSQL a where
 41 |   _valueToCell :: a -> Cell
 42 | 
 43 |   default _valueToCell :: (Generic a, GToSQL (Rep a)) => a -> Cell
 44 |   _valueToCell !x = _g2cell (from x)
 45 | 
 46 | valueToCell :: (ToSQL a) => a -> Cell
 47 | valueToCell = _valueToCell
 48 | 
 49 | -- class FromSQL a where
 50 | --   _cellToValue :: Cell -> Try a
 51 | 
 52 | instance ToSQL a => ToSQL (Maybe a) where
 53 |   _valueToCell (Just x) = _valueToCell x
 54 |   _valueToCell Nothing = Empty
 55 | 
 56 | instance (ToSQL a, ToSQL b) => ToSQL (a, b) where
 57 |   _valueToCell (x, y) = RowArray (V.fromList [valueToCell x, valueToCell y])
 58 | 
 59 | instance ToSQL Int where
 60 |   _valueToCell = IntElement
 61 | 
 62 | instance ToSQL Double where
 63 |   _valueToCell = DoubleElement
 64 | 
 65 | instance ToSQL Text where
 66 |   _valueToCell = StringElement
 67 | 
 68 | 
 69 | class GToSQL r where
 70 |   _g2buffer :: r a -> CurrentBuffer
 71 |   _g2cell :: r a -> Cell
 72 |   _g2cell = _cellOrError . _g2buffer
 73 | 
 74 | instance GToSQL U1 where
 75 |   _g2buffer U1 = failure $ pack "GToSQL UI called"
 76 | 
 77 | -- | Constants, additional parameters and recursion of kind *
 78 | instance (GToSQL a, GToSQL b) => GToSQL (a :*: b) where
 79 |   _g2buffer (a :*: b) = case (_g2buffer a, _g2buffer b) of
 80 |     (ConsData l1, ConsData l2) -> ConsData (l1 ++ l2)
 81 |     (y1, y2) -> failure $ pack $ "GToSQL (a :*: b): Expected buffers, received " ++ show y1 ++ " and " ++ show y2
 82 | 
 83 | instance (GToSQL a, GToSQL b) => GToSQL (a :+: b) where
 84 |   _g2buffer (L1 x) = _g2buffer x
 85 |   _g2buffer (R1 x) = let !y = _g2buffer x in y
 86 | 
 87 | -- -- | Sums: encode choice between constructors
 88 | -- instance (GToSQL a) => GToSQL (M1 i c a) where
 89 | --   _g2cell !(M1 x) = let !y = _g2cell x in
 90 | --     trace ("GToSQL M1: y = " ++ show y) y
 91 | 
 92 | instance (GToSQL a) => GToSQL (M1 C c a) where
 93 |   _g2buffer (M1 x) = let !y = _g2buffer x in y
 94 | 
 95 | instance (GToSQL a) => GToSQL (M1 S c a) where
 96 |   _g2buffer (M1 x) = let !y = ConsData [_g2cell x] in y
 97 | 
 98 | instance (GToSQL a) => GToSQL (M1 D c a) where
 99 |   _g2buffer (M1 x) =
100 |     case _g2buffer x of
101 |       ConsData cs -> BuiltCell $ RowArray (V.fromList cs)
102 |       BuiltCell cell -> BuiltCell cell
103 | 
104 | -- | Products: encode multiple arguments to constructors
105 | instance (ToSQL a) => GToSQL (K1 i a) where
106 |   _g2buffer (K1 x) = let !y = _valueToCell x in BuiltCell y
107 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/RowGenericsFrom.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE PolyKinds #-}
  2 | {-# LANGUAGE TypeOperators #-}
  3 | {-# LANGUAGE DeriveAnyClass #-}
  4 | {-# LANGUAGE FlexibleContexts #-}
  5 | {-# LANGUAGE FlexibleInstances #-}
  6 | {-# LANGUAGE DefaultSignatures #-}
  7 | {-# LANGUAGE ScopedTypeVariables #-}
  8 | {-# LANGUAGE MultiParamTypeClasses #-}
  9 | {-# LANGUAGE OverloadedStrings #-}
 10 | 
 11 | 
 12 | -- The generic implementation for the protocol that converts to
 13 | -- and from SQL cells.
 14 | -- Going through JSON is not recommended because of precision loss
 15 | -- for the numbers, and other issues related to numbers.
 16 | module Spark.Core.Internal.RowGenericsFrom(
 17 |   FromSQL(_cellToValue),
 18 |   TryS,
 19 |   cellToValue,
 20 | ) where
 21 | 
 22 | import GHC.Generics
 23 | import Data.Text(Text, pack)
 24 | import Control.Applicative(liftA2)
 25 | import Control.Monad.Except
 26 | import Formatting
 27 | import qualified Data.Vector as V
 28 | 
 29 | import Spark.Core.Internal.RowStructures
 30 | import Spark.Core.Internal.Utilities
 31 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeRepr, DataTypeElementRepr)
 32 | 
 33 | -- Convert a cell to a value (if possible)
 34 | cellToValue :: (FromSQL a) => Cell -> Either Text a
 35 | cellToValue = _cellToValue
 36 | 
 37 | type TryS = Either Text
 38 | 
 39 | -- Because of the way the generic decoders work,
 40 | -- an array of cell needs special treatment when it is
 41 | -- decoded as the constructor of an object. Then it should
 42 | -- be interpreted as a stateful tape, for which we read a
 43 | -- few cells (number unknown) and return some value from the
 44 | -- cells that have been consumed.
 45 | data Decode2 =
 46 |     -- A tape with some potentially remaining cells
 47 |     D2Cons ![Cell]
 48 |     -- Just a normal cell.
 49 |   | D2Normal !Cell
 50 |   deriving (Eq, Show)
 51 | 
 52 | 
 53 | -- All the types that can be converted to a SQL value.
 54 | class FromSQL a where
 55 |   _cellToValue :: Cell -> TryS a
 56 | 
 57 |   default _cellToValue :: (Generic a, GFromSQL (Rep a)) => Cell -> TryS a
 58 |   _cellToValue cell = let
 59 |       x = undefined :: a
 60 |       x1r = _gFcell (from x) (D2Normal cell) :: InterResult (Decode2, Rep a a)
 61 |       x2r = snd <$> x1r
 62 |       x1t = to <$> x2r
 63 |     in _toTry x1t
 64 | 
 65 | -- ******** Basic instance ********
 66 | 
 67 | instance FromSQL a => FromSQL (Maybe a) where
 68 |   _cellToValue Empty = pure Nothing
 69 |   _cellToValue x = pure <$> _cellToValue x
 70 | 
 71 | instance FromSQL Int where
 72 |   _cellToValue (IntElement x) = pure x
 73 |   _cellToValue x = throwError $ sformat ("FromSQL: Decoding an int from "%shown) x
 74 | 
 75 | instance FromSQL Double where
 76 |   _cellToValue (DoubleElement x) = pure x
 77 |   _cellToValue x = throwError $ sformat ("FromSQL: Decoding a double from "%shown) x
 78 | 
 79 | instance FromSQL Text where
 80 |   _cellToValue (StringElement txt) = pure txt
 81 |   _cellToValue x = throwError $ sformat ("FromSQL: Decoding a unicode text from "%shown) x
 82 | 
 83 | instance FromSQL Cell where
 84 |   _cellToValue = pure
 85 | 
 86 | instance FromSQL Bool where
 87 |   _cellToValue (BoolElement b) = pure b
 88 |   _cellToValue x = throwError $ sformat ("FromSQL: Decoding a boolean from "%shown) x
 89 | 
 90 | instance FromSQL DataTypeRepr
 91 | instance FromSQL DataTypeElementRepr
 92 | 
 93 | instance FromSQL a => FromSQL [a] where
 94 |   _cellToValue (RowArray xs) =
 95 |     sequence (_cellToValue <$> V.toList xs)
 96 |   _cellToValue x = throwError $ sformat ("FromSQL[]: Decoding array from "%shown) x
 97 | 
 98 | instance (FromSQL a1, FromSQL a2) => FromSQL (a1, a2) where
 99 |   _cellToValue (RowArray xs) = case V.toList xs of
100 |     [x1, x2] ->
101 |       liftA2 (,) (_cellToValue x1) (_cellToValue x2)
102 |     l -> throwError $ sformat ("FromSQL: Expected 2 elements but got "%sh) l
103 |   _cellToValue x = throwError $ sformat ("FromSQL(,): Decoding array from "%shown) x
104 | 
105 | -- ******* GENERIC ********
106 | 
107 | -- A final message at the bottom
108 | -- A path in the elements to get there
109 | data FailureInfo = FailureInfo !Text ![Text] deriving (Eq, Show)
110 | 
111 | type InterResult a = Either FailureInfo a
112 | 
113 | 
114 | class GFromSQL r where
115 |   -- An evidence about the type (in order to have info about the field names)
116 |   -- The current stuff that has been decoded
117 |   _gFcell :: r a -> Decode2 -> InterResult (Decode2, r a)
118 | 
119 | _toTry :: InterResult a -> TryS a
120 | _toTry (Right x) = pure x
121 | _toTry (Left (FailureInfo msg p)) = Left $ show' (reverse p) <> " : " <> msg
122 | 
123 | _fromTry :: TryS a -> InterResult a
124 | _fromTry (Right x) = Right x
125 | _fromTry (Left x) = Left $ FailureInfo x []
126 | 
127 | instance GFromSQL U1 where
128 |   _gFcell x = failure $ pack $ "GFromSQL UI called" ++ show x
129 | 
130 | instance (GFromSQL a, GFromSQL b) => GFromSQL (a :*: b) where
131 |   -- Switching to tape-reading mode
132 |   _gFcell ev (D2Normal (RowArray arr)) = _gFcell ev (D2Cons (V.toList arr))
133 |   -- Advancing into the reader
134 |   _gFcell ev (D2Cons l) = do
135 |     let (ev1 :*: ev2) = ev
136 |     (d1, x1) <- _gFcell ev1 (D2Cons l)
137 |     (d2, x2) <- _gFcell ev2 d1
138 |     return (d2, x1 :*: x2)
139 |   _gFcell _ x = failure $ pack ("GFromSQL (a :*: b) " ++ show x)
140 | 
141 | 
142 | instance (GFromSQL a, GFromSQL b) => GFromSQL (a :+: b) where
143 |   _gFcell _ x = failure $ pack $ "GFromSQL (a :+: b)" ++ show x
144 | 
145 | instance (GFromSQL a, Constructor c) => GFromSQL (M1 C c a) where
146 |   _gFcell _ (D2Cons x) = failure $ pack ("GFromSQL (M1 C c a)" ++ " FAILED CONS: " ++ show x)
147 |   _gFcell ev (D2Normal cell) = do
148 |     let ev' = unM1 ev
149 |     (d, x) <- _withHint (pack (conName ev)) $ _gFcell ev' (D2Normal cell)
150 |     return (d, M1 x)
151 | 
152 | instance (GFromSQL a, Selector c) => GFromSQL (M1 S c a) where
153 |   _gFcell ev (D2Normal (RowArray arr)) = do
154 |     let ev' = unM1 ev
155 |     let l = V.toList arr
156 |     (d, x) <- _withHint ("(1)" <> pack (selName ev)) $ _gFcell ev' (D2Cons l)
157 |     return (d, M1 x)
158 |   _gFcell ev d = do
159 |     let ev' = unM1 ev
160 |     (d', x) <- _withHint ("(2)" <> pack (selName ev)) $ _gFcell ev' d
161 |     return (d', M1 x)
162 | 
163 | instance (GFromSQL a, Datatype c) => GFromSQL (M1 D c a) where
164 |   _gFcell ev (z @ (D2Normal (RowArray _))) = do
165 |     let ev' = unM1 ev
166 |     (d, x) <- _gFcell ev' z
167 |     return (d, M1 x)
168 |   _gFcell _ x = failure $ pack $ "FAIL GFromSQL (M1 D c a)" ++ show x
169 | 
170 | -- | Products: encode multiple arguments to constructors
171 | instance (FromSQL a) => GFromSQL (K1 i a) where
172 |   -- It is just a normal cell.
173 |   -- Read one element and move on.
174 |   _gFcell _ (D2Cons (cell : r)) = do
175 |     x <- _fromTry $ _cellToValue cell
176 |     return (D2Cons r, K1 x)
177 |   -- Just reading a normal cell, return no tape.
178 |   _gFcell _ (D2Normal cell) = do
179 |     x <- _fromTry $ _cellToValue cell
180 |     return (D2Cons [], K1 x)
181 |   _gFcell _ x = failure $ pack ("GFromSQLK FAIL " ++ show x)
182 | 
183 | _withHint :: Text -> InterResult a -> InterResult a
184 | _withHint extra (Left (FailureInfo msg l)) = Left (FailureInfo msg (extra : l))
185 | _withHint _ (Right x) = Right x
186 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/RowStructures.hs:
--------------------------------------------------------------------------------
 1 | module Spark.Core.Internal.RowStructures where
 2 | 
 3 | import Data.Aeson
 4 | import Data.Vector(Vector)
 5 | import qualified Data.Text as T
 6 | 
 7 | -- | The basic representation of one row of data. This is a standard type that comes out of the
 8 | -- SQL engine in Spark.
 9 | 
10 | -- | An element in a Row object.
11 | -- All objects manipulated by the Spark framework are assumed to
12 | -- be convertible to cells.
13 | --
14 | -- This is usually handled by generic transforms.
15 | data Cell =
16 |     Empty -- To represent maybe
17 |     | IntElement !Int
18 |     | DoubleElement !Double
19 |     | StringElement !T.Text
20 |     | BoolElement !Bool
21 |     | RowArray !(Vector Cell) deriving (Show, Eq)
22 | 
23 | -- | A Row of data: the basic data structure to transport information
24 | -- TODO rename to rowCells
25 | data Row = Row {
26 |     cells :: !(Vector Cell)
27 |   } deriving (Show, Eq)
28 | 
29 | 
30 | -- AESON INSTANCES
31 | 
32 | -- TODO(kps) add some workaround to account for the restriction of
33 | -- JSON types:
34 | -- int32 -> int32
35 | -- double -> double
36 | -- weird double -> string?
37 | -- long/bigint -> string?
38 | 
39 | -- | Cell
40 | instance ToJSON Cell where
41 |   toJSON Empty = Null
42 |   toJSON (DoubleElement d) = toJSON d
43 |   toJSON (IntElement i) = toJSON i
44 |   toJSON (BoolElement b) = toJSON b
45 |   toJSON (StringElement s) = toJSON s
46 |   toJSON (RowArray arr) = toJSON arr
47 | 
48 | -- | Row
49 | instance ToJSON Row where
50 |   toJSON (Row x) = toJSON x
51 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/RowUtils.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | 
  3 | module Spark.Core.Internal.RowUtils(
  4 |   jsonToCell,
  5 |   checkCell,
  6 |   rowArray
  7 | ) where
  8 | 
  9 | import Data.Aeson
 10 | import Data.Text(Text)
 11 | import Data.Maybe(catMaybes, listToMaybe)
 12 | import Formatting
 13 | import qualified Data.Vector as V
 14 | import qualified Data.HashMap.Strict as HM
 15 | import Data.Scientific(floatingOrInteger, toRealFloat)
 16 | import Control.Monad.Except
 17 | 
 18 | import Spark.Core.Internal.TypesStructures
 19 | import Spark.Core.Internal.TypesFunctions
 20 | import Spark.Core.Internal.RowStructures
 21 | import Spark.Core.StructuresInternal(FieldName(..))
 22 | import Spark.Core.Internal.Utilities
 23 | 
 24 | type TryCell = Either Text Cell
 25 | 
 26 | -- | Decodes a JSON into a row.
 27 | -- This operation requires a SQL type that describes
 28 |  -- the schema.
 29 | jsonToCell :: DataType -> Value -> Either Text Cell
 30 | jsonToCell dt v = withContext ("jsonToCell: dt="<>show' dt<>" v="<>show' v) $
 31 |   _j2Cell v dt
 32 | 
 33 | {-| Given a datatype, ensures that the cell has the corresponding type.
 34 | -}
 35 | checkCell :: DataType -> Cell -> Either Text Cell
 36 | checkCell dt c = case _checkCell dt c of
 37 |   Nothing -> pure c
 38 |   Just txt -> throwError txt
 39 | 
 40 | {-| Convenience constructor for an array of cells.
 41 | -}
 42 | rowArray :: [Cell] -> Cell
 43 | rowArray = RowArray . V.fromList
 44 | 
 45 | 
 46 | -- Returns an error message if something wrong is found
 47 | _checkCell :: DataType -> Cell -> Maybe Text
 48 | _checkCell dt c = case (dt, c) of
 49 |   (NullableType _, Empty) -> Nothing
 50 |   (StrictType _, Empty) ->
 51 |     pure $ sformat ("Expected a strict value of type "%sh%" but no value") dt
 52 |   (StrictType sdt, x) -> _checkCell' sdt x
 53 |   (NullableType sdt, x) -> _checkCell' sdt x
 54 | 
 55 | -- Returns an error message if something wrong is found
 56 | _checkCell' :: StrictDataType -> Cell -> Maybe Text
 57 | _checkCell' sdt c = case (sdt, c) of
 58 |   (_, Empty) ->
 59 |     pure $ sformat ("Expected a strict value of type "%sh%" but no value") sdt
 60 |   (IntType, IntElement _) -> Nothing
 61 |   (StringType, StringElement _) -> Nothing
 62 |   (Struct (StructType fields), RowArray cells') ->
 63 |     if V.length fields == V.length cells'
 64 |       then
 65 |         let types = V.toList $ structFieldType <$> fields
 66 |             res = uncurry _checkCell <$> (types `zip` V.toList cells')
 67 |         in listToMaybe (catMaybes res)
 68 |       else
 69 |         pure $ sformat ("Struct "%sh%" has "%sh%" fields, asked to be matched with "%sh%" cells") sdt (V.length fields) (V.length cells')
 70 |   (ArrayType dt, RowArray cells') ->
 71 |     let res = uncurry _checkCell <$> (repeat dt `zip` V.toList cells')
 72 |     in listToMaybe (catMaybes res)
 73 |   (_, _) ->
 74 |     pure $ sformat ("Type "%sh%" is incompatible with cell content "%sh) sdt c
 75 | 
 76 | 
 77 | _j2Cell :: Value -> DataType -> TryCell
 78 | _j2Cell Null (StrictType t) =
 79 |   throwError $ sformat ("_j2Cell: Expected "%shown%", got null") t
 80 | _j2Cell Null (NullableType _) = pure Empty
 81 | _j2Cell x (StrictType t) = _j2CellS x t
 82 | -- We do not express optional types at cell level. They have to be
 83 | -- encoded in the data type.
 84 | _j2Cell x (NullableType t) = _j2CellS x t
 85 | --_j2Cell x t = throwError $ sformat ("_j2Cell: Could not match value "%shown%" with type "%shown) x t
 86 | 
 87 | _j2CellS :: Value -> StrictDataType -> TryCell
 88 | _j2CellS (String t) StringType = pure . StringElement $ t
 89 | _j2CellS (Bool t) BoolType = pure . BoolElement $ t
 90 | _j2CellS (Array v) (ArrayType t) =
 91 |   let trys = flip _j2Cell t <$> v in
 92 |     RowArray <$> sequence trys
 93 | _j2CellS (Number s) IntType = case floatingOrInteger s :: Either Double Int of
 94 |   Left _ -> throwError $ sformat ("_j2CellS: Could not cast as int "%shown) s
 95 |   Right i -> pure (IntElement i)
 96 | _j2CellS (Number s) DoubleType = pure . DoubleElement . toRealFloat $ s
 97 | -- Normal representation as object.
 98 | _j2CellS (Object o) (Struct struct) =
 99 |   let
100 |     o2f :: StructField -> TryCell
101 |     o2f field =
102 |       let nullable = isNullable $ structFieldType field
103 |           val = HM.lookup (unFieldName $ structFieldName field) o in
104 |       case val of
105 |         Nothing ->
106 |           if nullable then
107 |             pure Empty
108 |           else throwError $ sformat ("_j2CellS: Could not find key "%shown%" in object "%shown) field o
109 |         Just x -> _j2Cell x (structFieldType field)
110 |     fields = o2f <$> structFields struct
111 |   in RowArray <$> sequence fields
112 | -- Compact array-based representation.
113 | _j2CellS (Array v) (Struct (StructType fields)) =
114 |   if V.length v == V.length fields
115 |     then
116 |       let dts = structFieldType <$> fields
117 |           inner = uncurry _j2Cell <$> V.zip v dts
118 |       in RowArray <$> sequence inner
119 |     else throwError $ sformat ("_j2CellS: Compact object format a different number of fields '"%shown%"' compared "%shown) v fields
120 | _j2CellS x t = throwError $ sformat ("_j2CellS: Could not match value '"%shown%"' with type "%shown) x t
121 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/TypesGenerics.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE PolyKinds #-}
  2 | {-# LANGUAGE TypeOperators #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE DeriveAnyClass #-}
  5 | {-# LANGUAGE FlexibleContexts #-}
  6 | {-# LANGUAGE FlexibleInstances #-}
  7 | {-# LANGUAGE DefaultSignatures #-}
  8 | {-# LANGUAGE ScopedTypeVariables #-}
  9 | {-# LANGUAGE BangPatterns #-}
 10 | {-# LANGUAGE MultiParamTypeClasses #-}
 11 | 
 12 | module Spark.Core.Internal.TypesGenerics where
 13 | 
 14 | import qualified Data.Vector as V
 15 | import qualified Data.Text as T
 16 | import GHC.Generics
 17 | import Formatting
 18 | 
 19 | import Spark.Core.Internal.TypesStructures
 20 | import Spark.Core.Internal.TypesFunctions
 21 | import Spark.Core.Internal.Utilities
 22 | import Spark.Core.StructuresInternal(FieldName(..), unsafeFieldName)
 23 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeRepr, DataTypeElementRepr)
 24 | 
 25 | -- The 3rd attempt to get generics conversions.
 26 | 
 27 | -- Given a tag on a type, returns the equivalent SQL type.
 28 | -- This is the type for a cell, not for a row.
 29 | -- TODO(kps) more documentation
 30 | buildType :: (HasCallStack, SQLTypeable a) => SQLType a
 31 | buildType = _buildType
 32 | 
 33 | 
 34 | -- The class of all the types for which the SQL type can be inferred
 35 | -- from the Haskell type only.
 36 | -- Two notable exceptions are Row and Cell, which are the dynamic types
 37 | -- used by Spark.
 38 | -- See also buildType on how to use it.
 39 | class SQLTypeable a where
 40 |   _genericTypeFromValue :: (HasCallStack) => a -> GenericType
 41 |   default _genericTypeFromValue :: (HasCallStack, Generic a, GenSQLTypeable (Rep a)) => a -> GenericType
 42 |   _genericTypeFromValue x = genTypeFromProxy (from x)
 43 | 
 44 | -- Generic SQLTypeable
 45 | class GenSQLTypeable f where
 46 |   genTypeFromProxy :: (HasCallStack) => f a -> GenericType
 47 | 
 48 | 
 49 | -- | The only function that should matter for users in this file.
 50 | -- Given a type, returns the SQL representation of this type.
 51 | _buildType :: forall a. (HasCallStack, SQLTypeable a) => SQLType a
 52 | _buildType =
 53 |   let dt = _genericTypeFromValue (undefined :: a)
 54 |   in SQLType dt
 55 | 
 56 | type GenericType = DataType
 57 | 
 58 | instance SQLTypeable Int where
 59 |   _genericTypeFromValue _ = StrictType IntType
 60 | 
 61 | instance SQLTypeable Double where
 62 |   _genericTypeFromValue _ = StrictType DoubleType
 63 | 
 64 | instance SQLTypeable T.Text where
 65 |   _genericTypeFromValue _ = StrictType StringType
 66 | 
 67 | instance SQLTypeable Bool where
 68 |   _genericTypeFromValue _ = StrictType BoolType
 69 | 
 70 | instance SQLTypeable DataTypeRepr
 71 | instance SQLTypeable DataTypeElementRepr
 72 | 
 73 | instance SQLTypeable DataType where
 74 |   _genericTypeFromValue _ = _genericTypeFromValue (undefined :: DataTypeRepr)
 75 | 
 76 | 
 77 | -- instance {-# INCOHERENT #-} SQLTypeable String where
 78 | --   _genericTypeFromValue _ = StrictType StringType
 79 | 
 80 | instance SQLTypeable a => SQLTypeable (Maybe a) where
 81 |   _genericTypeFromValue _ = let SQLType dt = buildType :: (SQLType a) in
 82 |     (NullableType . iInnerStrictType) dt
 83 | 
 84 | instance {-# OVERLAPPABLE #-} SQLTypeable a => SQLTypeable [a] where
 85 |   _genericTypeFromValue _ =
 86 |     let SQLType dt = buildType :: (SQLType a) in
 87 |       (StrictType . ArrayType) dt
 88 | 
 89 | 
 90 | instance forall a1 a2. (
 91 |     SQLTypeable a2,
 92 |     SQLTypeable a1) => SQLTypeable (a1, a2) where
 93 |   _genericTypeFromValue _ =
 94 |     let
 95 |       SQLType t1 = buildType :: SQLType a1
 96 |       SQLType t2 = buildType :: SQLType a2
 97 |     in _buildTupleStruct [t1, t2]
 98 | 
 99 | _buildTupleStruct :: [GenericType] -> GenericType
100 | _buildTupleStruct dts =
101 |   let fnames = unsafeFieldName . T.pack. ("_" ++) . show <$> ([1..] :: [Int])
102 |       fs = uncurry StructField <$> zip fnames dts
103 |   in StrictType . Struct . StructType $ V.fromList fs
104 | 
105 | -- instance (SQLTypeable a, SQLTypeable b) => SQLTypeable (a,b) where
106 | --   _genericTypeFromValue _ = _genericTypeFromValue (undefined :: a) ++ _genericTypeFromValue (undefined :: b)
107 | 
108 | instance (GenSQLTypeable f) => GenSQLTypeable (M1 D c f) where
109 |   genTypeFromProxy m = genTypeFromProxy (unM1 m)
110 | 
111 | instance (GenSQLTypeable f, Constructor c) => GenSQLTypeable (M1 C c f) where
112 |   genTypeFromProxy m
113 |     | conIsRecord m =
114 |         let x = unM1 m
115 |             dt = genTypeFromProxy x in
116 |           dt
117 |     | otherwise =
118 |         -- It is assumed to be a newtype and we are going to unwrap it
119 |         let !dt1 = genTypeFromProxy (unM1 m)
120 |         in case iSingleField dt1 of
121 |           Just dt -> dt
122 |           Nothing ->
123 |             failure $ sformat ("M1 C "%sh%" dt1="%sh) n dt1
124 |               where n = conName m
125 | 
126 | -- Selector Metadata
127 | instance (GenSQLTypeable f, Selector c) => GenSQLTypeable (M1 S c f) where
128 |   genTypeFromProxy m =
129 |     let st = genTypeFromProxy (unM1 m)
130 |         n = selName m
131 |         field = StructField { structFieldName = FieldName $ T.pack n, structFieldType = st }
132 |         st2 = StructType (V.singleton field) in
133 |       StrictType $ Struct st2
134 | 
135 | instance (SQLTypeable a) => GenSQLTypeable (K1 R a) where
136 |   genTypeFromProxy m = _genericTypeFromValue (unK1 m)
137 | 
138 | -- Sum branch
139 | instance (GenSQLTypeable a, GenSQLTypeable b) => GenSQLTypeable (a :+: b) where
140 |   genTypeFromProxy (L1 x) = genTypeFromProxy x
141 |   genTypeFromProxy (R1 x) = genTypeFromProxy x
142 | 
143 | -- Product branch
144 | instance (GenSQLTypeable a, GenSQLTypeable b) => GenSQLTypeable (a :*: b) where
145 |   genTypeFromProxy z =
146 |     -- Due to optimizations that I do not understand, the decomposition has to
147 |     -- be done inside the function.
148 |     -- Otherwise, the value (which is undefined) gets to be evaluated, and breaks
149 |     -- the code.
150 |     let (x1 :*: x2) = z
151 |         y1 = genTypeFromProxy x1
152 |         y2 = genTypeFromProxy x2 in case (y1, y2) of
153 |         (StrictType (Struct s1), StrictType (Struct s2)) ->
154 |           (StrictType . Struct) s where
155 |             fs = structFields s1 V.++ structFields s2
156 |             s = StructType fs
157 |         _ -> failure $ sformat ("should not happen: left="%sh%" right="%sh) y1 y2
158 | 
159 | -- Void branch
160 | instance GenSQLTypeable U1 where
161 |   genTypeFromProxy _ = failure "U1"
162 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/TypesStructures.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | {-| The structures of data types in Karps.
  5 | 
  6 | For a detailed description of the supported types, see
  7 | http://spark.apache.org/docs/latest/sql-programming-guide.html#data-types
  8 | 
  9 | At a high-level, Spark DataFrames and Datasets are equivalent to lists of
 10 | objects whose type can be mapped to the same StructType:
 11 | Dataset a ~ ArrayType StructType (...)
 12 | Columns of a dataset are equivalent to lists of object whose type can be
 13 | mapped to the same DataType (either Strict or Nullable)
 14 | Local data (or "blobs") are single elements whose type can be mapped to a
 15 | DataType (either strict or nullable)
 16 | -}
 17 | module Spark.Core.Internal.TypesStructures where
 18 | 
 19 | import Data.Aeson
 20 | import Data.Vector(Vector)
 21 | import Control.Monad(guard)
 22 | import qualified Data.Vector as V
 23 | import qualified Data.Aeson as A
 24 | import qualified Data.Text as T
 25 | import GHC.Generics(Generic)
 26 | import Test.QuickCheck
 27 | 
 28 | import Spark.Core.StructuresInternal(FieldName(..))
 29 | import Spark.Core.Internal.Utilities
 30 | 
 31 | -- The core type algebra
 32 | 
 33 | -- | The data types that are guaranteed to not be null: evaluating them will return a value.
 34 | data StrictDataType =
 35 |     IntType
 36 |   | DoubleType
 37 |   | StringType
 38 |   | BoolType
 39 |   | Struct !StructType
 40 |   | ArrayType !DataType
 41 |   deriving (Eq)
 42 | 
 43 | -- | All the data types supported by the Spark engine.
 44 | -- The data types can either be nullable (they may contain null values) or strict (all the values are present).
 45 | -- There are a couple of differences with the algebraic data types in Haskell:
 46 | -- Maybe (Maybe a) ~ Maybe a which implies that arbitrary nesting of values will be flattened to a top-level Nullable
 47 | -- Similarly, [[]] ~ []
 48 | data DataType =
 49 |     StrictType !StrictDataType
 50 |   | NullableType !StrictDataType deriving (Eq)
 51 | 
 52 | -- | A field in a structure
 53 | data StructField = StructField {
 54 |   structFieldName :: !FieldName,
 55 |   structFieldType :: !DataType
 56 | } deriving (Eq)
 57 | 
 58 | -- | The main structure of a dataframe or a dataset
 59 | data StructType = StructType {
 60 |   structFields :: !(Vector StructField)
 61 | } deriving (Eq)
 62 | 
 63 | 
 64 | -- Convenience types
 65 | 
 66 | -- | Represents the choice between a strict and a nullable field
 67 | data Nullable = CanNull | NoNull deriving (Show, Eq)
 68 | 
 69 | -- | Encodes the type of all the nullable data types
 70 | data NullableDataType = NullableDataType !StrictDataType deriving (Eq)
 71 | 
 72 | -- | A tagged datatype that encodes the sql types
 73 | -- This is the main type information that should be used by users.
 74 | data SQLType a = SQLType {
 75 |   -- | The underlying data type.
 76 |   unSQLType :: !DataType
 77 | } deriving (Eq, Generic)
 78 | 
 79 | 
 80 | instance Show DataType where
 81 |   show (StrictType x) = show x
 82 |   show (NullableType x) = show x ++ "?"
 83 | 
 84 | instance Show StrictDataType where
 85 |   show StringType = "string"
 86 |   show DoubleType = "double"
 87 |   show IntType = "int"
 88 |   show BoolType = "bool"
 89 |   show (Struct struct) = show struct
 90 |   show (ArrayType at) = "[" ++ show at ++ "]"
 91 | 
 92 | instance Show StructField where
 93 |   show field = (T.unpack . unFieldName . structFieldName) field ++ ":" ++ s where
 94 |     s = show $ structFieldType field
 95 | 
 96 | instance Show StructType where
 97 |   show struct = "{" ++ unwords (map show (V.toList . structFields $ struct)) ++ "}"
 98 | 
 99 | instance Show (SQLType a) where
100 |   show (SQLType dt) = show dt
101 | 
102 | 
103 | -- QUICKCHECK INSTANCES
104 | 
105 | 
106 | instance Arbitrary StructField where
107 |   arbitrary = do
108 |     name <- elements ["_1", "a", "b", "abc"]
109 |     dt <- arbitrary :: Gen DataType
110 |     return $ StructField (FieldName $ T.pack name) dt
111 | 
112 | instance Arbitrary StructType where
113 |   arbitrary = do
114 |     fields <- listOf arbitrary
115 |     return . StructType . V.fromList $ fields
116 | 
117 | instance Arbitrary StrictDataType where
118 |   arbitrary = do
119 |     idx <- elements [1,2] :: Gen Int
120 |     return $ case idx of
121 |       1 -> StringType
122 |       2 -> IntType
123 |       _ -> failure "Arbitrary StrictDataType"
124 | 
125 | instance Arbitrary DataType where
126 |   arbitrary = do
127 |     x <- arbitrary
128 |     u <- arbitrary
129 |     return $ if x then
130 |       StrictType u
131 |     else
132 |       NullableType u
133 | 
134 | -- AESON INSTANCES
135 | 
136 | -- This follows the same structure as the JSON generated by Spark.
137 | instance ToJSON StrictDataType where
138 |   toJSON IntType = "integer"
139 |   toJSON DoubleType = "double"
140 |   toJSON StringType = "string"
141 |   toJSON BoolType = "bool"
142 |   toJSON (Struct struct) = toJSON struct
143 |   toJSON (ArrayType (StrictType dt)) =
144 |     object [ "type" .= A.String "array"
145 |            , "elementType" .= toJSON dt
146 |            , "containsNull" .= A.Bool False ]
147 |   toJSON (ArrayType (NullableType dt)) =
148 |     object [ "type" .= A.String "array"
149 |            , "elementType" .= toJSON dt
150 |            , "containsNull" .= A.Bool True ]
151 | 
152 | instance ToJSON StructType where
153 |   toJSON (StructType fields) =
154 |     let
155 |       fs = (snd . _fieldToJson) <$> V.toList fields
156 |     in object [ "type" .= A.String "struct"
157 |               , "fields" .= fs ]
158 | 
159 | -- Spark drops the info at the highest level.
160 | instance ToJSON DataType where
161 |   toJSON (StrictType dt) = object [
162 |     "nullable" .= A.Bool False,
163 |     "dt" .= toJSON dt]
164 |   toJSON (NullableType dt) = object [
165 |     "nullable" .= A.Bool True,
166 |     "dt" .= toJSON dt]
167 | 
168 | instance FromJSON DataType where
169 |   parseJSON = withObject "DataType" $ \o -> do
170 |     nullable <- o .: "nullable"
171 |     dt <- o .: "dt"
172 |     let c = if nullable then NullableType else StrictType
173 |     return (c dt)
174 | 
175 | instance FromJSON StructField where
176 |   parseJSON = withObject "StructField" $ \o -> do
177 |     n <- o .: "name"
178 |     dt <- o .: "type"
179 |     nullable <- o .: "nullable"
180 |     let c = if nullable then NullableType else StrictType
181 |     return $ StructField (FieldName n) (c dt)
182 | 
183 | instance FromJSON StructType where
184 |   parseJSON = withObject "StructType" $ \o -> do
185 |     tp <- o .: "type"
186 |     guard (tp == T.pack "struct")
187 |     fs <- o .: "fields"
188 |     return (StructType fs)
189 | 
190 | instance FromJSON StrictDataType where
191 |   parseJSON (A.String s) = case s of
192 |     "integer" -> return IntType
193 |     "double" -> return DoubleType
194 |     "string" -> return StringType
195 |     "bool" -> return BoolType
196 |     -- TODO: figure out which one is correct
197 |     "boolean" -> return BoolType
198 |     _ -> fail ("StrictDataType: unknown type " ++ T.unpack s)
199 |   parseJSON (Object o) = do
200 |     tp <- o .: "type"
201 |     case T.pack tp of
202 |       "struct" -> Struct <$> parseJSON (Object o)
203 |       "array" -> do
204 |         dt <- o .: "elementType"
205 |         containsNull <- o .: "containsNull"
206 |         let c = if containsNull then NullableType else StrictType
207 |         return $ ArrayType (c dt)
208 |       s -> fail ("StrictDataType: unknown type " ++ T.unpack s)
209 | 
210 |   parseJSON x = fail ("StrictDataType: cannot parse " ++ show x)
211 | 
212 | 
213 | _fieldToJson :: StructField -> (T.Text, A.Value)
214 | _fieldToJson (StructField (FieldName n) (StrictType dt)) =
215 |   (n, object [ "name" .= A.String n
216 |              , "type" .= toJSON dt
217 |              , "nullable" .= A.Bool False])
218 | _fieldToJson (StructField (FieldName n) (NullableType dt)) =
219 |   (n, object [ "name" .= A.String n
220 |              , "type" .= toJSON dt
221 |              , "nullable" .= A.Bool True])
222 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/TypesStructuresRepr.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveGeneric #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | 
 5 | module Spark.Core.Internal.TypesStructuresRepr(
 6 |   DataTypeElementRepr(..),
 7 |   DataTypeRepr(..)
 8 | ) where
 9 | 
10 | import qualified Data.Text as T
11 | import GHC.Generics(Generic)
12 | 
13 | -- The inner representation of a dataype as a Row object.
14 | -- This representation is meant to be internal.
15 | -- Because the Spark data types do not support recursive types (trees),
16 | -- This is a flattened representation of types.
17 | data DataTypeElementRepr = DataTypeElementRepr {
18 |   fieldPath :: ![T.Text],
19 |   isNullable :: !Bool,
20 |   typeId :: !Int,
21 |   fieldIndex :: !Int
22 | } deriving (Eq, Show, Generic)
23 | 
24 | data DataTypeRepr = DataTypeRepr {
25 |   rows :: [DataTypeElementRepr]
26 | } deriving (Eq, Show, Generic)
27 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Internal/Utilities.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | {-# LANGUAGE BangPatterns #-}
  4 | {-# LANGUAGE OverloadedStrings #-}
  5 | 
  6 | 
  7 | {-| A collection of small utility functions.
  8 | -}
  9 | module Spark.Core.Internal.Utilities(
 10 |   LB.HasCallStack,
 11 |   UnknownType,
 12 |   pretty,
 13 |   myGroupBy,
 14 |   myGroupBy',
 15 |   missing,
 16 |   failure,
 17 |   failure',
 18 |   forceRight,
 19 |   show',
 20 |   encodeDeterministicPretty,
 21 |   withContext,
 22 |   strictList,
 23 |   traceHint,
 24 |   SF.sh,
 25 |   (<&>),
 26 |   (<>)
 27 |   ) where
 28 | 
 29 | import Data.Aeson
 30 | import Data.Aeson.Encode.Pretty
 31 | import qualified Data.ByteString.Lazy.Char8 as Char8
 32 | import qualified Data.ByteString.Lazy as LBS
 33 | import qualified Data.Text as T
 34 | import qualified Formatting.ShortFormatters as SF
 35 | import Control.Arrow ((&&&))
 36 | import Data.List
 37 | import Data.Function
 38 | import Data.Text(Text)
 39 | import Formatting
 40 | import Debug.Trace(trace)
 41 | import qualified Data.Map.Strict as M
 42 | import Data.Monoid((<>))
 43 | 
 44 | import qualified Spark.Core.Internal.LocatedBase as LB
 45 | 
 46 | (<&>) :: Functor f => f a -> (a -> b) -> f b
 47 | (<&>) = flip fmap
 48 | 
 49 | -- | A type that is is not known and that is not meant to be exposed to the
 50 | -- user.
 51 | data UnknownType
 52 | 
 53 | {-| Pretty printing for Aeson values (and deterministic output)
 54 | -}
 55 | pretty :: Value -> Text
 56 | pretty = T.pack . Char8.unpack . encodeDeterministicPretty
 57 | 
 58 | -- | Produces a bytestring output of a JSON value that is deterministic
 59 | -- and that is invariant to the insertion order of the keys.
 60 | -- (i.e the keys are stored in alphabetic order)
 61 | -- This is to ensure that all id computations are stable and reproducible
 62 | -- on the server part.
 63 | -- TODO(kps) use everywhere JSON is converted
 64 | encodeDeterministicPretty :: Value -> LBS.ByteString
 65 | encodeDeterministicPretty =
 66 |   encodePretty' (defConfig { confIndent = Spaces 0, confCompare = compare })
 67 | 
 68 | -- | group by
 69 | -- TODO: have a non-empty list instead
 70 | myGroupBy' :: (Ord b) => (a -> b) -> [a] -> [(b, [a])]
 71 | myGroupBy' f = map (f . head &&& id)
 72 |                    . groupBy ((==) `on` f)
 73 |                    . sortBy (compare `on` f)
 74 | 
 75 | -- | group by
 76 | -- TODO: have a non-empty list instead
 77 | myGroupBy :: (Ord a) => [(a, b)] -> M.Map a [b]
 78 | myGroupBy l = let
 79 |   l2 = myGroupBy' fst l in
 80 |   M.map (snd <$>) $ M.fromList l2
 81 | 
 82 | 
 83 | -- | Missing implementations in the code base.
 84 | missing :: (LB.HasCallStack) => Text -> a
 85 | missing msg = LB.error $ T.concat ["MISSING IMPLEMENTATION: ", msg]
 86 | 
 87 | {-| The function that is used to trigger exception due to internal programming
 88 | errors.
 89 | 
 90 | Currently, all programming errors simply trigger an exception. All these
 91 | impure functions are tagged with an implicit call stack argument.
 92 | -}
 93 | failure :: (LB.HasCallStack) => Text -> a
 94 | failure msg = LB.error (T.concat ["FAILURE in Spark. Hint: ", msg])
 95 | 
 96 | failure' :: (LB.HasCallStack) => Format Text (a -> Text) -> a -> c
 97 | failure' x = failure . sformat x
 98 | 
 99 | 
100 | {-| Given a DataFrame or a LocalFrame, attempts to get the value,
101 | or throws the error.
102 | 
103 | This function is not total.
104 | -}
105 | forceRight :: (LB.HasCallStack, Show a) => Either a b -> b
106 | forceRight (Right b) = b
107 | forceRight (Left a) = LB.error $
108 |   sformat ("Failure from either, got instead a left: "%shown) a
109 | 
110 | -- | Force the complete evaluation of a list to WNF.
111 | strictList :: (Show a) => [a] -> [a]
112 | strictList [] = []
113 | strictList (h : t) = let !t' = strictList t in (h : t')
114 | 
115 | -- | (internal) prints a hint with a value
116 | traceHint :: (Show a) => Text -> a -> a
117 | traceHint hint x = trace (T.unpack hint ++ show x) x
118 | 
119 | -- | show with Text
120 | show' :: (Show a) => a -> Text
121 | show' x = T.pack (show x)
122 | 
123 | withContext :: Text -> Either Text a -> Either Text a
124 | withContext _ (Right x) = Right x
125 | withContext msg (Left other) = Left (msg <> "\n>>" <> other)
126 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Row.hs:
--------------------------------------------------------------------------------
 1 | module Spark.Core.Row(
 2 |   module Spark.Core.Internal.RowStructures,
 3 |   ToSQL,
 4 |   FromSQL,
 5 |   valueToCell,
 6 |   cellToValue,
 7 |   jsonToCell,
 8 |   rowArray
 9 |   ) where
10 | 
11 | import Spark.Core.Internal.RowStructures
12 | import Spark.Core.Internal.RowGenerics
13 | import Spark.Core.Internal.RowGenericsFrom
14 | import Spark.Core.Internal.RowUtils
15 | 


--------------------------------------------------------------------------------
/src/Spark/Core/StructuresInternal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | 
  5 | -- Some basic structures about nodes in a graph, etc.
  6 | 
  7 | module Spark.Core.StructuresInternal(
  8 |   NodeName(..),
  9 |   NodePath(..),
 10 |   NodeId(..),
 11 |   FieldName(..),
 12 |   FieldPath(..),
 13 |   ComputationID(..),
 14 |   catNodePath,
 15 |   fieldName,
 16 |   unsafeFieldName,
 17 |   emptyFieldPath,
 18 |   nullFieldPath,
 19 |   headFieldPath,
 20 |   fieldPath,
 21 |   prettyNodePath,
 22 | ) where
 23 | 
 24 | import qualified Data.Text as T
 25 | import Data.ByteString(ByteString)
 26 | import GHC.Generics (Generic)
 27 | import Data.Hashable(Hashable)
 28 | import Data.List(intercalate)
 29 | import qualified Data.Aeson as A
 30 | import Data.String(IsString(..))
 31 | import Data.Vector(Vector)
 32 | import qualified Data.Vector as V
 33 | 
 34 | import Spark.Core.Internal.Utilities
 35 | 
 36 | -- | The name of a node (without path information)
 37 | newtype NodeName = NodeName { unNodeName :: T.Text } deriving (Eq, Ord)
 38 | 
 39 | -- | The user-defined path of the node in the hierarchical representation of the graph.
 40 | newtype NodePath = NodePath { unNodePath :: Vector NodeName } deriving (Eq, Ord)
 41 | 
 42 | -- | The unique ID of a node. It is based on the parents of the node
 43 | -- and all the relevant intrinsic values of the node.
 44 | newtype NodeId = NodeId { unNodeId :: ByteString } deriving (Eq, Ord, Generic)
 45 | 
 46 | -- | The name of a field in a sql structure
 47 | -- This structure ensures that proper escaping happens if required.
 48 | -- TODO: prevent the constructor from being used, it should be checked first.
 49 | newtype FieldName = FieldName { unFieldName :: T.Text } deriving (Eq)
 50 | 
 51 | -- | A path to a nested field an a sql structure.
 52 | -- This structure ensures that proper escaping happens if required.
 53 | newtype FieldPath = FieldPath { unFieldPath :: Vector FieldName } deriving (Eq)
 54 | 
 55 | {-| A unique identifier for a computation (a batch of nodes sent for execution
 56 | to Spark).
 57 | -}
 58 | data ComputationID = ComputationID {
 59 |   unComputationID :: !T.Text
 60 | } deriving (Eq, Show, Generic)
 61 | 
 62 | 
 63 | 
 64 | -- | A safe constructor for field names that fixes all the issues relevant to
 65 | -- SQL escaping
 66 | -- TODO: proper implementation
 67 | fieldName :: T.Text -> Either String FieldName
 68 | fieldName = Right . FieldName
 69 | 
 70 | -- | Constructs the field name, but will fail if the content is not correct.
 71 | unsafeFieldName :: (HasCallStack) => T.Text -> FieldName
 72 | unsafeFieldName = forceRight . fieldName
 73 | 
 74 | -- | A safe constructor for field names that fixes all the issues relevant to SQL escaping
 75 | -- TODO: proper implementation
 76 | fieldPath :: T.Text -> Either String FieldPath
 77 | fieldPath x = Right . FieldPath . V.singleton $ FieldName x
 78 | 
 79 | emptyFieldPath :: FieldPath
 80 | emptyFieldPath = FieldPath V.empty
 81 | 
 82 | nullFieldPath :: FieldPath -> Bool
 83 | nullFieldPath = V.null . unFieldPath
 84 | 
 85 | headFieldPath :: FieldPath -> Maybe FieldName
 86 | headFieldPath (FieldPath v) | V.null v = Nothing
 87 | headFieldPath (FieldPath v) = Just $ V.head v
 88 | 
 89 | -- | The concatenated path. This is the inverse function of fieldPath.
 90 | -- | TODO: this one should be hidden?
 91 | catNodePath :: NodePath -> T.Text
 92 | catNodePath (NodePath np) =
 93 |   T.intercalate "/" (unNodeName <$> V.toList np)
 94 | 
 95 | prettyNodePath :: NodePath -> T.Text
 96 | -- Only a single slash, double slashes are reserved for the case
 97 | -- of global paths (including session and computation)
 98 | prettyNodePath np = "/" <> catNodePath np
 99 | 
100 | instance Show NodeId where
101 |   show (NodeId bs) = let s = show bs in
102 |     if length s > 9 then
103 |       (drop 1 . take 6) s ++ ".."
104 |     else
105 |       s
106 | 
107 | instance Show NodeName where
108 |   show (NodeName nn) = T.unpack nn
109 | 
110 | instance Show NodePath where
111 |   show np = T.unpack $ T.concat ["NPath(", catNodePath np, ")" ]
112 | 
113 | instance Show FieldPath where
114 |   show (FieldPath l) =
115 |     intercalate "." (show <$> V.toList l)
116 | 
117 | instance Show FieldName where
118 |   -- TODO(kps) escape the '.' characters in the field name
119 |   show (FieldName fn) = T.unpack fn
120 | 
121 | instance Hashable NodeId
122 | 
123 | instance IsString FieldName where
124 |   fromString = FieldName . T.pack
125 | 
126 | instance A.ToJSON NodeName where
127 |   toJSON = A.toJSON . unNodeName
128 | 
129 | instance A.FromJSON NodeName where
130 |   -- TODO: more parse checks
131 |   parseJSON x = NodeName <$> A.parseJSON x
132 | 
133 | instance A.ToJSON NodePath where
134 |   toJSON = A.toJSON . unNodePath
135 | 
136 | instance A.FromJSON NodePath where
137 |   parseJSON x = NodePath <$> A.parseJSON x
138 | 
139 | instance A.ToJSON FieldName where
140 |   toJSON = A.toJSON . unFieldName
141 | 
142 | instance A.ToJSON FieldPath where
143 |   toJSON = A.toJSON . unFieldPath
144 | 
145 | instance Ord FieldName where
146 |   compare f1 f2 = compare (unFieldName f1) (unFieldName f2)
147 | 
148 | instance A.ToJSON ComputationID where
149 |   toJSON = A.toJSON . unComputationID
150 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Try.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {-|
 3 | Useful classes and functions to deal with failures
 4 | within the Karps framework.
 5 | 
 6 | This is a developer API. Users should not have to invoke functions
 7 | from this module.
 8 | -}
 9 | module Spark.Core.Try(
10 |   NodeError(..),
11 |   Try,
12 |   tryError,
13 |   tryEither
14 |   ) where
15 | 
16 | import qualified Data.Text as T
17 | import qualified Data.Vector as V
18 | 
19 | import Spark.Core.StructuresInternal
20 | 
21 | -- | An error associated to a particular node (an observable or a dataset).
22 | data NodeError = Error {
23 |   ePath :: NodePath,
24 |   eMessage :: T.Text
25 | } deriving (Eq, Show)
26 | 
27 | -- | The common result of attempting to build something.
28 | type Try = Either NodeError
29 | 
30 | 
31 | -- TODO: rename to tryError
32 | _error :: T.Text -> Try a
33 | _error txt = Left Error {
34 |     ePath = NodePath V.empty,
35 |     eMessage = txt
36 |   }
37 | 
38 | -- | Returns an error object given a text clue.
39 | tryError :: T.Text -> Try a
40 | tryError = _error
41 | 
42 | -- | Returns an error object given a string clue.
43 | --Remove this method
44 | --tryError' :: String -> Try a
45 | --tryError' = _error . T.pack
46 | 
47 | -- | (internal)
48 | -- Given a potentially errored object, converts it to a Try.
49 | tryEither :: Either T.Text a -> Try a
50 | tryEither (Left msg) = tryError msg
51 | tryEither (Right x) = Right x
52 | 


--------------------------------------------------------------------------------
/src/Spark/Core/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE ScopedTypeVariables #-}
 3 | 
 4 | module Spark.Core.Types(
 5 |   DataType,
 6 |   Nullable(..),
 7 |   TupleEquivalence(..),
 8 |   NameTuple(..),
 9 |   -- intType,
10 |   -- canNull,
11 |   -- noNull,
12 |   -- stringType,
13 |   -- arrayType',
14 |   -- cellType,
15 |   -- structField,
16 |   -- structType,
17 |   -- arrayType,
18 |   SQLType,
19 |   columnType,
20 |   SQLTypeable,
21 |   buildType,
22 |   StructField,
23 |   StructType,
24 |   -- castType,
25 |   catNodePath,
26 |   unSQLType
27 | ) where
28 | 
29 | import Spark.Core.Internal.TypesStructures
30 | import Spark.Core.Internal.TypesGenerics
31 | import Spark.Core.Internal.TypesFunctions
32 | import Spark.Core.StructuresInternal
33 | import Spark.Core.Internal.FunctionsInternals(TupleEquivalence(..), NameTuple(..))
34 | 
35 | -- | Description of types supported in DataSets
36 | -- Karps supports a restrictive subset of Algebraic Datatypes that is amenable to SQL
37 | -- transformations. This file contains the description of all the supported types, and some
38 | -- conversion tools.
39 | 


--------------------------------------------------------------------------------
/src/Spark/IO/Inputs.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Spark.IO.Inputs(
 3 |   SparkPath,
 4 |   JsonMode,
 5 |   DataSchema,
 6 |   JsonOptions,
 7 |   SourceDescription,
 8 |   json',
 9 |   json,
10 |   jsonInfer
11 | ) where
12 | 
13 | import Spark.IO.Internal.Json
14 | import Spark.IO.Internal.InputGeneric
15 | 


--------------------------------------------------------------------------------
/src/Spark/IO/Internal/InputGeneric.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | module Spark.IO.Internal.InputGeneric(
  5 |   SparkPath(..),
  6 |   DataSchema(..),
  7 |   InputOptionValue(..),
  8 |   InputOptionKey(..),
  9 |   DataFormat(..),
 10 |   SourceDescription(..),
 11 |   generic',
 12 |   genericWithSchema',
 13 |   genericWithSchema
 14 | ) where
 15 | 
 16 | import Data.Text(Text)
 17 | import Data.String(IsString(..))
 18 | import qualified Data.Map.Strict as M
 19 | import qualified Data.Aeson as A
 20 | import qualified Data.Text as T
 21 | import Data.Aeson(toJSON, (.=))
 22 | -- import Debug.Trace
 23 | 
 24 | import Spark.Core.Types
 25 | import Spark.Core.Context
 26 | import Spark.Core.Try
 27 | import Spark.Core.Dataset
 28 | 
 29 | import Spark.Core.Internal.Utilities(forceRight)
 30 | import Spark.Core.Internal.DatasetFunctions(asDF, emptyDataset, emptyLocalData)
 31 | import Spark.Core.Internal.TypesStructures(SQLType(..))
 32 | import Spark.Core.Internal.OpStructures
 33 | 
 34 | {-| A path to some data that can be read by Spark.
 35 | -}
 36 | newtype SparkPath = SparkPath Text deriving (Show, Eq)
 37 | 
 38 | {-| The schema policty with respect to a data source. It should either
 39 | request Spark to infer the schema from the source, or it should try to
 40 | match the source against a schema provided by the user.
 41 | -}
 42 | data DataSchema = InferSchema | UseSchema DataType deriving (Show, Eq)
 43 | 
 44 | {-| The low-level option values accepted by the Spark reader API.
 45 | -}
 46 | data InputOptionValue =
 47 |     InputIntOption Int
 48 |   | InputDoubleOption Double
 49 |   | InputStringOption Text
 50 |   | InputBooleanOption Bool
 51 |   deriving (Eq, Show)
 52 | 
 53 | instance A.ToJSON InputOptionValue where
 54 |   toJSON (InputIntOption i) = toJSON i
 55 |   toJSON (InputDoubleOption d) = toJSON d
 56 |   toJSON (InputStringOption s) = toJSON s
 57 |   toJSON (InputBooleanOption b) = toJSON b
 58 | 
 59 | newtype InputOptionKey = InputOptionKey { unInputOptionKey :: Text } deriving (Eq, Show, Ord)
 60 | 
 61 | {-| The type of the source.
 62 | 
 63 | This enumeration contains all the data formats that are natively supported by
 64 | Spark, either for input or for output, and allows the users to express their
 65 | own format if requested.
 66 | -}
 67 | data DataFormat =
 68 |     JsonFormat
 69 |   | TextFormat
 70 |   | CsvFormat
 71 |   | CustomSourceFormat !Text
 72 |   deriving (Eq, Show)
 73 | -- data InputSource = JsonSource | TextSource | CsvSource | InputSource SparkPath
 74 | 
 75 | {-| A description of a data source, following Spark's reader API version 2.
 76 | 
 77 | Eeach source constists in an input source (json, xml, etc.), an optional schema
 78 | for this source, and a number of options specific to this source.
 79 | 
 80 | Since this descriptions is rather low-level, a number of wrappers of provided
 81 | for each of the most popular sources that are already built into Spark.
 82 | -}
 83 | data SourceDescription = SourceDescription {
 84 |   inputPath :: !SparkPath,
 85 |   inputSource :: !DataFormat,
 86 |   inputSchema :: !DataSchema,
 87 |   sdOptions :: !(M.Map InputOptionKey InputOptionValue),
 88 |   inputStamp :: !(Maybe DataInputStamp)
 89 | } deriving (Eq, Show)
 90 | 
 91 | instance IsString SparkPath where
 92 |   fromString = SparkPath . T.pack
 93 | 
 94 | {-| Generates a dataframe from a source description.
 95 | 
 96 | This may trigger some calculations on the Spark side if schema inference is
 97 | required.
 98 | -}
 99 | generic' :: SourceDescription -> SparkState DataFrame
100 | generic' sd = do
101 |   dtt <- _inferSchema sd
102 |   return $ dtt >>= \dt -> genericWithSchema' dt sd
103 | 
104 | {-| Generates a dataframe from a source description, and assumes a given schema.
105 | 
106 | This schema overrides whatever may have been given in the source description. If
107 | the source description specified that the schema must be checked or inferred,
108 | this instruction is overriden.
109 | 
110 | While this is convenient, it may lead to runtime errors that are hard to
111 | understand if the data does not follow the given schema.
112 | -}
113 | genericWithSchema' :: DataType -> SourceDescription -> DataFrame
114 | genericWithSchema' dt sd = asDF $ emptyDataset no (SQLType dt) where
115 |   sd' = sd { inputSchema = UseSchema dt }
116 |   so = StandardOperator {
117 |       soName = "org.spark.GenericDatasource",
118 |       soOutputType = dt,
119 |       soExtra = A.toJSON sd'
120 |     }
121 |   no = NodeDistributedOp so
122 | 
123 | {-| Generates a dataframe from a source description, and assumes a certain
124 | schema on the source.
125 | -}
126 | genericWithSchema :: forall a. (SQLTypeable a) => SourceDescription -> Dataset a
127 | genericWithSchema sd =
128 |   let sqlt = buildType :: SQLType a
129 |       dt = unSQLType sqlt in
130 |   forceRight $ castType sqlt =<< genericWithSchema' dt sd
131 | 
132 | -- Wraps the action of inferring the schema.
133 | -- This is not particularly efficient here: it does a first pass to get the
134 | -- schema, and then will do a second pass in order to read the data.
135 | _inferSchema :: SourceDescription -> SparkState (Try DataType)
136 | _inferSchema = executeCommand1 . _inferSchemaCmd
137 | 
138 | -- TODO: this is a monoidal operation, it could be turned into a universal
139 | -- aggregator.
140 | _inferSchemaCmd :: SourceDescription -> LocalData DataType
141 | _inferSchemaCmd sd = emptyLocalData no sqlt where
142 |   sqlt = buildType :: SQLType DataType
143 |   dt = unSQLType sqlt
144 |   so = StandardOperator {
145 |       soName = "org.spark.InferSchema",
146 |       soOutputType = dt,
147 |       soExtra = A.toJSON sd
148 |     }
149 |   no = NodeOpaqueAggregator so
150 | 
151 | instance A.ToJSON SparkPath where
152 |   toJSON (SparkPath p) = toJSON p
153 | 
154 | instance A.ToJSON DataSchema where
155 |   toJSON InferSchema = "infer_schema"
156 |   toJSON (UseSchema dt) = toJSON dt
157 | 
158 | instance A.ToJSON DataFormat where
159 |   toJSON JsonFormat = "json"
160 |   toJSON TextFormat = "text"
161 |   toJSON CsvFormat = "csv"
162 |   toJSON (CustomSourceFormat s) = toJSON s
163 | 
164 | instance A.ToJSON SourceDescription where
165 |   toJSON sd = A.object [
166 |                 "inputPath" .= toJSON (inputPath sd),
167 |                 "inputSource" .= toJSON (inputSource sd),
168 |                 "inputSchema" .= toJSON (inputSchema sd),
169 |                 "inputStamp" .= A.Null,
170 |                 "options" .= A.object (f <$> M.toList (sdOptions sd))
171 |               ] where
172 |                 f (k, v) = unInputOptionKey k .= toJSON v
173 | 


--------------------------------------------------------------------------------
/src/Spark/IO/Internal/Json.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE ScopedTypeVariables #-}
  3 | 
  4 | module Spark.IO.Internal.Json(
  5 |   JsonMode,
  6 |   JsonOptions(..),
  7 |   json',
  8 |   json,
  9 |   jsonInfer,
 10 |   jsonOpt',
 11 |   jsonOpt,
 12 |   defaultJsonOptions
 13 | ) where
 14 | 
 15 | import qualified Data.Map.Strict as M
 16 | import Data.Text(pack)
 17 | 
 18 | 
 19 | import Spark.Core.Types
 20 | import Spark.Core.Dataset(DataFrame, Dataset, castType')
 21 | import Spark.Core.Context
 22 | import Spark.Core.Try
 23 | 
 24 | import Spark.IO.Internal.InputGeneric
 25 | 
 26 | {-|
 27 | -}
 28 | data JsonMode = Permissive | DropMalformed | FailFast
 29 | 
 30 | {-| The options for the json input.
 31 | -}
 32 | data JsonOptions = JsonOptions {
 33 |   mode :: !JsonMode,
 34 |   jsonSchema :: !DataSchema
 35 | }
 36 | 
 37 | 
 38 | {-| Declares a source of data of the given data type.
 39 | 
 40 | The source is not read at this point, it is just declared. It may be found to be
 41 | invalid in subsequent computations.
 42 | -}
 43 | json' :: DataType -> String -> DataFrame
 44 | json' dt p = genericWithSchema' dt (_jsonSourceDescription (SparkPath (pack p)) defaultJsonOptions)
 45 | 
 46 | {-| Declares a source of data of the given data type.
 47 | 
 48 | The source is not read at this point, it is just declared.
 49 | -}
 50 | json :: (SQLTypeable a) => String -> Dataset a
 51 | json p = genericWithSchema (_jsonSourceDescription (SparkPath (pack p)) defaultJsonOptions)
 52 | 
 53 | {-| Reads a source of data expected to be in the JSON format.
 54 | 
 55 | The schema is not required and Spark will infer the schema of the source.
 56 | However, all the data contained in the source may end up being read in the
 57 | process.
 58 | -}
 59 | jsonInfer :: SparkPath -> SparkState DataFrame
 60 | jsonInfer = jsonOpt' defaultJsonOptions
 61 | 
 62 | {-| Reads a source of data expected to be in the JSON format.
 63 | 
 64 | The schema is not required and Spark will infer the schema of the source.
 65 | However, all the data contained in the source may end up being read in the
 66 | process.
 67 | -}
 68 | jsonOpt' :: JsonOptions -> SparkPath -> SparkState DataFrame
 69 | jsonOpt' jo sp = generic' (_jsonSourceDescription sp jo)
 70 | 
 71 | {-| Reads a source of data expected to be in the JSON format.
 72 | 
 73 | The schema is not required and Spark will infer the schema of the source.
 74 | However, all the data contained in the source may end up being read in the
 75 | process.
 76 | -}
 77 | jsonOpt :: forall a. (SQLTypeable a) => JsonOptions -> SparkPath -> SparkState (Try (Dataset a))
 78 | jsonOpt jo sp =
 79 |   let sqlt = buildType :: SQLType a
 80 |       dt = unSQLType sqlt
 81 |       jo' = jo { jsonSchema = UseSchema dt }
 82 |   in castType' sqlt <$> jsonOpt' jo' sp
 83 | 
 84 | defaultJsonOptions :: JsonOptions
 85 | defaultJsonOptions = JsonOptions {
 86 |   -- Fail fast by default, to be conservative about errors,
 87 |   -- and respect the strictness arguments.
 88 |   mode = FailFast,
 89 |   jsonSchema = InferSchema
 90 | }
 91 | 
 92 | _jsonSourceDescription :: SparkPath -> JsonOptions -> SourceDescription
 93 | _jsonSourceDescription sp jo = SourceDescription {
 94 |   inputSource = JsonFormat,
 95 |   inputPath = sp,
 96 |   inputSchema = jsonSchema jo,
 97 |   sdOptions = _jsonOptions jo,
 98 |   inputStamp = Nothing
 99 | }
100 | 
101 | _jsonOptions :: JsonOptions -> M.Map InputOptionKey InputOptionValue
102 | _jsonOptions jo = M.fromList [(InputOptionKey "mode", _mode (mode jo))]
103 | 
104 | _mode :: JsonMode -> InputOptionValue
105 | _mode Permissive = InputStringOption "PERMISSIVE"
106 | _mode DropMalformed = InputStringOption "DROPMALFORMED"
107 | _mode FailFast = InputStringOption "FAILFAST"
108 | 


--------------------------------------------------------------------------------
/src/Spark/IO/Internal/OutputCommon.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | 
  5 | module Spark.IO.Internal.OutputCommon(
  6 |   SaveMode(..),
  7 |   OutputBucket,
  8 |   DynOutputBucket,
  9 |   OutputPartition,
 10 |   DynOutputPartition,
 11 |   SavingDescription(..),
 12 |   partition,
 13 |   partition',
 14 |   bucket,
 15 |   bucket',
 16 |   saveDefaults,
 17 |   saveCol
 18 | ) where
 19 | 
 20 | -- import Data.Text(Text)
 21 | -- import qualified Data.Map.Strict as M
 22 | -- import qualified Data.Aeson as A
 23 | -- import Data.Aeson(toJSON, (.=))
 24 | 
 25 | -- import Spark.Core.Types
 26 | -- import Spark.Core.Context
 27 | import Spark.Core.Try
 28 | import Spark.Core.Column
 29 | -- import Spark.Core.ColumnFunctions
 30 | -- import Spark.Core.Row
 31 | import Spark.Core.Dataset
 32 | 
 33 | import Spark.Core.Internal.ColumnStructures(UnknownReference, UntypedColumnData)
 34 | import Spark.Core.Internal.ColumnFunctions(dropColReference)
 35 | import Spark.Core.Internal.Utilities
 36 | import Spark.IO.Internal.InputGeneric
 37 | 
 38 | {-| The mode when saving the data.
 39 | -}
 40 | data SaveMode =
 41 |     Overwrite
 42 |   | Append
 43 |   | Ignore
 44 |   | ErrorIfExists deriving(Eq, Show)
 45 | 
 46 | data OutputPartition ref = OutputPartition UntypedColumnData
 47 | 
 48 | type DynOutputPartition = Try (OutputPartition UnknownReference)
 49 | 
 50 | data OutputBucket ref = OutputBucket UntypedColumnData
 51 | 
 52 | type DynOutputBucket = Try (OutputBucket UnknownReference)
 53 | 
 54 | partition :: Column ref a -> OutputPartition ref
 55 | partition = OutputPartition . dropColType . dropColReference
 56 | 
 57 | partition' :: DynColumn -> DynOutputPartition
 58 | partition' = fmap partition
 59 | 
 60 | bucket :: Column ref a -> OutputBucket ref
 61 | bucket = OutputBucket . dropColType . dropColReference
 62 | 
 63 | bucket' :: DynColumn -> DynOutputBucket
 64 | bucket' = fmap bucket
 65 | 
 66 | 
 67 | data SavingDescription ref a = SavingDescription {
 68 |   partitions :: ![OutputPartition ref],
 69 |   buckets :: ![OutputBucket ref],
 70 |   savedCol :: !(Column ref a),
 71 |   saveFormat :: !DataFormat,
 72 |   savePath :: !SparkPath
 73 | }
 74 | 
 75 | saveDefaults :: SparkPath -> DataFormat -> Column ref a -> SavingDescription ref a
 76 | saveDefaults sp f c = SavingDescription {
 77 |   partitions = [],
 78 |   buckets = [],
 79 |   savedCol = c,
 80 |   saveFormat = f,
 81 |   savePath = sp
 82 | }
 83 | 
 84 | {-| Inserts an action to store the given dataframe in the graph of computations.
 85 | 
 86 | NOTE: Because of some limitations in Spark, all the columns used when forming
 87 | the buckets and the parttions must be present inside the column being written.
 88 | These columns will be appended to the column being written if they happen to be
 89 | missing. The consequence is that more data may be written than expected.
 90 | 
 91 | It returns true if the update was successful. The return type is subject to
 92 |  change.
 93 | -}
 94 | saveCol :: SavingDescription ref a -> LocalData Bool
 95 | saveCol _ = missing "saveCol"
 96 | 
 97 | -- test :: Int
 98 | -- test =
 99 | --   let c = undefined :: Column Int Int
100 | --       ld = saveCol (saveDefaults undefined JsonFormat c) { partitions = [partition c, partition c] }
101 | --   in 3
102 | --
103 | -- repeatDS :: Column ref Int -> Column ref a -> Dataset a
104 | --
105 | -- repeatFast :: Column ref Int -> Column ref a -> Dataset a
106 | --
107 | -- repeatScatter :: Int -> Column ref Int -> Column ref a -> Dataset a
108 | 


--------------------------------------------------------------------------------
/src/Spark/Inputs/Inputs.hs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krapsh/kraps-haskell/c3a03afe9a4a5ddcedf994756d9bc5967818cec2/src/Spark/Inputs/Inputs.hs


--------------------------------------------------------------------------------
/stack-ihaskell.yaml:
--------------------------------------------------------------------------------
 1 | flags: {}
 2 | extra-package-dbs: []
 3 | packages:
 4 | - '.'
 5 | #- ./ihaskell-display/ihaskell-blaze
 6 | #- ./ihaskell-display/ihaskell-basic
 7 | extra-deps:
 8 | - generator-0.5.5
 9 | - cryptohash-sha256-0.11.100.1
10 | - aeson-pretty-0.8.0
11 | - system-argv0-0.1.1 # Necessary for LTS 2.22 (GHC 7.8)
12 | - ihaskell-0.8.3.0
13 | - ipython-kernel-0.8.3.0
14 | resolver: nightly-2015-08-15 #lts-6.2
15 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | flags: {}
 2 | extra-package-dbs: []
 3 | packages:
 4 | - '.'
 5 | extra-deps:
 6 | - generator-0.5.5
 7 | - cryptohash-sha256-0.11.100.1
 8 | - aeson-pretty-0.8.2
 9 | resolver: lts-7.13
10 | pvp-bounds: both
11 | build:
12 |   library-profiling: true
13 |   executable-profiling: true
14 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/CachingSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.CachingSpec where
 5 | 
 6 | import Test.Hspec
 7 | import qualified Data.Text
 8 | 
 9 | import Spark.Core.Context
10 | import Spark.Core.Functions
11 | import Spark.Core.Column
12 | import Spark.Core.ColumnFunctions
13 | import Spark.Core.StructuresInternal(ComputationID(..))
14 | 
15 | 
16 | -- Collecting a dataset made from a list should yield the same list (modulo
17 | -- some reordering)
18 | collectIdempotent :: [Int] -> IO ()
19 | collectIdempotent l = do
20 |   -- stats <- computationStatsDef (ComputationID "0")
21 |   -- print "STATS"
22 |   -- print (show stats)
23 |   let ds = dataset l
24 |   let ds' = autocache ds
25 |   let c1 = asCol ds'
26 |   let s1 = sumCol c1
27 |   let s2 = count ds'
28 |   let x = s1 + s2
29 |   l2 <- exec1Def x
30 |   l2 `shouldBe` (sum l + length l)
31 | 
32 | run :: String -> IO () -> SpecWith (Arg (IO ()))
33 | run s f = it s $ do
34 |   createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s }
35 |   f
36 |   -- This is horribly not robust to any sort of failure, but it will do for now
37 |   -- TODO(kps) make more robust
38 |   closeSparkSessionDef
39 |   return ()
40 | 
41 | spec :: Spec
42 | spec = do
43 |   describe "Integration test - caching on ints" $ do
44 |     run "cache_sum1" $
45 |       collectIdempotent ([1,2,3] :: [Int])
46 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/CollectSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.CollectSpec where
 5 | 
 6 | import Test.Hspec
 7 | import qualified Data.Text
 8 | import Data.List(sort)
 9 | 
10 | import Spark.Core.Context
11 | import Spark.Core.Types
12 | import Spark.Core.Row
13 | import Spark.Core.Functions
14 | import Spark.Core.Column
15 | import Spark.Core.IntegrationUtilities
16 | import Spark.Core.Internal.Utilities
17 | 
18 | 
19 | -- Collecting a dataset made from a list should yield the same list (modulo
20 | -- some reordering)
21 | -- TODO: replace the ordering by the canonical ordering over the data
22 | collectIdempotent :: (Ord a, Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a) => [a] -> IO ()
23 | collectIdempotent l = do
24 |   let ds = dataset l
25 |   l2 <- exec1Def $ collect (asCol ds)
26 |   l2 `shouldBe` sort l
27 | 
28 | run :: String -> IO () -> SpecWith (Arg (IO ()))
29 | run s f = it s $ do
30 |   createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s }
31 |   f
32 |   -- This is horribly not robust to any sort of failure, but it will do for now
33 |   -- TODO(kps) make more robust
34 |   closeSparkSessionDef
35 |   return ()
36 | 
37 | spec :: Spec
38 | spec = do
39 |   describe "Integration test - collect on ints" $ do
40 |     run "running_twice" $ do
41 |       let ds = dataset [1::Int,2]
42 |       let c = traceHint "c=" $ collect (asCol ds)
43 |       l2 <- exec1Def $ c
44 |       l2' <- exec1Def $ collect (asCol ds)
45 |       l2 `shouldBe` l2'
46 |     run "empty_ints1" $
47 |       collectIdempotent ([] :: [Int])
48 |     run "ints1" $
49 |       collectIdempotent ([4,5,1,2,3] :: [Int])
50 |     run "ints1_opt" $
51 |       collectIdempotent ([Just 1, Nothing] :: [Maybe Int])
52 |     run "nothing_ints_opt" $
53 |       collectIdempotent ([Nothing] :: [Maybe Int])
54 |     run "ints1_opt" $
55 |       collectIdempotent ([Just 1, Just 2] :: [Maybe Int])
56 |     run "empty_ints_opt" $
57 |       collectIdempotent ([] :: [Maybe Int])
58 |   describe "Integration test - collect on TestStruct5" $ do
59 |     run "empty_TestStruct5" $
60 |       collectIdempotent ([] :: [TestStruct5])
61 |     run "single_TestStruct5" $
62 |       collectIdempotent ([TestStruct5 1 2] :: [TestStruct5])
63 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/ColumnSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.ColumnSpec where
 5 | 
 6 | import Test.Hspec
 7 | import Data.List.NonEmpty(NonEmpty( (:|) ))
 8 | 
 9 | import Spark.Core.Context
10 | import Spark.Core.Dataset
11 | import Spark.Core.Column
12 | import Spark.Core.Row
13 | import Spark.Core.Functions
14 | import Spark.Core.ColumnFunctions
15 | import Spark.Core.SimpleAddSpec(run)
16 | import Spark.Core.Internal.LocalDataFunctions(iPackTupleObs)
17 | import Spark.Core.Internal.DatasetFunctions(untypedLocalData)
18 | 
19 | myScaler :: Column ref Double -> Column ref Double
20 | myScaler col =
21 |   let cnt = asDouble (countCol col)
22 |       m = sumCol col / cnt
23 |       centered = col .- m
24 |       stdDev = sumCol (centered * centered) / cnt
25 |   in centered ./ stdDev
26 | 
27 | 
28 | spec :: Spec
29 | spec = do
30 |   describe "local data operations" $ do
31 |     run "broadcastPair_struct" $ do
32 |       let ds = dataset [1] :: Dataset Int
33 |       let cnt = countCol (asCol ds)
34 |       let c = collect (asCol ds .+ cnt)
35 |       res <- exec1Def c
36 |       res `shouldBe` [2]
37 |     run "LocalPack (doubles)" $ do
38 |       let x = untypedLocalData (1 :: LocalData Double)
39 |       let x2 = iPackTupleObs (x :| [x])
40 |       res <- exec1Def x2
41 |       res `shouldBe` rowArray [DoubleElement 1, DoubleElement 1]
42 |     run "LocalPack" $ do
43 |       let x = untypedLocalData (1 :: LocalData Int)
44 |       let x2 = iPackTupleObs (x :| [x])
45 |       res <- exec1Def x2
46 |       res `shouldBe` rowArray [IntElement 1, IntElement 1]
47 |     run "BroadcastPair" $ do
48 |       let x = 1 :: LocalData Int
49 |       let ds = dataset [2, 3] :: Dataset Int
50 |       let ds2 = broadcastPair ds x
51 |       res <- exec1Def (collect (asCol ds2))
52 |       res `shouldBe` [(2, 1), (3, 1)]
53 |       -- TODO: this combines a lot of elements together.
54 |   describe "columns - integration" $ do
55 |     run "mean" $ do
56 |       let ds = dataset [-1, 1] :: Dataset Double
57 |       let c = myScaler (asCol ds)
58 |       res <- exec1Def (collect c)
59 |       res `shouldBe` [-1, 1]
60 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/GroupsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.GroupsSpec where
 5 | 
 6 | import Test.Hspec
 7 | import Data.Text(Text)
 8 | 
 9 | import Spark.Core.Context
10 | import Spark.Core.Functions
11 | import Spark.Core.ColumnFunctions
12 | import Spark.Core.Column
13 | import Spark.Core.IntegrationUtilities
14 | import Spark.Core.CollectSpec(run)
15 | import Spark.Core.Internal.Groups
16 | 
17 | sumGroup :: [MyPair] -> [(Text, Int)] -> IO ()
18 | sumGroup l lexp = do
19 |   let ds = dataset l
20 |   let keys = ds // myKey'
21 |   let values = ds // myVal'
22 |   let g = groupByKey keys values
23 |   let ds2 = g `aggKey` sumCol
24 |   l2 <- exec1Def $ collect (asCol ds2)
25 |   l2 `shouldBe` lexp
26 | 
27 | spec :: Spec
28 | spec = do
29 |   describe "Integration test - groups on (text, int)" $ do
30 |     run "empty" $
31 |       sumGroup [] []
32 |     run "one" $
33 |       sumGroup [MyPair "x" 1] [("x", 1)]
34 |     run "two" $
35 |       sumGroup [MyPair "x" 1, MyPair "x" 2, MyPair "y" 1] [("x", 3), ("y", 1)]
36 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/IntegrationUtilities.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | {-# LANGUAGE DeriveGeneric #-}
 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 5 | 
 6 | module Spark.Core.IntegrationUtilities where
 7 | 
 8 | import GHC.Generics (Generic)
 9 | import Data.Text(Text)
10 | import Data.Aeson(ToJSON)
11 | 
12 | import Spark.Core.Context
13 | import Spark.Core.Types
14 | import Spark.Core.Row
15 | import Spark.Core.Column
16 | 
17 | data TestStruct1 = TestStruct1 {
18 |   ts1f1 :: Int,
19 |   ts1f2 :: Maybe Int } deriving (Show, Eq, Generic)
20 | instance ToSQL TestStruct1
21 | instance FromSQL TestStruct1
22 | instance SQLTypeable TestStruct1
23 | 
24 | data TestStruct2 = TestStruct2 {
25 |   ts2f1 :: [Int]
26 | } deriving (Show, Generic)
27 | instance SQLTypeable TestStruct2
28 | 
29 | data TestStruct3 = TestStruct3 {
30 |   ts3f1 :: Int
31 | } deriving (Show, Eq, Generic)
32 | instance ToSQL TestStruct3
33 | instance SQLTypeable TestStruct3
34 | 
35 | data TestStruct4 = TestStruct4 {
36 |   ts4f1 :: TestStruct3
37 | } deriving (Show, Eq, Generic)
38 | 
39 | data TestStruct5 = TestStruct5 {
40 |   ts5f1 :: Int,
41 |   ts5f2 :: Int
42 | } deriving (Show, Eq, Generic, Ord)
43 | -- instance ToJSON TestStruct5
44 | instance SQLTypeable TestStruct5
45 | instance FromSQL TestStruct5
46 | instance ToSQL TestStruct5
47 | 
48 | data TestStruct6 = TestStruct6 {
49 |   ts6f1 :: Int,
50 |   ts6f2 :: Int,
51 |   ts6f3 :: TestStruct3
52 | } deriving (Show, Eq, Generic)
53 | 
54 | data TestStruct7 = TestStruct7 {
55 |   ts7f1 :: Text
56 | } deriving (Show, Eq, Generic)
57 | instance ToSQL TestStruct7
58 | instance SQLTypeable TestStruct7
59 | instance ToJSON TestStruct7
60 | 
61 | newtype TestT1 = TestT1 {
62 |   unTestT1 :: Int
63 | } deriving (Eq, Show, Generic, Num)
64 | 
65 | data MyPair = MyPair {
66 |   myKey :: Text,
67 |   myVal :: Int } deriving (Generic, Show)
68 | 
69 | myKey' :: StaticColProjection MyPair Text
70 | myKey' = unsafeStaticProjection buildType "myKey"
71 | myVal' :: StaticColProjection MyPair Int
72 | myVal' = unsafeStaticProjection buildType "myVal"
73 | instance SQLTypeable MyPair
74 | instance ToSQL MyPair
75 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/JoinsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.JoinsSpec where
 5 | 
 6 | import Test.Hspec
 7 | 
 8 | import Spark.Core.Context
 9 | import Spark.Core.Dataset
10 | import Spark.Core.Column
11 | import Spark.Core.ColumnFunctions
12 | import Spark.Core.Row
13 | import Spark.Core.Functions
14 | import Spark.Core.SimpleAddSpec(run)
15 | 
16 | spec :: Spec
17 | spec = do
18 |   describe "Path test" $ do
19 |     run "test_path1" $ do
20 |       let ds1 = dataset [1] :: Dataset Int
21 |       let x1 = sumCol (asCol ds1) @@ "x1"
22 |       let x2 = ((x1 + 1) @@ "x2") `logicalParents` [untyped ds1]
23 |       res <- exec1Def x2
24 |       res `shouldBe` 2
25 |   -- describe "Join test - join on ints" $ do
26 |   --   run "empty_ints1" $ do
27 |   --     let ds1 = dataset [(1,2)] :: Dataset (Int, Int)
28 |   --     let ds2 = dataset [(1,3)] :: Dataset (Int, Int)
29 |   --     let df1 = asDF ds1
30 |   --     let df2 = asDF ds2
31 |   --     let df = joinInner' (df1/-"_1") (df1/-"_2") (df2/-"_1") (df2/-"_2" @@ "_3")
32 |   --     res <- exec1Def' (collect' (asCol' df))
33 |   --     res `shouldBe` rowArray [rowArray [IntElement 1, IntElement 2, IntElement 3]]
34 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/PruningSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.PruningSpec where
 5 | 
 6 | import Test.Hspec
 7 | import qualified Data.Text as T
 8 | import Data.List(sort)
 9 | 
10 | import Spark.Core.Context
11 | import Spark.Core.Types
12 | import Spark.Core.Row
13 | import Spark.Core.Functions
14 | import Spark.Core.Column
15 | import Spark.Core.IntegrationUtilities
16 | import Spark.Core.CollectSpec(run)
17 | 
18 | run2 :: T.Text -> IO () -> SpecWith (Arg (IO ()))
19 | run2 s f = it (T.unpack s) $ do
20 |   createSparkSessionDef $ defaultConf {
21 |       confRequestedSessionName = s,
22 |       confUseNodePrunning = True }
23 |   f
24 |   -- This is horribly not robust to any sort of failure, but it will do for now
25 |   -- TODO(kps) make more robust
26 |   closeSparkSessionDef
27 |   return ()
28 | 
29 | 
30 | spec :: Spec
31 | spec = do
32 |   describe "Integration test - pruning" $ do
33 |     run2 "running_twice" $ do
34 |       let ds = dataset [1::Int,2]
35 |       l2 <- exec1Def $ collect (asCol ds)
36 |       l2' <- exec1Def $ collect (asCol ds)
37 |       l2 `shouldBe` l2'
38 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/SimpleAddSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.Core.SimpleAddSpec where
 5 | 
 6 | import Test.Hspec
 7 | import qualified Data.Text
 8 | 
 9 | import Spark.Core.Context
10 | import Spark.Core.Types
11 | import Spark.Core.Row
12 | import Spark.Core.Functions
13 | 
14 | 
15 | smallSum :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> a -> IO ()
16 | smallSum x y = do
17 |   let x' = constant x
18 |   let y' = constant y
19 |   let z1' = x' + y'
20 |   z1 <- exec1Def z1'
21 |   z1 `shouldBe` (x + y)
22 |   let z2' = y' + x'
23 |   z2 <- exec1Def z2'
24 |   z2 `shouldBe` (x + y)
25 | 
26 | negation :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> a -> IO ()
27 | negation x y = do
28 |   let x' = constant x
29 |   let y' = constant y
30 |   let z1' = x' - y'
31 |   z1 <- exec1Def z1'
32 |   z1 `shouldBe` (x - y)
33 |   let z2' = y' - x'
34 |   z2 <- exec1Def z2'
35 |   z2 `shouldBe` (y - x)
36 | 
37 | checkNegate :: (Eq a, Show a, SQLTypeable a, ToSQL a, FromSQL a, Num a) => a -> IO ()
38 | checkNegate x = do
39 |   let x' = constant x
40 |   let z1' = negate x'
41 |   z1 <- exec1Def z1'
42 |   z1 `shouldBe` negate x
43 | 
44 | run :: String -> IO () -> SpecWith (Arg (IO ()))
45 | run s f = it s $ do
46 |   createSparkSessionDef $ defaultConf {
47 |     confRequestedSessionName = Data.Text.pack s,
48 |     confPollingIntervalMillis = 100,
49 |     confUseNodePrunning = False } -- Disabling caching for now, it causes issues.
50 |   f
51 |   -- This is horribly not robust to any sort of failure, but it will do for now
52 |   -- TODO(kps) make more robust
53 |   closeSparkSessionDef
54 |   return ()
55 | 
56 | spec :: Spec
57 | spec = do
58 |   describe "Integration test - sum on ints" $ do
59 |     run "empty_ints1" $
60 |       smallSum (1 :: Int) (2 :: Int)
61 |     run "zero_ints1" $
62 |       smallSum (0 :: Int) (2 :: Int)
63 |     run "negation_ints1" $
64 |       negation (1 :: Int) (2 :: Int)
65 |     run "negate_ints1" $
66 |       checkNegate (1 :: Int)
67 | 


--------------------------------------------------------------------------------
/test-integration/Spark/Core/Spec.hs:
--------------------------------------------------------------------------------
1 | -- Not working???
2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
3 | 


--------------------------------------------------------------------------------
/test-integration/Spark/IO/JsonSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MultiParamTypeClasses #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Spark.IO.JsonSpec where
 5 | 
 6 | import Test.Hspec
 7 | import Data.Aeson(encode)
 8 | import qualified Data.ByteString.Lazy
 9 | -- import System.IO
10 | 
11 | import Spark.Core.Context
12 | import Spark.Core.Types
13 | import Spark.Core.Row
14 | import Spark.Core.Functions
15 | import Spark.Core.Column
16 | import Spark.IO.Inputs
17 | import Spark.Core.IntegrationUtilities
18 | import Spark.Core.SimpleAddSpec(run)
19 | 
20 | spec :: Spec
21 | spec = do
22 |   describe "Read a json file" $ do
23 |     run "simple read" $ do
24 |       let xs = [TestStruct7 "x"]
25 |       let js = encode xs
26 |       _ <- Data.ByteString.Lazy.writeFile "/tmp/x.json" js
27 |       let dt = unSQLType (buildType :: SQLType TestStruct7)
28 |       let df = json' dt "/tmp/x.json"
29 |       let c = collect' (asCol' df)
30 |       c1 <- exec1Def' c
31 |       c1 `shouldBe` rowArray [rowArray [StringElement "x"]]
32 |       c2 <- exec1Def' c
33 |       c2 `shouldBe` rowArray [rowArray [StringElement "x"]]
34 |     run "simple inference" $ do
35 |       let xs = [TestStruct7 "x"]
36 |       let js = encode xs
37 |       _ <- Data.ByteString.Lazy.writeFile "/tmp/x.json" js
38 |       df <- execStateDef $ jsonInfer "/tmp/x.json"
39 |       let c = collect' (asCol' df)
40 |       c1 <- exec1Def' c
41 |       c1 `shouldBe` rowArray [rowArray [StringElement "x"]]
42 | 


--------------------------------------------------------------------------------
/test-integration/Spark/IO/StampSpec.hs:
--------------------------------------------------------------------------------
 1 | {-
 2 | let s = "s"
 3 | 
 4 | createSparkSessionDef $ defaultConf { confRequestedSessionName = Data.Text.pack s }
 5 | 
 6 | execStateDef (checkDataStamps [HdfsPath (Data.Text.pack "/tmp/")])
 7 | 
 8 | -}
 9 | 
10 | module Spark.IO.StampSpec where
11 | 
12 | import Test.Hspec
13 | 
14 | -- import Spark.Core.Context
15 | -- import Spark.Core.Types
16 | -- import Spark.Core.Row
17 | -- import Spark.Core.Functions
18 | -- import Spark.Core.Column
19 | -- import Spark.IO.Inputs
20 | -- import Spark.Core.IntegrationUtilities
21 | import Spark.Core.SimpleAddSpec(run)
22 | 
23 | spec :: Spec
24 | spec = do
25 |   describe "Read a json file" $ do
26 |     run "simple read" $ do
27 |       let x = 1 :: Int
28 |       x `shouldBe` x
29 | 


--------------------------------------------------------------------------------
/test-integration/Spec.hs:
--------------------------------------------------------------------------------
1 | -- Not working???
2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
3 | 


--------------------------------------------------------------------------------
/test/Spark/Core/ColumnSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Spark.Core.ColumnSpec where
 4 | 
 5 | import Test.Hspec
 6 | 
 7 | import Spark.Core.Column
 8 | import Spark.Core.Dataset
 9 | import Spark.Core.Functions
10 | import Spark.Core.Types
11 | import Spark.Core.ColumnFunctions
12 | import Spark.Core.Internal.Utilities
13 | 
14 | data Z
15 | data Y
16 | 
17 | myScaler :: Column ref Double -> Column ref Double
18 | myScaler col =
19 |   let cnt = asDouble (countCol col)
20 |       m = sumCol col / cnt
21 |       centered = col .- m
22 |       stdDev = sumCol (centered * centered) / cnt
23 |   in centered ./ stdDev
24 | 
25 | 
26 | spec :: Spec
27 | spec = do
28 |   describe "ColumnSpec: ensure rules compile correctly" $ do
29 |     let ds = dataset [(1,2)] :: Dataset (Int, Int)
30 |     let c1 = ds // _1
31 |     let c2 = ds // _2
32 |     let c1' = untypedCol c1
33 |     let c2' = untypedCol c2
34 |     let i1 = 3 :: Int
35 |     let o1 = constant 4 :: LocalData Int
36 |     let o2 = 5 :: LocalData Int
37 |     let o1' = asLocalObservable o1
38 |     let o2' = asLocalObservable o2
39 |     it "+ should not blow up" $ do
40 |       let z1 = c1 + c2
41 |       let z2 = c1' + c2'
42 |       let z3 = c1 + 1
43 |       let z4 = 1 + c1
44 |       'a' `shouldBe` 'a'
45 |     it ".+ should not blow up with colums" $ do
46 |       let z1 = c1 .+ c2
47 |       let z2 = c1' .+ c2'
48 |       let z3 = c1 .+ c2'
49 |       let z4 = c1' .+ c2
50 |       let z5 = c1 .+ o1
51 |       let z6 = c1 .+ o1'
52 |       'a' `shouldBe` 'a'
53 |     it "simple aggregations" $ do
54 |       let c3 = c1 + (c2 .+ sumCol c2)
55 |       let ds2 = pack1 c3
56 |       nodeType ds2 `shouldBe` (buildType :: SQLType Int)
57 |     it "mean" $ do
58 |       let ds' = dataset [1, 2] :: Dataset Double
59 |       let c = asCol ds'
60 |       let cnt = asDouble (countCol c)
61 |       let m = traceHint "m=" $ sumCol c / cnt
62 |       let centered = c .- m
63 |       let stdDev = sumCol (centered * centered) / cnt
64 |       let scaled = traceHint "scaled=" $ centered ./ stdDev
65 |       let ds2 = pack1 scaled
66 |       nodeType ds2 `shouldBe` (buildType :: SQLType Double)
67 | 


--------------------------------------------------------------------------------
/test/Spark/Core/ContextSpec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | module Spark.Core.ContextSpec where
 4 | 
 5 | import Test.Hspec
 6 | 
 7 | import Spark.Core.Functions
 8 | 
 9 | spec :: Spec
10 | spec = do
11 |   describe "Basic routines to get something out" $ do
12 |     it "should print a node" $ do
13 |       let x = dataset ([1 ,2, 3, 4]::[Int])
14 |       x `shouldBe` x
15 |         --   b = nodeToBundle (untyped x) in
16 |         -- trace (pretty b) $
17 |         --   1 `shouldBe` 1
18 | 


--------------------------------------------------------------------------------
/test/Spark/Core/DatasetSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Spark.Core.DatasetSpec where
 4 | 
 5 | import qualified Data.Text as T
 6 | import Test.Hspec
 7 | import qualified Data.Vector as V
 8 | 
 9 | import Spark.Core.Dataset
10 | import Spark.Core.Functions
11 | import Spark.Core.Column
12 | import Spark.Core.StructuresInternal
13 | import Spark.Core.Internal.ContextInternal
14 | import Spark.Core.Internal.Utilities
15 | import Spark.Core.Internal.DAGStructures
16 | import Spark.Core.Internal.ComputeDag
17 | 
18 | nName :: String -> NodeName
19 | nName = NodeName . T.pack
20 | 
21 | spec :: Spec
22 | spec = do
23 |   describe "create a dataframe" $ do
24 |     it "should not explode" $
25 |       let x = dataset ([1 ,2, 3, 4]::[Int]) in
26 |         nodeName x `shouldBe` nName "distributedliteral_c87697"
27 | 
28 |     it "renaming should work" $
29 |       let x = dataset ([1 ,2, 3]::[Int]) @@ "ds1" in
30 |         nodeName x `shouldBe` nName "ds1"
31 | 
32 |   describe "check localset" $ do
33 |     it "should not explode" $
34 |       let x = dataset ([1 ,2, 3]::[Int]) in
35 |         nodeName x `shouldBe` nName "distributedliteral_1ba31e"
36 | 
37 |     it "renaming should work" $
38 |       let x = dataset ([1 ,2, 3]::[Int]) @@ "ds1" in
39 |         nodeName x `shouldBe` nName "ds1"
40 | 
41 |   describe "column syntax" $
42 |     it "should not explode" $ do
43 |       let ds = dataset ([1 ,2, 3]::[Int])
44 |       let c1 = ds/-"c1"
45 |       c1 `shouldBe` c1
46 | 
47 |   describe "Logical dependencies" $ do
48 |     it "should work" $ do
49 |       let ds = dataset ([1 ,2, 3, 4]::[Int])
50 |       let ds1 = dataset ([1]::[Int]) `depends` [untyped ds]
51 |       let g = traceHint (T.pack "g=") $ computeGraphToGraph $ forceRight $ buildComputationGraph ds1
52 |       V.length (gVertices g) `shouldBe` 2
53 | 
54 | 
55 | --   describe "simple test" $ do
56 | --     it "the type should match" $ do
57 | --       let
58 | --        n1 = constant "xxx"
59 | --        n = NodeType $ T.pack "org.spark.Constant" in
60 | --         (nodeOp n1) `shouldBe` n
61 | 
62 | --     it "no name" $ do
63 | --       let n1 = constant "xxx"
64 | --           t = NodeName $ T.pack "org.spark.Constant" in
65 | --         (nodeName n1) `shouldBe` t
66 | 
67 | --     it "some name" $ do
68 | --       let n1 = constant "xxx" @@ "name"
69 | --           t = NodeName $ T.pack "name" in
70 | --         (nodeName n1) `shouldBe` t
71 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/CachingSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | module Spark.Core.Internal.CachingSpec where
  5 | 
  6 | import Test.Hspec
  7 | 
  8 | import qualified Data.ByteString.Char8 as C8
  9 | import Data.Either(isLeft, isRight)
 10 | import Control.Arrow((&&&))
 11 | import Data.Text(Text)
 12 | import Data.Foldable(toList)
 13 | import Formatting
 14 | 
 15 | import Spark.Core.Try
 16 | import Spark.Core.Functions
 17 | import Spark.Core.Column
 18 | import Spark.Core.ColumnFunctions
 19 | import Spark.Core.Internal.Caching
 20 | -- Required for instance resolution
 21 | import Spark.Core.StructuresInternal()
 22 | import Spark.Core.Internal.Client(LocalSessionId(..))
 23 | import Spark.Core.Internal.DAGStructures
 24 | import Spark.Core.Internal.DAGFunctions
 25 | import Spark.Core.Internal.DatasetStructures
 26 | import Spark.Core.Internal.Utilities
 27 | import Spark.Core.Internal.ContextStructures
 28 | import Spark.Core.Internal.ContextInternal
 29 | import Spark.Core.Internal.Pruning(emptyNodeCache)
 30 | 
 31 | data TestType = AutocacheNode | CacheNode | UncacheNode | Dataset | Row deriving (Eq, Show)
 32 | 
 33 | data TestNode = TestNode { tnId :: VertexId, tnType :: TestType, tnParents :: [(StructureEdge, TestNode)] } deriving (Eq)
 34 | 
 35 | instance Show TestNode where
 36 |   show v = "TestNode(" ++ (C8.unpack . unVertexId. tnId $ v) ++ ")"
 37 | 
 38 | nid :: String -> VertexId
 39 | nid = VertexId . C8.pack
 40 | 
 41 | node :: String -> TestType -> [TestNode] -> TestNode
 42 | node s tp p = TestNode (VertexId (C8.pack s)) tp ((const ParentEdge &&& id) <$> p)
 43 | 
 44 | instance GraphVertexOperations TestNode where
 45 |   vertexToId = tnId
 46 |   expandVertexAsVertices x = snd <$> tnParents x
 47 | 
 48 | instance GraphOperations TestNode StructureEdge where
 49 |   expandVertex tn = tnParents tn
 50 | 
 51 | acGen :: AutocacheGen TestNode
 52 | acGen =
 53 |   let deriveUncache' (Vertex (VertexId x) (TestNode _ AutocacheNode _)) =
 54 |         let vid' = VertexId $ C8.pack . (++ "_uncache") . C8.unpack $ x
 55 |         in Vertex vid' (TestNode vid' UncacheNode [])
 56 |       deriveUncache' x = error (show x)
 57 |       deriveIdentity' (Vertex (VertexId x) (TestNode _ r _)) =
 58 |         let vid' = VertexId $ C8.pack . (++ "_identity") . C8.unpack $ x
 59 |         in Vertex vid' (TestNode vid' r [])
 60 |   in AutocacheGen {
 61 |     deriveUncache = deriveUncache',
 62 |     deriveIdentity = deriveIdentity'
 63 |   }
 64 | 
 65 | expandFun :: TestNode -> CacheTry NodeCachingType
 66 | expandFun n = case (tnType n, tnParents n) of
 67 |   (AutocacheNode, [_]) -> pure $ AutocacheOp (tnId n)
 68 |   (AutocacheNode, x) -> Left $ sformat ("Node: "%shown%": expected one parent for autocaching, got "%shown) n x
 69 |   (CacheNode, [_]) -> pure $ CacheOp (tnId n)
 70 |   (CacheNode, x) -> Left $ sformat ("Node: "%shown%": expected one parent for caching, got "%shown) n x
 71 |   (UncacheNode, [(ParentEdge, x)]) -> pure $ UncacheOp (tnId n) (tnId x)
 72 |   (UncacheNode, x) -> Left $ sformat ("Node: "%shown%": Expected one parent for uncaching, got "%shown) n x
 73 |   (Dataset, _) -> Right Through
 74 |   (Row, _) -> Right Stop
 75 | 
 76 | errors :: TestNode -> CacheTry [CachingFailure]
 77 | errors tn = do
 78 |   g <- buildGraph tn :: Either Text (Graph TestNode StructureEdge)
 79 |   checkCaching (graphMapEdges g (const ParentEdge)) expandFun
 80 | 
 81 | errors' :: TestNode -> CacheTry (Graph TestNode StructureEdge)
 82 | errors' tn = do
 83 |   g <- buildGraph tn :: Either Text (Graph TestNode StructureEdge)
 84 |   fillAutoCache expandFun acGen g
 85 | 
 86 | intErrors :: LocalData a -> Try ComputeGraph
 87 | intErrors ld =
 88 |   let cg = buildComputationGraph ld
 89 |   in performGraphTransforms emptySession =<< cg
 90 | 
 91 | emptySession :: SparkSession
 92 | emptySession = SparkSession c (LocalSessionId "id") 3 emptyNodeCache
 93 |   where c = SparkSessionConf "end_point" (negate 1) 10 "session_name" True
 94 | 
 95 | spec :: Spec
 96 | spec = do
 97 |   describe "Caching operations" $ do
 98 |     it "missing parent node" $ do
 99 |       let n1 = node "1" CacheNode []
100 |       errors n1 `shouldSatisfy` isLeft
101 |     it "caching: parent node" $ do
102 |       let n0 = node "0" Dataset []
103 |       let n1 = node "1" CacheNode [n0]
104 |       errors n1 `shouldBe` Right []
105 |     it "uncaching: missing parent node" $ do
106 |       let n1 = node "1" UncacheNode []
107 |       errors n1 `shouldSatisfy` isLeft
108 |     it "uncaching: parent node" $ do
109 |       let n0 = node "0" Dataset []
110 |       let n1 = node "1" CacheNode [n0]
111 |       let n2 = node "2" CacheNode [n1]
112 |       errors n2 `shouldBe` Right []
113 |     it "too many nodes for uncaching" $ do
114 |       let n0 = node "0" Dataset []
115 |       let n1 = node "1" CacheNode [n0]
116 |       let n2 = node "2" UncacheNode [n1, n2]
117 |       errors n2 `shouldSatisfy` isLeft
118 |     it "access after uncaching" $ do
119 |       let n0 = node "0" Dataset []
120 |       let n1 = node "1" CacheNode [n0]
121 |       let n2 = node "2" UncacheNode [n1]
122 |       let n3 = node "3" Dataset [n1, n2]
123 |       errors n3 `shouldBe` Right [CachingFailure (nid "1") (nid "2") (nid "3")]
124 |     it "ambigous access after uncaching" $ do
125 |       let n0 = node "0" Dataset []
126 |       let n1 = node "1" CacheNode [n0]
127 |       let n2 = node "2" UncacheNode [n1]
128 |       let n3 = node "3" Dataset [n1]
129 |       let n4 = node "4" Dataset [n3, n2]
130 |       errors n4 `shouldBe` Right [CachingFailure (nid "1") (nid "2") (nid "3")
131 |                                   ,CachingFailure (nid "1") (nid "2") (nid "4")]
132 |   describe "Autocaching operations" $ do
133 |     it "missing parent node" $ do
134 |       let n1 = node "1" AutocacheNode []
135 |       let g = traceHint "g=" (errors' n1)
136 |       g `shouldSatisfy` isLeft
137 |     it "auto-uncaching with no child should not create uncaching" $ do
138 |       let n0 = node "0" Dataset []
139 |       let n1 = node "1" AutocacheNode [n0]
140 |       let g = traceHint "g=" (errors' n1)
141 |       g `shouldSatisfy` isRight
142 |       ((length . toList . gVertices) <$> g) `shouldBe` Right 2
143 |     it "access after uncaching" $ do
144 |       let n0 = node "0" Dataset []
145 |       let n1 = node "1" AutocacheNode [n0]
146 |       let n2 = node "2" Row [n1]
147 |       let g = traceHint "g=" (errors' n2)
148 |       g `shouldSatisfy` isRight
149 |       ((length . toList . gVertices) <$> g) `shouldBe` Right 5
150 |     it "access after and scoping" $ do
151 |       let n0 = node "0" Dataset []
152 |       let n1 = node "1" AutocacheNode [n0]
153 |       let n2a = node "2a" Row [n1]
154 |       let n2b = node "2b" Row [n1]
155 |       let n3 = node "3" Row [n2a, n2b]
156 |       let g = traceHint "g=" (errors' n3)
157 |       g `shouldSatisfy` isRight
158 |       ((length . toList . gVertices) <$> g) `shouldBe` Right 8
159 |   describe "Autocaching integration tests" $ do
160 |     it "test 1" $ do
161 |       let l = [1,2,3] :: [Int]
162 |       let ds = dataset l
163 |       let ds' = autocache ds
164 |       let c1 = asCol ds'
165 |       let s1 = sumCol c1
166 |       let s2 = count ds'
167 |       let x = s1 + s2
168 |       let g = traceHint "g=" (intErrors x)
169 |       g `shouldSatisfy` isRight
170 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/DAGFunctionsSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveAnyClass #-}
  2 | {-# LANGUAGE ScopedTypeVariables #-}
  3 | {-# LANGUAGE MultiParamTypeClasses #-}
  4 | 
  5 | -- :script test/Spark/Core/Internal/PathsSpec.hs
  6 | module Spark.Core.Internal.DAGFunctionsSpec where
  7 | 
  8 | import Test.Hspec
  9 | import qualified Data.Map.Strict as M
 10 | import qualified Data.Vector as V
 11 | import qualified Data.ByteString.Char8 as C8
 12 | import Control.Arrow((&&&))
 13 | import Data.Foldable(toList)
 14 | 
 15 | import Spark.Core.Internal.DAGStructures
 16 | import Spark.Core.Internal.DAGFunctions
 17 | import Spark.Core.Internal.Utilities
 18 | 
 19 | data MyV = MyV {
 20 |   mvId :: VertexId,
 21 |   mvParents :: [MyV]
 22 | } deriving (Eq)
 23 | 
 24 | id2Str :: VertexId -> String
 25 | id2Str = C8.unpack . unVertexId
 26 | 
 27 | instance Show MyV where
 28 |   show v = "MyV(" ++ (id2Str . mvId $ v) ++ ")"
 29 | 
 30 | 
 31 | instance GraphVertexOperations MyV where
 32 |   vertexToId = mvId
 33 |   expandVertexAsVertices = mvParents
 34 | 
 35 | instance GraphOperations MyV () where
 36 |   expandVertex = ((const () &&& id) <$>) . mvParents
 37 | 
 38 | myv :: String -> [MyV] -> MyV
 39 | myv s = MyV (VertexId (C8.pack s))
 40 | 
 41 | expandNodes :: MyV -> DagTry [String]
 42 | expandNodes vx =
 43 |   let tg = buildGraph vx :: DagTry (Graph MyV ())
 44 |   in (id2Str . mvId . vertexData <$>) . toList . gVertices <$> tg
 45 | 
 46 | -- edges: from -> to
 47 | expandEdges :: MyV -> DagTry [(String, String)]
 48 | expandEdges vx =
 49 |   let tg = buildGraph vx :: DagTry (Graph MyV ())
 50 |   in tg <&> \g ->
 51 |     concat $ M.assocs (gEdges g) <&> \(vid, v) ->
 52 |       (C8.unpack . unVertexId . vertexId . veEndVertex &&&
 53 |        C8.unpack . unVertexId . const vid) <$> V.toList v
 54 | 
 55 | spec :: Spec
 56 | spec = do
 57 |   describe "Tests on paths" $ do
 58 |     it "no parent" $ do
 59 |       let v0 = myv "v0" []
 60 |       expandNodes v0 `shouldBe` Right ["v0"]
 61 |     it "common parent" $ do
 62 |       let v0 = myv "v0" []
 63 |       let v0' = myv "v0" []
 64 |       let v1 = myv "v1" [v0, v0']
 65 |       expandEdges v1 `shouldBe` Right [("v0", "v1"), ("v0", "v1")]
 66 |     it "diamond" $ do
 67 |       let va = myv "va" []
 68 |       let va' = myv "va" []
 69 |       let v0 = myv "v0" [va]
 70 |       let v0' = myv "v0" [va']
 71 |       let v1 = myv "v1" [v0, v0']
 72 |       expandEdges v1 `shouldBe` Right [("va", "v0"), ("v0", "v1"), ("v0", "v1")]
 73 |     it "simple sources" $ do
 74 |       let v0 = myv "v0" []
 75 |       let v1 = myv "v1" [v0]
 76 |       let tg = buildGraph v1 :: DagTry (Graph MyV ())
 77 |       let g = forceRight tg
 78 |       mvId . vertexData <$> graphSources g `shouldBe` [mvId v1]
 79 |     it "simple sinks" $ do
 80 |       let v0 = myv "v0" []
 81 |       let v1 = myv "v1" [v0]
 82 |       let tg = buildGraph v1 :: DagTry (Graph MyV ())
 83 |       let g = forceRight tg
 84 |       mvId . vertexData <$> graphSinks g `shouldBe` [mvId v0]
 85 |     it "longer sources" $ do
 86 |       let v0 = myv "v0" []
 87 |       let v1 = myv "v1" [v0]
 88 |       let v2 = myv "v2" [v1]
 89 |       let tg = buildGraph v2 :: DagTry (Graph MyV ())
 90 |       let g = forceRight tg
 91 |       mvId . vertexData <$> graphSources g `shouldBe` [mvId v2]
 92 |     it "longer sinks" $ do
 93 |       let v0 = myv "v0" []
 94 |       let v1 = myv "v1" [v0]
 95 |       let v2 = myv "v2" [v1]
 96 |       let tg = buildGraph v2 :: DagTry (Graph MyV ())
 97 |       let g = forceRight tg
 98 |       mvId . vertexData <$> graphSinks g `shouldBe` [mvId v0]
 99 |   describe "building DAGs" $ do
100 |     it "2 nodes" $ do
101 |       let v0 = myv "v0" []
102 |       let v1 = myv "v1" [v0]
103 |       let v2 = myv "v2" [v1]
104 |       let l = forceRight $ buildVertexList v2
105 |       id2Str . mvId <$> l `shouldBe` ["v0", "v1", "v2"]
106 |     it "triangle" $ do
107 |       let v0 = myv "v0" []
108 |       let v1 = myv "v1" [v0]
109 |       let v2 = myv "v2" [v0, v1]
110 |       let l = forceRight $ buildVertexList v2
111 |       -- The return order should be in lexicographic order
112 |       -- (which is unique in this case).
113 |       id2Str . mvId <$> l `shouldBe` ["v0", "v1", "v2"]
114 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/GroupsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE DeriveGeneric #-}
 3 | {-# LANGUAGE MultiParamTypeClasses #-}
 4 | 
 5 | module Spark.Core.Internal.GroupsSpec where
 6 | 
 7 | import Data.Text(Text)
 8 | import Test.Hspec
 9 | import GHC.Generics
10 | import Data.Either(isRight)
11 | 
12 | import Spark.Core.Functions
13 | import Spark.Core.ColumnFunctions
14 | import Spark.Core.Dataset
15 | import Spark.Core.Column
16 | import Spark.Core.Row
17 | import Spark.Core.Types
18 | import Spark.Core.Internal.Groups
19 | 
20 | 
21 | data MyPair = MyPair {
22 |   myKey :: Text,
23 |   myVal :: Int } deriving (Generic, Show)
24 | 
25 | myKey' :: StaticColProjection MyPair Text
26 | myKey' = unsafeStaticProjection buildType "myKey"
27 | myVal' :: StaticColProjection MyPair Int
28 | myVal' = unsafeStaticProjection buildType "myVal"
29 | instance SQLTypeable MyPair
30 | instance ToSQL MyPair
31 | 
32 | -- The tests are really light for now, and just check that the code passes the
33 | -- dynamic type checker.
34 | spec :: Spec
35 | spec = do
36 |   describe "typed grouping tests" $ do
37 |     let ds = dataset [MyPair "1" 1, MyPair "2" 2]
38 |     let keys = ds // myKey'
39 |     let values = ds // myVal'
40 |     let g = groupByKey keys values
41 |     let sqlt1 = buildType :: SQLType MyPair
42 |     it "group" $ do
43 |       let tds2 = castType sqlt1 (groupAsDS g)
44 |       tds2 `shouldSatisfy` isRight
45 |     it "map group" $ do
46 |       let g2 = g `mapGroup` \c -> c + c
47 |       let tds2 = castType sqlt1 (groupAsDS g2)
48 |       tds2 `shouldSatisfy` isRight
49 |     it "simple reduce" $ do
50 |       let ds2 = g `aggKey` sumCol
51 |       let tds3 = castType sqlt1 ds2
52 |       tds3 `shouldSatisfy` isRight
53 |     it "complex reduce" $ do
54 |       let ds2 = g `aggKey` \c -> sumCol (c + c)
55 |       let tds3 = castType sqlt1 ds2
56 |       tds3 `shouldSatisfy` isRight
57 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/LocalDataFunctionsSpec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Spark.Core.Internal.LocalDataFunctionsSpec where
 3 | 
 4 | import Test.Hspec
 5 | 
 6 | import Spark.Core.Dataset
 7 | import Spark.Core.Functions()
 8 | 
 9 | spec :: Spec
10 | spec = do
11 |   describe "Arithmetic operations on local data (integers)" $ do
12 |     it "ints" $ do
13 |       let x1 = 1 :: LocalData Int
14 |       let x2 = 2 :: LocalData Int
15 |       let y1 = x1 + x2
16 |       let y2 = x1 `div` x2
17 |       (y2 `shouldBe` y2)
18 |       (y1 `shouldBe` y1)
19 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/OpFunctionsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveAnyClass #-}
 2 | {-# LANGUAGE DeriveGeneric #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | {-# LANGUAGE ScopedTypeVariables #-}
 5 | {-# LANGUAGE QuasiQuotes #-}
 6 | 
 7 | module Spark.Core.Internal.OpFunctionsSpec where
 8 | 
 9 | import Data.Aeson
10 | import Test.Hspec
11 | import Text.RawString.QQ
12 | 
13 | import Spark.Core.Functions
14 | import Spark.Core.Internal.OpFunctions
15 | import Spark.Core.Internal.DatasetFunctions
16 | 
17 | 
18 | spec :: Spec
19 | spec = do
20 |   describe "extraNodeOpData" $ do
21 |     it "should have the content of a constant dataset" $ do
22 |       let l = [1,2,3] :: [Int]
23 |       let res :: Maybe Value
24 |           res = decode
25 |               ([r|{"content": [1,2,3],
26 |                     "cellType" : {
27 |                       "dt": "integer",
28 |                       "nullable": false
29 |                     }
30 |                   }|])
31 |       let ds = dataset l
32 |       let d = extraNodeOpData . nodeOp $ ds
33 |       Just d `shouldBe` res
34 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/PathsSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveAnyClass #-}
  2 | {-# LANGUAGE DeriveGeneric #-}
  3 | {-# LANGUAGE ScopedTypeVariables #-}
  4 | 
  5 | -- :script test/Spark/Core/Internal/PathsSpec.hs
  6 | module Spark.Core.Internal.PathsSpec where
  7 | 
  8 | import Test.Hspec
  9 | import qualified Data.Map.Strict as M
 10 | import qualified Data.Set as S
 11 | import qualified Data.ByteString.Char8 as C8
 12 | import qualified Data.Text as T
 13 | 
 14 | import Spark.Core.StructuresInternal
 15 | import Spark.Core.Functions
 16 | import Spark.Core.Dataset
 17 | import Spark.Core.Internal.Paths
 18 | import Spark.Core.Internal.DAGStructures
 19 | import Spark.Core.Internal.DAGFunctions
 20 | import Spark.Core.Internal.ComputeDag
 21 | import Spark.Core.Internal.PathsUntyped
 22 | import Spark.Core.Internal.Utilities
 23 | import Spark.Core.Internal.DatasetFunctions
 24 | import Spark.Core.Internal.DatasetStructures
 25 | 
 26 | data MyV = MyV {
 27 |   mvId :: VertexId,
 28 |   mvLogical :: [MyV],
 29 |   mvParents :: [MyV]
 30 | } deriving (Eq)
 31 | 
 32 | instance Show MyV where
 33 |   show v = "MyV(" ++ (C8.unpack . unVertexId . mvId $ v) ++ ")"
 34 | 
 35 | 
 36 | assignPaths :: UntypedNode -> [UntypedNode]
 37 | assignPaths n =
 38 |   let cgt = buildCGraph n :: DagTry (ComputeDag UntypedNode NodeEdge)
 39 |       cg = forceRight cgt
 40 |       acgt = assignPathsUntyped cg
 41 |       ncg = forceRight acgt
 42 |   in graphDataLexico . tieNodes $ ncg
 43 | 
 44 | 
 45 | instance GraphVertexOperations MyV where
 46 |   vertexToId = mvId
 47 |   expandVertexAsVertices = mvParents
 48 | 
 49 | myv :: String -> [MyV] -> [MyV] -> MyV
 50 | myv s logical inner = MyV (VertexId (C8.pack s)) logical inner
 51 | 
 52 | myvToVertex :: MyV -> Vertex MyV
 53 | myvToVertex x = Vertex (mvId x) x
 54 | 
 55 | buildScopes :: [MyV] -> Scopes
 56 | buildScopes l = iGetScopes0 l' fun where
 57 |   l' = myvToVertex <$> l
 58 |   fun vx = ParentSplit {
 59 |     psLogical = myvToVertex <$> (mvLogical . vertexData $ vx),
 60 |     psInner = myvToVertex <$> (mvParents . vertexData $ vx) }
 61 | 
 62 | simple :: [(Maybe String, [String])] -> Scopes
 63 | simple [] = M.empty
 64 | simple ((ms, ss) : t) =
 65 |   let
 66 |     key = VertexId . C8.pack <$> ms
 67 |     vals = VertexId . C8.pack <$> ss
 68 |     new = M.singleton key (S.fromList vals)
 69 |   in mergeScopes new (simple t)
 70 | 
 71 | gatherings :: [(String, [[String]])] -> M.Map VertexId [[VertexId]]
 72 | gatherings [] = M.empty
 73 | gatherings ((key, paths) : t) =
 74 |   let
 75 |     k = VertexId . C8.pack $ key
 76 |     ps = (VertexId . C8.pack <$>) <$> paths
 77 |     new = M.singleton k ps
 78 |   in M.unionWith (++) new (gatherings t)
 79 | 
 80 | gatherPaths' :: [MyV] -> M.Map VertexId [[VertexId]]
 81 | gatherPaths' = gatherPaths . buildScopes
 82 | 
 83 | spec :: Spec
 84 | spec = do
 85 |   describe "Tests on paths" $ do
 86 |     it "nothing" $ do
 87 |       buildScopes [] `shouldBe` simple []
 88 |     it "no parent" $ do
 89 |       let v0 = myv "v0" [] []
 90 |       let res = [ (Nothing, ["v0"]), (Just "v0", []) ]
 91 |       buildScopes [v0] `shouldBe` simple res
 92 |     it "one logical parent" $ do
 93 |       let v0 = myv "v0" [] []
 94 |       let v1 = myv "v1" [v0] []
 95 |       let res = [ (Nothing, ["v0", "v1"])
 96 |                 , (Just "v1", [])
 97 |                 , (Just "v0", []) ]
 98 |       buildScopes [v1, v0] `shouldBe` simple res
 99 |     it "one inner parent" $ do
100 |       let v0 = myv "v0" [] []
101 |       let v1 = myv "v1" [] [v0]
102 |       let res = [ (Nothing, ["v1"])
103 |                 , (Just "v1", ["v0"])
104 |                 , (Just "v0", []) ]
105 |       buildScopes [v1, v0] `shouldBe` simple res
106 |     it "logical scoping over a parent" $ do
107 |       let v0 = myv "v0" [] []
108 |       let v1 = myv "v1" [v0] []
109 |       let v2 = myv "v2" [v0] [v1]
110 |       let res = [ (Nothing, ["v0", "v2"])
111 |                 , (Just "v0", [])
112 |                 , (Just "v1", [])
113 |                 , (Just "v2", ["v1"]) ]
114 |       buildScopes [v2] `shouldBe` simple res
115 |     it "common ancestor" $ do
116 |       let top = myv "top" [] []
117 |       let inner = myv "inner" [top] []
118 |       let v1 = myv "v1" [top] [inner]
119 |       let v2 = myv "v2" [top] [inner]
120 |       let res = [ (Nothing, ["top", "v1", "v2"])
121 |                 , (Just "inner", [])
122 |                 , (Just "top", [])
123 |                 , (Just "v1", ["inner"])
124 |                 , (Just "v2", ["inner"]) ]
125 |       buildScopes [v1, v2] `shouldBe` simple res
126 |     it "common ancestor, unbalanced" $ do
127 |       let top = myv "top" [] []
128 |       let inner = myv "inner" [top] []
129 |       let v1 = myv "v1" [top] [inner]
130 |       let v2 = myv "v2" [] [inner]
131 |       let res = [ (Nothing, ["top", "v1", "v2"])
132 |                 , (Just "inner", [])
133 |                 , (Just "top", [])
134 |                 , (Just "v1", ["inner"])
135 |                 , (Just "v2", ["inner", "top"]) ]
136 |       buildScopes [v1, v2] `shouldBe` simple res
137 |   describe "Path gatherings" $ do
138 |     it "nothing" $ do
139 |       gatherPaths' [] `shouldBe` gatherings []
140 |     it "no parent" $ do
141 |       let v0 = myv "v0" [] []
142 |       let res = [("v0", [[]])]
143 |       gatherPaths' [v0] `shouldBe` gatherings res
144 |     it "one logical parent" $ do
145 |       let v0 = myv "v0" [] []
146 |       let v1 = myv "v1" [v0] []
147 |       let res = [ ("v1", [[]])
148 |                 , ("v0", [[]])]
149 |       gatherPaths' [v1] `shouldBe` gatherings res
150 |     it "one inner parent" $ do
151 |       let v0 = myv "v0" [] []
152 |       let v1 = myv "v1" [] [v0]
153 |       let res = [ ("v1", [[]])
154 |                 , ("v0", [["v1"]])]
155 |       gatherPaths' [v1] `shouldBe` gatherings res
156 |     it "logical scoping over a parent" $ do
157 |       let v0 = myv "v0" [] []
158 |       let v1 = myv "v1" [v0] []
159 |       let v2 = myv "v2" [v0] [v1]
160 |       let res = [ ("v0", [[]])
161 |                 , ("v1", [["v2"]])
162 |                 , ("v2", [[]]) ]
163 |       gatherPaths' [v2] `shouldBe` gatherings res
164 |     it "common ancestor" $ do
165 |       let top = myv "top" [] []
166 |       let inner = myv "inner" [top] []
167 |       let v1 = myv "v1" [top] [inner]
168 |       let v2 = myv "v2" [top] [inner]
169 |       let res = [ ("inner", [["v1"], ["v2"]])
170 |                 , ("top", [[]])
171 |                 , ("v1", [[]])
172 |                 , ("v2", [[]]) ]
173 |       gatherPaths' [v1, v2] `shouldBe` gatherings res
174 |   describe "Real paths" $ do
175 |     it "simple test" $ do
176 |       let c0 = constant (1 :: Int) @@ "c0"
177 |       let c1 = identity c0 @@ "c1"
178 |       let c2 = identity c1 `logicalParents` [untyped c0] @@ "c2"
179 |       nodeId <$> nodeParents c1 `shouldBe` [nodeId c0]
180 |       nodeId <$> nodeParents c2 `shouldBe` [nodeId c1]
181 |       let withParents = T.unpack . catNodePath . nodePath <$> assignPaths (untyped c2)
182 |       withParents `shouldBe` ["c0", "c2/c1", "c2"]
183 |     it "simple test 2" $ do
184 |       let ds = dataset ([1 ,2, 3, 4]::[Int]) @@ "ds"
185 |       let c = count ds @@ "c"
186 |       let c2 = (c + (identity c @@ "id")) `logicalParents` [untyped ds] @@ "c2"
187 |       let withParents = T.unpack . catNodePath . nodePath <$> assignPaths (untyped c2)
188 |       withParents `shouldBe`  ["ds", "c2/c","c2/id","c2"]
189 | 


--------------------------------------------------------------------------------
/test/Spark/Core/Internal/RowUtilsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveAnyClass #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | 
 5 | module Spark.Core.Internal.RowUtilsSpec where
 6 | 
 7 | import Data.Aeson
 8 | import Data.Maybe(fromJust)
 9 | import Test.Hspec
10 | import Data.ByteString.Lazy(ByteString)
11 | import qualified Data.Vector as V
12 | import Data.Either(isRight)
13 | 
14 | import Spark.Core.Types
15 | import Spark.Core.Row
16 | import Spark.Core.Internal.TypesFunctions
17 | import Spark.Core.Internal.RowGenericsFrom
18 | import Spark.Core.Internal.TypesStructuresRepr(DataTypeElementRepr)
19 | 
20 | fun :: ByteString -> DataType -> Cell -> IO ()
21 | fun js dt cell2 =
22 |   let
23 |     mval = decode js :: Maybe Value
24 |     val = fromJust mval
25 |     cellt = jsonToCell dt val
26 |   in cellt `shouldBe` (Right cell2)
27 | 
28 | 
29 | spec :: Spec
30 | spec = do
31 |   describe "JSON -> Row" $ do
32 |     it "ints" $ do
33 |       fun "2" intType (IntElement 2)
34 |     it "[ints]" $ do
35 |       fun "[2]" (arrayType' intType) (RowArray (V.singleton (IntElement 2)))
36 |   describe "Decoding data types"  $ do
37 |     it "should decode DataTypeElementRepr" $ do
38 |       let x = rowArray [rowArray [StringElement "ts3f1"],BoolElement True,IntElement 1,IntElement 0]
39 |       let elt = cellToValue x :: TryS DataTypeElementRepr
40 |       elt `shouldSatisfy` isRight
41 | 


--------------------------------------------------------------------------------
/test/Spark/Core/PathSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ExistentialQuantification #-}
 2 | {-# LANGUAGE FlexibleInstances #-}
 3 | 
 4 | module Spark.Core.PathSpec where
 5 | 
 6 | import Data.Maybe(fromJust)
 7 | import Test.Hspec
 8 | 
 9 | import Spark.Core.Functions
10 | import Spark.Core.Dataset
11 | 
12 | fun1 :: LocalData Int
13 | fun1 =
14 |   let m1 = constant (1::Int)
15 |       m2 = constant (2::Int) in
16 |     constant (3::Int)
17 |       `logicalParents` [untyped m1, untyped m2]
18 |       `parents` []
19 | 
20 | 
21 | fun2 :: LocalData Int -> LocalData Int
22 | fun2 ld1 = let
23 |   m1 = constant (1 :: Int) `parents` [untyped ld1] @@ "m1"
24 |     in
25 |       constant (3 :: Int)
26 |         `logicalParents` [untyped m1]
27 |         `parents` [untyped ld1]
28 |         @@ "c2"
29 | 
30 | -- fun3 :: LocalData Int -> LocalData Int
31 | -- fun3 ld = ld + 3
32 | 
33 | spec :: Spec
34 | spec = do
35 |   describe "Tests with nodes" $ do
36 |     it "should get a node" $ do
37 |       let n1 = fun1
38 |       let l = fromJust $ nodeLogicalParents n1
39 |       (length l) `shouldBe` 2
40 |     -- it "should work with ints" $ do
41 |     --   let n2 = (fun3 4) @@ "" in
42 |     --     (length $ nodeDependencies n2) `shouldBe` 2
43 | 


--------------------------------------------------------------------------------
/test/Spark/Core/ProjectionsSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ExistentialQuantification #-}
  2 | {-# LANGUAGE FlexibleInstances #-}
  3 | {-# LANGUAGE FlexibleContexts #-}
  4 | {-# LANGUAGE DeriveGeneric #-}
  5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
  6 | {-# LANGUAGE MultiParamTypeClasses #-}
  7 | {-# LANGUAGE OverloadedStrings #-}
  8 | 
  9 | module Spark.Core.ProjectionsSpec where
 10 | 
 11 | import Test.Hspec
 12 | import GHC.Generics
 13 | import Data.List(isPrefixOf)
 14 | import Data.Either(isRight, isLeft)
 15 | import qualified Data.Vector as V
 16 | import qualified Data.Text as T
 17 | 
 18 | import Spark.Core.Functions
 19 | import Spark.Core.Dataset
 20 | import Spark.Core.Column
 21 | import Spark.Core.Row
 22 | import Spark.Core.Types
 23 | import Spark.Core.Try
 24 | import Spark.Core.Internal.Utilities
 25 | import Spark.Core.Internal.TypesFunctions
 26 | 
 27 | 
 28 | data Tree = Tree {
 29 |   treeId :: Int,
 30 |   treeWidth :: Int,
 31 |   treeHeight :: Int } deriving (Generic, Show)
 32 | 
 33 | treeId' :: StaticColProjection Tree Int
 34 | treeId' = unsafeStaticProjection buildType "treeId"
 35 | treeWidth' :: StaticColProjection Tree Int
 36 | treeWidth' = unsafeStaticProjection buildType "treeWidth"
 37 | instance SQLTypeable Tree
 38 | instance ToSQL Tree
 39 | 
 40 | newtype MyId = MyId Int deriving (Generic, Show, Num)
 41 | instance SQLTypeable MyId
 42 | instance ToSQL MyId
 43 | 
 44 | newtype Height = Height Int deriving (Generic, Num, Show)
 45 | instance SQLTypeable Height
 46 | instance ToSQL Height
 47 | 
 48 | data STree = STree {
 49 |   sTreeId :: MyId,
 50 |   sTreeWidth :: Height,
 51 |   sTreeHeight :: Int } deriving (Generic, Show)
 52 | 
 53 | instance SQLTypeable STree
 54 | instance ToSQL STree
 55 | sTreeId' :: StaticColProjection STree MyId
 56 | sTreeId' = unsafeStaticProjection buildType "sTreeId"
 57 | sTreeWidth' :: StaticColProjection STree Height
 58 | sTreeWidth' = unsafeStaticProjection buildType "sTreeWidth"
 59 | instance TupleEquivalence STree (MyId, Height, Int) where
 60 |   tupleFieldNames = NameTuple ["sTreeId", "sTreeWidth", "sTreeHeight"]
 61 | 
 62 | rawData :: [(Int, Int, Int)]
 63 | rawData = [(1, 3, 2)]
 64 | 
 65 | spec :: Spec
 66 | spec = do
 67 |   let ds = dataset [Tree 1 3 2]
 68 |   -- The untyped elements
 69 |   let dt = structType [structField (T.pack "treeId") intType, structField (T.pack "treeWidth") intType, structField (T.pack "treeHeight") intType]
 70 |   let fun (id', height, width) = RowArray $ V.fromList [IntElement id', IntElement height, IntElement width]
 71 |   let df1 = traceHint (T.pack "df1=") $ dataframe dt (fun <$> rawData)
 72 |   let ds1 = traceHint (T.pack "ds1=") $ forceRight (asDS df1) :: Dataset Tree
 73 |   describe "Simple projection demos" $ do
 74 |     it "should get a node" $ do
 75 |       ds `shouldBe` ds1
 76 |     it "Failing dynamic projection on dataframe" $ do
 77 |       df1/-"xx" `shouldSatisfy` isLeft
 78 |     it "Failing dynamic projection on dataset" $ do
 79 |       ds1/-"xx" `shouldSatisfy` isLeft
 80 |     it "Basic arithmetic on DS cols" $ do
 81 |       let c1 = ds1//treeWidth'
 82 |       let c2 = (c1 + c1)
 83 |       (show c2) `shouldSatisfy` ("treeWidth + treeWidth{int}" `isPrefixOf`)
 84 |     it "Basic arithmetic on DF cols" $ do
 85 |       let c1 = df1 // treeWidth'
 86 |       let c2 = c1 + c1
 87 |       (show c2) `shouldSatisfy` ("Right treeWidth + treeWidth{int}" `isPrefixOf`)
 88 |     it "Construction of ds2" $ do
 89 |       let str = struct' [ (df1/-"treeId") @@ "sTreeId",
 90 |                           (df1/-"treeWidth") @@ "sTreeWidth",
 91 |                           (df1/-"treeHeight") @@ "sTreeHeight"]
 92 |       let df2 = pack' str
 93 |       let ds2 = traceHint (T.pack "ds2=") $ asDS df2 :: Try (Dataset STree)
 94 |       ds2 `shouldSatisfy` isRight
 95 |     it "Static construction of ds2" $ do
 96 |       let ds2 = do
 97 |               idCol <- castCol' (buildType::SQLType MyId) (df1/-"treeId")
 98 |               widthCol <- castCol' (buildType::SQLType Height) (df1/-"treeWidth")
 99 |               heightCol <- castCol' (buildType::SQLType Int) (df1/-"treeWidth")
100 |               let s = pack (idCol, widthCol, heightCol) :: Dataset STree
101 |               return $ traceHint (T.pack "ds2=") s
102 |       ds2 `shouldSatisfy` isRight
103 |     it "Basic arithmetic on DS cols 1" $ do
104 |       let ds2' = do
105 |               idCol <- castCol' (buildType::SQLType MyId) (df1/-"treeId")
106 |               widthCol <- castCol' (buildType::SQLType Height) (df1/-"treeWidth")
107 |               heightCol <- castCol' (buildType::SQLType Int) (df1/-"treeWidth")
108 |               let s = pack (idCol, widthCol, heightCol) :: Dataset STree
109 |               return $ traceHint (T.pack "ds2=") s
110 |       let ds2 = forceRight ds2'
111 |       let c1 = ds2//sTreeWidth'
112 |       let c2 = c1 + c1
113 |       (show c2) `shouldSatisfy` ("sTreeWidth + sTreeWidth{int}" `isPrefixOf`)
114 | 


--------------------------------------------------------------------------------
/test/Spark/Core/RowToSQLSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveAnyClass #-}
 2 | {-# LANGUAGE DeriveGeneric #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | {-# LANGUAGE BangPatterns #-}
 5 | 
 6 | module Spark.Core.RowToSQLSpec where
 7 | 
 8 | import qualified Data.Vector as V
 9 | import GHC.Generics (Generic)
10 | import Test.Hspec
11 | 
12 | import Spark.Core.Types
13 | import Spark.Core.Row
14 | 
15 | data TestStruct1 = TestStruct1 {
16 |   ts1f1 :: Int,
17 |   ts1f2 :: Maybe Int } deriving (Show, Eq, Generic, ToSQL, FromSQL)
18 | 
19 | data TestStruct2 = TestStruct2 { ts2f1 :: [Int] } deriving (Show, Generic, SQLTypeable)
20 | 
21 | data TestStruct3 = TestStruct3 { ts3f1 :: Int } deriving (Show, Eq, Generic, ToSQL, FromSQL)
22 | data TestStruct4 = TestStruct4 { ts4f1 :: TestStruct3 } deriving (Show, Eq, Generic, ToSQL, FromSQL)
23 | 
24 | data TestStruct5 = TestStruct5 {
25 |   ts5f1 :: Int,
26 |   ts5f2 :: Int,
27 |   ts5f3 :: TestStruct3
28 | } deriving (Show, Eq, Generic, ToSQL, FromSQL)
29 | 
30 | newtype TestT1 = TestT1 { unTestT1 :: Int } deriving (Eq, Show, Generic, ToSQL, FromSQL)
31 | 
32 | 
33 | v2c :: (Show a, ToSQL a, FromSQL a, Eq a) => a -> Cell -> IO ()
34 | v2c !x !y = do
35 |   _ <- shouldBe (valueToCell x) y
36 |   _ <- shouldBe (cellToValue y) (Right x)
37 |   return ()
38 | 
39 | spec :: Spec
40 | spec = do
41 |   describe "Simple type tests" $ do
42 |     it "int" $
43 |       v2c (3 :: Int) (IntElement 3)
44 |     it "int?" $
45 |       v2c (Just 3 :: Maybe Int) (IntElement 3)
46 |     it "int? 2" $
47 |       v2c (Nothing :: Maybe Int) Empty
48 |     it "TestStruct3" $
49 |       v2c (TestStruct3 2) (RowArray $ V.fromList [IntElement 2])
50 |     it "TestStruct4" $
51 |       v2c (TestStruct4 (TestStruct3 3)) $
52 |         (RowArray $ V.fromList [
53 |             RowArray $ V.fromList [IntElement 3]
54 |           ])
55 |     it "TestStruct1 - empty" $
56 |       v2c (TestStruct1 2 Nothing) (RowArray $ V.fromList [IntElement 2, Empty])
57 |     it "TestStruct1 - full" $
58 |       v2c (TestStruct1 2 (Just 4)) (RowArray $ V.fromList [IntElement 2, IntElement 4])
59 |     it "TestStruct5" $
60 |       v2c (TestStruct5 1 2 (TestStruct3 3)) $
61 |         (RowArray $ V.fromList [
62 |             IntElement 1,
63 |             IntElement 2,
64 |             RowArray $ V.fromList [IntElement 3]
65 |           ])
66 |   -- describe "Simple type tests" $ do
67 |   --   it "newtype" $
68 |   --     v2c (TestT1 3) (IntElement 3)
69 | 


--------------------------------------------------------------------------------
/test/Spark/Core/SimpleExamplesSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveAnyClass #-}
 2 | {-# LANGUAGE ScopedTypeVariables #-}
 3 | {-# LANGUAGE QuasiQuotes #-}
 4 | 
 5 | -- Some small examples that get fully verified.
 6 | module Spark.Core.SimpleExamplesSpec where
 7 | 
 8 | import Data.Either(isRight)
 9 | import Data.Maybe(isJust)
10 | import Test.Hspec
11 | import qualified Data.Text as T
12 | import Text.RawString.QQ
13 | 
14 | import Spark.Core.Dataset
15 | import Spark.Core.Functions
16 | import Spark.Core.Column
17 | import Spark.Core.ColumnFunctions
18 | import Spark.Core.Internal.DatasetStructures
19 | import Spark.Core.Internal.Utilities(pretty)
20 | import Spark.Core.Internal.OpFunctions(extraNodeOpData)
21 | 
22 | ds1 :: Dataset Int
23 | ds1 = dataset [1,2,3]
24 | 
25 | ds2 :: Dataset Double
26 | ds2 = error "ds2"
27 | 
28 | spec :: Spec
29 | spec = do
30 |   describe "Simple examples" $ do
31 |     it "Precdence of renaming" $ do
32 |       let numbers = asCol ds1
33 |       let s = sumCol numbers
34 |       let numCount = count ds1
35 |       let avg = s `div` numCount @@ "myaverage"
36 |       _cnName avg `shouldSatisfy` isJust
37 |     it "name for simple integers" $ do
38 |       let numbers = asCol ds1
39 |       let s = sumCol numbers
40 |       let numCount = count ds1
41 |       let avg = s `div` numCount @@ "myaverage"
42 |       -- TODO: should it show "value: int" instead?
43 |       -- I think it should show it for distributed nodes only.
44 |       -- SQL is not allowed on observables
45 |       (show avg) `shouldBe` "/myaverage@org.spark.LocalDiv!int"
46 |   describe "pack1" $ do
47 |     it "Extracting and packing one column" $ do
48 |       let numbers = asCol ds1
49 |       let ds1' = pack1 numbers
50 |       (nodeType ds1) `shouldBe` (nodeType ds1')
51 |   describe "pack" $ do
52 |     it "Extracting and packing one column" $ do
53 |       let ds1' = pack' . asCol $ ds1
54 |       (nodeType <$> (asDF ds1)) `shouldBe` (nodeType <$> ds1')
55 |   describe "simple json example" $ do
56 |     it "packing and unpacking one column" $ do
57 |       let ds1' = pack' . asCol $ ds1
58 |       let d' = pretty . extraNodeOpData . nodeOp <$> ds1'
59 |       d' `shouldBe` Right (T.pack "{\"cellType\":{\"dt\":\"integer\",\"nullable\":false},\"content\":[1,2,3]}")
60 |     it "packing and unpacking 2 columns, one with a bad name" $ do
61 |       let col1 = asCol ds1
62 |       let col2 = col1 @@ "other"
63 |       let ds1' = pack' (col1, col2)
64 |       ds1' `shouldSatisfy` isRight -- NOT SURE WHY IT WOULD FAIL
65 |     it "packing and unpacking 2 columns, one with a good name" $ do
66 |       let col1 = asCol ds1 @@ "first"
67 |       let col2 = col1 @@ "second"
68 |       let ds1' = pack' (col1, col2)
69 |       ds1' `shouldSatisfy` isRight
70 | 
71 | 
72 |     -- it "example2" $ do
73 |     --   let numbers = asCol ds2
74 |     --   let avg = (colSum numbers) / (count ds2)
75 |     --   1 `shouldBe` 1
76 | 


--------------------------------------------------------------------------------
/test/Spark/Core/TypesSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveAnyClass #-}
 2 | {-# LANGUAGE DeriveGeneric #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | 
 5 | module Spark.Core.TypesSpec where
 6 | 
 7 | import GHC.Generics (Generic)
 8 | import Test.Hspec
 9 | import Test.Hspec.QuickCheck
10 | 
11 | import Spark.Core.Types
12 | import Spark.Core.Internal.TypesFunctions
13 | import Spark.Core.Internal.TypesGenerics()
14 | 
15 | data TestStruct1 = TestStruct1 {
16 |   ts1f1 :: Int,
17 |   ts1f2 :: Maybe Int } deriving (Show, Generic)
18 | 
19 | instance SQLTypeable TestStruct1
20 | 
21 | data TestStruct2 = TestStruct2 { ts2f1 :: [Int] } deriving (Show, Generic, SQLTypeable)
22 | 
23 | data TestStruct3 = TestStruct3 { ts3f1 :: Int } deriving (Show, Generic, SQLTypeable)
24 | data TestStruct4 = TestStruct4 { ts4f1 :: TestStruct3 } deriving (Show, Generic, SQLTypeable)
25 | 
26 | -- instance SQLTypeable TestStruct1
27 | -- instance Menu TestStruct1
28 | 
29 | 
30 | -- main :: IO ()
31 | -- main = hspec spec
32 | 
33 | spec :: Spec
34 | spec = do
35 |   describe "Simple type tests" $ do
36 |     it "show ints" $
37 |       show intType `shouldBe` "int"
38 | 
39 |     it "show arrays" $
40 |       show (arrayType' intType) `shouldBe` "[int]"
41 | 
42 |     it "show structures" $
43 |       show (arrayType' (canNull intType)) `shouldBe` "[int?]"
44 | 
45 |   describe "The basic tests for int types" $ do
46 |     it "ints" $
47 |       let t = buildType :: (SQLType Int)
48 |           dt = columnType t in
49 |         dt `shouldBe` intType
50 | 
51 |     it "opt ints" $
52 |       let t = buildType :: (SQLType (Maybe Int))
53 |           dt = columnType t in
54 |         dt `shouldBe` canNull intType
55 | 
56 |     -- The projection of all the product types
57 |     it "opt opt ints" $
58 |       let t = buildType :: (SQLType (Maybe (Maybe Int))) in
59 |         columnType t `shouldBe` canNull intType
60 | 
61 |     it "array ints" $
62 |       let t = buildType :: (SQLType [Int]) in
63 |         columnType t `shouldBe` arrayType' intType
64 | 
65 |     it "array opt ints" $
66 |       let t = buildType :: (SQLType [Maybe Int]) in
67 |         columnType t `shouldBe` arrayType' (canNull intType)
68 | 
69 |     it "opt array ints" $
70 |       let t = buildType :: (SQLType (Maybe [Int])) in
71 |         columnType t `shouldBe` canNull (arrayType' intType)
72 | 
73 |   describe "The basic tests for records" $ do
74 |     it "records with maybe" $
75 |       let t = buildType :: (SQLType TestStruct1)
76 |           out = structType [structField "ts1f1" intType, structField "ts1f2" (canNull intType)] in
77 |         columnType t `shouldBe` out
78 | 
79 |     it "records with arrays" $
80 |       let t = buildType :: (SQLType TestStruct2)
81 |           out = structType [structField "ts2f1" (arrayType' intType)] in
82 |         columnType t `shouldBe` out
83 | 
84 |     it "records within records" $
85 |       let t = buildType :: (SQLType TestStruct4)
86 |           out0 = structType [structField "ts3f1" intType]
87 |           out = structType [structField "ts4f1" out0] in
88 |         columnType t `shouldBe` out
89 | 
90 |   describe "Construction of frame types" $ do
91 |     prop "frameTypeFromCol should be invertible" $
92 |       \x ->
93 |         let dt = colTypeFromFrame x
94 |             y = frameTypeFromCol dt
95 |         in x == y
96 |     -- TODO this is not always working. Figure out the rules here.
97 |     -- prop "colTypeFromFrame should be invertible" $
98 |     --   \x -> (colTypeFromFrame . frameTypeFromCol) x == x
99 | 


--------------------------------------------------------------------------------
/test/Spec.hs:
--------------------------------------------------------------------------------
1 | -- Not working???
2 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
3 | 


--------------------------------------------------------------------------------