(r.state.toUpperCase > "K" &&
55 | r.state.toUpperCase < "P")).map(r => (r.state,r.county))
56 | .distinctOn(r => (r._1,r._2))
57 | .sortBy(r => (r._1,r._2))
58 |
59 | //query with state name >P
60 | val qryCountiesP_Z = AQMRPTQuery.filter(r => r.state.toUpperCase > "P")
61 | .map(r => (r.state,r.county))
62 | .distinctOn(r => (r._1,r._2))
63 | .sortBy(r => (r._1,r._2))
64 |
65 | case class Counties(state: String, name: String) extends FDAROW
66 | implicit def toCounties(row: (String,String)) = Counties(row._1,row._2)
67 | val countyLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toCounties _)
68 | //3 separate streams to extract county names from the same database table AQMRPT
69 | val countiesA_KStream = countyLoader.fda_typedStream(qryCountiesA_K.result)(db_b)(64,64)()()
70 | val countiesK_PStream = countyLoader.fda_typedStream(qryCountiesK_P.result)(db_b)(64,64)()()
71 | val countiesP_ZStream = countyLoader.fda_typedStream(qryCountiesP_Z.result)(db_b)(64,64)()()
72 |
73 | //obtain a combined stream with parallel loading with max of 4 open computation
74 | val combinedStream = fda_par_load(statesStream,countiesA_KStream,countiesK_PStream,countiesP_ZStream)(4)
75 |
76 |
77 | //define separate rows for different actions
78 | case class StateActionRow(action: FDAAction) extends FDAROW
79 | case class CountyActionRow(action: FDAAction) extends FDAROW
80 | val actionRunner = FDAActionRunner(slick.jdbc.H2Profile)
81 |
82 | //user-task to catch rows of States type and transform them into db insert actions
83 | def processStates: FDAUserTask[FDAROW] = row => {
84 | row match {
85 | //catch states row and transform it into insert action
86 | case States(stateName) => //target row type
87 | println(s"State name: ${stateName}")
88 | val action = StateQuery += StateModel(0,stateName)
89 | fda_next(StateActionRow(action))
90 | case others@ _ => //pass other types to next user-defined-tasks
91 | fda_next(others)
92 | }
93 | }
94 | //user-task to catch rows of Counties type and transform them into db insert actions
95 | def processCounties: FDAUserTask[FDAROW] = row => {
96 | row match {
97 | //catch counties row and transform it into insert action
98 | case Counties(stateName,countyName) => //target row type
99 | println(s"County ${countyName} of ${stateName}")
100 | val action = CountyQuery += CountyModel(0,countyName+ " of "+stateName)
101 | fda_next(CountyActionRow(action))
102 | case others@ _ => //pass other types to next user-defined-tasks
103 | fda_next(others)
104 | }
105 | }
106 |
107 | //user-task to catch States insert action rows and run them
108 | def runStateAction: FDAUserTask[FDAROW] = row => {
109 | row match {
110 | case StateActionRow(action) => //this is a state action row type
111 | println(s"runstate: ${action}")
112 | actionRunner.fda_execAction(action)(db_a) //run this query with db_a context
113 | fda_skip
114 | case others@ _ => //otherwise pass alone to next user-defined-tasks
115 | fda_next(others)
116 | }
117 | }
118 |
119 | //user-task to catch Counties insert action rows and run them
120 | def runCountyAction: FDAUserTask[FDAROW] = row => {
121 | row match {
122 | case CountyActionRow(action) => //this is a county action row type
123 | actionRunner.fda_execAction(action)(db_b) //run this query with db_b context
124 | fda_skip
125 | case others@ _ => //otherwise pass alone to next user-defined-tasks
126 | fda_next(others)
127 | }
128 | }
129 |
130 |
131 |
132 | def showRows: FDAUserTask[FDAROW] = row => {
133 | row match {
134 | case States(nm) =>
135 | println("")
136 | println(s"State: $nm")
137 | println("************")
138 | fda_skip
139 | case Counties(s,c) =>
140 | println("")
141 | println(s"County: $c")
142 | println(s"state of $s")
143 | println("------------")
144 | fda_skip
145 | case _ => fda_skip
146 | }
147 | }
148 |
149 | combinedStream.appendTask(processStates)
150 | .appendTask(processCounties)
151 | .appendTask(runStateAction)
152 | .appendTask(runCountyAction)
153 | .startRun
154 |
155 | }
156 |
--------------------------------------------------------------------------------
/src/main/scala/examples/UserDefinedTask.scala:
--------------------------------------------------------------------------------
1 | package com.bayakala.funda.examples
2 | import slick.jdbc.meta._
3 | import scala.language.implicitConversions
4 | import scala.concurrent.ExecutionContext.Implicits.global
5 | import scala.concurrent.duration._
6 | import scala.concurrent.{Await, Future}
7 | import scala.util.{Failure, Success}
8 | import slick.jdbc.H2Profile.api._
9 | import com.bayakala.funda._
10 | import api._
11 | import com.bayakala.funda.samples.SlickModels._
12 |
13 | object UserDefinedTasks extends App {
14 |
15 |
16 | val db = Database.forConfig("h2db")
17 |
18 | //drop original table schema
19 | val futVectorTables = db.run(MTable.getTables)
20 |
21 | val futDropTable = futVectorTables.flatMap{ tables => {
22 | val tableNames = tables.map(t => t.name.name)
23 | if (tableNames.contains(AQMRPTQuery.baseTableRow.tableName))
24 | db.run(AQMRPTQuery.schema.drop)
25 | else Future(():Unit)
26 | }
27 | }.andThen {
28 | case Success(_) => println(s"Table ${AQMRPTQuery.baseTableRow.tableName} dropped successfully! ")
29 | case Failure(e) => println(s"Failed to drop Table ${AQMRPTQuery.baseTableRow.tableName}, it may not exist! Error: ${e.getMessage}")
30 | }
31 | Await.ready(futDropTable,Duration.Inf)
32 |
33 | //create new table to refine AQMRawTable
34 | val actionCreateTable = AQMRPTQuery.schema.create
35 | val futCreateTable = db.run(actionCreateTable).andThen {
36 | case Success(_) => println("Table created successfully!")
37 | case Failure(e) => println(s"Table may exist already! Error: ${e.getMessage}")
38 | }
39 | //would carry on even fail to create table
40 | Await.ready(futCreateTable,Duration.Inf)
41 |
42 |
43 | //truncate data, only available in slick 3.2.1
44 | val futTruncateTable = futVectorTables.flatMap{ tables => {
45 | val tableNames = tables.map(t => t.name.name)
46 | if (tableNames.contains(AQMRPTQuery.baseTableRow.tableName))
47 | db.run(AQMRPTQuery.schema.truncate)
48 | else Future(():Unit)
49 | }
50 | }.andThen {
51 | case Success(_) => println(s"Table ${AQMRPTQuery.baseTableRow.tableName} truncated successfully!")
52 | case Failure(e) => println(s"Failed to truncate Table ${AQMRPTQuery.baseTableRow.tableName}! Error: ${e.getMessage}")
53 | }
54 | Await.ready(futDropTable,Duration.Inf)
55 |
56 |
57 | //load original table content
58 | //original table strong-typed-row
59 | case class AQMRaw(mid: String, state: String,
60 | county: String, year: String, value: String) extends FDAROW
61 | implicit def toAQMRaw(row: (String,String,String,String,String)) =
62 | AQMRaw(row._1,row._2,row._3,row._4,row._5)
63 | val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toAQMRaw _)
64 | // val queryAQMRaw = for { r <- AQMRawQuery } yield (r.mid,r.state,r.county,r.year,r.value)
65 | val queryAQMRaw = sql"""
66 | SELECT MEASUREID,STATENAME,COUNTYNAME,REPORTYEAR,VALUE FROM AIRQM
67 | """.as[(String,String,String,String,String)]
68 |
69 | val streamAQMRaw: FDAPipeLine[FDAROW] = streamLoader.fda_typedStream(queryAQMRaw)(db)(512,512)()()
70 |
71 |
72 | //filter out rows with inconvertible value strings and out of ranged value and year
73 | def filterRows: FDAUserTask[FDAROW] = row => {
74 | row match {
75 | case r: AQMRaw => {
76 | try {
77 | val yr = r.year.toInt
78 | val v = r.value.toInt
79 | val vlu = if ( v > 10 ) 10 else v
80 | val data = AQMRPTModel(0,r.mid.toInt,r.state,r.county,yr,vlu,0,true)
81 | if ((yr > 1960 && yr < 2018))
82 | fda_next(data) //this row ok. pass downstream
83 | else
84 | fda_skip //filter out this row
85 | } catch {
86 | case e: Exception =>
87 | fda_next(AQMRPTModel(0,r.mid.toInt,r.state,r.county,2000,0,0,false))
88 | //pass a invalid row
89 | }
90 | }
91 | case _ => fda_skip //wrong type, skip
92 | }
93 | }
94 |
95 | //transform data to action for later execution
96 | def toAction: FDAUserTask[FDAROW] = row => {
97 | row match {
98 | case r: AQMRPTModel =>
99 | val queryAction = AQMRPTQuery += r
100 | fda_next(FDAActionRow(queryAction))
101 | case other @ _ => fda_next(other)
102 | }
103 | }
104 |
105 | //get a query runner and an action task
106 | val actionRunner = FDAActionRunner(slick.jdbc.H2Profile)
107 | def runActionRow: FDAUserTask[FDAROW] = action => {
108 | action match {
109 | case FDAActionRow(q) => actionRunner.fda_execAction(q)(db)
110 | fda_skip
111 | case _ => fda_skip
112 | }
113 | }
114 |
115 |
116 | //start the program
117 | val streamAllTasks = streamAQMRaw.appendTask(filterRows)
118 | .appendTask(toAction)
119 | .appendTask(runActionRow)
120 |
121 | val streamToRun = streamAllTasks.onError { case e: Exception => println("Error:"+e.getMessage); fda_appendRow(FDAErrorRow(new Exception(e))) }
122 |
123 | streamToRun.startRun
124 |
125 | //aggregate-task demo: get count and sum of value for each state and year
126 | val orderedAQMRPT = AQMRPTQuery.sortBy(r => (r.state,r.year))
127 | //TableElementType conversion. must declare implicit
128 | implicit def toAQMRPT(row: AQMRPTTable#TableElementType) =
129 | AQMRPTModel(row.rid,row.mid,row.state,row.county,row.year,row.value,row.total,row.valid)
130 | val aqmrStreamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toAQMRPT _)
131 | val aqmrStream: FDAPipeLine[FDAROW] = aqmrStreamLoader.fda_typedStream(orderedAQMRPT.result)(db)(512,512)()()
132 | //user defined aggregator type.
133 | case class Accu(state: String, county: String, year: Int, count: Int, sumOfValue: Int)
134 | //user defined aggregation task
135 | def aggregateValue: FDAAggrTask[Accu,FDAROW] = (accu,row) => {
136 | row match {
137 | case aqmr: AQMRPTModel =>
138 | if (accu.state == "" || (aqmr.state == accu.state && aqmr.year == accu.year))
139 | //same condition: inc count and add sum, pass no row downstream
140 | (Accu(aqmr.state,aqmr.county,aqmr.year,accu.count+1, accu.sumOfValue+aqmr.value),fda_skip)
141 | else
142 | //reset accumulator, create a new aggregated row and pass downstream
143 | (Accu(aqmr.state,aqmr.county,aqmr.year,1, aqmr.value)
144 | ,fda_next(AQMRPTModel(0,9999,accu.state,accu.county,accu.year
145 | ,accu.count,accu.sumOfValue/accu.count,true)))
146 | case FDANullRow =>
147 | //last row encountered. create and pass new aggregated row
148 | (Accu(accu.state,accu.county,accu.year,1, 0)
149 | ,fda_next(AQMRPTModel(0,9999,accu.state,accu.county,accu.year
150 | ,accu.count,accu.sumOfValue/accu.count,true)))
151 | //incorrect row type, do nothing
152 | case _ => (accu,fda_skip)
153 | }
154 | }
155 |
156 |
157 | aqmrStream.aggregateTask(Accu("","",0,0,0),aggregateValue)
158 | .appendTask(toAction)
159 | .appendTask(runActionRow)
160 | .startRun
161 |
162 |
163 | }
164 |
--------------------------------------------------------------------------------
/src/main/scala/examples/ParallelTasks.scala:
--------------------------------------------------------------------------------
1 | package examples
2 | import slick.jdbc.meta._
3 | import com.bayakala.funda._
4 | import api._
5 | import scala.language.implicitConversions
6 | import scala.concurrent.ExecutionContext.Implicits.global
7 | import scala.concurrent.duration._
8 | import scala.concurrent.{Await, Future}
9 | import scala.util.{Failure, Success}
10 | import slick.jdbc.H2Profile.api._
11 | import com.bayakala.funda.samples.SlickModels._
12 | import fs2.Strategy
13 |
14 | object ParallelTasks extends App {
15 |
16 | val db = Database.forConfig("h2db")
17 |
18 | //drop original table schema
19 | val futVectorTables = db.run(MTable.getTables)
20 |
21 | val futDropTable = futVectorTables.flatMap{ tables => {
22 | val tableNames = tables.map(t => t.name.name)
23 | if (tableNames.contains(NORMAQMQuery.baseTableRow.tableName))
24 | db.run(NORMAQMQuery.schema.drop)
25 | else Future(():Unit)
26 | }
27 | }.andThen {
28 | case Success(_) => println(s"Table ${NORMAQMQuery.baseTableRow.tableName} dropped successfully! ")
29 | case Failure(e) => println(s"Failed to drop Table ${NORMAQMQuery.baseTableRow.tableName}, it may not exist! Error: ${e.getMessage}")
30 | }
31 | Await.ready(futDropTable,Duration.Inf)
32 |
33 | //create new table to refine AQMRawTable
34 | val actionCreateTable = NORMAQMQuery.schema.create
35 | val futCreateTable = db.run(actionCreateTable).andThen {
36 | case Success(_) => println("Table created successfully!")
37 | case Failure(e) => println(s"Table may exist already! Error: ${e.getMessage}")
38 | }
39 | //would carry on even fail to create table
40 | Await.ready(futCreateTable,Duration.Inf)
41 |
42 |
43 | //truncate data, only available in slick 3.2.1
44 | val futTruncateTable = futVectorTables.flatMap{ tables => {
45 | val tableNames = tables.map(t => t.name.name)
46 | if (tableNames.contains(NORMAQMQuery.baseTableRow.tableName))
47 | db.run(NORMAQMQuery.schema.truncate)
48 | else Future(():Unit)
49 | }
50 | }.andThen {
51 | case Success(_) => println(s"Table ${NORMAQMQuery.baseTableRow.tableName} truncated successfully!")
52 | case Failure(e) => println(s"Failed to truncate Table ${NORMAQMQuery.baseTableRow.tableName}! Error: ${e.getMessage}")
53 | }
54 | Await.ready(futDropTable,Duration.Inf)
55 |
56 | //a conceived task for the purpose of resource consumption
57 | //getting id with corresponding name from STATES table
58 | def getStateID(state: String): Int = {
59 | //create a stream for state id with state name
60 | implicit def toState(row: StateTable#TableElementType) = StateModel(row.id,row.name)
61 | val stateLoader = FDAViewLoader(slick.jdbc.H2Profile)(toState _)
62 | val stateSeq = stateLoader.fda_typedRows(StateQuery.result)(db).toSeq
63 | //constructed a Stream[Task,String]
64 | val stateStream = fda_staticSource(stateSeq)()
65 | var id = -1
66 | def getid: FDAUserTask[FDAROW] = row => {
67 | row match {
68 | case StateModel(stid,stname) => //target row type
69 | if (stname.contains(state)) {
70 | id = stid
71 | fda_break //exit
72 | }
73 | else fda_skip //take next row
74 | case _ => fda_skip
75 | }
76 | }
77 | stateStream.appendTask(getid).startRun
78 | id
79 | }
80 | //another conceived task for the purpose of resource consumption
81 | //getting id with corresponding names from COUNTIES table
82 | def getCountyID(state: String, county: String): Int = {
83 | //create a stream for county id with state name and county name
84 | implicit def toCounty(row: CountyTable#TableElementType) = CountyModel(row.id,row.name)
85 | val countyLoader = FDAViewLoader(slick.jdbc.H2Profile)(toCounty _)
86 | val countySeq = countyLoader.fda_typedRows(CountyQuery.result)(db).toSeq
87 | //constructed a Stream[Task,String]
88 | val countyStream = fda_staticSource(countySeq)()
89 | var id = -1
90 | def getid: FDAUserTask[FDAROW] = row => {
91 | row match {
92 | case CountyModel(cid,cname) => //target row type
93 | if (cname.contains(state) && cname.contains(county)) {
94 | id = cid
95 | fda_break //exit
96 | }
97 | else fda_skip //take next row
98 | case _ => fda_skip
99 | }
100 | }
101 | countyStream.appendTask(getid).startRun
102 | id
103 | }
104 |
105 | //original table listing
106 | implicit def toAQMRPT(row: AQMRPTTable#TableElementType) =
107 | AQMRPTModel(row.rid,row.mid,row.state,row.county,row.year,row.value,row.total,row.valid)
108 | val AQMRPTLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toAQMRPT _)
109 | val AQMRPTStream = AQMRPTLoader.fda_typedStream(AQMRPTQuery.result)(db)(256,256)()()
110 |
111 | def getIdsThenInsertAction: FDAUserTask[FDAROW] = row => {
112 | row match {
113 | case aqm: AQMRPTModel =>
114 | if (aqm.valid) {
115 | val stateId = 0 //getStateID(aqm.state)
116 | val countyId = 0 //getCountyID(aqm.state,aqm.county)
117 | val action = NORMAQMQuery += NORMAQMModel(0,aqm.mid, stateId, countyId, aqm.year,aqm.value,aqm.total)
118 | fda_next(FDAActionRow(action))
119 | }
120 | else fda_skip
121 | case _ => fda_skip
122 | }
123 | }
124 | val runner = FDAActionRunner(slick.jdbc.H2Profile)
125 | def runInsertAction: FDAUserTask[FDAROW] = row =>
126 | row match {
127 | case FDAActionRow(action) =>
128 | runner.fda_execAction(action)(db)
129 | fda_skip
130 | case _ => fda_skip
131 | }
132 |
133 | val cnt_start = System.currentTimeMillis()
134 |
135 |
136 | /*
137 | AQMRPTStream.take(100000)
138 | .appendTask(getIdsThenInsertAction)
139 | .appendTask(runInsertAction)
140 | .startRun
141 | //println(s"processing 10000 rows in a single thread in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
142 | //processing 10000 rows in a single thread in 570 seconds
143 | //println(s"processing 20000 rows in a single thread in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
144 | //processing 20000 rows in a single thread in 1090 seconds
145 | //println(s"processing 100000 rows in a single thread in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
146 | //processing 100000 rows in a single thread in 2+ hrs
147 | */
148 |
149 | implicit val strategy = Strategy.fromCachedDaemonPool("cachedPool")
150 | // implicit val strategy = Strategy.fromFixedDaemonPool(6)
151 | fda_runPar(AQMRPTStream.toPar(getIdsThenInsertAction))(4)
152 | .appendTask(runInsertAction)
153 | .startRun
154 |
155 | //println(s"processing 10000 rows parallelly in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
156 | // processing 10000 rows parallelly in 316 seconds
157 | //println(s"processing 20000 rows parallelly in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
158 | //processing 20000 rows parallelly in 614 seconds
159 | println(s"processing 100000 rows parallelly in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
160 | //processing 100000 rows parallelly in 3885 seconds
161 |
162 | }
163 |
--------------------------------------------------------------------------------
/src/main/scala/examples/ParallelExecution.scala:
--------------------------------------------------------------------------------
1 | package examples
2 | import slick.jdbc.meta._
3 | import com.bayakala.funda._
4 | import api._
5 | import scala.language.implicitConversions
6 | import scala.concurrent.ExecutionContext.Implicits.global
7 | import scala.concurrent.duration._
8 | import scala.concurrent.{Await, Future}
9 | import scala.util.{Failure, Success}
10 | import slick.jdbc.H2Profile.api._
11 | import com.bayakala.funda.samples.SlickModels._
12 |
13 |
14 | object ParallelExecution extends App {
15 |
16 | val db = Database.forConfig("h2db")
17 |
18 | //drop original table schema
19 | val futVectorTables = db.run(MTable.getTables)
20 |
21 | val futDropTable = futVectorTables.flatMap{ tables => {
22 | val tableNames = tables.map(t => t.name.name)
23 | if (tableNames.contains(NORMAQMQuery.baseTableRow.tableName))
24 | db.run(NORMAQMQuery.schema.drop)
25 | else Future(():Unit)
26 | }
27 | }.andThen {
28 | case Success(_) => println(s"Table ${NORMAQMQuery.baseTableRow.tableName} dropped successfully! ")
29 | case Failure(e) => println(s"Failed to drop Table ${NORMAQMQuery.baseTableRow.tableName}, it may not exist! Error: ${e.getMessage}")
30 | }
31 | Await.ready(futDropTable,Duration.Inf)
32 |
33 | //create new table to refine AQMRawTable
34 | val actionCreateTable = NORMAQMQuery.schema.create
35 | val futCreateTable = db.run(actionCreateTable).andThen {
36 | case Success(_) => println("Table created successfully!")
37 | case Failure(e) => println(s"Table may exist already! Error: ${e.getMessage}")
38 | }
39 | //would carry on even fail to create table
40 | Await.ready(futCreateTable,Duration.Inf)
41 |
42 |
43 | //truncate data, only available in slick 3.2.1
44 | val futTruncateTable = futVectorTables.flatMap{ tables => {
45 | val tableNames = tables.map(t => t.name.name)
46 | if (tableNames.contains(NORMAQMQuery.baseTableRow.tableName))
47 | db.run(NORMAQMQuery.schema.truncate)
48 | else Future(():Unit)
49 | }
50 | }.andThen {
51 | case Success(_) => println(s"Table ${NORMAQMQuery.baseTableRow.tableName} truncated successfully!")
52 | case Failure(e) => println(s"Failed to truncate Table ${NORMAQMQuery.baseTableRow.tableName}! Error: ${e.getMessage}")
53 | }
54 | Await.ready(futDropTable,Duration.Inf)
55 |
56 | //a conceived task for the purpose of resource consumption
57 | //getting id with corresponding name from STATES table
58 | def getStateID(state: String): Int = {
59 | //create a stream for state id with state name
60 | implicit def toState(row: StateTable#TableElementType) = StateModel(row.id,row.name)
61 | val stateLoader = FDAViewLoader(slick.jdbc.H2Profile)(toState _)
62 | val stateSeq = stateLoader.fda_typedRows(StateQuery.result)(db).toSeq
63 | //constructed a Stream[Task,String]
64 | val stateStream = fda_staticSource(stateSeq)()
65 | var id = -1
66 | def getid: FDAUserTask[FDAROW] = row => {
67 | row match {
68 | case StateModel(stid,stname) => //target row type
69 | if (stname.contains(state)) {
70 | id = stid
71 | fda_break //exit
72 | }
73 | else fda_skip //take next row
74 | case _ => fda_skip
75 | }
76 | }
77 | stateStream.appendTask(getid).startRun
78 | id
79 | }
80 | //another conceived task for the purpose of resource consumption
81 | //getting id with corresponding names from COUNTIES table
82 | def getCountyID(state: String, county: String): Int = {
83 | //create a stream for county id with state name and county name
84 | implicit def toCounty(row: CountyTable#TableElementType) = CountyModel(row.id,row.name)
85 | val countyLoader = FDAViewLoader(slick.jdbc.H2Profile)(toCounty _)
86 | val countySeq = countyLoader.fda_typedRows(CountyQuery.result)(db).toSeq
87 | //constructed a Stream[Task,String]
88 | val countyStream = fda_staticSource(countySeq)()
89 | var id = -1
90 | def getid: FDAUserTask[FDAROW] = row => {
91 | row match {
92 | case CountyModel(cid,cname) => //target row type
93 | if (cname.contains(state) && cname.contains(county)) {
94 | id = cid
95 | fda_break //exit
96 | }
97 | else fda_skip //take next row
98 | case _ => fda_skip
99 | }
100 | }
101 | countyStream.appendTask(getid).startRun
102 | id
103 | }
104 |
105 | //process input row and produce action row to insert into NORMAQM
106 | def getIdsThenInsertAction: FDAUserTask[FDAROW] = row => {
107 | row match {
108 | case aqm: AQMRPTModel =>
109 | if (aqm.valid) {
110 | val stateId = getStateID(aqm.state)
111 | val countyId = getCountyID(aqm.state,aqm.county)
112 | val action = NORMAQMQuery += NORMAQMModel(0,aqm.mid, stateId, countyId, aqm.year,aqm.value,aqm.total)
113 | fda_next(FDAActionRow(action))
114 | }
115 | else fda_skip
116 | case _ => fda_skip
117 | }
118 | }
119 | //runner for the action rows
120 | val runner = FDAActionRunner(slick.jdbc.H2Profile)
121 | def runInsertAction: FDAUserTask[FDAROW] = row =>
122 | row match {
123 | case FDAActionRow(action) =>
124 | runner.fda_execAction(action)(db)
125 | fda_skip
126 | case _ => fda_skip
127 | }
128 |
129 | //create parallel sources
130 | //get a stream of years
131 | val qryYears = AQMRPTQuery.map(_.year).distinct
132 | case class Years(year: Int) extends FDAROW
133 |
134 | implicit def toYears(y: Int) = Years(y)
135 |
136 | val yearViewLoader = FDAViewLoader(slick.jdbc.H2Profile)(toYears _)
137 | val yearSeq = yearViewLoader.fda_typedRows(qryYears.result)(db).toSeq
138 | val yearStream = fda_staticSource(yearSeq)()
139 |
140 | //strong row type
141 | implicit def toAQMRPT(row: AQMRPTTable#TableElementType) =
142 | AQMRPTModel(row.rid, row.mid, row.state, row.county, row.year, row.value, row.total, row.valid)
143 |
144 | //shared stream loader when operate in parallel mode
145 | val AQMRPTLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toAQMRPT _)
146 |
147 | //loading rows with year yr
148 | def loadRowsInYear(yr: Int) = {
149 | //a new query
150 | val query = AQMRPTQuery.filter(row => row.year === yr)
151 | //reuse same loader
152 | AQMRPTLoader.fda_typedStream(query.result)(db)(256, 256)(println(s"End of stream ${yr}!!!!!!"))()
153 | }
154 |
155 | //loading rows by year
156 | def loadRowsByYear: FDASourceLoader = row => {
157 | row match {
158 | case Years(y) => loadRowsInYear(y) //produce stream of the year
159 | case _ => fda_appendRow(FDANullRow)
160 | }
161 | }
162 |
163 |
164 | //start counter
165 | val cnt_start = System.currentTimeMillis()
166 |
167 | def showRecord: FDAUserTask[FDAROW] = row => {
168 | row match {
169 | case Years(y) => println(y); fda_skip
170 | case aqm: AQMRPTModel =>
171 | println(s"${aqm.year} $aqm")
172 | fda_next(aqm)
173 | case FDAActionRow(action) =>
174 | println(s"${action}")
175 | fda_skip
176 | case _ => fda_skip
177 | }
178 | }
179 |
180 | //the following is a process of composition of stream combinators
181 | //get parallel source constructor
182 | val parSource = yearStream.toParSource(loadRowsByYear)
183 | //produce a stream from parallel sources
184 | val source = fda_par_source(parSource)(4)
185 | //turn getIdsThenInsertAction into parallel task
186 | val parTasks = source.toPar(getIdsThenInsertAction)
187 | //runPar to produce a new stream
188 | val actionStream =fda_runPar(parTasks)(4)
189 | //turn runInsertAction into parallel task
190 | val parRun = actionStream.toPar(runInsertAction)
191 | //runPar and carry out by startRun
192 | fda_runPar(parRun)(2).startRun
193 |
194 | println(s"processing 219400 rows parallelly in ${(System.currentTimeMillis - cnt_start)/1000} seconds")
195 |
196 |
197 |
198 | }
199 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/src/main/scala/com/bayakala/funda/package.scala:
--------------------------------------------------------------------------------
1 | package com.bayakala
2 |
3 | /**
4 | * FunDA core types, global imports and fs2 stream method injection
5 | */
6 | package object funda {
7 | import fs2._
8 | import slick.dbio._
9 | import scala.concurrent.Future
10 | import akka.stream.stage._
11 |
12 | implicit val fda_strategy = Strategy.fromFixedDaemonPool(4)
13 | implicit val fda_scheduler = Scheduler.fromFixedDaemonPool(4)
14 |
15 | /** fs2 manned halt type
16 | * terminateNow indicates to FDADataStream.pushData intent to stop
17 | * anytime now during enqueue process
18 | */
19 | class Fs2Terminator {
20 | var terminateNow = false
21 | def reset = terminateNow = false
22 | def stopASAP = terminateNow = true
23 | }
24 | /** default killswitch for fs2
25 | * declare separate instances for multiple concurrent running stream control
26 | * by providing explicitly killSwitch parameter
27 | */
28 | implicit object Fs2KillSwitch extends Fs2Terminator
29 | /** akka manned halt type
30 | * terminateNow indicates to FDADataStream.Fs2Gate intent to stop
31 | * anytime now during enqueue process
32 | */
33 | class AkkaTerminator{
34 | var callback: AsyncCallback[Unit] = null
35 | def stopASAP = {
36 | if (callback != null) {
37 | callback.invoke(())
38 | }
39 | }
40 | }
41 | /** default killswitch for akka
42 | * declare separate instances for multiple concurrent running stream control
43 | * by providing explicitly killSwitch parameter
44 | */
45 | implicit object AkkaKillSwitch extends AkkaTerminator
46 |
47 | /** 数据处理管道
48 | * a stream of data or action rows
49 | * @tparam ROW type of row
50 | */
51 | type FDAPipeLine[ROW] = Stream[Task, ROW]
52 |
53 | /** 数据作业节点
54 | * a work node appended to stream to perform user action
55 | * @tparam ROW type of row
56 | */
57 | type FDAWorkNode[ROW] = Pipe[Task, ROW, ROW]
58 |
59 | /** 数据管道开关阀门,从此处获得管道内数据
60 | * a handle to get rows from upstream
61 | * @tparam ROW type of row
62 | */
63 | type FDAValve[ROW] = Handle[Task, ROW]
64 |
65 | /** 管道连接器
66 | * gate to send rows downstream
67 | * @tparam ROW type of row
68 | */
69 | type FDAPipeJoint[ROW] = Pull[Task, ROW, Unit]
70 |
71 | /** 作业类型
72 | * user define function to be performed at a FDAWorkNode
73 | * given a row from upstream, return Option[List[ROW]] as follows:
74 | * fda_skip -> Some(Nil) : skip sending the current row
75 | * fda_next -> Some(List(r1,r2...)): send r1,r2... downstream
76 | * fda_break -> None : halt stream, end of process
77 | * @tparam ROW type of row
78 | */
79 | type FDAUserTask[ROW] = (ROW) => (Option[List[ROW]])
80 |
81 | /** 数据源构建器类型
82 | * a function type to produce a stream from input row
83 | * used to be turn into FDAParSource by toParSource
84 | */
85 | type FDASourceLoader = FDAROW => FDAPipeLine[FDAROW]
86 |
87 | /** 合计作业类型
88 | * user define function with aggregation effect to be performed at a FDAWorkNode
89 | * given current aggregation value and row from upstream,
90 | * return updated aggregation value and Option[List[ROW]] as follows:
91 | * fda_skip -> Some(Nil) : skip sending the current row
92 | * fda_next -> Some(List(r1,r2...)): send r1,r2... downstream
93 | * fda_break -> None : halt stream, end of process
94 | * @tparam AGGR type of aggregation
95 | * @tparam ROW type of row
96 | */
97 | type FDAAggrTask[AGGR,ROW] = (AGGR,ROW) => (AGGR,Option[List[ROW]])
98 |
99 | /** 并行作业类型
100 | * stream of streams type for parallel running user action
101 | * use stream.toPar to convert from FDAUserTask
102 | */
103 | type FDAParTask = Stream[Task,Stream[Task,Option[List[FDAROW]]]]
104 |
105 | /** 并行数据源类型
106 | * source of sources type for parallel loading data sources
107 | * use stream.toParSource to convert from FDASourceLoader
108 | */
109 | type FDAParSource = Stream[Task,Stream[Task,FDAROW]]
110 |
111 | /** 数据行类型
112 | * topmost generic row type
113 | */
114 | trait FDAROW
115 |
116 | /**
117 | * a EOS object used to signify end of stream
118 | */
119 | case object FDANullRow extends FDAROW
120 |
121 | /**
122 | * capture exception in a row
123 | * @param e
124 | */
125 | case class FDAErrorRow(e: Exception) extends FDAROW
126 |
127 | /**
128 | * manually emit a row such as FDANullRow or FDAErrorRow
129 | * @example {{{
130 | * //loading rows by year
131 | * def loadRowsByYear: FDASourceLoader = row => {
132 | * row match {
133 | * case Years(y) => loadRowsInYear(y) //produce stream of the year
134 | * case _ => fda_appendRow(FDANullRow)
135 | * }
136 | * }
137 | * }}}
138 | * @param row row to emit
139 | * @return new stream
140 | */
141 | def fda_appendRow(row: FDAROW): FDAPipeLine[FDAROW] = Stream(row)
142 |
143 | /**
144 | * runnable action type
145 | */
146 | type FDAAction = DBIO[Int]
147 |
148 | /**
149 | * action row type. can have futher distinct child type as fullows:
150 | * @example {{{
151 | * scala> class MyActionRow(action: FDAAction) extends FDAActionRow(action)
152 | * }}}
153 | * @param action runnable action
154 | */
155 | case class FDAActionRow(action: FDAAction) extends FDAROW
156 |
157 | /**
158 | * methods injected to fs2Stream
159 | */
160 | implicit class toFDAOps(fs2Stream: FDAPipeLine[FDAROW]) {
161 | /**
162 | * append a user task t to stream
163 | * @example {{{
164 | * val streamAllTasks = streamAQMRaw.appendTask(filterRows)
165 | * .appendTask(toAction)
166 | * .appendTask(runActionRow)
167 | * }}}
168 | * @param t user defined function
169 | * @return new stream
170 | */
171 | def appendTask(t: FDAUserTask[FDAROW]): FDAPipeLine[FDAROW] =
172 | fs2Stream.through(FDATask.fda_execUserTask(t))
173 |
174 | /**
175 | * append a user defined aggregation task t
176 | * @example {{{
177 | * //user defined aggregator type.
178 | * case class Accu(state: String, county: String, year: Int, count: Int, sumOfValue: Int)
179 | *
180 | * aqmrStream.aggregateTask(Accu("","",0,0,0),aggregateValue)
181 | * .appendTask(toAction)
182 | * .appendTask(runActionRow)
183 | * .startRun
184 | * }}}
185 | * @param aggr initial value of aggregation
186 | * @param t user defined task
187 | * @tparam AGGR type of aggr
188 | * @return new stream
189 | */
190 | def aggregateTask[AGGR](aggr: AGGR, t: FDAAggrTask[AGGR, FDAROW]): FDAPipeLine[FDAROW] =
191 | fs2Stream.through(FDATask.fda_aggregate(aggr, t))
192 |
193 | /**
194 | * replace stream[Task,ROW].run.unsafeRun
195 | * @example {{{
196 | * streamAQMRaw.appendTask(filterRows)
197 | * .appendTask(toAction)
198 | * .appendTask(runActionRow)
199 | * .startRun
200 | * }}}
201 | */
202 | def startRun: Unit = fs2Stream.run.unsafeRun
203 |
204 | /**
205 | * replace stream[Task,ROW].run.unsafeRunAsyncFuture
206 | * returns immediately
207 | *
208 | * @return Future
209 | */
210 | def startFuture[A]: Future[Unit] = fs2Stream.run.unsafeRunAsyncFuture
211 |
212 |
213 | /**
214 | * turn user task into type for parallel computation
215 | * @example {{{
216 | * //runner for the action rows
217 | * val runner = FDAActionRunner(slick.jdbc.H2Profile)
218 | * def runInsertAction: FDAUserTask[FDAROW] = row =>
219 | * row match {
220 | * case FDAActionRow(action) =>
221 | * runner.fda_execAction(action)(db)
222 | * fda_skip
223 | * case _ => fda_skip
224 | * }
225 | *
226 | * //turn runInsertAction into parallel task
227 | * val parRun = actionStream.toPar(runInsertAction)
228 | * }}}
229 | * @param st user defined task
230 | * @return stream of streams
231 | */
232 | def toPar(st: FDAUserTask[FDAROW]): FDAParTask =
233 | fs2Stream.map { row =>
234 | Stream.eval(Task {
235 | st(row)
236 | })
237 | }
238 |
239 | /**
240 | * turn a single stream into parallel sources
241 | * @example {{{
242 | * //loading rows with year yr
243 | * def loadRowsInYear(yr: Int) = {
244 | * //a new query
245 | * val query = AQMRPTQuery.filter(row => row.year === yr)
246 | * //reuse same loader
247 | * AQMRPTLoader.fda_typedStream(query.result)(db)(256, 256)(println(s"End of stream ${yr}!!!!!!"))
248 | * }
249 | *
250 | * //loading rows by year
251 | * def loadRowsByYear: FDASourceLoader = row => {
252 | * row match {
253 | * case Years(y) => loadRowsInYear(y) //produce stream of the year
254 | * case _ => fda_appendRow(FDANullRow)
255 | * }
256 | * }
257 | *
258 | * //produce a stream from parallel sources
259 | * val source = fda_par_source(parSource)(4)
260 | * }}}
261 | * @param load stream constructing function: FDAROW => FDAPipeLine[FDAROW]
262 | * @return stream of streams
263 | */
264 | def toParSource(load: FDASourceLoader): FDAParSource =
265 | fs2Stream.map(row => load(row))
266 |
267 | }
268 |
269 |
270 | /** methods to run an user defined function on FDAPipeLine*/
271 | object FDATask { //作业节点工作方法
272 | /**
273 | * returns state of next worknode. using fs2 Handle of Pull object,
274 | * take the next element and apply function task and determine new state of stream
275 | * @param task user defined function: ROW => Option[List[ROW]]
276 | * returns an Option[List[ROW]]] value signifying movement downstream
277 | * as follows:
278 | * Some(Nil) : skip sending the current row
279 | * Some(List(r1,r2...)): send r1,r2... downstream
280 | * None : halt stream, end of process
281 | * @tparam ROW row type: FDAROW or FDAActionROW
282 | * @return new state of stream
283 | */
284 | private[funda] def fda_execUserTask[ROW](task: FDAUserTask[ROW]): FDAWorkNode[ROW] = {
285 | def go: FDAValve[ROW] => FDAPipeJoint[ROW] = h => {
286 | h.receive1Option {
287 | case Some((r, h)) => task(r) match {
288 | case Some(lx) => lx match {
289 | case Nil => go(h)
290 | case _ => Pull.output(Chunk.seq(lx)) >> go(h)
291 | }
292 | case None => task(FDANullRow.asInstanceOf[ROW]) match {
293 | case Some(lx) => lx match {
294 | case Nil => Pull.done
295 | case _ => Pull.output(Chunk.seq(lx)) >> Pull.done
296 | }
297 | case _ => Pull.done
298 | }
299 | }
300 | case None => task(FDANullRow.asInstanceOf[ROW]) match {
301 | case Some(lx) => lx match {
302 | case Nil => Pull.done
303 | case _ => Pull.output(Chunk.seq(lx)) >> Pull.done
304 | }
305 | case _ => Pull.done
306 | }
307 | }
308 | }
309 | in => in.pull(go)
310 | }
311 | /**
312 | * returns state of next worknode and some aggregation defined inside user function.
313 | * execute user defined function with internal aggregation mechanism by means of
314 | * functional state transition style of passing in state and return new state.
315 | * take in current aggregation and next row, apply user function on both
316 | * and determine new state of stream
317 | * @param aggr user selected type of aggregation such as Int, (Int,Int) ...
318 | * @param task user defined function: (AGGR,ROW) => (AGGR,Option[List[ROW]])
319 | * take in current aggregation and row,
320 | * and return new aggregation and Option[List[ROW]] with meaning of:
321 | * Some(Nil) : skip sending the current row
322 | * Some(List(r1,r2...)): send r1,r2... downstream
323 | * None : halt stream, end of process
324 | * @tparam AGGR type of aggr
325 | * @tparam ROW type of row
326 | * @return new state of stream
327 | */
328 | private[funda] def fda_aggregate[AGGR,ROW](aggr: AGGR, task: FDAAggrTask[AGGR,ROW]): FDAWorkNode[ROW] = {
329 | def go(acc: AGGR): FDAValve[ROW] => FDAPipeJoint[ROW] = h => {
330 | h.receive1Option {
331 | case Some((r, h)) => task(acc,r) match {
332 | case (a,Some(lx)) => lx match {
333 | case Nil => go(a)(h)
334 | case _ => Pull.output(Chunk.seq(lx)) >> go(a)(h)
335 | }
336 | case (a,None) => task(a,FDANullRow.asInstanceOf[ROW]) match {
337 | case (a,Some(lx)) => lx match {
338 | case Nil => Pull.done
339 | case _ => Pull.output(Chunk.seq(lx)) >> Pull.done
340 | }
341 | case _ => Pull.done
342 | }
343 | }
344 | case None => task(acc,FDANullRow.asInstanceOf[ROW]) match {
345 | case (a,Some(lx)) => lx match {
346 | case Nil => Pull.done
347 | case _ => Pull.output(Chunk.seq(lx)) >> Pull.done
348 | }
349 | case _ => Pull.done
350 | }
351 | }
352 | }
353 | in => in.pull(go(aggr))
354 | }
355 |
356 |
357 | }
358 |
359 |
360 | }
361 |
362 |
--------------------------------------------------------------------------------
/src/main/scala/com/bayakala/funda/fdasources/FDADataStream.scala:
--------------------------------------------------------------------------------
1 | package com.bayakala.funda.fdasources
2 |
3 | import fs2._
4 | import play.api.libs.iteratee._
5 | import com.bayakala.funda._
6 | import slick.jdbc.JdbcProfile
7 |
8 | import akka.actor._
9 | import akka.stream.scaladsl._
10 | import akka.stream._
11 | import akka.stream.stage._
12 | import akka.stream.stage.{GraphStage, GraphStageLogic}
13 |
14 | /** stream loader class wrapper */
15 | trait FDADataStream {
16 |
17 | /** running Slick DBIOAction to produce a data stream conforming to reactive-streams api.
18 | * provide strong typed result conversion if required
19 | * @param slickProfile Slick jdbc profile such as 'slick.jdbc.H2Profile'
20 | * @param convert a defined implicit type conversion function.
21 | * from SOURCE type to TARGET type, set to null if not required
22 | * @tparam SOURCE source type, result type of DBIOAction, most likely a tuple type
23 | * @tparam TARGET final converted type, most likely a case class type
24 | */
25 | class FDAStreamLoader[SOURCE, TARGET](slickProfile: JdbcProfile, convert: SOURCE => TARGET) {
26 |
27 | import slickProfile.api._
28 |
29 | /**
30 | * returns a reactive-stream from Slick DBIOAction result
31 | * using play-iteratees and fs2 queque to connect to slick data stream publisher
32 | * provide facade for error handler and finalizer to support exception and cleanup handling
33 | * also provide stream element conversion from SOURCE type to TARGET type
34 | * @example {{{
35 | * val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toTypedRow _)
36 | * val streamSource = streamLoader.fda_typedStream(aqmQuery.result)(db)(512,16,100)()()
37 | * val safeStreamSource = streamLoader.fda_typedStream(aqmQuery.result)(db)(512,16)(
38 | * println("the end finally!"))(killSwitch)
39 | * }}}
40 | * @param action a Slick DBIOAction to produce query results
41 | * @param slickDB Slick database object
42 | * @param fetchSize number of rows cached during database read
43 | * @param queSize size of queque used by iteratee as cache to pass elements to fs2 stream
44 | * @param take take first 'take' elements
45 | * @param finalizer cleanup callback
46 | * @param killSwitch use killSwitch.stopASAP to halt stream
47 | * @param convert just a measure to guarantee conversion function is defined
48 | * when this function is used there has to be a converter defined
49 | * implicitly in compile time
50 | * @return a reactive-stream of TARGET row type elements
51 | */
52 | def fda_typedStream(action: DBIOAction[Iterable[SOURCE],Streaming[SOURCE],Effect.Read])(
53 | slickDB: Database)(
54 | fetchSize: Int, queSize: Int, take: Int = 0)(
55 | finalizer: => Unit = ())(
56 | killSwitch: Fs2Terminator = Fs2KillSwitch)(
57 | implicit convert: SOURCE => TARGET)
58 | : FDAPipeLine[TARGET] = {
59 | val disableAutocommit = SimpleDBIO(_.connection.setAutoCommit(false))
60 | val action_ = action.withStatementParameters(fetchSize = fetchSize)
61 | val publisher = slickDB.stream(disableAutocommit andThen action_)
62 | val enumerator = streams.IterateeStreams.publisherToEnumerator(publisher)
63 |
64 | val s = Stream.eval(async.boundedQueue[Task,Option[SOURCE]](queSize)).flatMap { q =>
65 | Task { Iteratee.flatten(enumerator |>> pushData(killSwitch,take,q)).run }.unsafeRunAsyncFuture()
66 | pipe.unNoneTerminate(q.dequeue).map {row => convert(row)}
67 | }
68 | s.onFinalize(Task.delay(finalizer))
69 |
70 | }
71 | /**
72 | * returns a reactive-stream from Slick DBIOAction result
73 | * using akka-stream to connect to slick data stream publisher
74 | * provide facade for error handler and finalizer to support exception and cleanup handling
75 | * @example {{{
76 | * val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toTypedRow _)
77 | * val streamSource = streamLoader.fda_akkaTypedStream(aqmQuery.result)(db)(512,2,100)()()
78 | * val safeStreamSource = streamLoader.fda_akkaTypedStream(aqmQuery.result)(db)(512,2)(
79 | * println("the end finally!"))(killSwitch)
80 | * }}}
81 | * @param action a Slick DBIOAction to produce query results
82 | * @param slickDB Slick database object
83 | * @param fetchSize number of rows cached during database read
84 | * @param queSize size of queque used akka-stream as cache to pass elements to fs2 queue
85 | * @param take take first 'take' elements
86 | * @param finalizer cleanup callback
87 | * @param killSwitch use killSwitch.stopASAP to halt stream
88 | * @param convert just a measure to guarantee conversion function is defined
89 | * when this function is used there has to be a converter defined
90 | * implicitly in compile time
91 | * @return a reactive-stream of SOURCE row type elements
92 | */
93 | def fda_akkaTypedStream(action: DBIOAction[Iterable[SOURCE],Streaming[SOURCE],Effect.Read])(
94 | slickDB: Database)(
95 | fetchSize: Int, queSize: Int, take: Int = 0)(
96 | finalizer: => Unit = ())(
97 | killSwitch: AkkaTerminator = AkkaKillSwitch)(
98 | implicit convert: SOURCE => TARGET)
99 | : FDAPipeLine[TARGET] = {
100 | val disableAutocommit = SimpleDBIO(_.connection.setAutoCommit(false))
101 | val action_ = action.withStatementParameters(fetchSize = fetchSize)
102 | val publisher = slickDB.stream(disableAutocommit andThen action_)
103 | implicit val actorSys = ActorSystem("actor-system")
104 | implicit val ec = actorSys.dispatcher
105 | implicit val mat = ActorMaterializer()
106 | // construct akka source
107 | val akkaSource = Source.fromPublisher[SOURCE](publisher)
108 |
109 | val s = Stream.eval(async.boundedQueue[Task,Option[SOURCE]](queSize))
110 | .flatMap { q =>
111 | Task(akkaSource.to(new FS2Gate[SOURCE](killSwitch, take, q)).run).unsafeRunAsyncFuture //enqueue Task(new thread)
112 | pipe.unNoneTerminate(q.dequeue).map {row => convert(row)} //dequeue in current thread
113 | }
114 | s.onFinalize{Task.delay{actorSys.terminate();finalizer}}
115 | }
116 | /**
117 | * returns a reactive-stream from Slick DBIOAction result
118 | * using play-iteratees and fs2 queque to connect to slick data stream publisher
119 | * provide facade for error handler and finalizer to support exception and cleanup handling
120 | * @example {{{
121 | * val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)()
122 | * val streamSource = streamLoader.fda_plainStream(aqmQuery.result)(db)(512,16, 100)()()
123 | * val safeStreamSource = streamLoader.fda_plainStream(aqmQuery.result)(db)(512,16)(
124 | * println("the end finally!"))(killSwitch)
125 | * }}}
126 | * @param action a Slick DBIOAction to produce query results
127 | * @param slickDB Slick database object
128 | * @param fetchSize number of rows cached during database read
129 | * @param queSize size of queque used by iteratee as cache to pass elements to fs2 stream
130 | * @param take take first 'take' elements
131 | * @param finalizer cleanup callback
132 | * @param killSwitch use killSwitch.stopASAP to halt stream
133 | * @return a reactive-stream of SOURCE row type elements
134 | */
135 | def fda_plainStream(action: DBIOAction[Iterable[SOURCE],Streaming[SOURCE],Effect.Read])(
136 | slickDB: Database)(
137 | fetchSize: Int, queSize: Int, take: Int = 0)(
138 | finalizer: => Unit = ())(
139 | implicit killSwitch: Fs2Terminator): FDAPipeLine[SOURCE] = {
140 | val disableAutocommit = SimpleDBIO(_.connection.setAutoCommit(false))
141 | val action_ = action.withStatementParameters(fetchSize = fetchSize)
142 | val publisher = slickDB.stream(disableAutocommit andThen action_)
143 | val enumerator = streams.IterateeStreams.publisherToEnumerator(publisher)
144 |
145 | val s = Stream.eval(async.boundedQueue[Task,Option[SOURCE]](queSize)).flatMap { q =>
146 | Task { Iteratee.flatten(enumerator |>> pushData(killSwitch,take,q)).run }.unsafeRunAsyncFuture()
147 | pipe.unNoneTerminate(q.dequeue)
148 | }
149 | s.onFinalize(Task.delay(finalizer))
150 | }
151 | /**
152 | * returns a reactive-stream from Slick DBIOAction result
153 | * using akka-stream to connect to slick data stream publisher
154 | * provide facade for error handler and finalizer to support exception and cleanup handling
155 | * @example {{{
156 | * val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)()
157 | * val streamSource = streamLoader.fda_akkaPlainStream(aqmQuery.result)(db)(512,2,100)()()
158 | * val safeStreamSource = streamLoader.fda_plainStream(aqmQuery.result)(db)(512,2)(
159 | * println("the end finally!"))(killSwitch)
160 | * }}}
161 | * @param action a Slick DBIOAction to produce query results
162 | * @param slickDB Slick database object
163 | * @param fetchSize number of rows cached during database read
164 | * @param queSize size of queque used akka-stream as cache to pass elements to fs2 queue
165 | * @param take take first 'take' elements
166 | * @param finalizer cleanup callback
167 | * @param killSwitch use killSwitch.stopASAP to halt stream
168 | * @return a reactive-stream of SOURCE row type elements
169 | */
170 | def fda_akkaPlainStream(action: DBIOAction[Iterable[SOURCE],Streaming[SOURCE],Effect.Read])(
171 | slickDB: Database)(
172 | fetchSize: Int, queSize: Int, take: Int = 0)(
173 | finalizer: => Unit = ())(
174 | implicit killSwitch: AkkaTerminator): FDAPipeLine[SOURCE] = {
175 | val disableAutocommit = SimpleDBIO(_.connection.setAutoCommit(false))
176 | val action_ = action.withStatementParameters(fetchSize = fetchSize)
177 | val publisher = slickDB.stream(disableAutocommit andThen action_)
178 | implicit val actorSys = ActorSystem("actor-system")
179 | implicit val ec = actorSys.dispatcher
180 | implicit val mat = ActorMaterializer()
181 | // construct akka source
182 | val akkaSource = Source.fromPublisher[SOURCE](publisher)
183 |
184 | val s = Stream.eval(async.boundedQueue[Task,Option[SOURCE]](queSize))
185 | .flatMap { q =>
186 | Task(akkaSource.to(new FS2Gate[SOURCE](killSwitch, take, q)).run).unsafeRunAsyncFuture //enqueue Task(new thread)
187 | pipe.unNoneTerminate(q.dequeue) //dequeue in current thread
188 | }
189 | s.onFinalize{Task.delay{actorSys.terminate();finalizer}}
190 | }
191 |
192 | /**
193 | * consume input from enumerator by pushing each element into q queque
194 | * end and produce error when enqueque could not be completed in timeout
195 | * @tparam R stream element type
196 | * @param killSwitch object with killSwitch.stopASAP to halt stream
197 | * @param take emit the first 'take' elements
198 | * @param q queque for cache purpose
199 | * @return iteratee in new state
200 | */
201 | private def pushData[R](killSwitch: Fs2Terminator, take: Int, q: async.mutable.Queue[Task,Option[R]]): Iteratee[R,Unit] = Cont {
202 | case Input.EOF =>
203 | q.enqueue1(None).unsafeRun
204 | Done((), Input.Empty)
205 | case Input.Empty => pushData(killSwitch,take,q)
206 | case Input.El(e) =>
207 | if (take >= 0 && !killSwitch.terminateNow) {
208 | q.enqueue1(Some(e)).unsafeRun
209 | pushData(killSwitch, if(take == 0) 0 else {if (take == 1) -1 else take - 1}, q)
210 | }
211 | else {
212 | killSwitch.reset
213 | q.enqueue1(None).unsafeRun
214 | Done((), Input.Empty)
215 | }
216 | }
217 | /**
218 | * an akka-stream graph stage that connects akka-stream-source to fs2 flow through
219 | * ffs2.async.mutable.Queue structure
220 | * acts as a de-backpressurer to adjust emit rate to pull-model stream fs2
221 | * also takes care of manual halt and limiting first batch of element emission
222 | * @tparam T stream element type
223 | * @param killSwitch object with killSwitch.stopASAP to halt stream
224 | * @param take emit the first 'take' elements
225 | * @param q queque for cache purpose
226 | * @return iteratee in new state
227 | */
228 | private class FS2Gate[T](killSwitch: AkkaTerminator, take: Int, q: fs2.async.mutable.Queue[Task,Option[T]]) extends GraphStage[SinkShape[T]] {
229 | val in = Inlet[T]("inport")
230 | val shape = SinkShape.of(in)
231 |
232 | override def createLogic(inheritedAttributes: Attributes): GraphStageLogic =
233 | new GraphStageLogic(shape) with InHandler {
234 | override def preStart(): Unit = {
235 | if (killSwitch != null) {
236 | val callback = getAsyncCallback[Unit] { (_) =>
237 | killStream = true
238 | }
239 | killSwitch.callback = callback
240 | }
241 | pull(in) //initiate stream elements movement
242 | super.preStart()
243 | }
244 | var take_ = take
245 | var killStream = false
246 | override def onPush(): Unit = {
247 | if (killStream) take_ = -1
248 | q.enqueue1{
249 | if ( take_ >= 0 )
250 | Some(grab(in))
251 | else
252 | None
253 | }.unsafeRun()
254 | pull(in)
255 | if ( take_ < 0) completeStage()
256 | if (take_ == 1)
257 | take_ = -1
258 | else
259 | if (take_ != 0) take_ -= 1
260 | }
261 |
262 | override def onUpstreamFinish(): Unit = {
263 | q.enqueue1(None).unsafeRun()
264 | completeStage()
265 | }
266 |
267 | override def onUpstreamFailure(ex: Throwable): Unit = {
268 | q.enqueue1(None).unsafeRun()
269 | completeStage()
270 | }
271 |
272 | setHandler(in,this)
273 |
274 | }
275 | }
276 |
277 | }
278 |
279 | /**
280 | * constructing FDAStreamLoader given slickProfile and converter
281 | */
282 | object FDAStreamLoader {
283 | /**
284 | * constructor for FDAStreamLoader
285 | * @example {{{
286 | * val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toTypedRow _)
287 | * val untypedLoader = FDAStreamLoader(slick.jdbc.H2Profile)()
288 | * }}}
289 | * @param slickProfile Slick jdbcprofile such as 'slick.jdbc.H2Profile'
290 | * @param converter a defined implicit type conversion function.
291 | * from SOURCE type to TARGET type, set to null if not required
292 | * @tparam SOURCE source type, result type of DBIOAction, most likely a tuple type
293 | * @tparam TARGET final converted type, most likely a case class type
294 | * @return a new FDAStreamLoader object
295 | */
296 | def apply[SOURCE, TARGET](slickProfile: JdbcProfile)(converter: SOURCE => TARGET = null): FDAStreamLoader[SOURCE, TARGET] =
297 | new FDAStreamLoader[SOURCE, TARGET](slickProfile, converter)
298 | }
299 | }
300 |
301 |
302 | /**
303 | * for global imports
304 | */
305 | object FDADataStream extends FDADataStream
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FunDA (Functional Data Access)
2 |
3 | *FunDA* is a functional database access library designed to supplement *FRM* (*Functional-Relational-Mapper*) tools like *Slick*.
4 | While *FRM* tools bring type safe *language-integrated-query* and flexible query composition as well as powerful functional programming paradigm to it's users, the main focus is on data object access and thus in ways short on strength for data persistence support such as data row traversal operations that are so common in *ORM*. This short-fall plus a brand new functional programming style also make many *OOP* programmers from the *ORM* world using *FRM* quite uncomfortable or even somewhat awkward. On top of bringing back the familiar recordset operations to support data row processing, *FunDA* also adds in explicit parallel data processing capabilities as well as a simple user-defined programming model to lower requirements for functional programming skills such that with little getting-used-to a traditional *OOP* programmer could handle *FRM* so to make *FunDA* as a much practically productive tool.
5 | The core of *FunDA* is implemented with *scalaz-streams-fs2*. *FunDA* can be depicted as a workflow pipe with a sequence of work-nodes where user-defined data processing tasks could be plugged in. *FunDA* is implemented as a forward-only stream of rows representing pure data or query actions. User-defined-tasks at a work-node can intercept rows and run some processes with the context of each row. And these user-defined-tasks can be run parallelly through *FunDA*'s Parallelism.
6 | A typical *FunDA* program consists of a **source** and many **user-defined-tasks** as follows:
7 |
8 | ```
9 | val streamSource = streamLoader.fda_typedStream(albumsInfo.result)(db)(512, 128)()
10 |
11 | streamSource.appendTask(transformData).appendTask(runActionQuery).appendTask(showResults).startRun
12 | ```
13 | where "streamSource" is a *FunDA* stream **source** produced by loading data from database, and "transformData", "runActionQuery" and "showResults" are all user-defined-tasks each responsible to achieve some minimal distinctive effect. As the unique flavor of functional programming, these are functional combinators and can be composed in a specific order to perform a much bigger and complexed task. From the semantics of the *FunDA* program above we can make a wild guess that "transformData" would transform each data row to query actions and these query actions are executed by "runActionQuery" at the next work-node.
14 | #### how to use
15 |
16 | *FunDA* artifacts are currently published on Bintray. Add following in your build.sbt:
17 |
18 | ```
19 | resolvers += Resolver.bintrayRepo("bayakala","maven")
20 | libraryDependencies += "com.bayakala" %% "funda" % "1.0.0-RC-01" withSources() withJavadoc()
21 |
22 |
23 | ```
24 | for your info, *FunDA* allready includes the following dependencies:
25 |
26 | ```
27 | libraryDependencies ++= Seq(
28 | "com.typesafe.slick" %% "slick" % "3.2.0",
29 | "com.h2database" % "h2" % "1.4.191",
30 | "com.typesafe.slick" %% "slick-hikaricp" % "3.2.0",
31 | "ch.qos.logback" % "logback-classic" % "1.1.7",
32 | "co.fs2" %% "fs2-core" % "0.9.4",
33 | "co.fs2" %% "fs2-io" % "0.9.4",
34 | "com.typesafe.play" % "play-iteratees-reactive-streams_2.11" % "2.6.0"
35 | )
36 |
37 | ```
38 | **Remarks:** users should set up their own *Slick* database configuration file application.conf in the resources directory.
39 | #### to run the examples
40 | There is a sample applicaion "funda-demo" located on github here: [www.github.com/bayakala/funda-demo](http://www.github.com/bayakala/funda-demo/) . It includes sample data located under resources/testdata/ and it is a bare cvs file. Import this file in your database before you run the examples. The examples should be run in the following order:
41 |
42 | ```
43 | 1. StrongTypedRows.scala
44 | 2. UserDefinedTasks.scala
45 | 3. ParallelLoading.scala
46 | 4. ParallelTasks.scala
47 | 5. ParallelExecution.scala
48 | 6. ExceptionsAndFinalizers.scala
49 | ```
50 | *download and try it. good luck and have fun!*
51 | ## The Principles
52 |
53 | *FunDA*'s workflow *FDAPipeLine* is a *scalaz-streams-fs2* and therefore is a *free-monad*. It is highly composable:
54 |
55 | ```
56 | val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toTypedRow _)
57 | val source = streamLoader.fda_typedStream(aqmQuery.result)(db)(512,512)()
58 | val stream = source.filter{r => r.year > "1999"}.take(3).appendTask(showRecord)
59 |
60 | stream.startRun
61 | ```
62 | as demonstrated above, we can compose stream anyway we want before **startRun**
63 | ### FunDA stream (The program)
64 | ##### strong-typed rows
65 | As mentioned above, FunDA programs are just composition of a **source** and a string of **user-defined-tasks** as a stream with data produced by **source** as rows. To facilitate stream operations we must convert data loaded from database into strong-typed rows. A practical case is that *Slick* usually returns query results in a collection of tuples. Thus we must take an extra step to convert them into user defined strong-typed case classes.
66 | The following code snippet demonstrates such conversion:
67 |
68 | ```
69 | // aqmQuery.result returns Seq[(String,String,String,String)]
70 | val aqmQuery = aqmraw.map {r => (r.year,r.state,r.county,r.value)}
71 | // user designed strong typed resultset type. must extend FDAROW
72 | case class TypedRow(year: String, state: String, county: String, value: String) extends FDAROW
73 | // strong typed resultset conversion function. declared implicit to remind during compilation
74 | implicit def toTypedRow(row: (String,String,String,String)): TypedRow =
75 | TypedRow(row._1,row._2,row._3,row._4)
76 |
77 | // loader to read from database and convert result collection to strong typed collection
78 | val viewLoader = FDAViewLoader(slick.jdbc.H2Profile)(toTypedRow _)
79 | val dataSeq = viewLoader.fda_typedRows(aqmQuery.result)(db).toSeq
80 | // turn Seq collection into FunDA stream with strong-typed rows
81 | val aqmStream: FDAPipeLine[TypedRow] = fda_staticSource(dataSeq)()
82 |
83 | ```
84 | ##### static view and dynamic streaming sources
85 | Static sources or views are data structures completely loaded into memory after returning from running a query. Stream sources are data streams as returned query results that are *reactive-streams* conformant. In other words stream sources are backend cached and motivated by back-pressure. *FunDA* provides functions to produce sources. The following is a demonstration of static view producing:
86 |
87 | ```
88 | // loader to read from database and convert result collection to strong typed collection
89 | val viewLoader = FDAViewLoader(slick.jdbc.H2Profile)(toTypedRow _)
90 | val dataSeq = viewLoader.fda_typedRows(aqmQuery.result)(db).toSeq
91 | // turn Seq collection into FunDA stream with strong-typed rows
92 | val aqmView: FDAPipeLine[TypedRow] = fda_staticSource(dataSeq)()
93 |
94 | ```
95 | stream source can be constructed as follows:
96 |
97 | ```
98 | // strong typed source is also possible with Slick data streaming
99 | val streamLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toTypedRow _)
100 | val aqmStream: FDAPipeLine[TypedRow] = streamLoader.fda_typedStream(aqmQuery.result)(db)(512,512)()
101 | ```
102 | as demonstrated above, both static collections and dynamic data streams can be transform into strong-typed-row sources.
103 |
104 | ### Control data flow
105 |
106 | Flow of rows in *FunDA* streams are controlled inside user-defined-tasks, in which a row is received from upstream and zero or one or more rows could be passed downstream. This means additional new rows could be constructed instantly and passed downstream inside these user-defined-tasks which makes *FunDA* logics much more flexible and powerful. Passing no row in a receive-send-loop is represented by skip. User could also halt stream by passing an end-of-stream signal downstream inside these user-defined-tasks. The following are some code samples:
107 |
108 | ```
109 | user-defined-task type is defined as follows:
110 | type FDAUserTask[ROW] = (ROW) => (Option[List[ROW]])
111 | /* user define function to be performed at a FDAWorkNode
112 | * given a row from upstream, return Option[List[ROW]] as follows:
113 | * fda_skip -> Some(Nil) : skip sending the current row
114 | * fda_next -> Some(List(r1,r2...)): send r1,r2... downstream
115 | * fda_break -> None : halt stream, end of process
116 | * @tparam ROW type of row
117 | */
118 |
119 | // a example of user-defined-task
120 | def dancing: FDAUserTask[FDAROW] = row => {
121 | row match {
122 | case qmr: TypedRow =>
123 | qmr.year match {
124 | case a if a < 1960 =>
125 | // pass downstream untouched
126 | fda_next(qmr)
127 | case b if b < 1970 =>
128 | // transform row
129 | fda_next(qmr.copy(year = 1970))
130 | case c if c < 1980 =>
131 | // pass alone with a new row. TypedRow is a case class
132 | fda_next(List(qmr,TypedRow(qmr.stare,"countyQQ",0.3,1980)))
133 | case d if d < 2000 =>
134 | // do not process this row
135 | fda_skip
136 | case _ =>
137 | // stop stream
138 | fda_break
139 | }
140 | // encounter unknown row type, break out
141 | case _ => fda_break
142 | }
143 | }
144 | ```
145 |
146 | ### defining user-defined-task
147 |
148 | As a functional stream, It seems that some of the data access and processing in *FunDA* could be achieved in some pure functional ways like the following:
149 |
150 | ```
151 | fdaStream.map(row => transformData(row)).map(action => runQueryAction(action))
152 | ```
153 | unfortunately the fact is pure stream combinators lack powerful and flexible flow control abilities that are so crucial for processing stream elements, therefore **user-defined-task** is introduced as a programming model to deal with the situation.
154 | User-defined-tasks are functional combinators designed by users each to achieve a single minimal task, and a much more complexed final task could be assembled by composing many of these tiny tasks in a specific order and then ***startRun***. The signature of *FDAUserTask[ROW]* is as follows:
155 |
156 | ```
157 | type FDAUserTask[ROW] = (ROW) => (Option[List[ROW]])
158 |
159 | ```
160 | the above reads: an **user-defined-task** takes a row as input, use or transform it, and as a way of flow control, signify the state of next step of stream by returning **Option[List[ROW]]** as a result of execution of fda_next, fda_skip or fda_break. With the strong-typed-row requirement in place, the involved row types must extend from **FDAROW** and could be either a data-row or action-row.
161 | ##### types of rows
162 | *FunDA* streams are strong-typed, all rows must extend **FDAROW**. There are several categories of rows:
163 |
164 | * data-row: any case class extending **FDAROW** with parameters representing fields:
165 | **case class TypedRow(year: Int, state: String, value: Int) extends FDAROW**
166 | * action-row: case class extending **FDAROW** with a **Slick DBIOAction** wrapped inside the parameter as follows:
167 | **case class FDAActionRow(action: FDAAction) extends FDAROW**
168 | sometimes we need to target an action row to be run in different database context. In that case we can just define any case class and extend **FDAROW**:
169 | **case class MyActionRow(action: FDAAction) extends FDAROW**
170 | * error-row: case class extending **FDAROW** with a caught Exception object wrapped inside its parameter.
171 | **case class FDAErrorRow(e: Exception) extends FDAROW**
172 | user can define their own error row for different exceptions as long as they extend **FDAROW**:
173 | **case class MyErrorRow(msg: String, e: Exception) extends FDAROW**
174 | * null-row: a signal object used to represent EOS(end-of-stream):
175 | **case object FDANullRow extends FDAROW**
176 |
177 | ##### standard-operation-procedures
178 | User-defined-tasks have standard operation procedures as the following:
179 |
180 | 1. determine row type by pattern-matching
181 | 2. use row fields to perform data processing and transformation
182 | 3. control flow of rows downstream
183 |
184 | the following are samples of a few different purposed user-defined-tasks:
185 |
186 | ```
187 | //strong typed row type. must extend FDAROW
188 | case class StateRow(state: String) extends FDAROW
189 |
190 | //a logging task. show name and pass row untouched downstream
191 | def showState: FDAUserTask[FDAROW] = row => {
192 | row match {
193 | case StateRow(sname) => //this is my row
194 | println(s"Name of state is:$sname")
195 | fda_next(row)
196 | case _ => fda_skip //not my row, do not pass it
197 | }
198 | }
199 | ```
200 |
201 | ```
202 | //a filter and processing task.
203 | //filter out rows with inconvertible value strings and out of ranged value and year
204 | def filterRows: FDAUserTask[FDAROW] = row => {
205 | row match {
206 | case r: AQMRaw => { //this is the correct row
207 | try { //process this row and catch exceptions
208 | val yr = r.year.toInt
209 | val v = r.value.toInt
210 | val vlu = if ( v > 10 ) 10 else v //max value allowed
211 | //construct a new row
212 | val data = AQMRPTModel(0,r.mid.toInt,r.state,r.county,yr,vlu,0,true)
213 | if ((yr > 1960 && yr < 2018)) //filtering
214 | fda_next(data) //this row ok. pass downstream
215 | else
216 | fda_skip //filter out this row
217 | } catch {
218 | case e: Exception =>
219 | fda_next(FDAErrorRow(e)) //pass the caught exception as a row downstream
220 | }
221 | }
222 | case _ => fda_skip //wrong type, skip
223 | }
224 | }
225 | ```
226 |
227 | ```
228 | //a row transformation task
229 | //transform data to action for later execution
230 | def toAction: FDAUserTask[FDAROW] = row => {
231 | row match {
232 | case r: AQMRPTModel => //this is my row
233 | val queryAction = AQMRPTQuery += r //slick action
234 | fda_next(FDAActionRow(queryAction))
235 | case _ => fda_skip
236 | }
237 | }
238 | ```
239 |
240 | ```
241 | //a query action runner task
242 | //get a query runner and an action task
243 | val actionRunner = FDAActionRunner(slick.jdbc.H2Profile)
244 | def runActionRow: FDAUserTask[FDAROW] = action => {
245 | action match {
246 | case FDAActionRow(q) => //this is a query action row
247 | actionRunner.fda_execAction(q)(db) //run it
248 | fda_skip
249 | case other@_ => fda_next(other) //don't touch it, just pass down
250 | //someone else downstream could process it
251 | }
252 | }
253 |
254 | ```
255 |
256 | to run many task as a whole, we compose them and **startRun**:
257 |
258 | ```
259 | //compose the program
260 | val streamAllTasks = streamAQMRaw.appendTask(filterRows)
261 | .appendTask(toAction)
262 | .appendTask(runActionRow)
263 | //run program
264 | streamToRun.startRun
265 |
266 | ```
267 | ##### aggregation
268 | In stream style processing, many times we need to aggregate over rows, this is where **user-aggregate-task** is designed to fit in. An **user-aggregate-task** has the following signature:
269 | ```
270 | type FDAAggrTask[AGGR,ROW] = (AGGR,ROW) => (AGGR,Option[List[ROW]])
271 | ```
272 | *AGGR* could be any user defined type to represent the state of aggregation. From the above type signature, we can see it is a typical functional style function represented by input a state and output new state. The following is an example of **user-aggregate-task**:
273 |
274 | ```
275 | //define a structure to represent aggregator type
276 | case class Accu(state: String, county: String, year: Int, count: Int, sumOfValue: Int)
277 |
278 | //user defined aggregation task. only pass aggregated row downstream
279 | def countingAverage: FDAAggrTask[Accu,FDAROW] = (accu,row) => {
280 | row match {
281 | case aqmr: AQMRPTModel => //this is the target row type
282 | if (accu.state == "" || (aqmr.state == accu.state && aqmr.year == accu.year))
283 | //same condition: inc count and add sum, no need to pass row downstream
284 | (Accu(aqmr.state,aqmr.county,aqmr.year,accu.count+1, accu.sumOfValue+aqmr.value),fda_skip)
285 | else
286 | //reset accumulator, create a new aggregated row and pass downstream
287 | (Accu(aqmr.state,aqmr.county,aqmr.year,1, aqmr.value)
288 | ,fda_next(AQMRPTModel(0,9999,accu.state,accu.county,accu.year
289 | ,accu.count,accu.sumOfValue/accu.count,true)))
290 | case FDANullRow =>
291 | //last row encountered. create and pass new aggregated row
292 | (Accu(accu.state,accu.county,accu.year,1, 0)
293 | ,fda_next(AQMRPTModel(0,9999,accu.state,accu.county,accu.year
294 | ,accu.count,accu.sumOfValue/accu.count,true)))
295 | //incorrect row type, do nothing
296 | case _ => (accu,fda_skip)
297 | }
298 | }
299 |
300 | ```
301 |
302 | the following demonstrates how it is executed:
303 |
304 | ```
305 | aqmrStream.aggregateTask(Accu("","",0,0,0),countingAverage)
306 | .appendTask(toAction)
307 | .appendTask(runActionRow)
308 | .startRun
309 |
310 | ```
311 | "aqmrStream" is a **source** with rows to be aggregated.
312 | ### Running programs inside user-defined-task
313 | A *FunDA* program consist of a **source** and multiple **user-defined-tasks**. It is possible to execute a *FunDA* program inside these *user_defined-tasks*. This means we have to call **startRun** inside the *user-defined-task* and some effect would inevitably be produced rending the calling *user-defined-task* impure. A complete example of *FunDA* program inside a *user-defined-task* is given below:
314 |
315 | ```
316 | //getting id with corresponding name from STATES table
317 | def getStateID(state: String): Int = {
318 | //create a stream for state id with state name
319 | implicit def toState(row: StateTable#TableElementType) = StateModel(row.id,row.name)
320 | val stateLoader = FDAViewLoader(slick.jdbc.H2Profile)(toState _)
321 | val stateSeq = stateLoader.fda_typedRows(StateQuery.result)(db).toSeq
322 | //constructed a Stream[Task,String]
323 | val stateStream = fda_staticSource(stateSeq)()
324 | var id = -1
325 | def getid: FDAUserTask[FDAROW] = row => {
326 | row match {
327 | case StateModel(stid,stname) => //target row type
328 | if (stname.contains(state)) {
329 | id = stid
330 | fda_break //exit
331 | }
332 | else fda_skip //take next row
333 | case _ => fda_skip
334 | }
335 | }
336 | stateStream.appendTask(getid).startRun
337 | id
338 | }
339 |
340 | ```
341 | "getStateID" is a *user-defined-function* in which function "getid" is physically executed. Because "getid" is local, we are still confident to use the calling *user-defined-function* in composition with other combinators as following:
342 |
343 | ```
344 | //process input row and produce action row to insert into NORMAQM
345 | def getIdsThenInsertAction: FDAUserTask[FDAROW] = row => {
346 | row match {
347 | case aqm: AQMRPTModel =>
348 | if (aqm.valid) {
349 | val stateId = getStateID(aqm.state)
350 | val countyId = getCountyID(aqm.state,aqm.county)
351 | val action = NORMAQMQuery += NORMAQMModel(0,aqm.mid, stateId, countyId, aqm.year,aqm.value,aqm.total)
352 | fda_next(FDAActionRow(action))
353 | }
354 | else fda_skip
355 | case _ => fda_skip
356 | }
357 | }
358 | ```
359 | in this case "getStateID" is called within another *user-defined-task*.
360 | ### Parallel Processing
361 | *FunDA* borrows its parallelism capabilities from *scalaz-streams-fs2*. There are two areas of parallel data processing application:
362 |
363 |
364 | * parallel loading of multiple **sources**
365 | * parallel execution of a single **user-defined-task**
366 |
367 | ##### parallel loading
368 | Parallel loading of many sources is achieved by calling function **fda-par-load** provided in *FunDA*. These sources could be constructed from tables on many separate database-servers or by spliting huge data tables into smaller un-repeated data chunks like the following:
369 |
370 | ```
371 | //define query for extracting State names from AQMRPT
372 | val qryStates = AQMRPTQuery.map(_.state).distinct.sorted
373 | case class States(name: String) extends FDAROW
374 | implicit def toStates(row: String) = States(row)
375 | val stateLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toStates _)
376 | val statesStream = stateLoader.fda_typedStream(qryStates.result)(db_a)(64,64)()
377 |
378 |
379 | //define query for extracting County names from AQMRPT in separate chunks
380 | //query with state name >A and (r.state.toUpperCase > "K" &&
388 | r.state.toUpperCase < "P")).map(r => (r.state,r.county))
389 | .distinctOn(r => (r._1,r._2))
390 | .sortBy(r => (r._1,r._2))
391 |
392 | //query with state name >P
393 | val qryCountiesP_Z = AQMRPTQuery.filter(r => r.state.toUpperCase > "P")
394 | .map(r => (r.state,r.county))
395 | .distinctOn(r => (r._1,r._2))
396 | .sortBy(r => (r._1,r._2))
397 |
398 | case class Counties(state: String, name: String) extends FDAROW
399 | implicit def toCounties(row: (String,String)) = Counties(row._1,row._2)
400 | val countyLoader = FDAStreamLoader(slick.jdbc.H2Profile)(toCounties _)
401 | //3 separate streams to extract county names from the same database table AQMRPT
402 | val countiesA_KStream: FDAPipeLine[County] = countyLoader.fda_typedStream(qryCountiesA_K.result)(db_b)(64,64)()
403 | val countiesK_PStream: FDAPipeLine[County] = countyLoader.fda_typedStream(qryCountiesK_P.result)(db_b)(64,64)()
404 | val countiesP_ZStream: FDAPipeLine[County] = countyLoader.fda_typedStream(qryCountiesP_Z.result)(db_b)(64,64)()
405 |
406 | ```
407 | once these **sources** are all constructed, we then load them parallelly:
408 |
409 | ```
410 | //obtain a combined stream with parallel loading with max of 4 open computation
411 | val combinedStream: FDAPipeLine[FDAROW] = fda_par_load(statesStream,countiesA_KStream,countiesK_PStream,countiesP_ZStream)(4)
412 |
413 | ```
414 | doing parallel loading would most likely to produce a stream with multiple types of rows, in the above case **States** and **Counties** represent two different types of rows respectively. Therefore *user-defined-tasks* each targeting different type of row are designed to handle rows of its target type, like the following:
415 |
416 | ```
417 | //user-task to catch rows of States type and transform them into db insert actions
418 | def processStates: FDAUserTask[FDAROW] = row => {
419 | row match {
420 | //catch states row and transform it into insert action
421 | case States(stateName) => //target row type
422 | println(s"State name: ${stateName}")
423 | val action = StateQuery += StateModel(0,stateName)
424 | fda_next(StateActionRow(action))
425 | case others@ _ => //pass other types to next user-defined-tasks
426 | fda_next(others)
427 | }
428 | }
429 | //user-task to catch rows of Counties type and transform them into db insert actions
430 | def processCounties: FDAUserTask[FDAROW] = row => {
431 | row match {
432 | //catch counties row and transform it into insert action
433 | case Counties(stateName,countyName) => //target row type
434 | println(s"County ${countyName} of ${stateName}")
435 | val action = CountyQuery += CountyModel(0,countyName+ " of "+stateName)
436 | fda_next(CountyActionRow(action))
437 | case others@ _ => //pass other types to next user-defined-tasks
438 | fda_next(others)
439 | }
440 | }
441 | ```
442 | ###### parallel loading a stream of sources
443 | The above demonstration of parallel loading started with known number of sources. This is especially convenient for users to manually arrange sources of different row types in a parallel loading operation. But, when a list of sources is itself a stream, then to parallelly load the stream of sources we need to first convert the stream into **FDAParSource** over a **FDASourceLoader** function as follows:
444 |
445 | ```
446 | //loading rows with year yr
447 | def loadRowsInYear(yr: Int) = {
448 | //a new query
449 | val query = AQMRPTQuery.filter(row => row.year === yr)
450 | //reuse same loader
451 | AQMRPTLoader.fda_typedStream(query.result)(db)(256, 256)(println(s"End of stream ${yr}!!!!!!"))
452 | }
453 |
454 | //loading rows by year
455 | def loadRowsByYear: FDASourceLoader = row => {
456 | row match {
457 | case Years(y) => loadRowsInYear(y) //produce stream of the year
458 | case _ => fda_appendRow(FDANullRow)
459 | }
460 | }
461 |
462 | //get parallel source constructor
463 | val parSource: FDAParSource = yearStream.toParSource(loadRowsByYear)
464 |
465 | ```
466 | the following demonstrates loading of this parallel source:
467 |
468 | ```
469 | //produce a stream from parallel source
470 | val stream: FDAPipeLine[FDAROW] = fda_par_source(parSource)(4)
471 | ```
472 | **fda_par_source** is actually a parallel execution function analogous to **fda_runPar** which is described in the following section.
473 | ##### parallel execution
474 | *FunDA* provides a function **fda_runPar** as a parallel task runner. A parallel task has the following signature:
475 |
476 | ```
477 | /** Parallel task type
478 | * stream of streams type for parallel running user action
479 | * use stream.toPar to convert from FDAUserTask
480 | */
481 | type FDAParTask = Stream[Task,Stream[Task,Option[List[FDAROW]]]]
482 |
483 | ```
484 | and a **FDAUserTask** can be converted to **FDAParTask** as bellow:
485 |
486 | ```
487 | AQMRPTStream.toPar(getIdsThenInsertAction)
488 | ```
489 | where "AQMRPTStream" is a *FunDA* **source** and **toPar** is its method to turn "getIdsThenInsertAction" into a parallel task of many instances running in different threads. The principle of parallel execution is by scrambling rows in a single input stream into several un-ordered streams as inputs to many instances of a single task running parallelly in different threads. A **FDAParTask** requires a special runner to be carried out as shown below:
490 |
491 | ```
492 | fda_runPar(AQMRPTStream.toPar(getIdsThenInsertAction))(8)
493 | ```
494 | **fda_runPar** has a signature as follows:
495 |
496 | ```
497 | def fda_runPar(parTask: FDAParTask)(maxOpen: Int): FDAPipleLine[FDAROW]
498 |
499 | ```
500 | **maxOpen** designates the maximum number of open computations or degree of parallelism and the actual number of open computations depends on a number of factors including CPU cores, size of thread-pool and no more than user suggested maximum number of open computations. Thread-pool can be adjusted from default values by declaring implicit instance of Strategy:
501 |
502 | ```
503 | implicit val strategy = Strategy.fromCachedDaemonPool("cachedPool")
504 | // implicit val strategy = Strategy.fromFixedDaemonPool(6)
505 |
506 | ```
507 | the actual performance of parallelism requires thorough tuning of thread-pool strategies with respect to number of CPU cores. For whatever configurations, the performance gain through parallelism over single-thread task demonstrates great significance. A complete composition of parallel loading and parallel execution has the following model:
508 |
509 | ```
510 | //get parallel source constructor
511 | val parSource = yearStream.toParSource(loadRowsByYear)
512 | //implicit val strategy = Strategy.fromCachedDaemonPool("cachedPool")
513 | //produce a stream from parallel sources
514 | val source = fda_par_source(parSource)(4)
515 | //turn getIdsThenInsertAction into parallel task
516 | val parTasks = source.toPar(getIdsThenInsertAction)
517 | //runPar to produce a new stream
518 | val actionStream =fda_runPar(parTasks)(4)
519 | //turn runInsertAction into parallel task
520 | val parRun = actionStream.toPar(runInsertAction)
521 | //runPar and carry out by startRun
522 | fda_runPar(parRun)(2).startRun
523 |
524 | ```
525 | ##### remarks when writing parallel processing programs
526 | According to experiments, **FunDA** concurrent combinators are quite sensitive to thread management. The **maxOpen** parameters of both **fda_runPar** and **fda_par_source** in the examples given above had to be tuned to avoid thread contention and process-hang. The example used **HikariCP** with parameters set below:
527 |
528 | ```
529 | h2db {
530 | url = "jdbc:h2:tcp://localhost/~/slickdemo;mv_store=false"
531 | driver = "org.h2.Driver"
532 | connectionPool = HikariCP
533 | numThreads = 48
534 | maxConnections = 48
535 | minConnections = 12
536 | keepAliveConnection = true
537 | }
538 |
539 | ```
540 | ### Exceptions handling and Finalizers
541 | **FunDA** provides a mechanism that guarantees a **finalizer** be called upon termination of the stream, no matter if it is naturally end of stream or break-out caused by interruptions or exceptions. **Finalizers** are in fact call-back-functions hooked-up to a *FunDA* program during source construction, like the following:
542 |
543 | ```
544 | val view = fda_staticSource(stateSeq)(println("***Finally*** the end of view!!!"))
545 | val stream = streamLoader.fda_typedStream(StateQuery.result)(db)(64,64)(println("***Finally*** the end of stream!!!"))
546 |
547 | ```
548 | exceptions can be caught by **onError** call-backs that are hooked-up at the **very end** of **FunDA** stream in order to catch exceptions from all work-nodes as follows:
549 |
550 | ```
551 | val v = viewState.appendTask(errorRow).appendTask(trackRows)
552 | val v1 = v.onError {case e: Exception => println(s"Caught Error in viewState!!![${e.getMessage}]"); fda_appendRow(FDANullRow)}
553 | v1.startRun
554 |
555 | val s = streamState.appendTask(errorRow).appendTask(trackRows)
556 | val s1 = s.onError {case e: Exception => println(s"Caught Error in streamState!!![${e.getMessage}]"); fda_appendRow(FDANullRow)}
557 | s1.startRun
558 |
559 | ```
560 | ##### user defined exceptions
561 | Sometimes we wish to watch some particular events and take corresponding actions when they take place. This can be achieved by user-defined-exceptions. User-defined-exceptions are special rows extending from **FDAROW** that can be caught by pattern matching. The following is an example of user-defined-exception and its handling:
562 |
563 | ```
564 | case class DivideZeroError(msg: String, e: Exception) extends FDAROW
565 | def catchError: FDAUserTask[FDAROW] = row => {
566 | row match {
567 | case StateModel(id,name) =>
568 | try {
569 | val idx = id / (id - 3)
570 | fda_next(StateModel(idx, name))
571 | } catch {
572 | case e: Exception => //pass an error row
573 | fda_next(DivideZeroError(s"Divide by zero exception at ${id}",e))
574 | }
575 | case m@_ => fda_next(m)
576 | }
577 | }
578 |
579 | def trackRows: FDAUserTask[FDAROW] = row => {
580 | row match {
581 | case m@StateModel(id,name) =>
582 | println(s"State: $id $name")
583 | println( "----------------")
584 | fda_next(m)
585 | case DivideZeroError(msg, e) => //error row
586 | println(s"***Error:$msg***")
587 | fda_skip
588 | case m@_ => fda_next(m)
589 | }
590 | }
591 |
592 | val s = streamState.take(5).appendTask(catchError).appendTask(trackRows)
593 | val s1 = s.onError {case e: Exception => println(s"Caught Error in streamState!!![${e.getMessage}]"); fda_appendRow(FDANullRow)}
594 | s1.startRun
595 |
596 | ```
--------------------------------------------------------------------------------