├── LICENSE ├── README.md ├── accumulator.go ├── aggregator.go ├── context.go ├── env.go ├── examples ├── demo.go ├── kmean_data.txt ├── kmeans.go └── points.txt ├── rdd.go ├── rdd_test.go ├── rddimpls.go ├── schedule.go ├── utils.go └── vector.go /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | ======================================================================= 206 | Apache Spark Subcomponents: 207 | 208 | The Apache Spark project contains subcomponents with separate copyright 209 | notices and license terms. Your use of the source code for the these 210 | subcomponents is subject to the terms and conditions of the following 211 | licenses. 212 | 213 | 214 | ======================================================================= 215 | For the Boto EC2 library (ec2/third_party/boto*.zip): 216 | ======================================================================= 217 | 218 | Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/ 219 | 220 | Permission is hereby granted, free of charge, to any person obtaining a 221 | copy of this software and associated documentation files (the 222 | "Software"), to deal in the Software without restriction, including 223 | without limitation the rights to use, copy, modify, merge, publish, dis- 224 | tribute, sublicense, and/or sell copies of the Software, and to permit 225 | persons to whom the Software is furnished to do so, subject to the fol- 226 | lowing conditions: 227 | 228 | The above copyright notice and this permission notice shall be included 229 | in all copies or substantial portions of the Software. 230 | 231 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 232 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- 233 | ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 234 | SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 235 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 236 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 237 | IN THE SOFTWARE. 238 | 239 | 240 | ======================================================================== 241 | For CloudPickle (pyspark/cloudpickle.py): 242 | ======================================================================== 243 | 244 | Copyright (c) 2012, Regents of the University of California. 245 | Copyright (c) 2009 `PiCloud, Inc. `_. 246 | All rights reserved. 247 | 248 | Redistribution and use in source and binary forms, with or without 249 | modification, are permitted provided that the following conditions 250 | are met: 251 | * Redistributions of source code must retain the above copyright 252 | notice, this list of conditions and the following disclaimer. 253 | * Redistributions in binary form must reproduce the above copyright 254 | notice, this list of conditions and the following disclaimer in the 255 | documentation and/or other materials provided with the distribution. 256 | * Neither the name of the University of California, Berkeley nor the 257 | names of its contributors may be used to endorse or promote 258 | products derived from this software without specific prior written 259 | permission. 260 | 261 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 262 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 263 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 264 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 265 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 266 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 267 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 268 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 269 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 270 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 271 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 272 | 273 | 274 | ======================================================================== 275 | For Py4J (python/lib/py4j0.7.egg and files in assembly/lib/net/sf/py4j): 276 | ======================================================================== 277 | 278 | Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. 279 | 280 | Redistribution and use in source and binary forms, with or without 281 | modification, are permitted provided that the following conditions are met: 282 | 283 | - Redistributions of source code must retain the above copyright notice, this 284 | list of conditions and the following disclaimer. 285 | 286 | - Redistributions in binary form must reproduce the above copyright notice, 287 | this list of conditions and the following disclaimer in the documentation 288 | and/or other materials provided with the distribution. 289 | 290 | - The name of the author may not be used to endorse or promote products 291 | derived from this software without specific prior written permission. 292 | 293 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 294 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 295 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 296 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 297 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 298 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 299 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 300 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 301 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 302 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 303 | POSSIBILITY OF SUCH DAMAGE. 304 | 305 | 306 | ======================================================================== 307 | For DPark join code (python/pyspark/join.py): 308 | ======================================================================== 309 | 310 | Copyright (c) 2011, Douban Inc. 311 | All rights reserved. 312 | 313 | Redistribution and use in source and binary forms, with or without 314 | modification, are permitted provided that the following conditions are 315 | met: 316 | 317 | * Redistributions of source code must retain the above copyright 318 | notice, this list of conditions and the following disclaimer. 319 | 320 | * Redistributions in binary form must reproduce the above 321 | copyright notice, this list of conditions and the following disclaimer 322 | in the documentation and/or other materials provided with the 323 | distribution. 324 | 325 | * Neither the name of the Douban Inc. nor the names of its 326 | contributors may be used to endorse or promote products derived from 327 | this software without specific prior written permission. 328 | 329 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 330 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 331 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 332 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 333 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 334 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 335 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 336 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 337 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 338 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 339 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 340 | 341 | 342 | ======================================================================== 343 | For sorttable (core/src/main/resources/org/apache/spark/ui/static/sorttable.js): 344 | ======================================================================== 345 | 346 | Copyright (c) 1997-2007 Stuart Langridge 347 | 348 | Permission is hereby granted, free of charge, to any person obtaining a copy 349 | of this software and associated documentation files (the "Software"), to deal 350 | in the Software without restriction, including without limitation the rights 351 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 352 | copies of the Software, and to permit persons to whom the Software is 353 | furnished to do so, subject to the following conditions: 354 | 355 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 356 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 357 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 358 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 359 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 360 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 361 | THE SOFTWARE. 362 | 363 | 364 | ======================================================================== 365 | For Scala Interpreter classes (all .scala files in repl/src/main/scala 366 | except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala): 367 | ======================================================================== 368 | 369 | Copyright (c) 2002-2013 EPFL 370 | Copyright (c) 2011-2013 Typesafe, Inc. 371 | 372 | All rights reserved. 373 | 374 | Redistribution and use in source and binary forms, with or without 375 | modification, are permitted provided that the following conditions are met: 376 | 377 | - Redistributions of source code must retain the above copyright notice, 378 | this list of conditions and the following disclaimer. 379 | 380 | - Redistributions in binary form must reproduce the above copyright notice, 381 | this list of conditions and the following disclaimer in the documentation 382 | and/or other materials provided with the distribution. 383 | 384 | - Neither the name of the EPFL nor the names of its contributors may be 385 | used to endorse or promote products derived from this software without 386 | specific prior written permission. 387 | 388 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 389 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 390 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 394 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 395 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 396 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 397 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 398 | POSSIBILITY OF SUCH DAMAGE. 399 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GoPark 2 | ============= 3 | 4 | GoPark is a naive local version porting of [Spark](http://spark.incubator.apache.org/)/[Dpark](https://github.com/douban/dpark), MapReduce(R) alike computing framework supporting iterative computation. 5 | 6 | GoPark is implemented in Go languange, and provides the cocurrent MapReduce(R) data operations using GoRoutines. It can only run in the local mode, but you can specify the concurrent numbers. 7 | 8 | Examples 9 | ------------- 10 | 11 | Examples for computing PI: 12 | ``` 13 | package main 14 | 15 | import ( 16 | "fmt" 17 | "github.com/mijia/gopark" 18 | "math/rand" 19 | ) 20 | 21 | func main() { 22 | gopark.ParseOptions() 23 | c := gopark.NewContext("ComputePI") 24 | defer c.Stop() 25 | 26 | N := 100000 27 | iters := c.Data(make([]interface{}, N)) 28 | count := iters.Map(func(_ interface{}) interface{} { 29 | x := rand.Float32() 30 | y := rand.Float32() 31 | if x*x+y*y < 1 { 32 | return 1 33 | } else { 34 | return 0 35 | } 36 | }).Reduce(func(x, y interface{}) interface{} { 37 | return x.(int) + y.(int) 38 | }).(int) 39 | fmt.Println("Pi =", (4.0 * float64(count) / float64(N))) 40 | } 41 | ``` 42 | 43 | The above code can be runned as (using 4 go routines concurrently.) 44 | ``` 45 | $ go run hello.go -p=4 46 | ``` 47 | Checkout the examples/ for more cases. 48 | 49 | interface{} 50 | ------------- 51 | As the examples shows, since Go only provides the interface{} as the root type for everything and the type check/assertion is very strict in golang, so all the apis are implemented using the interface{} as the parameters. Have to do the type asserting in the closure functions. This also applies to the []interface{}. 52 | 53 | The basic closure functions are like, 54 | ``` 55 | type MapperFunc func(interface{}) interface{} 56 | type PartitionMapperFunc func([]interface{}) []interface{} 57 | type FlatMapperFunc func(interface{}) []interface{} 58 | type ReducerFunc func(interface{}, interface{}) interface{} 59 | type FilterFunc func(interface{}) bool 60 | type LoopFunc func(interface{}) 61 | ``` 62 | 63 | Shuffle and Shuffle_N like funcs 64 | ------------- 65 | Some functions which do shuffle job like ```GroupByKey()``` also provides the ```GroupByKey_N()``` func, which user can specify the numPartitions that job should run on. Please check rdd.go for references. 66 | 67 | Encode / Gob 68 | ------------- 69 | For the shuffle jobs like ```GroupByKey()``` and ```Persist()```, GoPark uses encoding/gob as the encoder/decoder into local files, since GoPark uses interface{} as the parameters, GOB need to know what the interface{} actually is when decoding. Which can be done like the kmeans.go example shows, 70 | ``` 71 | type CenterCounter struct { 72 | X gopark.Vector 73 | Count int 74 | } 75 | 76 | gob.Register(new(CenterCounter)) 77 | ``` 78 | and you cannot use structs with unexported fields. Just be careful with this, if you got runtime panics, please check 79 | * if you have use a complicated struct 80 | * if you hadn't register the type on GOB, even like ```type T int``` 81 | * if you have use slices of slices of slices .... 82 | Just make sure GOB knows your objects behind the interface{} and []interface{}. 83 | 84 | Things not included 85 | ------------- 86 | So far, the Broadcast are still not implemented. I am just using the 87 | vars in closure. 88 | 89 | And GoPark now really cannot run in the distributed mode. 90 | 91 | Have fun~ 92 | 93 | Originally, I have only two goals in writing this, 94 | * Write some real stuff in Go, since I am learning the language 95 | * I am doing data mining jobs and I need some better concurrent framework for performance, and runs locally is ok for me. 96 | 97 | Spark/DPark 98 | ------------- 99 | These projects are really awesome and the RDD is really a fantastic data structure or design pattern. I learned a lot in them. 100 | 101 | Really want to thank these two projects. 102 | -------------------------------------------------------------------------------- /accumulator.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type AccumulateFunc func(x, y interface{}) interface{} 8 | 9 | type AccumulatorParam interface { 10 | AddFunc() AccumulateFunc 11 | } 12 | 13 | type accumulatorParam struct { 14 | fn AccumulateFunc 15 | } 16 | 17 | func (ap accumulatorParam) AddFunc() AccumulateFunc { 18 | return ap.fn 19 | } 20 | 21 | var IntAccumulatorParam AccumulatorParam 22 | var ListAccumulatorParam AccumulatorParam 23 | 24 | func init() { 25 | IntAccumulatorParam = accumulatorParam{ 26 | fn: func(x, y interface{}) interface{} { 27 | return x.(int) + y.(int) 28 | }, 29 | } 30 | 31 | ListAccumulatorParam = accumulatorParam{ 32 | fn: func(x, y interface{}) interface{} { 33 | return append(x.([]interface{}), y) 34 | }, 35 | } 36 | } 37 | 38 | type Accumulator interface { 39 | Add(interface{}) 40 | Value() interface{} 41 | } 42 | 43 | type _BaseAccumulator struct { 44 | id int64 45 | param AccumulatorParam 46 | value interface{} 47 | accuChan chan interface{} 48 | } 49 | 50 | func (a *_BaseAccumulator) init(initValue interface{}, param AccumulatorParam) { 51 | a.id = newAccumulatorId() 52 | a.value = initValue 53 | a.param = param 54 | a.accuChan = make(chan interface{}) 55 | go func() { 56 | for { 57 | localValue := <-a.accuChan 58 | a.value = a.param.AddFunc()(a.value, localValue) 59 | } 60 | }() 61 | } 62 | 63 | func (a *_BaseAccumulator) Add(x interface{}) { 64 | a.accuChan <- x 65 | } 66 | 67 | func (a *_BaseAccumulator) Value() interface{} { 68 | return a.value 69 | } 70 | 71 | func newIntAccumulator(initValue int) Accumulator { 72 | return newAccumulator(initValue, IntAccumulatorParam) 73 | } 74 | 75 | func newAccumulator(initValue interface{}, param AccumulatorParam) Accumulator { 76 | a := &_BaseAccumulator{} 77 | a.init(initValue, param) 78 | return a 79 | } 80 | 81 | var nextAccuId AtomicInt = 0 82 | 83 | func newAccumulatorId() int64 { 84 | nextAccuId.Add(1) 85 | return nextAccuId.Get() 86 | } 87 | 88 | var _ = fmt.Println 89 | -------------------------------------------------------------------------------- /aggregator.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "sort" 5 | ) 6 | 7 | // We need the []interface{} type for the gob to work 8 | type CombinerCreator func(interface{}) []interface{} 9 | type CombinerMerger func([]interface{}, []interface{}) []interface{} 10 | type ValueMerger func([]interface{}, interface{}) []interface{} 11 | 12 | type _Aggregator struct { 13 | combinerCreator CombinerCreator 14 | combinerMerger CombinerMerger 15 | valueMerger ValueMerger 16 | } 17 | 18 | func newMergeAggregator() *_Aggregator { 19 | a := &_Aggregator{} 20 | a.combinerCreator = func(x interface{}) []interface{} { 21 | return []interface{}{x}[:] 22 | } 23 | a.combinerMerger = func(x, y []interface{}) []interface{} { 24 | return append(x, y...) 25 | } 26 | a.valueMerger = func(x []interface{}, y interface{}) []interface{} { 27 | return append(x, y) 28 | } 29 | return a 30 | } 31 | 32 | type Partitioner interface { 33 | numPartitions() int 34 | getPartition(key interface{}) int 35 | } 36 | 37 | type _HashPartitioner struct { 38 | partitions int 39 | } 40 | 41 | func (p *_HashPartitioner) numPartitions() int { 42 | return p.partitions 43 | } 44 | 45 | func (p *_HashPartitioner) getPartition(key interface{}) int { 46 | hashCode := hashCode(key) 47 | return int(hashCode % int64(p.partitions)) 48 | } 49 | 50 | func newHashPartitioner(partitions int) Partitioner { 51 | p := &_HashPartitioner{} 52 | if partitions < 1 { 53 | p.partitions = 1 54 | } else { 55 | p.partitions = partitions 56 | } 57 | return p 58 | } 59 | 60 | type _RangePartitioner struct { 61 | keys []interface{} 62 | reverse bool 63 | fn KeyLessFunc 64 | } 65 | 66 | func (p *_RangePartitioner) numPartitions() int { 67 | return len(p.keys) + 1 68 | } 69 | 70 | func (p *_RangePartitioner) getPartition(key interface{}) int { 71 | index := sort.Search(len(p.keys), func(i int) bool { 72 | return !p.fn(p.keys[i], key) 73 | }) 74 | if !p.reverse { 75 | return index 76 | } 77 | return len(p.keys) - index 78 | } 79 | 80 | func newRangePartitioner(fn KeyLessFunc, keys []interface{}, reverse bool) Partitioner { 81 | p := &_RangePartitioner{} 82 | p.fn = fn 83 | p.reverse = reverse 84 | sorter := NewParkSorter(keys, fn) 85 | sort.Sort(sorter) 86 | p.keys = keys 87 | return p 88 | } 89 | -------------------------------------------------------------------------------- /context.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "encoding/gob" 5 | "fmt" 6 | "log" 7 | "os" 8 | "os/signal" 9 | "path/filepath" 10 | "runtime" 11 | "syscall" 12 | "time" 13 | ) 14 | 15 | type KeyValue struct { 16 | Key interface{} 17 | Value interface{} 18 | } 19 | 20 | func (kv *KeyValue) String() string { 21 | return fmt.Sprintf("%v:%v", kv.Key, kv.Value) 22 | } 23 | 24 | type KeyGroups struct { 25 | Key interface{} 26 | Groups [][]interface{} 27 | } 28 | 29 | type KeyLessFunc func(x, y interface{}) bool 30 | 31 | type ParkSorter struct { 32 | values []interface{} 33 | fn KeyLessFunc 34 | } 35 | 36 | func (s *ParkSorter) Len() int { 37 | return len(s.values) 38 | } 39 | 40 | func (s *ParkSorter) Swap(i, j int) { 41 | s.values[i], s.values[j] = s.values[j], s.values[i] 42 | } 43 | 44 | func (s *ParkSorter) Less(i, j int) bool { 45 | return s.fn(s.values[i], s.values[j]) 46 | } 47 | 48 | func NewParkSorter(values []interface{}, fn KeyLessFunc) *ParkSorter { 49 | return &ParkSorter{values, fn} 50 | } 51 | 52 | type Yielder chan interface{} 53 | type ReducerFn func(yield Yielder, partition int) interface{} 54 | 55 | type Context struct { 56 | jobName string 57 | scheduler Scheduler 58 | initialzed bool 59 | started bool 60 | startTime time.Time 61 | } 62 | 63 | func (c *Context) String() string { 64 | return fmt.Sprintf("Context-[%s]", c.jobName) 65 | } 66 | 67 | func (c *Context) init() { 68 | if c.initialzed { 69 | return 70 | } 71 | 72 | c.scheduler = newLocalScheduler() 73 | c.initialzed = true 74 | log.Printf("Gpark Context [%s] initialzed.", c.jobName) 75 | } 76 | 77 | func (c *Context) start() { 78 | if c.started { 79 | return 80 | } 81 | 82 | c.init() 83 | env.start() 84 | c.scheduler.start() 85 | 86 | signalChan := make(chan os.Signal, 1) 87 | signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGABRT) 88 | go func() { 89 | s := <-signalChan 90 | parklog("Captured the signal %v\n", s) 91 | c.Stop() 92 | os.Exit(2) 93 | }() 94 | c.started = true 95 | c.startTime = time.Now() 96 | log.Printf("Context [%s] is started.", c.jobName) 97 | } 98 | 99 | func (c *Context) Stop() { 100 | if !c.started { 101 | return 102 | } 103 | 104 | env.stop() 105 | c.scheduler.stop() 106 | c.started = false 107 | log.Printf("Context [%s] is stopped, duration = %s.", c.jobName, (time.Since(c.startTime))) 108 | } 109 | 110 | func (c *Context) runRoutine(rdd RDD, partitions []int, rn ReducerFn) []Yielder { 111 | if partitions == nil { 112 | partitions = make([]int, rdd.len()) 113 | for i := 0; i < rdd.len(); i++ { 114 | partitions[i] = i 115 | } 116 | } 117 | if len(partitions) == 0 { 118 | return nil 119 | } 120 | 121 | c.start() 122 | return c.scheduler.runRoutine(rdd, partitions, rn) 123 | } 124 | 125 | func (c *Context) TextFile(pathname string) RDD { 126 | absPathname, err := filepath.Abs(pathname) 127 | if err != nil { 128 | panic(err) 129 | } 130 | if fStat, err := os.Stat(absPathname); err != nil { 131 | panic(err) 132 | } else { 133 | if !fStat.IsDir() { 134 | return newTextFileRDD(c, absPathname) 135 | } 136 | pathNames := make([]string, 0) 137 | err = filepath.Walk(absPathname, func(path string, info os.FileInfo, err error) error { 138 | if !info.IsDir() { 139 | pathNames = append(pathNames, path) 140 | } 141 | return nil 142 | }) 143 | if err != nil { 144 | panic(err) 145 | } 146 | 147 | rdds := make([]RDD, len(pathNames)) 148 | for i := range pathNames { 149 | rdds[i] = newTextFileRDD(c, pathNames[i]) 150 | } 151 | return c.Union(rdds) 152 | } 153 | } 154 | 155 | func (c *Context) Union(rdds []RDD) RDD { 156 | return newUnionRDD(c, rdds) 157 | } 158 | 159 | func (c *Context) Data(d []interface{}) RDD { 160 | return newDataRDD(c, d) 161 | } 162 | 163 | func (c *Context) Data_N(d []interface{}, numPartitions int) RDD { 164 | return newDataRDD_N(c, d, numPartitions) 165 | } 166 | 167 | func (c *Context) Accumulator(initValue int) Accumulator { 168 | return newIntAccumulator(initValue) 169 | } 170 | 171 | func (c *Context) AccumulatorWithParam(initValue interface{}, param AccumulatorParam) Accumulator { 172 | return newAccumulator(initValue, param) 173 | } 174 | 175 | func NewContext(jobName string) *Context { 176 | return &Context{ 177 | jobName: jobName, 178 | initialzed: false, 179 | started: false, 180 | } 181 | } 182 | 183 | func init() { 184 | log.SetFlags(log.LstdFlags) 185 | runtime.GOMAXPROCS(runtime.NumCPU()) 186 | 187 | gob.Register(new(KeyValue)) 188 | gob.Register(new(KeyGroups)) 189 | } 190 | 191 | var _ = fmt.Println 192 | -------------------------------------------------------------------------------- /env.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "os" 8 | "path" 9 | "time" 10 | ) 11 | 12 | type _Environment struct { 13 | master string 14 | parallel int 15 | goparkWorkDir string 16 | jobWorkDir string 17 | started bool 18 | verbose bool 19 | } 20 | 21 | func (e *_Environment) getLocalShufflePath(shuffleId int64, inputId, outputId int) string { 22 | e.start() 23 | pathName := path.Join(e.jobWorkDir, fmt.Sprintf("shuffle-%d", shuffleId)) 24 | if _, err := os.Stat(pathName); os.IsNotExist(err) { 25 | os.Mkdir(pathName, os.ModePerm) 26 | } 27 | return path.Join(pathName, fmt.Sprintf("%05d_%05d", inputId, outputId)) 28 | } 29 | 30 | func (e *_Environment) getLocalRDDPath(rddId int64, splitId int) string { 31 | e.start() 32 | pathName := path.Join(e.jobWorkDir, fmt.Sprintf("rdd-%d", rddId)) 33 | if _, err := os.Stat(pathName); os.IsNotExist(err) { 34 | os.Mkdir(pathName, os.ModePerm) 35 | } 36 | return path.Join(pathName, fmt.Sprintf("%05d", splitId)) 37 | } 38 | 39 | func (e *_Environment) start() { 40 | if e.started { 41 | return 42 | } 43 | 44 | if _, err := os.Stat(e.goparkWorkDir); os.IsNotExist(err) { 45 | os.Mkdir(e.goparkWorkDir, os.ModePerm) 46 | } 47 | 48 | // create sub job working dir 49 | pathName := fmt.Sprintf("gopark-%s-%s-%d", e.master, time.Now().Format("20060102150405"), os.Getpid()) 50 | e.jobWorkDir = path.Join(e.goparkWorkDir, pathName) 51 | if _, err := os.Stat(e.jobWorkDir); os.IsNotExist(err) { 52 | os.Mkdir(e.jobWorkDir, os.ModePerm) 53 | } 54 | 55 | // need to setup the basic tracker servers and etc. 56 | 57 | e.started = true 58 | } 59 | 60 | func (e *_Environment) stop() { 61 | if !e.started { 62 | return 63 | } 64 | 65 | // clear the working dir 66 | if !e.verbose { 67 | err := os.RemoveAll(e.jobWorkDir) 68 | if err != nil { 69 | parklog("Error when removing job work directory[%s]: %s", e.jobWorkDir, err) 70 | } else { 71 | parklog("Clear the gopark job work direcotry: %s", e.jobWorkDir) 72 | } 73 | } 74 | e.started = false 75 | } 76 | 77 | var env *_Environment 78 | 79 | func init() { 80 | env = &_Environment{} 81 | } 82 | 83 | func parklog(fmt string, v ...interface{}) { 84 | if env.verbose { 85 | log.Printf(fmt, v...) 86 | } 87 | } 88 | 89 | func ParseOptions() { 90 | flag.StringVar(&env.master, "master", "local", "Master of Gpark: local") 91 | flag.IntVar(&env.parallel, "p", 2, "Number of parallelism level, must >= 0") 92 | flag.StringVar(&env.goparkWorkDir, "workdir", "/opt/tmp", "Working Directory of Gpark") 93 | flag.BoolVar(&env.verbose, "v", false, "Output verbose log information.") 94 | 95 | flag.Parse() 96 | 97 | if env.parallel < 0 { 98 | flag.Usage() 99 | os.Exit(1) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /examples/demo.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/mijia/gopark" 6 | "math" 7 | "math/rand" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | func main() { 13 | gopark.ParseOptions() 14 | c := gopark.NewContext("testflight") 15 | defer c.Stop() 16 | 17 | WordCount(c) 18 | ComputePi(c) 19 | LogisticRegression(c) 20 | } 21 | 22 | func WordCount(c *gopark.Context) { 23 | txt := c.TextFile("../") 24 | counts := txt.FlatMap(func(line interface{}) []interface{} { 25 | vs := strings.Fields(line.(string)) 26 | words := make([]interface{}, len(vs)) 27 | for i := range vs { 28 | words[i] = vs[i] 29 | } 30 | return words 31 | }).Map(func(x interface{}) interface{} { 32 | return &gopark.KeyValue{x, 1} 33 | }).ReduceByKey(func(x, y interface{}) interface{} { 34 | return x.(int) + y.(int) 35 | }).Cache() 36 | 37 | fmt.Println( 38 | counts.Filter(func(x interface{}) bool { 39 | return x.(*gopark.KeyValue).Value.(int) > 50 40 | }).CollectAsMap()) 41 | 42 | fmt.Println( 43 | counts.Filter(func(x interface{}) bool { 44 | return x.(*gopark.KeyValue).Value.(int) > 50 45 | }).Map(func(x interface{}) interface{} { 46 | keyValue := x.(*gopark.KeyValue) 47 | keyValue.Key, keyValue.Value = keyValue.Value, keyValue.Key 48 | return keyValue 49 | }).GroupByKey().Collect()) 50 | } 51 | 52 | func ComputePi(c *gopark.Context) { 53 | N := 100000 54 | iters := c.Data(make([]interface{}, N)) 55 | count := iters.Map(func(_ interface{}) interface{} { 56 | x := rand.Float32() 57 | y := rand.Float32() 58 | if x*x+y*y < 1 { 59 | return 1 60 | } else { 61 | return 0 62 | } 63 | }).Reduce(func(x, y interface{}) interface{} { 64 | return x.(int) + y.(int) 65 | }).(int) 66 | fmt.Println("Pi =", (4.0 * float64(count) / float64(N))) 67 | } 68 | 69 | func LogisticRegression(c *gopark.Context) { 70 | type dataPoint struct { 71 | x gopark.Vector 72 | y float64 73 | } 74 | 75 | points := c.TextFile("points.txt").Map(func(line interface{}) interface{} { 76 | vs := strings.Fields(line.(string)) 77 | vector := make(gopark.Vector, len(vs)-1) 78 | for i := 0; i < len(vs)-1; i++ { 79 | vector[i], _ = strconv.ParseFloat(vs[i], 64) 80 | } 81 | y, _ := strconv.ParseFloat(vs[len(vs)-1], 64) 82 | return &dataPoint{vector, y} 83 | }).Cache() 84 | 85 | hx := func(w, x gopark.Vector, y float64) float64 { 86 | return 1/(1+math.Exp(-1*w.Dot(x))) - y 87 | } 88 | var w gopark.Vector = []float64{1, -10}[:] 89 | for i := 0; i < 10; i++ { 90 | gradient := points.Map(func(x interface{}) interface{} { 91 | p := x.(*dataPoint) 92 | return p.x.Multiply(-1 * hx(w, p.x, p.y)) 93 | }).Reduce(func(x, y interface{}) interface{} { 94 | return x.(gopark.Vector).Plus(y.(gopark.Vector)) 95 | }).(gopark.Vector) 96 | w = w.Minus(gradient) 97 | } 98 | fmt.Println("Final Weights:", w) 99 | } 100 | -------------------------------------------------------------------------------- /examples/kmean_data.txt: -------------------------------------------------------------------------------- 1 | 0.1 0.2 0.0 0.2 2 | 0.2 0.2 0.3 0.2 3 | 0.3 0.0 0.0 0.1 4 | 0.1 0.2 0.3 0.2 5 | 1.1 0.2 0.0 0.2 6 | 1.2 0.2 0.3 0.2 7 | 1.3 0.0 0.0 0.1 8 | 1.1 0.2 0.3 0.2 9 | 0.1 1.2 1.0 0.2 10 | 0.2 1.2 1.3 0.2 11 | 0.3 1.0 1.0 0.1 12 | 0.1 1.2 1.3 0.2 13 | 0.1 0.2 0.0 1.2 14 | 0.2 0.2 0.3 1.2 15 | 0.3 0.0 0.0 1.1 16 | 0.1 0.2 0.3 1.2 17 | -------------------------------------------------------------------------------- /examples/kmeans.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/gob" 5 | "fmt" 6 | "github.com/mijia/gopark" 7 | "math/rand" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | func CloseCenter(p gopark.Vector, centers []gopark.Vector) int { 13 | minDist := p.EulaDistance(centers[0]) 14 | minIndex := 0 15 | for i := 1; i < len(centers); i++ { 16 | dist := p.EulaDistance(centers[i]) 17 | if dist < minDist { 18 | minDist = dist 19 | minIndex = i 20 | } 21 | } 22 | return minIndex 23 | } 24 | 25 | type CenterCounter struct { 26 | X gopark.Vector 27 | Count int 28 | } 29 | 30 | func main() { 31 | gopark.ParseOptions() 32 | c := gopark.NewContext("kmeans") 33 | defer c.Stop() 34 | 35 | // This is important, have to register user types for gob to correctly encode. 36 | gob.Register(new(CenterCounter)) 37 | 38 | D := 4 39 | K := 3 40 | MIN_DIST := 0.01 41 | 42 | centers := make([]gopark.Vector, K) 43 | for i := range centers { 44 | center := make(gopark.Vector, D) 45 | for j := range center { 46 | center[j] = rand.Float64() 47 | } 48 | centers[i] = center 49 | } 50 | fmt.Println(centers) 51 | 52 | points := c.TextFile("kmean_data.txt").Map(func(line interface{}) interface{} { 53 | vs := strings.Fields(line.(string)) 54 | dims := make(gopark.Vector, len(vs)) 55 | for i := range vs { 56 | dims[i], _ = strconv.ParseFloat(vs[i], 64) 57 | } 58 | return dims 59 | }).Cache() 60 | 61 | for i := 0; i < 10; i++ { 62 | fmt.Println("Iter:", i) 63 | mappedPoints := points.Map(func(x interface{}) interface{} { 64 | p := x.(gopark.Vector) 65 | center := CloseCenter(p, centers) 66 | return &gopark.KeyValue{ 67 | Key: center, 68 | Value: &CenterCounter{p, 1}, 69 | } 70 | }) 71 | newCenters := mappedPoints.ReduceByKey(func(x, y interface{}) interface{} { 72 | cc1 := x.(*CenterCounter) 73 | cc2 := y.(*CenterCounter) 74 | return &CenterCounter{ 75 | X: cc1.X.Plus(cc2.X), 76 | Count: cc1.Count + cc2.Count, 77 | } 78 | }).Map(func(x interface{}) interface{} { 79 | keyValue := x.(*gopark.KeyValue) 80 | cc := keyValue.Value.(*CenterCounter) 81 | return &gopark.KeyValue{ 82 | Key: keyValue.Key, 83 | Value: cc.X.Divide(float64(cc.Count)), 84 | } 85 | }).CollectAsMap() 86 | 87 | updated := false 88 | for key, value := range newCenters { 89 | center := value.(gopark.Vector) 90 | cid := key.(int) 91 | if center.EulaDistance(centers[cid]) > MIN_DIST { 92 | centers[cid] = center 93 | updated = true 94 | } 95 | } 96 | if !updated { 97 | break 98 | } 99 | } 100 | 101 | fmt.Println("Final Centers:", centers) 102 | } 103 | -------------------------------------------------------------------------------- /examples/points.txt: -------------------------------------------------------------------------------- 1 | 150 1 0 2 | 155 1 0 3 | 140 1 0 4 | 180 1 1 5 | 170 1 1 6 | 200 1 1 -------------------------------------------------------------------------------- /rdd.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "sort" 9 | ) 10 | 11 | type MapperFunc func(interface{}) interface{} 12 | type PartitionMapperFunc func(Yielder) Yielder 13 | type FlatMapperFunc func(interface{}) []interface{} 14 | type ReducerFunc func(interface{}, interface{}) interface{} 15 | type FilterFunc func(interface{}) bool 16 | type LoopFunc func(interface{}) 17 | 18 | type RDD interface { 19 | Map(f MapperFunc) RDD 20 | MapPartition(f PartitionMapperFunc) RDD 21 | FlatMap(f FlatMapperFunc) RDD 22 | Filter(f FilterFunc) RDD 23 | Sample(fraction float64, seed int64, withReplacement bool) RDD 24 | GroupByKey() RDD 25 | SortByKey(fn KeyLessFunc, reverse bool) RDD 26 | SortByValue(fn KeyLessFunc, reverse bool) RDD 27 | PartitionByKey() RDD 28 | ReduceByKey(fn ReducerFunc) RDD 29 | Distinct() RDD 30 | Union(other RDD) RDD 31 | Join(other RDD) RDD 32 | LeftOuterJoin(other RDD) RDD 33 | RightOuterJoin(other RDD) RDD 34 | GroupWith(other RDD) RDD 35 | Cartesian(other RDD) RDD 36 | 37 | GroupByKey_N(numPartitions int) RDD 38 | SortByKey_N(fn KeyLessFunc, reverse bool, numPartitions int) RDD 39 | SortByValue_N(fn KeyLessFunc, reverse bool, numPartitions int) RDD 40 | PartitionByKey_N(numPartitions int) RDD 41 | ReduceByKey_N(fn ReducerFunc, numPartitions int) RDD 42 | Distinct_N(numPartitions int) RDD 43 | Join_N(other RDD, numPartitions int) RDD 44 | LeftOuterJoin_N(other RDD, numPartitions int) RDD 45 | RightOuterJoin_N(other RDD, numPartitions int) RDD 46 | GroupWith_N(other RDD, numPartitions int) RDD 47 | 48 | Reduce(fn ReducerFunc) interface{} 49 | CountByKey() map[interface{}]int64 50 | CountByValue() map[interface{}]int64 51 | Take(n int64) []interface{} 52 | Collect() []interface{} 53 | CollectAsMap() map[interface{}]interface{} 54 | Count() int64 55 | Foreach(fn LoopFunc) 56 | Foreach0(fn LoopFunc) 57 | SaveAsTextFile(pathname string) 58 | 59 | Cache() RDD 60 | Persist() RDD 61 | 62 | getId() int64 63 | getContext() *Context 64 | getSplits() []Split 65 | getSplit(int) Split 66 | len() int 67 | traverse(split Split) Yielder 68 | compute(split Split) Yielder 69 | } 70 | 71 | type Split interface { 72 | getIndex() int 73 | } 74 | 75 | type _BaseRDD struct { 76 | ctx *Context 77 | prototype RDD 78 | id int64 79 | splits []Split 80 | shouldCache bool 81 | cache [][]interface{} 82 | shouldPersist bool 83 | persistLocation string 84 | length int 85 | } 86 | 87 | ////////////////////////////////////////////////////// 88 | // Base RDD operations implementation 89 | ////////////////////////////////////////////////////// 90 | func (r *_BaseRDD) Cache() RDD { 91 | if !r.shouldCache { 92 | r.shouldCache = true 93 | r.cache = make([][]interface{}, r.length) 94 | } 95 | return r.prototype 96 | } 97 | 98 | func (r *_BaseRDD) Persist() RDD { 99 | if !r.shouldPersist { 100 | r.shouldPersist = true 101 | } 102 | return r.prototype 103 | } 104 | 105 | func (r *_BaseRDD) Union(other RDD) RDD { 106 | rdds := []RDD{r.prototype, other}[:] 107 | return newUnionRDD(r.ctx, rdds) 108 | } 109 | 110 | func (r *_BaseRDD) Join(other RDD) RDD { 111 | return r.Join_N(other, 0) 112 | } 113 | 114 | func (r *_BaseRDD) Join_N(other RDD, numPartitions int) RDD { 115 | return r.join(other, numPartitions, true, true) 116 | } 117 | 118 | func (r *_BaseRDD) LeftOuterJoin(other RDD) RDD { 119 | return r.LeftOuterJoin_N(other, 0) 120 | } 121 | 122 | func (r *_BaseRDD) LeftOuterJoin_N(other RDD, numPartitions int) RDD { 123 | return r.join(other, numPartitions, true, false) 124 | } 125 | 126 | func (r *_BaseRDD) RightOuterJoin(other RDD) RDD { 127 | return r.RightOuterJoin_N(other, 0) 128 | } 129 | 130 | func (r *_BaseRDD) RightOuterJoin_N(other RDD, numPartitions int) RDD { 131 | return r.join(other, numPartitions, false, true) 132 | } 133 | 134 | func (r *_BaseRDD) join(other RDD, numPartitions int, needLeft, needRight bool) RDD { 135 | return r.GroupWith_N(other, numPartitions).FlatMap(func(x interface{}) []interface{} { 136 | keyGroups := x.(*KeyGroups) 137 | groups := keyGroups.Groups 138 | if needLeft && len(groups[0]) == 0 { 139 | return nil 140 | } 141 | if needRight && len(groups[1]) == 0 { 142 | return nil 143 | } 144 | results := make([]interface{}, 0) 145 | if len(groups[0]) == 0 { 146 | groups[0] = append(groups[0], nil) 147 | } 148 | if len(groups[1]) == 0 { 149 | groups[1] = append(groups[1], nil) 150 | } 151 | for _, leftValue := range groups[0] { 152 | for _, rightValue := range groups[1] { 153 | results = append(results, &KeyValue{ 154 | Key: keyGroups.Key, 155 | Value: []interface{}{leftValue, rightValue}[:], 156 | }) 157 | } 158 | } 159 | return results 160 | }) 161 | } 162 | 163 | func (r *_BaseRDD) GroupWith(other RDD) RDD { 164 | return r.GroupWith_N(other, 0) 165 | } 166 | 167 | func (r *_BaseRDD) GroupWith_N(other RDD, numPartitions int) RDD { 168 | if numPartitions <= 0 { 169 | switch { 170 | case env.parallel == 0: 171 | numPartitions = r.len() 172 | default: 173 | numPartitions = env.parallel 174 | } 175 | } 176 | rdds := []RDD{r.prototype, other}[:] 177 | return newCoGroupedRDD(r.ctx, rdds, numPartitions) 178 | } 179 | 180 | func (r *_BaseRDD) SortByKey(fn KeyLessFunc, reverse bool) RDD { 181 | return r.SortByKey_N(fn, reverse, 0) 182 | } 183 | 184 | func (r *_BaseRDD) SortByKey_N(fn KeyLessFunc, reverse bool, numPartitions int) RDD { 185 | if numPartitions <= 0 { 186 | switch { 187 | case env.parallel == 0: 188 | numPartitions = r.len() 189 | default: 190 | numPartitions = env.parallel 191 | } 192 | } 193 | sortMapper := func(list []interface{}) []interface{} { 194 | sorter := NewParkSorter(list, func(x, y interface{}) bool { 195 | return fn(x.(*KeyValue).Key, y.(*KeyValue).Key) 196 | }) 197 | if !reverse { 198 | sort.Sort(sorter) 199 | } else { 200 | sort.Sort(sort.Reverse(sorter)) 201 | } 202 | return list 203 | } 204 | goSortMapper := func(iter Yielder) Yielder { 205 | yield := make(chan interface{}, 1) 206 | go func() { 207 | values := make([]interface{}, 0) 208 | for value := range iter { 209 | values = append(values, value) 210 | } 211 | sorted := sortMapper(values) 212 | for _, value := range sorted { 213 | yield <- value 214 | } 215 | close(yield) 216 | }() 217 | return yield 218 | } 219 | 220 | if r.len() == 1 { 221 | return r.MapPartition(goSortMapper) 222 | } 223 | // we choose some sample records as the key to partition results 224 | n := numPartitions * 10 / r.len() 225 | samples := r.MapPartition(func(iter Yielder) Yielder { 226 | yield := make(chan interface{}, 1) 227 | go func() { 228 | index := 0 229 | for value := range iter { 230 | yield <- value 231 | index++ 232 | if index >= n { 233 | break 234 | } 235 | } 236 | close(yield) 237 | }() 238 | return yield 239 | }).Collect() 240 | samples = sortMapper(samples) 241 | keys := make([]interface{}, 0) 242 | for i := 0; i < numPartitions-1; i++ { 243 | if i*10+5 >= len(samples) { 244 | break 245 | } 246 | keys = append(keys, samples[i*10+5].(*KeyValue).Key) 247 | } 248 | partitioner := newRangePartitioner(fn, keys, reverse) 249 | aggregator := newMergeAggregator() 250 | shuffleRdd := newShuffledRDD(r.prototype, aggregator, partitioner) 251 | return shuffleRdd.FlatMap(func(x interface{}) []interface{} { 252 | keyValue := x.(*KeyValue) 253 | values := keyValue.Value.([]interface{}) 254 | results := make([]interface{}, len(values)) 255 | for i := range values { 256 | results[i] = &KeyValue{ 257 | Key: keyValue.Key, 258 | Value: values[i], 259 | } 260 | } 261 | return results 262 | }).MapPartition(goSortMapper) 263 | } 264 | 265 | func (r *_BaseRDD) SortByValue(fn KeyLessFunc, reverse bool) RDD { 266 | return r.SortByValue_N(fn, reverse, 0) 267 | } 268 | 269 | func (r *_BaseRDD) SortByValue_N(fn KeyLessFunc, reverse bool, numPartitions int) RDD { 270 | return r.Map(func(x interface{}) interface{} { 271 | kv := x.(*KeyValue) 272 | return &KeyValue{kv.Value, kv.Key} 273 | }).SortByKey_N(fn, reverse, numPartitions).Map(func(x interface{}) interface{} { 274 | kv := x.(*KeyValue) 275 | kv.Key, kv.Value = kv.Value, kv.Key 276 | return kv 277 | }) 278 | } 279 | 280 | func (r *_BaseRDD) Cartesian(other RDD) RDD { 281 | return newCartesianRDD(r.ctx, r.prototype, other) 282 | } 283 | 284 | func (r *_BaseRDD) Distinct() RDD { 285 | return r.Distinct_N(0) 286 | } 287 | 288 | func (r *_BaseRDD) Distinct_N(numPartitions int) RDD { 289 | d := r.Map(func(arg interface{}) interface{} { 290 | return &KeyValue{arg, nil} 291 | }).ReduceByKey_N(func(x, y interface{}) interface{} { 292 | return nil 293 | }, numPartitions).Map(func(arg interface{}) interface{} { 294 | return arg.(*KeyValue).Key 295 | }) 296 | return d 297 | } 298 | 299 | func (r *_BaseRDD) ReduceByKey(fn ReducerFunc) RDD { 300 | return r.ReduceByKey_N(fn, 0) 301 | } 302 | 303 | func (r *_BaseRDD) ReduceByKey_N(fn ReducerFunc, numPartitions int) RDD { 304 | combinerCreator := func(x interface{}) []interface{} { 305 | y := []interface{}{x}[:] 306 | return y 307 | } 308 | combinderMerger := func(x, y []interface{}) []interface{} { 309 | x[0] = fn(x[0], y[0]) 310 | return x 311 | } 312 | valueMerger := func(x []interface{}, y interface{}) []interface{} { 313 | x[0] = fn(x[0], y) 314 | return x 315 | } 316 | aggregator := &_Aggregator{combinerCreator, combinderMerger, valueMerger} 317 | return r.combineByKey(aggregator, numPartitions).Map(func(x interface{}) interface{} { 318 | keyValue := x.(*KeyValue) 319 | return &KeyValue{keyValue.Key, keyValue.Value.([]interface{})[0]} 320 | }) 321 | } 322 | 323 | func (r *_BaseRDD) PartitionByKey() RDD { 324 | return r.PartitionByKey_N(0) 325 | } 326 | 327 | func (r *_BaseRDD) PartitionByKey_N(numPartitions int) RDD { 328 | return r.GroupByKey_N(numPartitions).Map(func(arg interface{}) interface{} { 329 | keyValue := arg.(*KeyValue) 330 | values := keyValue.Value.([]interface{}) 331 | results := make([]interface{}, len(values)) 332 | for i := range values { 333 | results[i] = &KeyValue{keyValue.Key, values[i]} 334 | } 335 | return results 336 | }).FlatMap(func(arg interface{}) []interface{} { 337 | return arg.([]interface{}) 338 | }) 339 | } 340 | 341 | func (r *_BaseRDD) GroupByKey() RDD { 342 | return r.GroupByKey_N(0) 343 | } 344 | 345 | func (r *_BaseRDD) GroupByKey_N(numPartitions int) RDD { 346 | aggregator := newMergeAggregator() 347 | return r.combineByKey(aggregator, numPartitions) 348 | } 349 | 350 | func (r *_BaseRDD) Map(f MapperFunc) RDD { 351 | return newMappedRDD(r.prototype, f) 352 | } 353 | 354 | func (r *_BaseRDD) MapPartition(f PartitionMapperFunc) RDD { 355 | return newPartitionMappedRDD(r.prototype, f) 356 | } 357 | 358 | func (r *_BaseRDD) FlatMap(f FlatMapperFunc) RDD { 359 | return newFlatMappedRDD(r.prototype, f) 360 | } 361 | 362 | func (r *_BaseRDD) Filter(f FilterFunc) RDD { 363 | return newFilteredRDD(r.prototype, f) 364 | } 365 | 366 | func (r *_BaseRDD) Sample(fraction float64, seed int64, withReplacement bool) RDD { 367 | return newSampledRDD(r.prototype, fraction, seed, withReplacement) 368 | } 369 | 370 | func (r *_BaseRDD) CountByKey() map[interface{}]int64 { 371 | return r.Map(func(arg interface{}) interface{} { 372 | return arg.(*KeyValue).Key 373 | }).CountByValue() 374 | } 375 | 376 | func (r *_BaseRDD) CountByValue() map[interface{}]int64 { 377 | parklog(" %s", r.prototype) 378 | iters := r.ctx.runRoutine(r.prototype, nil, func(yield Yielder, partition int) interface{} { 379 | cnts := make(map[interface{}]int64) 380 | for value := range yield { 381 | if _, ok := cnts[value]; ok { 382 | cnts[value]++ 383 | } else { 384 | cnts[value] = 1 385 | } 386 | } 387 | return cnts 388 | }) 389 | cnts := make(map[interface{}]int64) 390 | for _, iter := range iters { 391 | for value := range iter { 392 | for key, cnt := range value.(map[interface{}]int64) { 393 | if _, ok := cnts[key]; ok { 394 | cnts[key] += cnt 395 | } else { 396 | cnts[key] = cnt 397 | } 398 | } 399 | } 400 | } 401 | return cnts 402 | } 403 | 404 | func (r *_BaseRDD) Reduce(fn ReducerFunc) interface{} { 405 | parklog(" %s", r.prototype) 406 | iters := r.ctx.runRoutine(r.prototype, nil, func(yield Yielder, partition int) interface{} { 407 | var accu interface{} = nil 408 | for value := range yield { 409 | switch { 410 | case accu == nil: 411 | accu = value 412 | default: 413 | accu = fn(accu, value) 414 | } 415 | } 416 | return accu 417 | }) 418 | var accu interface{} = nil 419 | for _, iter := range iters { 420 | for value := range iter { 421 | if value != nil { 422 | switch { 423 | case accu == nil: 424 | accu = value 425 | default: 426 | accu = fn(accu, value) 427 | } 428 | } 429 | } 430 | } 431 | return accu 432 | } 433 | 434 | func (r *_BaseRDD) SaveAsTextFile(pathname string) { 435 | newOutputTextFileRDD(r.prototype, pathname).Collect() 436 | } 437 | 438 | func (r *_BaseRDD) Collect() []interface{} { 439 | parklog(" %s", r.prototype) 440 | iters := r.ctx.runRoutine(r.prototype, nil, func(yield Yielder, partition int) interface{} { 441 | subCollections := make([]interface{}, 0) 442 | for value := range yield { 443 | subCollections = append(subCollections, value) 444 | } 445 | return subCollections 446 | }) 447 | collections := make([]interface{}, 0) 448 | for _, iter := range iters { 449 | subCollections := (<-iter).([]interface{}) 450 | collections = append(collections, subCollections...) 451 | } 452 | return collections 453 | } 454 | 455 | func (r *_BaseRDD) CollectAsMap() map[interface{}]interface{} { 456 | parklog(" %s", r.prototype) 457 | collections := r.Collect() 458 | sets := make(map[interface{}]interface{}) 459 | for _, item := range collections { 460 | keyValue := item.(*KeyValue) 461 | sets[keyValue.Key] = keyValue.Value 462 | } 463 | return sets 464 | } 465 | 466 | func (r *_BaseRDD) Foreach0(fn LoopFunc) { 467 | parklog(" %s", r.prototype) 468 | for i := 0; i < r.prototype.len(); i++ { 469 | dumps := r.ctx.runRoutine(r.prototype, []int{i}[:], func(yield Yielder, partition int) interface{} { 470 | for value := range yield { 471 | fn(value) 472 | } 473 | return struct{}{} 474 | }) 475 | // we need to dump all the channels otherwise the function will not be executed. 476 | for _, dump := range dumps { 477 | for _ = range dump { 478 | } 479 | } 480 | } 481 | } 482 | 483 | func (r *_BaseRDD) Foreach(fn LoopFunc) { 484 | parklog(" %s", r.prototype) 485 | dumps := r.ctx.runRoutine(r.prototype, nil, func(yield Yielder, partition int) interface{} { 486 | for value := range yield { 487 | fn(value) 488 | } 489 | return struct{}{} 490 | }) 491 | // we need to dump all the channels otherwise the function will not be executed. 492 | for _, dump := range dumps { 493 | for _ = range dump { 494 | } 495 | } 496 | } 497 | 498 | func (r *_BaseRDD) Count() int64 { 499 | parklog(" %s", r.prototype) 500 | var cnt int64 = 0 501 | iters := r.ctx.runRoutine(r.prototype, nil, func(yield Yielder, partition int) interface{} { 502 | var total int64 = 0 503 | for _ = range yield { 504 | total++ 505 | } 506 | return total 507 | }) 508 | for _, iter := range iters { 509 | for subCount := range iter { 510 | cnt += subCount.(int64) 511 | } 512 | } 513 | return cnt 514 | } 515 | 516 | func (r *_BaseRDD) Take(n int64) []interface{} { 517 | if n < 0 { 518 | return nil 519 | } 520 | parklog("=%d %s", n, r.prototype) 521 | results := make([]interface{}, n) 522 | var index int64 = 0 523 | p := make([]int, 1) 524 | p[0] = 0 525 | for index < n && p[0] < r.prototype.len() { 526 | if y := r.ctx.runRoutine(r.prototype, p, func(yield Yielder, partition int) interface{} { 527 | s := make([]interface{}, n-index) 528 | var i int64 = 0 529 | for ; i < n-index; i++ { 530 | if value, ok := <-yield; ok { 531 | s[i] = value 532 | } else { 533 | break 534 | } 535 | } 536 | return s[:i] 537 | }); len(y) > 0 { 538 | for taked := range y[0] { 539 | takedSlice := taked.([]interface{}) 540 | copy(results[index:], takedSlice) 541 | index += int64(len(takedSlice)) 542 | } 543 | } 544 | p[0]++ 545 | } 546 | return results 547 | } 548 | 549 | func (r *_BaseRDD) combineByKey(aggregator *_Aggregator, numPartitions int) RDD { 550 | if numPartitions <= 0 { 551 | switch { 552 | case env.parallel == 0: 553 | numPartitions = r.len() 554 | default: 555 | numPartitions = env.parallel 556 | } 557 | } 558 | patitioner := newHashPartitioner(numPartitions) 559 | return newShuffledRDD(r.prototype, aggregator, patitioner) 560 | } 561 | 562 | func (r *_BaseRDD) getOrCompute(split Split) Yielder { 563 | rdd := r.prototype 564 | i := split.getIndex() 565 | yield := make(chan interface{}, 1) 566 | go func() { 567 | if r.cache[i] != nil { 568 | parklog("Cache hit <%s> on Split[%d]", rdd, i) 569 | for _, value := range r.cache[i] { 570 | yield <- value 571 | } 572 | } else { 573 | r.cache[i] = make([]interface{}, 0) 574 | for value := range rdd.compute(split) { 575 | r.cache[i] = append(r.cache[i], value) 576 | yield <- value 577 | } 578 | r.cache[i] = r.cache[i][:] 579 | } 580 | close(yield) 581 | }() 582 | return yield 583 | } 584 | 585 | func (r *_BaseRDD) persistOrCompute(split Split) Yielder { 586 | rdd := r.prototype 587 | i := split.getIndex() 588 | yield := make(chan interface{}, 1) 589 | go func() { 590 | if len(r.persistLocation) != 0 { 591 | pathname := env.getLocalRDDPath(r.id, i) 592 | parklog("Decoding rdd-%d/%d[GOB] from local file %s", r.id, i, pathname) 593 | input, err := os.Open(pathname) 594 | if err != nil { 595 | log.Panicf("Error when persist/decode rdd split[%d], %v", i, err) 596 | } 597 | defer input.Close() 598 | 599 | var buffer []interface{} 600 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 601 | for err == nil { 602 | buffer, err = encoder.Decode(input) 603 | if err != nil { 604 | break 605 | } 606 | for _, value := range buffer { 607 | yield <- value 608 | } 609 | } 610 | if err != nil && err != io.EOF { 611 | log.Panicf("Error when persist/decode rdd split[%d], %v", i, err) 612 | } 613 | } else { 614 | pathname := env.getLocalRDDPath(r.id, i) 615 | output, err := os.Create(pathname) 616 | if err != nil { 617 | log.Panicf("Error when creating presist file [%s] for rdd split[%d], %v", pathname, i, err) 618 | } 619 | defer output.Close() 620 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 621 | for value := range rdd.compute(split) { 622 | err = encoder.Encode(output, value) 623 | if err != nil { 624 | log.Panicf("Error when encoding object into file for rdd split[%d], %v", i, err) 625 | } 626 | yield <- value 627 | } 628 | err = encoder.Flush(output) 629 | if err != nil { 630 | log.Panicf("Error when encoding object into file for rdd split[%d], %v", i, err) 631 | } 632 | parklog("Encoding rdd-%d/%d[GOB] into local file %s", r.id, i, pathname) 633 | r.persistLocation = pathname 634 | } 635 | close(yield) 636 | }() 637 | return yield 638 | } 639 | 640 | func (r *_BaseRDD) traverse(split Split) Yielder { 641 | rdd := r.prototype 642 | if r.shouldCache { 643 | return r.getOrCompute(split) 644 | } 645 | if r.shouldPersist { 646 | return r.persistOrCompute(split) 647 | } 648 | return rdd.compute(split) 649 | } 650 | 651 | func (r *_BaseRDD) getId() int64 { 652 | return r.id 653 | } 654 | 655 | func (r *_BaseRDD) getContext() *Context { 656 | return r.ctx 657 | } 658 | 659 | func (r *_BaseRDD) getSplits() []Split { 660 | return r.splits 661 | } 662 | 663 | func (r *_BaseRDD) getSplit(index int) Split { 664 | return r.splits[index] 665 | } 666 | 667 | func (r *_BaseRDD) len() int { 668 | return r.length 669 | } 670 | 671 | func (r *_BaseRDD) init(ctx *Context, prototype RDD) { 672 | r.ctx = ctx 673 | r.prototype = prototype 674 | r.id = r.newRddId() 675 | r.splits = make([]Split, 0) 676 | 677 | r.ctx.init() 678 | } 679 | 680 | var nextRddId AtomicInt = 0 681 | 682 | func (r *_BaseRDD) newRddId() int64 { 683 | nextRddId.Add(1) 684 | return nextRddId.Get() 685 | } 686 | 687 | var nextShuffleId AtomicInt = 0 688 | 689 | func (r *_BaseRDD) newShuffleId() int64 { 690 | nextShuffleId.Add(1) 691 | return nextShuffleId.Get() 692 | } 693 | 694 | var _ = fmt.Println 695 | -------------------------------------------------------------------------------- /rdd_test.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func setupEnv() { 11 | env.master = "local" 12 | env.parallel = 3 13 | env.goparkWorkDir = "/opt/tmp" 14 | } 15 | 16 | func TestDataRDD(t *testing.T) { 17 | setupEnv() 18 | c := NewContext("TestDataRDD") 19 | fmt.Printf("\n\n%s\n", c) 20 | defer c.Stop() 21 | a := []interface{}{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}[:] 22 | data := c.Data(a) 23 | fmt.Println(data.Collect()) 24 | if data.Count() != 10 { 25 | t.Errorf("Data parallellized error, with %s", data) 26 | } 27 | } 28 | 29 | func TestTextFile(t *testing.T) { 30 | setupEnv() 31 | c := NewContext("TestTextFile") 32 | fmt.Printf("\n\n%s\n", c) 33 | defer c.Stop() 34 | txt := c.TextFile("rdd.go") 35 | samples := txt.FlatMap(func(line interface{}) []interface{} { 36 | vs := strings.Fields(line.(string)) 37 | results := make([]interface{}, len(vs)) 38 | for i := range vs { 39 | results[i] = vs[i] 40 | } 41 | return results 42 | }).Take(5) 43 | fmt.Println(samples) 44 | if len(samples) != 5 { 45 | t.Errorf("TextFile Samples length error, %d", len(samples)) 46 | } 47 | } 48 | 49 | func TestSimpleMappers(t *testing.T) { 50 | setupEnv() 51 | c := NewContext("TestSimpleMappers") 52 | fmt.Printf("\n\n%s\n", c) 53 | defer c.Stop() 54 | d := []interface{}{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}[:] 55 | data := c.Data(d) 56 | 57 | mData := data.Map(func(arg interface{}) interface{} { 58 | return arg 59 | }) 60 | fmt.Println(mData.Collect()) 61 | if int(mData.Count()) != len(d) { 62 | t.Errorf("Mapping data error, %v", mData.Collect()) 63 | } 64 | 65 | flatData := mData.FlatMap(func(arg interface{}) []interface{} { 66 | t := make([]interface{}, 2) 67 | t[0] = arg 68 | t[1] = arg.(int) * 10 69 | return t 70 | }) 71 | fmt.Println(flatData.Collect()) 72 | if int(flatData.Count()) != 2*len(d) { 73 | t.Errorf("FlatMap data error, %v", flatData.Collect()) 74 | } 75 | 76 | filterData := flatData.Filter(func(arg interface{}) bool { 77 | return arg.(int) < 10 78 | }) 79 | fmt.Println(filterData.Collect()) 80 | if int(filterData.Count()) <= 0 { 81 | t.Errorf("Filter data error, %v", filterData.Collect()) 82 | } 83 | 84 | samples := flatData.Sample(1, 42, true).Collect() 85 | fmt.Println("Samples:", samples) 86 | if len(samples) <= 0 { 87 | t.Errorf("Sample Data error, %v", samples) 88 | } 89 | 90 | mapPartition := data.MapPartition(func(iter Yielder) Yielder { 91 | yield := make(chan interface{}, 1) 92 | go func() { 93 | for value := range iter { 94 | yield <- value 95 | yield <- value 96 | } 97 | close(yield) 98 | }() 99 | return yield 100 | }).Collect() 101 | fmt.Println(mapPartition) 102 | if len(mapPartition) != 2*len(d) { 103 | t.Error("MapPartition data error.") 104 | } 105 | } 106 | 107 | func TestKeyValueMappers(t *testing.T) { 108 | setupEnv() 109 | c := NewContext("TestKeyValueMappers") 110 | fmt.Printf("\n\n%s\n", c) 111 | defer c.Stop() 112 | d := []interface{}{ 113 | &KeyValue{1, 10}, 114 | &KeyValue{1, 11}, 115 | &KeyValue{2, 12}, 116 | &KeyValue{3, 13}, 117 | &KeyValue{4, 14}, 118 | &KeyValue{4, 15}, 119 | &KeyValue{5, 16}, 120 | &KeyValue{6, 17}, 121 | &KeyValue{5, 18}, 122 | }[:] 123 | data := c.Data(d) 124 | 125 | group := data.GroupByKey().CollectAsMap() 126 | fmt.Println(group) 127 | for i := 1; i <= 6; i++ { 128 | if _, ok := group[i]; !ok { 129 | t.Errorf("%d key not in GroupByKey Map", i) 130 | } 131 | } 132 | 133 | pKey := data.PartitionByKey().Collect() 134 | fmt.Println(pKey) 135 | if len(pKey) != len(d) { 136 | t.Error("PartitionByKey data error") 137 | } 138 | 139 | distinct := data.Map(func(x interface{}) interface{} { 140 | return x.(*KeyValue).Key 141 | }).Distinct().Collect() 142 | fmt.Println(distinct) 143 | if len(distinct) != 6 { 144 | t.Errorf("Distinct data error, %v", distinct) 145 | } 146 | 147 | countKeys := data.CountByKey() 148 | fmt.Println(countKeys) 149 | for i := 1; i <= 6; i++ { 150 | if _, ok := countKeys[i]; !ok { 151 | t.Errorf("%d key not in CountByKey Map", i) 152 | } 153 | } 154 | } 155 | 156 | func TestSimpleReducer(t *testing.T) { 157 | setupEnv() 158 | c := NewContext("TestSimpleReducer") 159 | fmt.Printf("\n\n%s\n", c) 160 | defer c.Stop() 161 | d := []interface{}{1, 2, 4, 5, 7, 3, 3, 3, 1}[:] 162 | data := c.Data(d) 163 | 164 | sum := data.Reduce(func(x, y interface{}) interface{} { 165 | return x.(int) + y.(int) 166 | }) 167 | fmt.Println(sum) 168 | if sum != 29 { 169 | t.Error("Reduce function failed.") 170 | } 171 | 172 | countValues := data.CountByValue() 173 | fmt.Println(countValues) 174 | if len(countValues) <= 0 { 175 | t.Error("CountByValue data error.") 176 | } 177 | 178 | data.Foreach(func(x interface{}) { 179 | fmt.Println("Happend to found", x.(int)) 180 | }) 181 | } 182 | 183 | func TestUnionRDD(t *testing.T) { 184 | setupEnv() 185 | c := NewContext("TestUnionRDD") 186 | fmt.Printf("\n\n%s\n", c) 187 | defer c.Stop() 188 | d1 := []interface{}{1, 2, 3, 4, 5}[:] 189 | d2 := []interface{}{6, 7, 8, 9, 10}[:] 190 | u := c.Data(d1).Union(c.Data(d2)).Collect() 191 | fmt.Println(u) 192 | if len(u) != len(d1)+len(d2) { 193 | t.Error("Union RDD failed.") 194 | } 195 | } 196 | 197 | func TestJoins(t *testing.T) { 198 | setupEnv() 199 | c := NewContext("TestJoins") 200 | fmt.Printf("\n\n%s\n", c) 201 | defer c.Stop() 202 | d1 := []interface{}{ 203 | &KeyValue{1, 10}, 204 | &KeyValue{2, 11}, 205 | &KeyValue{3, 12}, 206 | &KeyValue{5, 13}, 207 | &KeyValue{7, 15}, 208 | }[:] 209 | d2 := []interface{}{ 210 | &KeyValue{1, 100}, 211 | &KeyValue{2, 111}, 212 | &KeyValue{5, 131}, 213 | &KeyValue{6, 161}, 214 | }[:] 215 | 216 | data1 := c.Data(d1) 217 | data2 := c.Data(d2) 218 | 219 | join := data1.Join(data2).CollectAsMap() 220 | for key, value := range join { 221 | fmt.Println(key, value) 222 | } 223 | for _, key := range []int{1, 2, 5}[:] { 224 | if _, ok := join[key]; !ok { 225 | t.Errorf("%d key not in join map, that's wrong", key) 226 | } 227 | } 228 | for _, key := range []int{3, 7, 6}[:] { 229 | if _, ok := join[key]; ok { 230 | t.Errorf("%d key is in join map, that's wrong", key) 231 | } 232 | } 233 | 234 | leftJoin := data1.LeftOuterJoin(data2).CollectAsMap() 235 | for key, value := range leftJoin { 236 | fmt.Println(key, value) 237 | } 238 | for _, key := range []int{1, 2, 3, 5, 7}[:] { 239 | if _, ok := leftJoin[key]; !ok { 240 | t.Errorf("%d key not in left join map, that's wrong", key) 241 | } 242 | } 243 | 244 | rightJoin := data1.RightOuterJoin(data2).CollectAsMap() 245 | for key, value := range rightJoin { 246 | fmt.Println(key, value) 247 | } 248 | for _, key := range []int{1, 2, 5, 6}[:] { 249 | if _, ok := rightJoin[key]; !ok { 250 | t.Errorf("%d key not in left join map, that's wrong", key) 251 | } 252 | } 253 | } 254 | 255 | func TestCartesian(t *testing.T) { 256 | setupEnv() 257 | c := NewContext("TestCartesian") 258 | fmt.Printf("\n\n%s\n", c) 259 | defer c.Stop() 260 | d1 := []interface{}{1, 2, 3, 4, 5, 6}[:] 261 | d2 := []interface{}{"a", "b", "c", "d", "e"}[:] 262 | 263 | data1 := c.Data(d1) 264 | data2 := c.Data(d2) 265 | cart := data1.Cartesian(data2).Collect() 266 | fmt.Println(cart) 267 | if len(cart) != len(d1)*len(d2) { 268 | t.Error("Cartesian data error.") 269 | } 270 | } 271 | 272 | func TestSortByKey(t *testing.T) { 273 | setupEnv() 274 | c := NewContext("TestSortByKey") 275 | fmt.Printf("\n\n%s\n", c) 276 | defer c.Stop() 277 | d := []interface{}{ 278 | &KeyValue{1, "a"}, 279 | &KeyValue{4, "b"}, 280 | &KeyValue{6, "c"}, 281 | &KeyValue{3, "d"}, 282 | &KeyValue{7, "e"}, 283 | &KeyValue{8, "f"}, 284 | &KeyValue{1, "a"}, 285 | &KeyValue{4, "b"}, 286 | &KeyValue{6, "c"}, 287 | &KeyValue{3, "d"}, 288 | &KeyValue{7, "e"}, 289 | &KeyValue{8, "f"}, 290 | }[:] 291 | 292 | data := c.Data(d).SortByKey(func(x, y interface{}) bool { 293 | return x.(int) < y.(int) 294 | }, true).Collect() 295 | fmt.Println(data) 296 | sorter := NewParkSorter(data, func(x, y interface{}) bool { 297 | return x.(*KeyValue).Key.(int) < y.(*KeyValue).Key.(int) 298 | }) 299 | if !sort.IsSorted(sort.Reverse(sorter)) { 300 | t.Error("SortByKey failed, is not sorted.") 301 | } 302 | 303 | data = c.Data(d).SortByValue(func(x, y interface{}) bool { 304 | return x.(string) < y.(string) 305 | }, true).Collect() 306 | fmt.Println(data) 307 | sorter = NewParkSorter(data, func(x, y interface{}) bool { 308 | return x.(*KeyValue).Value.(string) < y.(*KeyValue).Value.(string) 309 | }) 310 | if !sort.IsSorted(sort.Reverse(sorter)) { 311 | t.Error("SortByValue failed, is not sorted.") 312 | } 313 | } 314 | 315 | func TestAccumulator(t *testing.T) { 316 | setupEnv() 317 | c := NewContext("TestAccumulator") 318 | fmt.Printf("\n\n%s\n", c) 319 | defer c.Stop() 320 | d := []interface{}{1, 2, 3, 5, 6, 7, 8}[:] 321 | accu := c.Accumulator(0) 322 | c.Data(d).Foreach(func(_ interface{}) { 323 | accu.Add(1) 324 | }) 325 | fmt.Println(accu.Value()) 326 | if accu.Value() != 7 { 327 | t.Error("Accumulator error") 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /rddimpls.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "log" 8 | "math" 9 | "math/rand" 10 | "os" 11 | "path/filepath" 12 | "sync" 13 | "time" 14 | ) 15 | 16 | ////////////////////////////////////////////////////// 17 | // Derived RDD operations implementation 18 | ////////////////////////////////////////////////////// 19 | type _DerivedRDD struct { 20 | _BaseRDD 21 | previous RDD 22 | } 23 | 24 | func (d *_DerivedRDD) init(prevRdd, prototype RDD) { 25 | d._BaseRDD.init(prevRdd.getContext(), prototype) 26 | d.previous = prevRdd 27 | d.length = prevRdd.len() 28 | d.splits = prevRdd.getSplits() 29 | } 30 | 31 | ////////////////////////////////////////////////////// 32 | // Mapped RDD Impl 33 | ////////////////////////////////////////////////////// 34 | type _MappedRDD struct { 35 | _DerivedRDD 36 | fn MapperFunc 37 | } 38 | 39 | func (m *_MappedRDD) compute(split Split) Yielder { 40 | yield := make(chan interface{}, 1) 41 | go func() { 42 | parklog("Computing <%s> on Split[%d]", m, split.getIndex()) 43 | for value := range m.previous.traverse(split) { 44 | yield <- m.fn(value) 45 | } 46 | close(yield) 47 | }() 48 | return yield 49 | } 50 | 51 | func (m *_MappedRDD) init(rdd RDD, f MapperFunc) { 52 | m._DerivedRDD.init(rdd, m) 53 | m.fn = f 54 | } 55 | 56 | func (m *_MappedRDD) String() string { 57 | return fmt.Sprintf("MappedRDD-%d <%s>", m.id, m._DerivedRDD.previous) 58 | } 59 | 60 | func newMappedRDD(rdd RDD, f MapperFunc) RDD { 61 | mRdd := &_MappedRDD{} 62 | mRdd.init(rdd, f) 63 | return mRdd 64 | } 65 | 66 | //////////////////////////////////////////////////////////////////////// 67 | // PartitionMappedRDD Impl 68 | //////////////////////////////////////////////////////////////////////// 69 | type _PartitionMappedRDD struct { 70 | _DerivedRDD 71 | fn PartitionMapperFunc 72 | } 73 | 74 | func (r *_PartitionMappedRDD) compute(split Split) Yielder { 75 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 76 | return r.fn(r.previous.traverse(split)) 77 | } 78 | 79 | func (r *_PartitionMappedRDD) init(rdd RDD, f PartitionMapperFunc) { 80 | r._DerivedRDD.init(rdd, r) 81 | r.fn = f 82 | } 83 | 84 | func (r *_PartitionMappedRDD) String() string { 85 | return fmt.Sprintf("PartitionMappedRDD-%d <%s>", r.id, r._DerivedRDD.previous) 86 | } 87 | 88 | func newPartitionMappedRDD(rdd RDD, f PartitionMapperFunc) RDD { 89 | r := &_PartitionMappedRDD{} 90 | r.init(rdd, f) 91 | return r 92 | } 93 | 94 | //////////////////////////////////////////////////////////////////////// 95 | // FlatMappedRDD Impl 96 | //////////////////////////////////////////////////////////////////////// 97 | type _FlatMappedRDD struct { 98 | _DerivedRDD 99 | fn FlatMapperFunc 100 | } 101 | 102 | func (r *_FlatMappedRDD) compute(split Split) Yielder { 103 | yield := make(chan interface{}, 1) 104 | go func() { 105 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 106 | for arg := range r.previous.traverse(split) { 107 | value := r.fn(arg) 108 | if value != nil && len(value) > 0 { 109 | for _, subValue := range value { 110 | yield <- subValue 111 | } 112 | } 113 | } 114 | close(yield) 115 | }() 116 | return yield 117 | } 118 | 119 | func (r *_FlatMappedRDD) init(rdd RDD, f FlatMapperFunc) { 120 | r._DerivedRDD.init(rdd, r) 121 | r.fn = f 122 | } 123 | 124 | func (r *_FlatMappedRDD) String() string { 125 | return fmt.Sprintf("FlatMappedRDD-%d <%s>", r.id, r._DerivedRDD.previous) 126 | } 127 | 128 | func newFlatMappedRDD(rdd RDD, f FlatMapperFunc) RDD { 129 | r := &_FlatMappedRDD{} 130 | r.init(rdd, f) 131 | return r 132 | } 133 | 134 | //////////////////////////////////////////////////////////////////////// 135 | // FilteredRDD Impl 136 | //////////////////////////////////////////////////////////////////////// 137 | type _FilteredRDD struct { 138 | _DerivedRDD 139 | fn FilterFunc 140 | } 141 | 142 | func (r *_FilteredRDD) compute(split Split) Yielder { 143 | yield := make(chan interface{}, 1) 144 | go func() { 145 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 146 | for value := range r.previous.traverse(split) { 147 | if r.fn(value) { 148 | yield <- value 149 | } 150 | } 151 | close(yield) 152 | }() 153 | return yield 154 | } 155 | 156 | func (r *_FilteredRDD) init(rdd RDD, f FilterFunc) { 157 | r._DerivedRDD.init(rdd, r) 158 | r.fn = f 159 | } 160 | 161 | func (r *_FilteredRDD) String() string { 162 | return fmt.Sprintf("FilteredRDD-%d <%s>", r.id, r._DerivedRDD.previous) 163 | } 164 | 165 | func newFilteredRDD(rdd RDD, f FilterFunc) RDD { 166 | r := &_FilteredRDD{} 167 | r.init(rdd, f) 168 | return r 169 | } 170 | 171 | //////////////////////////////////////////////////////////////////////// 172 | // SampledRDD Impl 173 | //////////////////////////////////////////////////////////////////////// 174 | type _SampledRDD struct { 175 | _DerivedRDD 176 | fraction float64 177 | seed int64 178 | withReplacement bool 179 | } 180 | 181 | func (r *_SampledRDD) compute(split Split) Yielder { 182 | yield := make(chan interface{}, 1) 183 | go func() { 184 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 185 | seed := r.seed + int64(split.getIndex()) 186 | rd := rand.New(rand.NewSource(seed)) 187 | if r.withReplacement { 188 | allData := make([]interface{}, 0) 189 | for value := range r.previous.traverse(split) { 190 | allData = append(allData, value) 191 | } 192 | dCount := len(allData) 193 | sampleSize := int(math.Ceil(float64(dCount) * r.fraction)) 194 | for i := 0; i < sampleSize; i++ { 195 | yield <- allData[rd.Intn(dCount)] 196 | } 197 | } else { 198 | for value := range r.previous.traverse(split) { 199 | if rd.Float64() <= r.fraction { 200 | yield <- value 201 | } 202 | } 203 | } 204 | close(yield) 205 | }() 206 | return yield 207 | } 208 | 209 | func (r *_SampledRDD) init(rdd RDD, fraction float64, seed int64, withReplacement bool) { 210 | r._DerivedRDD.init(rdd, r) 211 | r.fraction = fraction 212 | r.seed = seed 213 | r.withReplacement = withReplacement 214 | } 215 | 216 | func (r *_SampledRDD) String() string { 217 | return fmt.Sprintf("SampledRDD-%d <%s>", r.id, r._DerivedRDD.previous) 218 | } 219 | 220 | func newSampledRDD(rdd RDD, fraction float64, seed int64, withReplacement bool) RDD { 221 | r := &_SampledRDD{} 222 | r.init(rdd, fraction, seed, withReplacement) 223 | return r 224 | } 225 | 226 | //////////////////////////////////////////////////////////////////////// 227 | // TextFileRDD Impl 228 | //////////////////////////////////////////////////////////////////////// 229 | const DEFAULT_FILE_SPLIT_SIZE = 64 * 1024 * 1024 // 64MB Split Size 230 | 231 | type _PartialSplit struct { 232 | index int 233 | begin int64 234 | end int64 235 | } 236 | 237 | func (s *_PartialSplit) getIndex() int { 238 | return s.index 239 | } 240 | 241 | type _TextFileRDD struct { 242 | _BaseRDD 243 | path string 244 | size int64 245 | splitSize int64 246 | } 247 | 248 | func (t *_TextFileRDD) compute(split Split) Yielder { 249 | yield := make(chan interface{}, 100) 250 | go func() { 251 | parklog("Computing <%s> on Split[%d]", t, split.getIndex()) 252 | defer close(yield) 253 | 254 | f, err := os.Open(t.path) 255 | if err != nil { 256 | panic(err) 257 | } 258 | defer f.Close() 259 | 260 | pSplit := split.(*_PartialSplit) 261 | start := pSplit.begin 262 | end := pSplit.end 263 | if start > 0 { 264 | start, err = seekToNewLine(f, start-1) 265 | if err == io.EOF { 266 | return 267 | } 268 | if err != nil { 269 | panic(err) 270 | } 271 | } 272 | 273 | if start >= end { 274 | return 275 | } 276 | r := bufio.NewReader(f) 277 | line, err := readLine(r) 278 | for err == nil { 279 | yield <- line 280 | start += int64(len(line)) + 1 // here would be dargon 281 | if start >= end { 282 | break 283 | } 284 | line, err = readLine(r) 285 | } 286 | if err != nil && err != io.EOF { 287 | panic(err) 288 | } 289 | }() 290 | return yield 291 | } 292 | 293 | func (t *_TextFileRDD) init(ctx *Context, path string, numSplits int) { 294 | t._BaseRDD.init(ctx, t) 295 | t.path = path 296 | 297 | fi, err := os.Stat(path) 298 | if err != nil { 299 | panic(err) 300 | } 301 | t.size = fi.Size() 302 | t.splitSize = DEFAULT_FILE_SPLIT_SIZE 303 | if numSplits > 0 { 304 | t.splitSize = t.size / int64(numSplits) 305 | } 306 | if t.size > 0 { 307 | t.length = int(t.size / t.splitSize) 308 | } else { 309 | t.length = 0 310 | } 311 | t.splits = make([]Split, t.length) 312 | for i := 0; i < t.length; i++ { 313 | end := int64(i+1) * t.splitSize 314 | if i == t.length-1 { 315 | end = t.size 316 | } 317 | t.splits[i] = &_PartialSplit{ 318 | index: i, 319 | begin: int64(i) * t.splitSize, 320 | end: end, 321 | } 322 | } 323 | } 324 | 325 | func (t *_TextFileRDD) String() string { 326 | return fmt.Sprintf("TextFileRDD-%d <%s %d>", t.id, t.path, t.len()) 327 | } 328 | 329 | func newTextFileRDD(ctx *Context, path string) RDD { 330 | textRdd := &_TextFileRDD{} 331 | textRdd.init(ctx, path, env.parallel) 332 | return textRdd 333 | } 334 | 335 | //////////////////////////////////////////////////////////////////////// 336 | // ShuffledRDD Impl 337 | //////////////////////////////////////////////////////////////////////// 338 | const SHUFFLE_MAGIC_ID = 9999 339 | 340 | type _ShuffledSplit struct { 341 | index int 342 | } 343 | 344 | func (s *_ShuffledSplit) getIndex() int { 345 | return s.index 346 | } 347 | 348 | type _ShuffledRDD struct { 349 | _BaseRDD 350 | shuffleId int64 351 | parent RDD 352 | aggregator *_Aggregator 353 | partitioner Partitioner 354 | numParts int 355 | shuffleJob sync.Once 356 | } 357 | 358 | func (r *_ShuffledRDD) compute(split Split) Yielder { 359 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 360 | r.shuffleJob.Do(func() { 361 | r.runShuffleJob() 362 | }) 363 | 364 | yield := make(chan interface{}, 1) 365 | go func() { 366 | outputId := split.getIndex() 367 | combinePath := env.getLocalShufflePath(r.shuffleId, SHUFFLE_MAGIC_ID, outputId) 368 | input, err := os.Open(combinePath) 369 | if err != nil { 370 | log.Panicf("Error when open/decode shuffle-%d split[%d] from file %s, %v", r.shuffleId, outputId, combinePath, err) 371 | } 372 | defer input.Close() 373 | parklog("Decoding shuffle-%d[GOB] from local file %s", r.shuffleId, combinePath) 374 | 375 | var buffer []interface{} 376 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 377 | for err == nil { 378 | buffer, err = encoder.Decode(input) 379 | if err != nil { 380 | break 381 | } 382 | for _, value := range buffer { 383 | yield <- value 384 | } 385 | } 386 | if err != nil && err != io.EOF { 387 | log.Panicf("Error when open/decode shuffle split[%d] from file %s, %v", outputId, combinePath, err) 388 | } 389 | close(yield) 390 | }() 391 | return yield 392 | } 393 | 394 | func (r *_ShuffledRDD) runShuffleJob() { 395 | r.computeShuffleStage() 396 | 397 | splits := r.getSplits() 398 | var wg sync.WaitGroup 399 | for _, split := range splits { 400 | wg.Add(1) 401 | go func(s Split) { 402 | r.computeCombineStage(s) 403 | wg.Done() 404 | }(split) 405 | } 406 | wg.Wait() 407 | } 408 | 409 | func (r *_ShuffledRDD) computeShuffleStage() { 410 | start := time.Now() 411 | parklog("Computing shuffle stage for <%s>", r) 412 | iters := r.ctx.runRoutine(r.parent, nil, func(yield Yielder, partition int) interface{} { 413 | numSplits := r.partitioner.numPartitions() 414 | buckets := make([]map[interface{}][]interface{}, numSplits) 415 | for i := 0; i < numSplits; i++ { 416 | buckets[i] = make(map[interface{}][]interface{}) 417 | } 418 | for value := range yield { 419 | keyValue := value.(*KeyValue) 420 | bucketId := r.partitioner.getPartition(keyValue.Key) 421 | bucket := buckets[bucketId] 422 | if collection, ok := bucket[keyValue.Key]; ok { 423 | bucket[keyValue.Key] = r.aggregator.valueMerger(collection, keyValue.Value) 424 | } else { 425 | bucket[keyValue.Key] = r.aggregator.combinerCreator(keyValue.Value) 426 | } 427 | } 428 | var wg sync.WaitGroup 429 | for i := 0; i < numSplits; i++ { 430 | wg.Add(1) 431 | go func(inputId int) { 432 | pathName := env.getLocalShufflePath(r.shuffleId, partition, inputId) 433 | output, err := os.Create(pathName) 434 | if err != nil { 435 | log.Panicf("Error when shuffling data into file %s, %v", pathName, err) 436 | } 437 | defer output.Close() 438 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 439 | for key, value := range buckets[inputId] { 440 | obj := &KeyValue{key, value} 441 | err = encoder.Encode(output, obj) 442 | if err != nil { 443 | log.Panicf("Error when shuffling data into file %s, %v", pathName, err) 444 | } 445 | } 446 | err = encoder.Flush(output) 447 | if err != nil { 448 | log.Panicf("Error when shuffling data into file %s, %v", pathName, err) 449 | } 450 | parklog("Encoding shuffle-%d[GOB] into local file %s", r.shuffleId, pathName) 451 | wg.Done() 452 | }(i) 453 | } 454 | wg.Wait() 455 | return struct{}{} 456 | }) 457 | for _, iter := range iters { 458 | for _ = range iter { 459 | // we need to dump the yielders that returns to finish up the routine 460 | } 461 | } 462 | parklog("Shuffling Stage DONE for <%s>, duration=%s", r, time.Since(start)) 463 | } 464 | 465 | func (r *_ShuffledRDD) computeCombineStage(split Split) { 466 | start := time.Now() 467 | parklog("Computing combine stage for <%s> on split[%d]", r, split.getIndex()) 468 | 469 | outputId := split.getIndex() 470 | numParentSplit := r.parent.len() 471 | combined := make(map[interface{}][]interface{}) 472 | var lock sync.Mutex 473 | var wg sync.WaitGroup 474 | for inputId := 0; inputId < numParentSplit; inputId++ { 475 | wg.Add(1) 476 | go func(inputId int) { 477 | pathName := env.getLocalShufflePath(r.shuffleId, inputId, outputId) 478 | parklog("Merging shuffle-%d[GOB] from local file %s", r.shuffleId, pathName) 479 | input, err := os.Open(pathName) 480 | if err != nil { 481 | log.Panicf("Error when open/decode shuff-%d from local file %s, %v", r.shuffleId, pathName, err) 482 | } 483 | defer input.Close() 484 | 485 | var buffer []interface{} 486 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 487 | for err == nil { 488 | buffer, err = encoder.Decode(input) 489 | if err != nil { 490 | break 491 | } 492 | for _, value := range buffer { 493 | kv := value.(*KeyValue) 494 | lock.Lock() 495 | if collection, ok := combined[kv.Key]; ok { 496 | combined[kv.Key] = r.aggregator.combinerMerger(collection, kv.Value.([]interface{})) 497 | } else { 498 | combined[kv.Key] = kv.Value.([]interface{}) 499 | } 500 | lock.Unlock() 501 | } 502 | } 503 | wg.Done() 504 | }(inputId) 505 | } 506 | wg.Wait() 507 | 508 | // dump to the merged file 509 | combinePath := env.getLocalShufflePath(r.shuffleId, SHUFFLE_MAGIC_ID, outputId) 510 | output, err := os.Create(combinePath) 511 | if err != nil { 512 | log.Panicf("Error when combine/decode shuffle data into local file %s, %v", combinePath, err) 513 | } 514 | defer output.Close() 515 | encoder := NewBufferEncoder(ENCODE_BUFFER_SIZE) 516 | for key, value := range combined { 517 | kv := &KeyValue{key, value} 518 | err = encoder.Encode(output, kv) 519 | if err != nil { 520 | log.Panicf("Error when combine/decode shuffle data into local file %s, %v", combinePath, err) 521 | } 522 | } 523 | err = encoder.Flush(output) 524 | if err != nil { 525 | log.Panicf("Error when combine/decode shuffle data into local file %s, %v", combinePath, err) 526 | } 527 | parklog("Merging/Combining shuffle data into local file %s, duration=%s", combinePath, time.Since(start)) 528 | } 529 | 530 | func (r *_ShuffledRDD) init(rdd RDD, aggregator *_Aggregator, partitioner Partitioner) { 531 | r._BaseRDD.init(rdd.getContext(), r) 532 | r.shuffleId = r._BaseRDD.newShuffleId() 533 | r.parent = rdd 534 | r.numParts = rdd.len() 535 | r.aggregator = aggregator 536 | r.partitioner = partitioner 537 | r.length = partitioner.numPartitions() 538 | 539 | r.splits = make([]Split, r.length) 540 | for i := 0; i < r.length; i++ { 541 | r.splits[i] = &_ShuffledSplit{index: i} 542 | } 543 | } 544 | 545 | func (r *_ShuffledRDD) String() string { 546 | return fmt.Sprintf("ShuffledRDD-%d <%s %d>", r.id, r.parent, r.length) 547 | } 548 | 549 | func newShuffledRDD(rdd RDD, aggregator *_Aggregator, partitioner Partitioner) RDD { 550 | r := &_ShuffledRDD{} 551 | r.init(rdd, aggregator, partitioner) 552 | return r 553 | } 554 | 555 | //////////////////////////////////////////////////////////////////////// 556 | // OutputTextFileRDD Impl 557 | //////////////////////////////////////////////////////////////////////// 558 | type _OutputTextFileRDD struct { 559 | _DerivedRDD 560 | pathname string 561 | } 562 | 563 | func (r *_OutputTextFileRDD) compute(split Split) Yielder { 564 | yield := make(chan interface{}, 1) 565 | go func() { 566 | parklog("Saving <%s> on Split[%d]", r, split.getIndex()) 567 | pathName := filepath.Join(r.pathname, fmt.Sprintf("%05d", split.getIndex())) 568 | outputFile, err := os.Create(pathName) 569 | if err != nil { 570 | panic(err) 571 | } 572 | defer outputFile.Close() 573 | 574 | write := func(v string) { 575 | if _, err := outputFile.Write([]byte(v)); err != nil { 576 | panic(err) 577 | } 578 | } 579 | for value := range r.previous.traverse(split) { 580 | sValue := fmt.Sprintln(value) 581 | write(sValue) 582 | } 583 | yield <- pathName 584 | close(yield) 585 | }() 586 | return yield 587 | } 588 | 589 | func (r *_OutputTextFileRDD) init(rdd RDD, pathname string) { 590 | absPathname, err := filepath.Abs(pathname) 591 | if err != nil { 592 | panic(err) 593 | } 594 | if fStat, err := os.Stat(absPathname); os.IsNotExist(err) { 595 | os.Mkdir(absPathname, os.ModePerm) 596 | } else { 597 | if !fStat.IsDir() { 598 | log.Panicf("%s must be a directory in file system.", pathname) 599 | } 600 | // delete all the files under the directory 601 | err2 := filepath.Walk(absPathname, func(path string, info os.FileInfo, err error) error { 602 | if !info.IsDir() { 603 | if e := os.Remove(path); e != nil { 604 | return e 605 | } 606 | } 607 | return nil 608 | }) 609 | if err2 != nil { 610 | panic(err2) 611 | } 612 | } 613 | 614 | r._DerivedRDD.init(rdd, r) 615 | r.pathname = absPathname 616 | } 617 | 618 | func (r *_OutputTextFileRDD) String() string { 619 | return fmt.Sprintf("OutputTextFileRDD-%d <%s>", r.id, r.pathname) 620 | } 621 | 622 | func newOutputTextFileRDD(rdd RDD, pathname string) RDD { 623 | r := &_OutputTextFileRDD{} 624 | r.init(rdd, pathname) 625 | return r 626 | } 627 | 628 | //////////////////////////////////////////////////////////////////////// 629 | // Parallelize DataRDD Impl 630 | //////////////////////////////////////////////////////////////////////// 631 | type _DataSplit struct { 632 | index int 633 | values []interface{} 634 | } 635 | 636 | func (s *_DataSplit) getIndex() int { 637 | return s.index 638 | } 639 | 640 | type _DataRDD struct { 641 | _BaseRDD 642 | size int 643 | } 644 | 645 | func (r *_DataRDD) compute(split Split) Yielder { 646 | yield := make(chan interface{}, 1) 647 | go func() { 648 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 649 | dSplit := split.(*_DataSplit) 650 | for _, value := range dSplit.values { 651 | yield <- value 652 | } 653 | close(yield) 654 | }() 655 | return yield 656 | } 657 | 658 | func (r *_DataRDD) init(ctx *Context, data []interface{}, numPartitions int) { 659 | r._BaseRDD.init(ctx, r) 660 | r.size = len(data) 661 | if r.size <= 0 { 662 | log.Panicf("Please don't provide an empty data array.") 663 | } 664 | if numPartitions <= 0 { 665 | numPartitions = 1 666 | } 667 | if r.size < numPartitions { 668 | numPartitions = r.size 669 | } 670 | splitSize := r.size / numPartitions 671 | r.length = numPartitions 672 | r.splits = make([]Split, numPartitions) 673 | for i := 0; i < numPartitions; i++ { 674 | end := splitSize*i + splitSize 675 | if i == numPartitions-1 { 676 | end = r.size 677 | } 678 | r.splits[i] = &_DataSplit{ 679 | index: i, 680 | values: data[splitSize*i : end], 681 | } 682 | } 683 | } 684 | 685 | func (r *_DataRDD) String() string { 686 | return fmt.Sprintf("DataRDD-%d ", r.id, r.size) 687 | } 688 | 689 | func newDataRDD(ctx *Context, data []interface{}) RDD { 690 | return newDataRDD_N(ctx, data, env.parallel) 691 | } 692 | 693 | func newDataRDD_N(ctx *Context, data []interface{}, numPartitions int) RDD { 694 | r := &_DataRDD{} 695 | r.init(ctx, data, numPartitions) 696 | return r 697 | } 698 | 699 | //////////////////////////////////////////////////////////////////////// 700 | // UnionRDD impl 701 | //////////////////////////////////////////////////////////////////////// 702 | type _UnionSplit struct { 703 | index int 704 | rdd RDD 705 | split Split 706 | } 707 | 708 | func (s *_UnionSplit) getIndex() int { 709 | return s.index 710 | } 711 | 712 | type _UnionRDD struct { 713 | _BaseRDD 714 | size int 715 | rdds []RDD 716 | } 717 | 718 | func (r *_UnionRDD) compute(split Split) Yielder { 719 | yield := make(chan interface{}, 1) 720 | go func() { 721 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 722 | unionSplit := split.(*_UnionSplit) 723 | for value := range unionSplit.rdd.traverse(unionSplit.split) { 724 | yield <- value 725 | } 726 | close(yield) 727 | }() 728 | return yield 729 | } 730 | 731 | func (r *_UnionRDD) init(ctx *Context, rdds []RDD) { 732 | r._BaseRDD.init(ctx, r) 733 | r.size = len(rdds) 734 | r.rdds = rdds 735 | for _, rdd := range rdds { 736 | r.length += rdd.len() 737 | } 738 | r.splits = make([]Split, r.length) 739 | index := 0 740 | for _, rdd := range rdds { 741 | for _, split := range rdd.getSplits() { 742 | r.splits[index] = &_UnionSplit{ 743 | index: index, 744 | rdd: rdd, 745 | split: split, 746 | } 747 | index++ 748 | } 749 | } 750 | } 751 | 752 | func (r *_UnionRDD) String() string { 753 | return fmt.Sprintf("UnionRDD-%d <%d %s ...>", r.id, r.size, r.rdds[0]) 754 | } 755 | 756 | func newUnionRDD(ctx *Context, rdds []RDD) RDD { 757 | r := &_UnionRDD{} 758 | r.init(ctx, rdds) 759 | return r 760 | } 761 | 762 | //////////////////////////////////////////////////////////////////////// 763 | // CoGroupedRDD impl 764 | //////////////////////////////////////////////////////////////////////// 765 | type _CoGroupedRDD struct { 766 | _BaseRDD 767 | size int 768 | rdds []RDD 769 | } 770 | 771 | func (r *_CoGroupedRDD) compute(split Split) Yielder { 772 | yield := make(chan interface{}, 1) 773 | go func() { 774 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 775 | keyGroups := make(map[interface{}][][]interface{}) 776 | for groupIndex, rdd := range r.rdds { 777 | for value := range rdd.traverse(split) { 778 | keyValue := value.(*KeyValue) 779 | if _, ok := keyGroups[keyValue.Key]; !ok { 780 | keyGroups[keyValue.Key] = make([][]interface{}, r.size) 781 | } 782 | keyGroups[keyValue.Key][groupIndex] = keyValue.Value.([]interface{}) 783 | } 784 | } 785 | for key, groups := range keyGroups { 786 | yield <- &KeyGroups{ 787 | Key: key, 788 | Groups: groups, 789 | } 790 | } 791 | close(yield) 792 | }() 793 | return yield 794 | } 795 | 796 | func (r *_CoGroupedRDD) init(ctx *Context, rdds []RDD, numPartitions int) { 797 | r._BaseRDD.init(ctx, r) 798 | r.size = len(rdds) 799 | r.rdds = make([]RDD, len(rdds)) 800 | for i, rdd := range rdds { 801 | r.rdds[i] = rdd.GroupByKey_N(numPartitions) 802 | } 803 | r.length = numPartitions 804 | r.splits = make([]Split, r.length) 805 | for i := 0; i < numPartitions; i++ { 806 | r.splits[i] = &_ShuffledSplit{index: i} 807 | } 808 | } 809 | 810 | func (r *_CoGroupedRDD) String() string { 811 | return fmt.Sprintf("CoGroupedRDD-%d <%d %s %s...>", r.id, r.size, r.rdds[0], r.rdds[1]) 812 | } 813 | 814 | func newCoGroupedRDD(ctx *Context, rdds []RDD, numPartitions int) RDD { 815 | r := &_CoGroupedRDD{} 816 | r.init(ctx, rdds, numPartitions) 817 | return r 818 | } 819 | 820 | //////////////////////////////////////////////////////////////////////// 821 | // CartesianRDD Impl 822 | //////////////////////////////////////////////////////////////////////// 823 | type _CartesianSplit struct { 824 | index int 825 | split1 Split 826 | split2 Split 827 | } 828 | 829 | func (s *_CartesianSplit) getIndex() int { 830 | return s.index 831 | } 832 | 833 | type _CartesianRDD struct { 834 | _BaseRDD 835 | rdd1 RDD 836 | rdd2 RDD 837 | } 838 | 839 | func (r *_CartesianRDD) compute(split Split) Yielder { 840 | yield := make(chan interface{}, 1) 841 | go func() { 842 | parklog("Computing <%s> on Split[%d]", r, split.getIndex()) 843 | cSplit := split.(*_CartesianSplit) 844 | rightYields := make([]interface{}, 0) 845 | for i := range r.rdd1.traverse(cSplit.split1) { 846 | if len(rightYields) == 0 { 847 | for j := range r.rdd2.traverse(cSplit.split2) { 848 | yield <- []interface{}{i, j}[:] 849 | rightYields = append(rightYields, j) 850 | } 851 | } else { 852 | for _, j := range rightYields { 853 | yield <- []interface{}{i, j}[:] 854 | } 855 | } 856 | } 857 | close(yield) 858 | }() 859 | return yield 860 | } 861 | 862 | func (r *_CartesianRDD) init(ctx *Context, rdd1, rdd2 RDD) { 863 | r._BaseRDD.init(ctx, r) 864 | r.rdd1 = rdd1 865 | r.rdd2 = rdd2 866 | r.length = rdd1.len() * rdd2.len() 867 | r.splits = make([]Split, r.length) 868 | n := rdd2.len() 869 | for i := 0; i < rdd1.len(); i++ { 870 | for j := 0; j < rdd2.len(); j++ { 871 | r.splits[i*n+j] = &_CartesianSplit{ 872 | index: i*n + j, 873 | split1: rdd1.getSplit(i), 874 | split2: rdd2.getSplit(j), 875 | } 876 | } 877 | } 878 | } 879 | 880 | func (r *_CartesianRDD) String() string { 881 | return fmt.Sprintf("CartesianRDD-%d <%s %s>", r.id, r.rdd1, r.rdd2) 882 | } 883 | 884 | func newCartesianRDD(ctx *Context, rdd1, rdd2 RDD) RDD { 885 | r := &_CartesianRDD{} 886 | r.init(ctx, rdd1, rdd2) 887 | return r 888 | } 889 | -------------------------------------------------------------------------------- /schedule.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import () 4 | 5 | /* 6 | * This is a really simple recurrsive implementation of scheduler, 7 | * just call the registered function via the flow of rdd chain. 8 | * You may need to reconsider this. 9 | */ 10 | 11 | ////////////////////////////////////////////////////// 12 | // DAGScheduler base impl 13 | ////////////////////////////////////////////////////// 14 | type Scheduler interface { 15 | start() 16 | clear() 17 | stop() 18 | runRoutine(rdd RDD, partitions []int, rn ReducerFn) []Yielder 19 | } 20 | 21 | type _DAGScheduler struct { 22 | } 23 | 24 | func (d *_DAGScheduler) init() { 25 | } 26 | 27 | func (d *_DAGScheduler) runRoutine(s Scheduler, rdd RDD, partitions []int, rn ReducerFn) []Yielder { 28 | numOutputParts := len(partitions) 29 | yields := make([]Yielder, numOutputParts) 30 | for i := 0; i < numOutputParts; i++ { 31 | split := rdd.getSplit(partitions[i]) 32 | yields[i] = make(chan interface{}, 1) 33 | go func(yield Yielder, partition int) { 34 | yield <- rn(rdd.traverse(split), partition) 35 | close(yield) 36 | }(yields[i], i) 37 | } 38 | return yields 39 | } 40 | 41 | ////////////////////////////////////////////////////// 42 | // LocalScheduler impl 43 | ////////////////////////////////////////////////////// 44 | type _LocalScheduler struct { 45 | _DAGScheduler 46 | } 47 | 48 | func (s *_LocalScheduler) init() { 49 | s._DAGScheduler.init() 50 | } 51 | 52 | func (s *_LocalScheduler) start() {} 53 | func (s *_LocalScheduler) stop() {} 54 | func (s *_LocalScheduler) clear() {} 55 | 56 | func (s *_LocalScheduler) runRoutine(rdd RDD, partitions []int, rn ReducerFn) []Yielder { 57 | return s._DAGScheduler.runRoutine(s, rdd, partitions, rn) 58 | } 59 | 60 | func newLocalScheduler() Scheduler { 61 | local := &_LocalScheduler{} 62 | local.init() 63 | return local 64 | } 65 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/binary" 7 | "encoding/gob" 8 | "fmt" 9 | "hash/fnv" 10 | "log" 11 | "os" 12 | "strconv" 13 | "sync/atomic" 14 | ) 15 | 16 | func Range(start, end int) []interface{} { 17 | if start > end { 18 | log.Panicf("Range start cannot be larger than end, [%d, %d)", start, end) 19 | } 20 | r := make([]interface{}, end-start) 21 | for i := start; i < end; i++ { 22 | r[i-start] = i 23 | } 24 | return r 25 | } 26 | 27 | type AtomicInt int64 28 | 29 | func (i *AtomicInt) Add(n int64) { 30 | atomic.AddInt64((*int64)(i), n) 31 | } 32 | 33 | func (i *AtomicInt) Get() int64 { 34 | return atomic.LoadInt64((*int64)(i)) 35 | } 36 | 37 | func (i *AtomicInt) String() string { 38 | return strconv.FormatInt(i.Get(), 10) 39 | } 40 | 41 | func MaxInt64(a, b int64) int64 { 42 | if a > b { 43 | return a 44 | } 45 | return b 46 | } 47 | 48 | func MinInt64(a, b int64) int64 { 49 | if a > b { 50 | return b 51 | } 52 | return a 53 | } 54 | 55 | func readLine(r *bufio.Reader) (string, error) { 56 | var ( 57 | isPrefix bool = true 58 | err error = nil 59 | line, ln []byte 60 | ) 61 | for isPrefix && err == nil { 62 | line, isPrefix, err = r.ReadLine() 63 | ln = append(ln, line...) 64 | } 65 | return string(ln), err 66 | } 67 | 68 | var _ = fmt.Println 69 | 70 | func seekToNewLine(f *os.File, start int64) (int64, error) { 71 | _, err := f.Seek(start, 0) 72 | if err != nil { 73 | return start, err 74 | } 75 | b := make([]byte, 1) 76 | _, err = f.Read(b) 77 | start++ 78 | if err != nil { 79 | return start, err 80 | } 81 | for b[0] != '\n' { 82 | _, err := f.Read(b) 83 | if err != nil { 84 | return start, err 85 | } 86 | start++ 87 | } 88 | return start, nil 89 | } 90 | 91 | func hashCode(value interface{}) (hashCode int64) { 92 | if value == nil { 93 | hashCode = 0 94 | return 95 | } 96 | hash := fnv.New32() 97 | buffer := new(bytes.Buffer) 98 | encoder := gob.NewEncoder(buffer) 99 | encoder.Encode(value) 100 | hash.Write(buffer.Bytes()) 101 | hashCode = 0 102 | hashBytes := hash.Sum(nil) 103 | for _, hashByte := range hashBytes { 104 | hashCode = hashCode*256 + int64(hashByte) 105 | } 106 | return 107 | } 108 | 109 | func HashCode(value interface{}) int64 { 110 | return hashCode(value) 111 | } 112 | 113 | // Encode related funcs 114 | 115 | const ENCODE_BUFFER_SIZE = 10000 116 | 117 | type EncodeBox struct { 118 | Object interface{} 119 | } 120 | 121 | func init() { 122 | gob.Register(new(EncodeBox)) 123 | gob.Register(make([]interface{}, 0)) 124 | } 125 | 126 | type BufferEncoder struct { 127 | size int 128 | buffer []interface{} 129 | watermark int 130 | } 131 | 132 | func NewBufferEncoder(size int) *BufferEncoder { 133 | encoder := &BufferEncoder{} 134 | encoder.size = size 135 | encoder.buffer = make([]interface{}, size) 136 | return encoder 137 | } 138 | 139 | func (e *BufferEncoder) Encode(f *os.File, value interface{}) error { 140 | if len(e.buffer) == e.watermark { 141 | err := encodeObjectIntoFile(f, e.buffer) 142 | if err != nil { 143 | return err 144 | } 145 | e.watermark = 0 146 | } 147 | e.buffer[e.watermark] = value 148 | e.watermark++ 149 | return nil 150 | } 151 | 152 | func (e *BufferEncoder) Flush(f *os.File) error { 153 | if e.watermark > 0 { 154 | err := encodeObjectIntoFile(f, e.buffer[:e.watermark]) 155 | if err != nil { 156 | return err 157 | } 158 | e.watermark = 0 159 | } 160 | return nil 161 | } 162 | 163 | func (e *BufferEncoder) Decode(f *os.File) ([]interface{}, error) { 164 | buffer, err := decodeObjectFromFile(f) 165 | if err != nil { 166 | return nil, err 167 | } 168 | return buffer.([]interface{}), nil 169 | } 170 | 171 | func encodeObjectIntoFile(f *os.File, value interface{}) error { 172 | box := EncodeBox{value} 173 | objBuffer := new(bytes.Buffer) 174 | err := gob.NewEncoder(objBuffer).Encode(box) 175 | if err != nil { 176 | return err 177 | } 178 | 179 | size := int32(len(objBuffer.Bytes())) 180 | sizeBuffer := new(bytes.Buffer) 181 | err = binary.Write(sizeBuffer, binary.LittleEndian, size) 182 | if err != nil { 183 | return err 184 | } 185 | 186 | f.Write(sizeBuffer.Bytes()) 187 | f.Write(objBuffer.Bytes()) 188 | 189 | return err 190 | } 191 | 192 | func decodeObjectFromFile(f *os.File) (interface{}, error) { 193 | var ( 194 | size int32 195 | box EncodeBox 196 | ) 197 | err := binary.Read(f, binary.LittleEndian, &size) 198 | if err != nil { 199 | return nil, err 200 | } 201 | 202 | boxBytes := make([]byte, size) 203 | _, err = f.Read(boxBytes) 204 | if err != nil { 205 | return nil, err 206 | } 207 | 208 | buffer := bytes.NewBuffer(boxBytes) 209 | decoder := gob.NewDecoder(buffer) 210 | if err = decoder.Decode(&box); err == nil { 211 | return box.Object, nil 212 | } 213 | return nil, err 214 | } 215 | -------------------------------------------------------------------------------- /vector.go: -------------------------------------------------------------------------------- 1 | package gopark 2 | 3 | import ( 4 | "encoding/gob" 5 | "fmt" 6 | "math" 7 | "math/rand" 8 | "strings" 9 | "time" 10 | ) 11 | 12 | type Vector []float64 13 | type IndexedVector map[interface{}]float64 14 | 15 | func init() { 16 | gob.Register(new(Vector)) 17 | gob.Register(new(IndexedVector)) 18 | } 19 | 20 | // Vector methods 21 | func NewZeroVector(size int) Vector { 22 | v := make(Vector, size) 23 | return v 24 | } 25 | 26 | func NewSameValueVector(size int, value float64) Vector { 27 | v := make(Vector, size) 28 | for i := range v { 29 | v[i] = value 30 | } 31 | return v 32 | } 33 | 34 | func NewRandomVector(size int) Vector { 35 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 36 | v := make(Vector, size) 37 | for i := range v { 38 | v[i] = r.Float64() 39 | } 40 | return v 41 | } 42 | 43 | func NewRandomNormVector(size int, dev, mean float64) Vector { 44 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 45 | v := make(Vector, size) 46 | for i := range v { 47 | v[i] = r.NormFloat64()*dev + mean 48 | } 49 | return v 50 | } 51 | 52 | func NewRandomLimitedVector(size int, minValue, maxValue float64) Vector { 53 | if maxValue <= minValue { 54 | panic("NewRandomLimitedVector cannot use maxValue <= minValue") 55 | } 56 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 57 | v := make(Vector, size) 58 | for i := range v { 59 | v[i] = r.Float64()*(maxValue-minValue) + minValue 60 | } 61 | return v 62 | } 63 | 64 | func (v Vector) String() string { 65 | fields := make([]string, len(v)) 66 | for i := range v { 67 | fields[i] = fmt.Sprintf("%v", v[i]) 68 | } 69 | return strings.Join(fields, "\t") 70 | } 71 | 72 | func (v Vector) Plus(o Vector) Vector { 73 | return biVectorsOp(v, o, func(x, y float64) float64 { 74 | return x + y 75 | }) 76 | } 77 | 78 | func (v Vector) Minus(o Vector) Vector { 79 | return biVectorsOp(v, o, func(x, y float64) float64 { 80 | return x - y 81 | }) 82 | } 83 | 84 | func (v Vector) Multiply(m float64) Vector { 85 | result := make(Vector, len(v)) 86 | for i := range v { 87 | result[i] = v[i] * m 88 | } 89 | return result 90 | } 91 | 92 | func (v Vector) Divide(d float64) Vector { 93 | if d == 0 { 94 | panic(fmt.Errorf("Vector divided by zero.")) 95 | } 96 | result := make(Vector, len(v)) 97 | for i := range v { 98 | result[i] = v[i] / d 99 | } 100 | return result 101 | } 102 | 103 | func (v Vector) Sum() float64 { 104 | sum := 0.0 105 | for i := range v { 106 | sum += v[i] 107 | } 108 | return sum 109 | } 110 | 111 | func (v Vector) Dot(o Vector) float64 { 112 | if len(v) != len(o) { 113 | panic(fmt.Errorf("Two vectors of different length")) 114 | } 115 | sum := 0.0 116 | for i := range v { 117 | sum += v[i] * o[i] 118 | } 119 | return sum 120 | } 121 | 122 | func (v Vector) NormL2() float64 { 123 | return v.Dot(v) 124 | } 125 | 126 | func (v Vector) Magnitude() float64 { 127 | return math.Sqrt(v.Dot(v)) 128 | } 129 | 130 | func (v Vector) Cosine(o Vector) float64 { 131 | return v.Dot(o) / v.Magnitude() / o.Magnitude() 132 | } 133 | 134 | func (v Vector) EulaDistance(o Vector) float64 { 135 | if len(v) != len(o) { 136 | panic(fmt.Errorf("Two vectors of different length")) 137 | } 138 | dist := 0.0 139 | for i := range v { 140 | dist += (v[i] - o[i]) * (v[i] - o[i]) 141 | } 142 | return math.Sqrt(dist) 143 | } 144 | 145 | type operand func(x, y float64) float64 146 | 147 | func biVectorsOp(v1, v2 Vector, fn operand) Vector { 148 | if len(v1) != len(v2) { 149 | panic(fmt.Errorf("Two vectors of different length")) 150 | } 151 | result := make(Vector, len(v1)) 152 | for i := range v1 { 153 | result[i] = fn(v1[i], v2[i]) 154 | } 155 | return result 156 | } 157 | 158 | // IndexedVector methods 159 | func NewIndexedVector() IndexedVector { 160 | return make(IndexedVector) 161 | } 162 | 163 | func (v IndexedVector) String() string { 164 | fields := make([]string, len(v)) 165 | i := 0 166 | for key, value := range v { 167 | fields[i] = fmt.Sprintf("%v:%v", key, value) 168 | i++ 169 | } 170 | return strings.Join(fields, "\t") 171 | } 172 | 173 | func (v IndexedVector) RandomFeature(feature interface{}) { 174 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 175 | v[feature] = r.Float64() 176 | } 177 | 178 | func (v IndexedVector) RandomLimitedFeature(feature interface{}, minValue, maxValue float64) { 179 | if maxValue <= minValue { 180 | panic("RandomLimitedFeature cannot use maxValue <= minValue") 181 | } 182 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 183 | v[feature] = r.Float64()*(maxValue-minValue) + minValue 184 | } 185 | 186 | func (v IndexedVector) RandomNormFeature(feature interface{}, dev, mean float64) { 187 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 188 | v[feature] = r.NormFloat64()*dev + mean 189 | } 190 | 191 | func (v IndexedVector) Copy() IndexedVector { 192 | result := make(IndexedVector) 193 | for key, value := range v { 194 | result[key] = value 195 | } 196 | return result 197 | } 198 | 199 | func (v IndexedVector) Keys() []interface{} { 200 | keys := make([]interface{}, len(v)) 201 | i := 0 202 | for key := range v { 203 | keys[i] = key 204 | i++ 205 | } 206 | return keys 207 | } 208 | 209 | func (v IndexedVector) Plus(o IndexedVector) IndexedVector { 210 | result := v.Copy() 211 | for key, value := range o { 212 | result[key] += value 213 | } 214 | return result 215 | } 216 | 217 | func (v IndexedVector) Minus(o IndexedVector) IndexedVector { 218 | result := v.Copy() 219 | for key, value := range o { 220 | result[key] -= value 221 | } 222 | return result 223 | } 224 | 225 | func (v IndexedVector) Multiply(m float64) IndexedVector { 226 | result := v.Copy() 227 | for key := range result { 228 | result[key] *= m 229 | } 230 | return result 231 | } 232 | 233 | func (v IndexedVector) Divide(d float64) IndexedVector { 234 | if d == 0 { 235 | panic("IndexedVector divide by zero") 236 | } 237 | result := v.Copy() 238 | for key := range result { 239 | result[key] /= d 240 | } 241 | return result 242 | } 243 | 244 | func (v IndexedVector) Sum() float64 { 245 | var sum float64 = 0 246 | for _, value := range v { 247 | sum += value 248 | } 249 | return sum 250 | } 251 | 252 | func (v IndexedVector) Dot(o IndexedVector) float64 { 253 | var sum float64 = 0 254 | vx, vy := v, o 255 | if len(v) > len(o) { 256 | vx, vy = o, v 257 | } 258 | for key, value := range vx { 259 | sum += value * vy[key] 260 | } 261 | return sum 262 | } 263 | 264 | func (v IndexedVector) NormL2() float64 { 265 | var sum float64 = 0 266 | for _, value := range v { 267 | sum += value * value 268 | } 269 | return sum 270 | } 271 | 272 | func (v IndexedVector) Magnitude() float64 { 273 | return math.Sqrt(v.NormL2()) 274 | } 275 | 276 | func (v IndexedVector) Cosine(o IndexedVector) float64 { 277 | return v.Dot(o) / v.Magnitude() / o.Magnitude() 278 | } 279 | 280 | func (v IndexedVector) EulaDistance(o IndexedVector) float64 { 281 | var dist float64 = 0 282 | for key, value := range v { 283 | dist += (value - o[key]) * (value - o[key]) 284 | } 285 | for key, value := range o { 286 | if _, ok := v[key]; !ok { 287 | dist += value * value 288 | } 289 | } 290 | return dist 291 | } 292 | --------------------------------------------------------------------------------