├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── project
├── build.properties
└── plugins.sbt
├── sbt
├── sbt
└── sbt-launch-lib.bash
└── src
├── main
└── scala
│ └── edu
│ └── berkeley
│ └── cs
│ └── amplab
│ └── spark
│ └── indexedrdd
│ ├── IndexedRDD.scala
│ ├── IndexedRDDPartition.scala
│ ├── KeySerializer.scala
│ └── impl
│ ├── LazyPartition.scala
│ └── PARTPartition.scala
└── test
├── resources
└── log4j.properties
└── scala
└── edu
└── berkeley
└── cs
└── amplab
└── spark
└── indexedrdd
├── IndexedRDDSuite.scala
├── KeySerializerSuite.scala
├── SharedSparkContext.scala
└── impl
└── IndexedRDDPartitionSuite.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | sbt/sbt-launch*.jar
5 | .idea/
6 | .idea_modules/
7 | .cache
8 | .history
9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | project/boot/
15 | project/plugins/project/
16 |
17 | .scala_dependencies
18 | .worksheet
19 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.11.6
4 | - 2.10.6
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IndexedRDD for Apache Spark
2 |
3 | An efficient updatable key-value store for [Apache Spark](http://spark.apache.org).
4 |
5 | IndexedRDD extends `RDD[(K, V)]` by enforcing key uniqueness and pre-indexing the entries for efficient joins and point lookups, updates, and deletions. It is implemented by (1) hash-partitioning the entries by key, (2) maintaining a radix tree ([PART](https://github.com/ankurdave/part)) index within each partition, and (3) using this immutable and efficiently updatable data structure to enable efficient modifications and deletions.
6 |
7 | ## Usage
8 |
9 | Add the dependency to your SBT project by adding the following to `build.sbt` (see the [Spark Packages listing](http://spark-packages.org/package/amplab/spark-indexedrdd) for spark-submit and Maven instructions):
10 |
11 | ```scala
12 | resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven"
13 |
14 | libraryDependencies += "amplab" % "spark-indexedrdd" % "0.3"
15 | ```
16 |
17 | Then use IndexedRDD as follows:
18 |
19 | ```scala
20 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD
21 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD._
22 |
23 | // Create an RDD of key-value pairs with Long keys.
24 | val rdd = sc.parallelize((1 to 1000000).map(x => (x.toLong, 0)))
25 | // Construct an IndexedRDD from the pairs, hash-partitioning and indexing
26 | // the entries.
27 | val indexed = IndexedRDD(rdd).cache()
28 |
29 | // Perform a point update.
30 | val indexed2 = indexed.put(1234L, 10873).cache()
31 | // Perform a point lookup. Note that the original IndexedRDD remains
32 | // unmodified.
33 | indexed2.get(1234L) // => Some(10873)
34 | indexed.get(1234L) // => Some(0)
35 |
36 | // Efficiently join derived IndexedRDD with original.
37 | val indexed3 = indexed.innerJoin(indexed2) { (id, a, b) => b }.filter(_._2 != 0)
38 | indexed3.collect // => Array((1234L, 10873))
39 |
40 | // Perform insertions and deletions.
41 | val indexed4 = indexed2.put(-100L, 111).delete(Array(998L, 999L)).cache()
42 | indexed2.get(-100L) // => None
43 | indexed4.get(-100L) // => Some(111)
44 | indexed2.get(999L) // => Some(0)
45 | indexed4.get(999L) // => None
46 | ```
47 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-indexedrdd"
2 | version := "0.4.0"
3 | organization := "edu.berkeley.cs.amplab"
4 |
5 | scalaVersion := "2.11.8"
6 | crossScalaVersions := Seq("2.10.6", "2.11.6")
7 |
8 | spName := "amplab/spark-indexedrdd"
9 | sparkVersion := "2.1.0"
10 | sparkComponents += "core"
11 |
12 | resolvers += "Repo at github.com/ankurdave/maven-repo" at "https://raw.githubusercontent.com/ankurdave/maven-repo/master"
13 |
14 | libraryDependencies ++= Seq(
15 | "com.ankurdave" % "part_2.10" % "0.1", // artifact is not published for 2.11, but it only contains Java code anyway
16 | "org.scalatest" %% "scalatest" % "2.2.4" % "test",
17 | "org.scalacheck" %% "scalacheck" % "1.12.2" % "test"
18 | )
19 |
20 | publishMavenStyle := true
21 |
22 | licenses += "Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")
23 |
24 | pomExtra :=
25 | https://github.com/amplab/spark-indexedrdd
26 |
27 | git@github.com:amplab/spark-indexedrdd.git
28 | scm:git:git@github.com:amplab/spark-indexedrdd.git
29 |
30 |
31 |
32 | ankurdave
33 | Ankur Dave
34 | https://github.com/ankurdave
35 |
36 |
37 |
38 |
39 | // Run tests with more memory
40 | javaOptions in test += "-Xmx2G"
41 |
42 | fork in test := true
43 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.13
18 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | scalaVersion := "2.10.4"
2 |
3 | // resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
4 |
5 | // resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
6 |
7 | // resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
8 |
9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
10 |
11 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
12 |
13 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")
14 |
--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | realpath () {
4 | (
5 | TARGET_FILE="$1"
6 |
7 | cd "$(dirname "$TARGET_FILE")"
8 | TARGET_FILE="$(basename "$TARGET_FILE")"
9 |
10 | COUNT=0
11 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
12 | do
13 | TARGET_FILE="$(readlink "$TARGET_FILE")"
14 | cd $(dirname "$TARGET_FILE")
15 | TARGET_FILE="$(basename $TARGET_FILE)"
16 | COUNT=$(($COUNT + 1))
17 | done
18 |
19 | echo "$(pwd -P)/"$TARGET_FILE""
20 | )
21 | }
22 |
23 | . "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash
24 |
25 |
26 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
27 | declare -r sbt_opts_file=".sbtopts"
28 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
29 |
30 | usage() {
31 | cat < path to global settings/plugins directory (default: ~/.sbt)
40 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
41 | -ivy path to local Ivy repository (default: ~/.ivy2)
42 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
43 | -no-share use all local caches; no sharing
44 | -no-global uses global caches, but does not use global ~/.sbt directory.
45 | -jvm-debug Turn on JVM debugging, open at the given port.
46 | -batch Disable interactive mode
47 |
48 | # sbt version (default: from project/build.properties if present, else latest release)
49 | -sbt-version use the specified version of sbt
50 | -sbt-jar use the specified jar as the sbt launcher
51 | -sbt-rc use an RC version of sbt
52 | -sbt-snapshot use a snapshot version of sbt
53 |
54 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
55 | -java-home alternate JAVA_HOME
56 |
57 | # jvm options and output control
58 | JAVA_OPTS environment variable, if unset uses "$java_opts"
59 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
60 | .sbtopts if this file exists in the current directory, it is
61 | prepended to the runner args
62 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args
63 | -Dkey=val pass -Dkey=val directly to the java runtime
64 | -J-X pass option -X directly to the java runtime
65 | (-J is stripped)
66 | -S-X add -X to sbt's scalacOptions (-J is stripped)
67 | -PmavenProfiles Enable a maven profile for the build.
68 |
69 | In the case of duplicated or conflicting options, the order above
70 | shows precedence: JAVA_OPTS lowest, command line options highest.
71 | EOM
72 | }
73 |
74 | process_my_args () {
75 | while [[ $# -gt 0 ]]; do
76 | case "$1" in
77 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
78 | -no-share) addJava "$noshare_opts" && shift ;;
79 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
80 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
81 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
82 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
83 | -batch) exec &2 "$@"
31 | }
32 | vlog () {
33 | [[ $verbose || $debug ]] && echoerr "$@"
34 | }
35 | dlog () {
36 | [[ $debug ]] && echoerr "$@"
37 | }
38 |
39 | acquire_sbt_jar () {
40 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
41 | URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
42 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
43 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
44 |
45 | sbt_jar=$JAR
46 |
47 | if [[ ! -f "$sbt_jar" ]]; then
48 | # Download sbt launch jar if it hasn't been downloaded yet
49 | if [ ! -f "${JAR}" ]; then
50 | # Download
51 | printf "Attempting to fetch sbt\n"
52 | JAR_DL="${JAR}.part"
53 | if hash curl 2>/dev/null; then
54 | (curl -L --silent ${URL1} > "${JAR_DL}" || curl -L --silent ${URL2} > "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
55 | elif hash wget 2>/dev/null; then
56 | (wget --quiet ${URL1} -O "${JAR_DL}" || wget --quiet ${URL2} -O "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
57 | else
58 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
59 | exit -1
60 | fi
61 | fi
62 | if [ ! -f "${JAR}" ]; then
63 | # We failed to download
64 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
65 | exit -1
66 | fi
67 | printf "Launching sbt from ${JAR}\n"
68 | fi
69 | }
70 |
71 | execRunner () {
72 | # print the arguments one to a line, quoting any containing spaces
73 | [[ $verbose || $debug ]] && echo "# Executing command line:" && {
74 | for arg; do
75 | if printf "%s\n" "$arg" | grep -q ' '; then
76 | printf "\"%s\"\n" "$arg"
77 | else
78 | printf "%s\n" "$arg"
79 | fi
80 | done
81 | echo ""
82 | }
83 |
84 | exec "$@"
85 | }
86 |
87 | addJava () {
88 | dlog "[addJava] arg = '$1'"
89 | java_args=( "${java_args[@]}" "$1" )
90 | }
91 |
92 | enableProfile () {
93 | dlog "[enableProfile] arg = '$1'"
94 | maven_profiles=( "${maven_profiles[@]}" "$1" )
95 | export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
96 | }
97 |
98 | addSbt () {
99 | dlog "[addSbt] arg = '$1'"
100 | sbt_commands=( "${sbt_commands[@]}" "$1" )
101 | }
102 | addResidual () {
103 | dlog "[residual] arg = '$1'"
104 | residual_args=( "${residual_args[@]}" "$1" )
105 | }
106 | addDebugger () {
107 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
108 | }
109 |
110 | # a ham-fisted attempt to move some memory settings in concert
111 | # so they need not be dicked around with individually.
112 | get_mem_opts () {
113 | local mem=${1:-2048}
114 | local perm=$(( $mem / 4 ))
115 | (( $perm > 256 )) || perm=256
116 | (( $perm < 4096 )) || perm=4096
117 | local codecache=$(( $perm / 2 ))
118 |
119 | echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
120 | }
121 |
122 | require_arg () {
123 | local type="$1"
124 | local opt="$2"
125 | local arg="$3"
126 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
127 | die "$opt requires <$type> argument"
128 | fi
129 | }
130 |
131 | is_function_defined() {
132 | declare -f "$1" > /dev/null
133 | }
134 |
135 | process_args () {
136 | while [[ $# -gt 0 ]]; do
137 | case "$1" in
138 | -h|-help) usage; exit 1 ;;
139 | -v|-verbose) verbose=1 && shift ;;
140 | -d|-debug) debug=1 && shift ;;
141 |
142 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
143 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
144 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
145 | -batch) exec partitioner.get.getPartition(k))
81 | val partitions = ksByPartition.keys.toSeq
82 | // TODO: avoid sending all keys to all partitions by creating and zipping an RDD of keys
83 | val results: Array[Array[(K, V)]] = context.runJob(partitionsRDD,
84 | (context: TaskContext, partIter: Iterator[IndexedRDDPartition[K, V]]) => {
85 | if (partIter.hasNext && ksByPartition.contains(context.partitionId)) {
86 | val part = partIter.next()
87 | val ksForPartition = ksByPartition.get(context.partitionId).get
88 | part.multiget(ksForPartition).toArray
89 | } else {
90 | Array.empty[(K, V)]
91 | }
92 | }, partitions)
93 | results.flatten.toMap
94 | }
95 |
96 | /**
97 | * Unconditionally updates the specified key to have the specified value. Returns a new IndexedRDD
98 | * that reflects the modification.
99 | *
100 | * Some implementations may not support this operation and will throw
101 | * `UnsupportedOperationException`.
102 | */
103 | def put(k: K, v: V): IndexedRDD[K, V] = multiput(Map(k -> v))
104 |
105 | /**
106 | * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new
107 | * IndexedRDD that reflects the modification.
108 | *
109 | * Some implementations may not support this operation and will throw
110 | * `UnsupportedOperationException`.
111 | */
112 | def multiput(kvs: Map[K, V]): IndexedRDD[K, V] =
113 | multiput(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b)
114 |
115 | /**
116 | * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new
117 | * IndexedRDD that reflects the modification.
118 | *
119 | * Some implementations may not support this operation and will throw
120 | * `UnsupportedOperationException`.
121 | */
122 | def multiputRDD(kvs: RDD[(K, V)]): IndexedRDD[K, V] =
123 | multiputRDD(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b)
124 |
125 | /**
126 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
127 | * if necessary. Returns a new IndexedRDD that reflects the modification.
128 | *
129 | * Some implementations may not support this operation and will throw
130 | * `UnsupportedOperationException`.
131 | */
132 | def multiput(kvs: Map[K, V], merge: (K, V, V) => V): IndexedRDD[K, V] =
133 | multiput(kvs, (id: K, a: V) => a, merge)
134 |
135 | /**
136 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
137 | * if necessary. Returns a new IndexedRDD that reflects the modification.
138 | *
139 | * Some implementations may not support this operation and will throw
140 | * `UnsupportedOperationException`.
141 | */
142 | def multiputRDD(kvs: RDD[(K, V)], merge: (K, V, V) => V): IndexedRDD[K, V] =
143 | multiputRDD(kvs, (id: K, a: V) => a, merge)
144 |
145 | /**
146 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
147 | * if necessary. Returns a new IndexedRDD that reflects the modification.
148 | *
149 | * Some implementations may not support this operation and will throw
150 | * `UnsupportedOperationException`.
151 | */
152 | def multiput[U: ClassTag](kvs: Map[K, U], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] =
153 | multiputRDD(context.parallelize(kvs.toSeq), project, merge)
154 |
155 | /**
156 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
157 | * if necessary. Returns a new IndexedRDD that reflects the modification.
158 | *
159 | * Some implementations may not support this operation and will throw
160 | * `UnsupportedOperationException`.
161 | */
162 | def multiputRDD[U: ClassTag](updates: RDD[(K, U)], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] = {
163 | zipPartitionsWithOther(updates.partitionBy(partitioner.get))(new MultiputZipper(project, merge))
164 | }
165 |
166 | /**
167 | * Deletes the specified keys. Returns a new IndexedRDD that reflects the deletions.
168 | *
169 | * Some implementations may not support this operation and will throw
170 | * `UnsupportedOperationException`.
171 | */
172 | def delete(ks: Array[K]): IndexedRDD[K, V] = {
173 | val deletions = context.parallelize(ks.map(k => (k, ()))).partitionBy(partitioner.get)
174 | zipPartitionsWithOther(deletions)(new DeleteZipper)
175 | }
176 |
177 | /** Applies a function to each partition of this IndexedRDD. */
178 | private def mapIndexedRDDPartitions[K2: ClassTag, V2: ClassTag](
179 | f: IndexedRDDPartition[K, V] => IndexedRDDPartition[K2, V2]): IndexedRDD[K2, V2] = {
180 | val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true)
181 | new IndexedRDD(newPartitionsRDD)
182 | }
183 |
184 | /** Applies a function to corresponding partitions of `this` and another IndexedRDD. */
185 | private def zipIndexedRDDPartitions[V2: ClassTag, V3: ClassTag](other: IndexedRDD[K, V2])
186 | (f: ZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = {
187 | assert(partitioner == other.partitioner)
188 | val newPartitionsRDD = partitionsRDD.zipPartitions(other.partitionsRDD, true)(f)
189 | new IndexedRDD(newPartitionsRDD)
190 | }
191 |
192 | /** Applies a function to corresponding partitions of `this` and a pair RDD. */
193 | private def zipPartitionsWithOther[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)])
194 | (f: OtherZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = {
195 | val partitioned = other.partitionBy(partitioner.get)
196 | val newPartitionsRDD = partitionsRDD.zipPartitions(partitioned, true)(f)
197 | new IndexedRDD(newPartitionsRDD)
198 | }
199 |
200 | /**
201 | * Restricts the entries to those satisfying the given predicate. This operation preserves the
202 | * index for efficient joins with the original IndexedRDD and is implemented using soft deletions.
203 | *
204 | * @param pred the user defined predicate, which takes a tuple to conform to the `RDD[(K, V)]`
205 | * interface
206 | */
207 | override def filter(pred: Tuple2[K, V] => Boolean): IndexedRDD[K, V] =
208 | this.mapIndexedRDDPartitions(_.filter(Function.untupled(pred)))
209 |
210 | /** Maps each value, preserving the index. */
211 | def mapValues[V2: ClassTag](f: V => V2): IndexedRDD[K, V2] =
212 | this.mapIndexedRDDPartitions(_.mapValues((vid, attr) => f(attr)))
213 |
214 | /** Maps each value, supplying the corresponding key and preserving the index. */
215 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDD[K, V2] =
216 | this.mapIndexedRDDPartitions(_.mapValues(f))
217 |
218 | /**
219 | * Intersects `this` and `other` and keeps only elements with differing values. For these
220 | * elements, keeps the values from `this`.
221 | */
222 | def diff(other: RDD[(K, V)]): IndexedRDD[K, V] = other match {
223 | case other: IndexedRDD[K, V] if partitioner == other.partitioner =>
224 | this.zipIndexedRDDPartitions(other)(new DiffZipper)
225 | case _ =>
226 | this.zipPartitionsWithOther(other)(new OtherDiffZipper)
227 | }
228 |
229 | /**
230 | * Joins `this` with `other`, running `f` on the values of all keys in both sets. Note that for
231 | * efficiency `other` must be an IndexedRDD, not just a pair RDD. Use [[aggregateUsingIndex]] to
232 | * construct an IndexedRDD co-partitioned with `this`.
233 | *
234 | * @param maybeLazy if true, a joined "view" of the input RDDs (that preserves the underlying
235 | * indices) may be returned
236 | */
237 | def fullOuterJoin[V2: ClassTag, W: ClassTag]
238 | (other: RDD[(K, V2)], maybeLazy: Boolean = false)
239 | (f: (K, Option[V], Option[V2]) => W): IndexedRDD[K, W] = other match {
240 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner => {
241 | val castFn = implicitly[ClassTag[(K, Option[V], Option[V]) => V]]
242 | val castRDD = implicitly[ClassTag[IndexedRDD[K, V]]]
243 | (other, f) match {
244 | case (castRDD(other), castFn(f)) if maybeLazy =>
245 | this.zipIndexedRDDPartitions(other)(new LazyFullOuterJoinZipper(f)).asInstanceOf[IndexedRDD[K, W]]
246 | case (other, f) =>
247 | this.zipIndexedRDDPartitions(other)(new FullOuterJoinZipper(f))
248 | }
249 | }
250 | case _ =>
251 | this.zipPartitionsWithOther(other)(new OtherFullOuterJoinZipper(f))
252 | }
253 |
254 | /**
255 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
256 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
257 | * value type.
258 | */
259 | def join[U: ClassTag]
260 | (other: RDD[(K, U)])(f: (K, V, U) => V): IndexedRDD[K, V] = other match {
261 | case other: IndexedRDD[K, U] if partitioner == other.partitioner =>
262 | this.zipIndexedRDDPartitions(other)(new JoinZipper(f))
263 | case _ =>
264 | this.zipPartitionsWithOther(other)(new OtherJoinZipper(f))
265 | }
266 |
267 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
268 | def leftJoin[V2: ClassTag, V3: ClassTag]
269 | (other: RDD[(K, V2)])(f: (K, V, Option[V2]) => V3): IndexedRDD[K, V3] = other match {
270 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner =>
271 | this.zipIndexedRDDPartitions(other)(new LeftJoinZipper(f))
272 | case _ =>
273 | this.zipPartitionsWithOther(other)(new OtherLeftJoinZipper(f))
274 | }
275 |
276 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
277 | def innerJoin[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)])
278 | (f: (K, V, V2) => V3): IndexedRDD[K, V3] = other match {
279 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner =>
280 | this.zipIndexedRDDPartitions(other)(new InnerJoinZipper(f))
281 | case _ =>
282 | this.zipPartitionsWithOther(other)(new OtherInnerJoinZipper(f))
283 | }
284 |
285 | /**
286 | * Creates a new IndexedRDD with values from `elems` that may share an index with `this`,
287 | * merging duplicate keys in `elems` arbitrarily.
288 | */
289 | def createUsingIndex[V2: ClassTag](elems: RDD[(K, V2)]): IndexedRDD[K, V2] = {
290 | this.zipPartitionsWithOther(elems)(new CreateUsingIndexZipper)
291 | }
292 |
293 | /** Creates a new IndexedRDD with values from `elems` that may share an index with `this`. */
294 | def aggregateUsingIndex[V2: ClassTag](
295 | elems: RDD[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDD[K, V2] = {
296 | this.zipPartitionsWithOther(elems)(new AggregateUsingIndexZipper(reduceFunc))
297 | }
298 |
299 | /**
300 | * Optionally rebuilds the indexes of this IndexedRDD. Depending on the implementation, this may
301 | * remove tombstoned entries and the resulting IndexedRDD may not support efficient joins with the
302 | * original one.
303 | */
304 | def reindex(): IndexedRDD[K, V] = this.mapIndexedRDDPartitions(_.reindex())
305 |
306 | // The following functions could have been anonymous, but we name them to work around a Scala
307 | // compiler bug related to specialization.
308 |
309 | private type ZipPartitionsFunction[V2, V3] =
310 | Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[IndexedRDDPartition[K, V2]],
311 | Iterator[IndexedRDDPartition[K, V3]]]
312 |
313 | private type OtherZipPartitionsFunction[V2, V3] =
314 | Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[(K, V2)],
315 | Iterator[IndexedRDDPartition[K, V3]]]
316 |
317 | private class MultiputZipper[U](z: (K, U) => V, f: (K, V, U) => V)
318 | extends OtherZipPartitionsFunction[U, V] with Serializable {
319 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)])
320 | : Iterator[IndexedRDDPartition[K, V]] = {
321 | val thisPart = thisIter.next()
322 | Iterator(thisPart.multiput(otherIter, z, f))
323 | }
324 | }
325 |
326 | private class DeleteZipper extends OtherZipPartitionsFunction[Unit, V] with Serializable {
327 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, Unit)])
328 | : Iterator[IndexedRDDPartition[K, V]] = {
329 | val thisPart = thisIter.next()
330 | Iterator(thisPart.delete(otherIter.map(_._1)))
331 | }
332 | }
333 |
334 | private class DiffZipper extends ZipPartitionsFunction[V, V] with Serializable {
335 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]]): Iterator[IndexedRDDPartition[K, V]] = {
336 | val thisPart = thisIter.next()
337 | val otherPart = otherIter.next()
338 | Iterator(thisPart.diff(otherPart))
339 | }
340 | }
341 |
342 | private class OtherDiffZipper extends OtherZipPartitionsFunction[V, V] with Serializable {
343 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V)]): Iterator[IndexedRDDPartition[K, V]] = {
344 | val thisPart = thisIter.next()
345 | Iterator(thisPart.diff(otherIter))
346 | }
347 | }
348 |
349 | private class FullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W)
350 | extends ZipPartitionsFunction[V2, W] with Serializable {
351 | def apply(
352 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]])
353 | : Iterator[IndexedRDDPartition[K, W]] = {
354 | val thisPart = thisIter.next()
355 | val otherPart = otherIter.next()
356 | Iterator(thisPart.fullOuterJoin(otherPart)(f))
357 | }
358 | }
359 |
360 | private class LazyFullOuterJoinZipper(f: (K, Option[V], Option[V]) => V)
361 | extends ZipPartitionsFunction[V, V] with Serializable {
362 | def apply(
363 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]])
364 | : Iterator[IndexedRDDPartition[K, V]] = {
365 | val thisPart = thisIter.next()
366 | val otherPart = otherIter.next()
367 | (thisPart, otherPart) match {
368 | case (thisPart: LazyPartition[K, V], otherPart: LazyPartition[K, V]) if thisPart.reducer == f && otherPart.reducer == f =>
369 | Iterator(new LazyPartition(thisPart.partitions ++ otherPart.partitions, f))
370 | case (thisPart: LazyPartition[K, V], _) if thisPart.reducer == f =>
371 | Iterator(new LazyPartition(thisPart.partitions :+ otherPart, f))
372 | case (_, otherPart: LazyPartition[K, V]) if otherPart.reducer == f =>
373 | Iterator(new LazyPartition(thisPart +: otherPart.partitions, f))
374 | case _ =>
375 | Iterator(new LazyPartition(Seq(thisPart, otherPart), f))
376 | }
377 | }
378 | }
379 |
380 | private class OtherFullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W)
381 | extends OtherZipPartitionsFunction[V2, W] with Serializable {
382 | def apply(
383 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)])
384 | : Iterator[IndexedRDDPartition[K, W]] = {
385 | val thisPart = thisIter.next()
386 | Iterator(thisPart.fullOuterJoin(otherIter)(f))
387 | }
388 | }
389 |
390 | private class JoinZipper[U: ClassTag](f: (K, V, U) => V)
391 | extends ZipPartitionsFunction[U, V] with Serializable {
392 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, U]]): Iterator[IndexedRDDPartition[K, V]] = {
393 | val thisPart = thisIter.next()
394 | val otherPart = otherIter.next()
395 | Iterator(thisPart.join(otherPart)(f))
396 | }
397 | }
398 |
399 | private class OtherJoinZipper[U: ClassTag](f: (K, V, U) => V)
400 | extends OtherZipPartitionsFunction[U, V] with Serializable {
401 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)]): Iterator[IndexedRDDPartition[K, V]] = {
402 | val thisPart = thisIter.next()
403 | Iterator(thisPart.join(otherIter)(f))
404 | }
405 | }
406 |
407 | private class LeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3)
408 | extends ZipPartitionsFunction[V2, V3] with Serializable {
409 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]]): Iterator[IndexedRDDPartition[K, V3]] = {
410 | val thisPart = thisIter.next()
411 | val otherPart = otherIter.next()
412 | Iterator(thisPart.leftJoin(otherPart)(f))
413 | }
414 | }
415 |
416 | private class OtherLeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3)
417 | extends OtherZipPartitionsFunction[V2, V3] with Serializable {
418 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V3]] = {
419 | val thisPart = thisIter.next()
420 | Iterator(thisPart.leftJoin(otherIter)(f))
421 | }
422 | }
423 |
424 | private class InnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3)
425 | extends ZipPartitionsFunction[V2, V3] with Serializable {
426 | def apply(
427 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]])
428 | : Iterator[IndexedRDDPartition[K, V3]] = {
429 | val thisPart = thisIter.next()
430 | val otherPart = otherIter.next()
431 | Iterator(thisPart.innerJoin(otherPart)(f))
432 | }
433 | }
434 |
435 | private class OtherInnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3)
436 | extends OtherZipPartitionsFunction[V2, V3] with Serializable {
437 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)])
438 | : Iterator[IndexedRDDPartition[K, V3]] = {
439 | val thisPart = thisIter.next()
440 | Iterator(thisPart.innerJoin(otherIter)(f))
441 | }
442 | }
443 |
444 | private class CreateUsingIndexZipper[V2: ClassTag]
445 | extends OtherZipPartitionsFunction[V2, V2] with Serializable {
446 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = {
447 | val thisPart = thisIter.next()
448 | Iterator(thisPart.createUsingIndex(otherIter))
449 | }
450 | }
451 |
452 | private class AggregateUsingIndexZipper[V2: ClassTag](reduceFunc: (V2, V2) => V2)
453 | extends OtherZipPartitionsFunction[V2, V2] with Serializable {
454 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = {
455 | val thisPart = thisIter.next()
456 | Iterator(thisPart.aggregateUsingIndex(otherIter, reduceFunc))
457 | }
458 | }
459 | }
460 |
461 | object IndexedRDD {
462 | /**
463 | * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily.
464 | */
465 | def apply[K: ClassTag : KeySerializer, V: ClassTag]
466 | (elems: RDD[(K, V)]): IndexedRDD[K, V] = updatable(elems)
467 |
468 | /**
469 | * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily.
470 | */
471 | def updatable[K: ClassTag : KeySerializer, V: ClassTag]
472 | (elems: RDD[(K, V)])
473 | : IndexedRDD[K, V] = updatable[K, V, V](elems, (id, a) => a, (id, a, b) => b)
474 |
475 | /** Constructs an IndexedRDD from an RDD of pairs. */
476 | def updatable[K: ClassTag : KeySerializer, U: ClassTag, V: ClassTag]
477 | (elems: RDD[(K, U)], z: (K, U) => V, f: (K, V, U) => V)
478 | : IndexedRDD[K, V] = {
479 | val elemsPartitioned =
480 | if (elems.partitioner.isDefined) elems
481 | else elems.partitionBy(new HashPartitioner(elems.partitions.size))
482 | val partitions = elemsPartitioned.mapPartitions[IndexedRDDPartition[K, V]](
483 | iter => Iterator(PARTPartition(iter, z, f)),
484 | preservesPartitioning = true)
485 | new IndexedRDD(partitions)
486 | }
487 |
488 | implicit val longSer = new LongSerializer
489 | implicit val stringSer = new StringSerializer
490 | implicit val shortSer = new ShortSerializer
491 | implicit val charSer = new CharSerializer
492 | implicit val intSet = new IntSerializer
493 | implicit val bigintSer = new BigIntSerializer
494 | implicit val uuidSer = new UUIDSerializer
495 |
496 | implicit def tuple2Ser[A, B](
497 | implicit aSer: KeySerializer[A], bSer: KeySerializer[B]): Tuple2Serializer[A, B] =
498 | new Tuple2Serializer()(aSer, bSer)
499 | }
500 |
--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDPartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import scala.reflect.ClassTag
21 |
22 | /**
23 | * A map of key-value `(K, V)` pairs that enforces key uniqueness and pre-indexes the entries for
24 | * fast lookups, joins, and optionally updates. To construct an `IndexedRDDPartition`, use one of
25 | * the constructors in the [[edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDDPartition$
26 | * IndexedRDDPartition object]].
27 | *
28 | * @tparam K the key associated with each entry in the set.
29 | * @tparam V the value associated with each entry in the set.
30 | */
31 | private[indexedrdd] abstract class IndexedRDDPartition[K, V] extends Serializable {
32 |
33 | protected implicit def kTag: ClassTag[K]
34 | protected implicit def vTag: ClassTag[V]
35 |
36 | def size: Long
37 |
38 | /** Return the value for the given key. */
39 | def apply(k: K): Option[V]
40 |
41 | def isDefined(k: K): Boolean =
42 | apply(k).isDefined
43 |
44 | def iterator: Iterator[(K, V)]
45 |
46 | /**
47 | * Gets the values corresponding to the specified keys, if any.
48 | */
49 | def multiget(ks: Array[K]): Iterator[(K, V)]
50 |
51 | /**
52 | * Updates the keys in `kvs` to their corresponding values generated by running `f` on old and new
53 | * values, if an old value exists, or `z` otherwise. Returns a new IndexedRDDPartition that
54 | * reflects the modification.
55 | */
56 | def multiput[U](
57 | kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] =
58 | throw new UnsupportedOperationException("modifications not supported")
59 |
60 | /** Deletes the specified keys. Returns a new IndexedRDDPartition that reflects the deletions. */
61 | def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] =
62 | throw new UnsupportedOperationException("modifications not supported")
63 |
64 | /** Maps each value, supplying the corresponding key and preserving the index. */
65 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2]
66 |
67 | /**
68 | * Restricts the entries to those satisfying the given predicate.
69 | */
70 | def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V]
71 |
72 | /**
73 | * Intersects `this` and `other` and keeps only elements with differing values. For these
74 | * elements, keeps the values from `this`.
75 | */
76 | def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V]
77 |
78 | /**
79 | * Intersects `this` and `other` and keeps only elements with differing values. For these
80 | * elements, keeps the values from `this`.
81 | */
82 | def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V]
83 |
84 | /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */
85 | def fullOuterJoin[V2: ClassTag, W: ClassTag]
86 | (other: IndexedRDDPartition[K, V2])
87 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W]
88 |
89 | /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */
90 | def fullOuterJoin[V2: ClassTag, W: ClassTag]
91 | (other: Iterator[(K, V2)])
92 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W]
93 |
94 | /**
95 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
96 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
97 | * value type.
98 | */
99 | def join[U: ClassTag]
100 | (other: IndexedRDDPartition[K, U])
101 | (f: (K, V, U) => V): IndexedRDDPartition[K, V]
102 |
103 | /**
104 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
105 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
106 | * value type.
107 | */
108 | def join[U: ClassTag]
109 | (other: Iterator[(K, U)])
110 | (f: (K, V, U) => V): IndexedRDDPartition[K, V]
111 |
112 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
113 | def leftJoin[V2: ClassTag, V3: ClassTag]
114 | (other: IndexedRDDPartition[K, V2])
115 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3]
116 |
117 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
118 | def leftJoin[V2: ClassTag, V3: ClassTag]
119 | (other: Iterator[(K, V2)])
120 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3]
121 |
122 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
123 | def innerJoin[U: ClassTag, V2: ClassTag]
124 | (other: IndexedRDDPartition[K, U])
125 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2]
126 |
127 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
128 | def innerJoin[U: ClassTag, V2: ClassTag]
129 | (other: Iterator[(K, U)])
130 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2]
131 |
132 | /**
133 | * Creates a new partition with values from `elems` that may share an index with `this`,
134 | * merging duplicate keys in `elems` arbitrarily.
135 | */
136 | def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2]
137 |
138 | /** Creates a new partition with values from `elems` that shares an index with `this`. */
139 | def aggregateUsingIndex[V2: ClassTag](
140 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2]
141 |
142 | /**
143 | * Optionally rebuilds the indexes of this partition. Depending on the implementation, this may
144 | * remove tombstoned entries and the resulting partition may support efficient joins with the
145 | * original one.
146 | */
147 | def reindex(): IndexedRDDPartition[K, V]
148 | }
149 |
--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import java.util.UUID
21 |
22 | /**
23 | * Serializer for storing arbitrary key types as byte arrays for PART.
24 | *
25 | * If serialized keys may be of variable length, they should be terminated with a unique value,
26 | * because keys in PART cannot be prefixes of other keys.
27 | */
28 | trait KeySerializer[K] extends Serializable {
29 | def toBytes(k: K): Array[Byte]
30 | def fromBytes(b: Array[Byte]): K
31 | }
32 |
33 | class LongSerializer extends KeySerializer[Long] {
34 | override def toBytes(k: Long) = Array(
35 | ((k >> 56) & 0xFF).toByte,
36 | ((k >> 48) & 0xFF).toByte,
37 | ((k >> 40) & 0xFF).toByte,
38 | ((k >> 32) & 0xFF).toByte,
39 | ((k >> 24) & 0xFF).toByte,
40 | ((k >> 16) & 0xFF).toByte,
41 | ((k >> 8) & 0xFF).toByte,
42 | ( k & 0xFF).toByte)
43 |
44 | override def fromBytes(b: Array[Byte]): Long =
45 | ( (b(0).toLong << 56) & (0xFFL << 56) |
46 | (b(1).toLong << 48) & (0xFFL << 48) |
47 | (b(2).toLong << 40) & (0xFFL << 40) |
48 | (b(3).toLong << 32) & (0xFFL << 32) |
49 | (b(4).toLong << 24) & (0xFFL << 24) |
50 | (b(5).toLong << 16) & (0xFFL << 16) |
51 | (b(6).toLong << 8) & (0xFFL << 8) |
52 | b(7).toLong & 0xFFL)
53 | }
54 |
55 | class IntSerializer extends KeySerializer[Int] {
56 | override def toBytes(k: Int) = Array(
57 | ((k >> 24) & 0xFF).toByte,
58 | ((k >> 16) & 0xFF).toByte,
59 | ((k >> 8) & 0xFF).toByte,
60 | ( k & 0xFF).toByte)
61 |
62 | override def fromBytes(b: Array[Byte]): Int =
63 | (b(0).toInt << 24) & (0xFF << 24) |
64 | (b(1).toInt << 16) & (0xFF << 16) |
65 | (b(2).toInt << 8) & (0xFF << 8) |
66 | b(3).toInt & 0xFF
67 | }
68 |
69 | class BigIntSerializer extends KeySerializer[BigInt] {
70 | override def toBytes(k: BigInt) = {
71 | // Prepend the BigInt bit length to ensure no key is a prefix of any other
72 | val lengthBytes = Array(
73 | ((k.bitLength >> 24) & 0xFF).toByte,
74 | ((k.bitLength >> 16) & 0xFF).toByte,
75 | ((k.bitLength >> 8) & 0xFF).toByte,
76 | ( k.bitLength & 0xFF).toByte)
77 | lengthBytes ++ k.toByteArray
78 | }
79 | override def fromBytes(b: Array[Byte]): BigInt = BigInt.apply(b.drop(4))
80 | }
81 |
82 | class ShortSerializer extends KeySerializer[Short] {
83 | override def toBytes(k: Short) = Array(
84 | ((k >> 8) & 0xFF).toByte,
85 | ( k & 0xFF).toByte)
86 | override def fromBytes(b: Array[Byte]): Short =
87 | ((b(0).toInt << 8) & (0xFF << 8) |
88 | b(1).toInt & 0xFF).toShort
89 | }
90 |
91 | class CharSerializer extends KeySerializer[Char] {
92 | override def toBytes(k: Char) = Array(
93 | ((k >> 8) & 0xFF).toByte,
94 | ( k & 0xFF).toByte)
95 | override def fromBytes(b: Array[Byte]): Char =
96 | ((b(0).toInt << 8) & (0xFF << 8) |
97 | b(1).toInt & 0xFF).toChar
98 | }
99 |
100 | class UUIDSerializer(val longSer: LongSerializer = new LongSerializer) extends KeySerializer[UUID] {
101 | override def toBytes(k: UUID) =
102 | (longSer.toBytes(k.getMostSignificantBits) ++
103 | longSer.toBytes(k.getLeastSignificantBits))
104 | override def fromBytes(b: Array[Byte]): UUID =
105 | new UUID(
106 | longSer.fromBytes(b.take(8)),
107 | longSer.fromBytes(b.takeRight(8)))
108 | }
109 |
110 | class StringSerializer extends KeySerializer[String] {
111 | override def toBytes(k: String) = {
112 | val result = new Array[Byte](4 + k.length * 2)
113 |
114 | // Prepend the string length to ensure no key is a prefix of any other
115 | result(0) = ((k.length >> 24) & 0xFF).toByte
116 | result(1) = ((k.length >> 16) & 0xFF).toByte
117 | result(2) = ((k.length >> 8) & 0xFF).toByte
118 | result(3) = ( k.length & 0xFF).toByte
119 |
120 | var i = 0
121 | while (i < k.length) {
122 | result(4 + 2 * i) = ((k(i) >> 8) & 0xFF).toByte
123 | result(4 + 2 * i + 1) = ( k(i) & 0xFF).toByte
124 | i += 1
125 | }
126 |
127 | result
128 | }
129 |
130 | override def fromBytes(b: Array[Byte]): String = {
131 | val result = new Array[Char]((b.length - 4) / 2)
132 |
133 | var i = 0
134 | while (i < result.length) {
135 | result(i) =
136 | ((b(4 + 2 * i) << 8) & (0xFF << 8) |
137 | (b(4 + 2 * i + 1) & 0xFF)).toChar
138 | i += 1
139 | }
140 |
141 | new String(result)
142 | }
143 | }
144 |
145 | class Tuple2Serializer[A, B](
146 | implicit aSer: KeySerializer[A], bSer: KeySerializer[B])
147 | extends KeySerializer[(A, B)] {
148 |
149 | override def toBytes(k: (A, B)) = {
150 | val aBytes = aSer.toBytes(k._1)
151 | val bBytes = bSer.toBytes(k._2)
152 |
153 | val result = new Array[Byte](4 + aBytes.length + bBytes.length)
154 |
155 | // Prepend the length of aBytes so we know where the boundary is when reading
156 | result(0) = ((aBytes.length >> 24) & 0xFF).toByte
157 | result(1) = ((aBytes.length >> 16) & 0xFF).toByte
158 | result(2) = ((aBytes.length >> 8) & 0xFF).toByte
159 | result(3) = ( aBytes.length & 0xFF).toByte
160 |
161 | aBytes.copyToArray(result, 4)
162 | bBytes.copyToArray(result, 4 + aBytes.length)
163 |
164 | result
165 | }
166 |
167 | override def fromBytes(b: Array[Byte]): (A, B) = {
168 | val aLength =
169 | ( (b(0).toInt << 24) & (0xFF << 24) |
170 | (b(1).toInt << 16) & (0xFF << 16) |
171 | (b(2).toInt << 8) & (0xFF << 8) |
172 | b(3).toInt & 0xFF)
173 | (aSer.fromBytes(b.slice(4, 4 + aLength)),
174 | bSer.fromBytes(b.drop(4 + aLength)))
175 | }
176 | }
177 |
--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/LazyPartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import scala.reflect.ClassTag
21 | import scala.collection.Traversable
22 |
23 | /**
24 | * A wrapper around several IndexedRDDPartition that avoids rebuilding
25 | * the index for the combined partitions. Instead, each operation probes
26 | * the nested partitions and merges the results.
27 | */
28 |
29 | private[indexedrdd] class LazyPartition[K, V]
30 | (val partitions: Seq[IndexedRDDPartition[K, V]],
31 | val reducer: (K, Option[V], Option[V]) => V)
32 | (override implicit val kTag: ClassTag[K],
33 | override implicit val vTag: ClassTag[V])
34 | extends IndexedRDDPartition[K, V] {
35 |
36 | @transient private lazy val cached: IndexedRDDPartition[K, V] =
37 | partitions.reduce((a, b) => a.fullOuterJoin(b)(reducer))
38 |
39 | def size: Long =
40 | cached.size
41 |
42 | /** Return the value for the given key. */
43 | def apply(k: K): Option[V] =
44 | partitions.
45 | map(_(k)).
46 | reduce((a, b) => Option(reducer(k, a, b)))
47 |
48 | override def isDefined(k: K): Boolean =
49 | partitions.find(_.isDefined(k)).isDefined
50 |
51 | def iterator: Iterator[(K, V)] =
52 | cached.iterator
53 |
54 | /**
55 | * Query each partition independently, then merge the results by key. This
56 | * could be more efficient if multiget returned ordered results!
57 | */
58 | def multiget(ks: Array[K]): Iterator[(K, V)] =
59 | partitions.
60 | flatMap(_.multiget(ks)).
61 | groupBy(_._1).
62 | map {
63 | case (k, vs) =>
64 | val v = vs.map(_._2).reduce((v1, v2) => reducer(k, Some(v1), Some(v2)))
65 | (k, v)
66 | }.
67 | iterator
68 |
69 | /**
70 | * We have to re-index as we don't know how to reduce the mapped values.
71 | */
72 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] =
73 | cached.mapValues(f)
74 |
75 | def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] =
76 | new LazyPartition(partitions.map(_.filter(pred)), reducer)
77 |
78 | def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] =
79 | cached.diff(other)
80 |
81 | def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] =
82 | cached.diff(other)
83 |
84 | def fullOuterJoin[V2: ClassTag, W: ClassTag]
85 | (other: IndexedRDDPartition[K, V2])
86 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
87 | cached.fullOuterJoin(other)(f)
88 |
89 | def fullOuterJoin[V2: ClassTag, W: ClassTag]
90 | (other: Iterator[(K, V2)])
91 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
92 | cached.fullOuterJoin(other)(f)
93 |
94 | def join[U: ClassTag]
95 | (other: IndexedRDDPartition[K, U])
96 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] =
97 | cached.join(other)(f)
98 |
99 | def join[U: ClassTag]
100 | (other: Iterator[(K, U)])
101 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] =
102 | cached.join(other)(f)
103 |
104 | def leftJoin[V2: ClassTag, V3: ClassTag]
105 | (other: IndexedRDDPartition[K, V2])
106 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
107 | cached.leftJoin(other)(f)
108 |
109 | def leftJoin[V2: ClassTag, V3: ClassTag]
110 | (other: Iterator[(K, V2)])
111 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
112 | cached.leftJoin(other)(f)
113 |
114 | def innerJoin[U: ClassTag, V2: ClassTag]
115 | (other: IndexedRDDPartition[K, U])
116 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
117 | cached.innerJoin(other)(f)
118 |
119 | def innerJoin[U: ClassTag, V2: ClassTag]
120 | (other: Iterator[(K, U)])
121 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
122 | cached.innerJoin(other)(f)
123 |
124 | def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] =
125 | cached.createUsingIndex(elems)
126 |
127 | def aggregateUsingIndex[V2: ClassTag](
128 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] =
129 | cached.aggregateUsingIndex(elems, reduceFunc)
130 |
131 | /**
132 | * Forces the partitions to re-index, and rebuilds the combined index.
133 | */
134 | def reindex(): IndexedRDDPartition[K, V] =
135 | partitions.map(_.reindex).reduce((a, b) => a.fullOuterJoin(b)(reducer))
136 | }
137 |
--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/PARTPartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl
19 |
20 | import scala.reflect.ClassTag
21 | import scala.collection.JavaConversions._
22 |
23 | import edu.berkeley.cs.amplab.spark.indexedrdd._
24 | import com.ankurdave.part.ArtTree
25 |
26 | private[indexedrdd] class PARTPartition[K, V]
27 | (protected val map: ArtTree)
28 | (override implicit val kTag: ClassTag[K],
29 | override implicit val vTag: ClassTag[V],
30 | implicit val kSer: KeySerializer[K])
31 | extends IndexedRDDPartition[K, V] {
32 |
33 | protected def withMap[V2: ClassTag]
34 | (map: ArtTree): PARTPartition[K, V2] = {
35 | new PARTPartition(map)
36 | }
37 |
38 | override def size: Long = map.size()
39 |
40 | override def apply(k: K): Option[V] = Option(map.search(kSer.toBytes(k)).asInstanceOf[V])
41 |
42 | override def iterator: Iterator[(K, V)] =
43 | map.iterator.map(kv => (kSer.fromBytes(kv._1), kv._2.asInstanceOf[V]))
44 |
45 | private def rawIterator: Iterator[(Array[Byte], V)] =
46 | map.iterator.map(kv => (kv._1, kv._2.asInstanceOf[V]))
47 |
48 | override def multiget(ks: Array[K]): Iterator[(K, V)] =
49 | ks.flatMap { k => this(k).map(v => (k, v)) }.iterator
50 |
51 | override def multiput[U](
52 | kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] = {
53 | val newMap = map.snapshot()
54 | for (ku <- kvs) {
55 | val kBytes = kSer.toBytes(ku._1)
56 | val oldV = newMap.search(kBytes).asInstanceOf[V]
57 | val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2)
58 | newMap.insert(kBytes, newV)
59 | }
60 | this.withMap[V](newMap)
61 | }
62 |
63 | override def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] = {
64 | val newMap = map.snapshot()
65 | for (k <- ks) {
66 | newMap.delete(kSer.toBytes(k))
67 | }
68 | this.withMap[V](newMap)
69 | }
70 |
71 | override def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] = {
72 | val newMap = new ArtTree
73 | for (kv <- rawIterator) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2))
74 | this.withMap[V2](newMap)
75 | }
76 |
77 | override def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] = {
78 | val newMap = new ArtTree
79 | for (kv <- rawIterator if pred(kSer.fromBytes(kv._1), kv._2)) {
80 | newMap.insert(kv._1, kv._2)
81 | }
82 | this.withMap[V](newMap)
83 | }
84 |
85 | override def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] = other match {
86 | case other: PARTPartition[K, V] =>
87 | val newMap = new ArtTree
88 | for (kv <- rawIterator) {
89 | val otherV = other.map.search(kv._1).asInstanceOf[V]
90 | if (otherV != null && otherV != kv._2) {
91 | newMap.insert(kv._1, kv._2)
92 | }
93 | }
94 | this.withMap[V](newMap)
95 |
96 | case _ =>
97 | diff(other.iterator)
98 | }
99 |
100 | override def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] =
101 | diff(PARTPartition(other))
102 |
103 | override def fullOuterJoin[V2: ClassTag, W: ClassTag]
104 | (other: IndexedRDDPartition[K, V2])
105 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = other match {
106 | case other: PARTPartition[K, V2] =>
107 | val newMap = new ArtTree
108 | // Scan `this` and probe `other`, adding all elements in `this`
109 | for (kv <- rawIterator) {
110 | val newV = f(
111 | kSer.fromBytes(kv._1),
112 | Some(kv._2),
113 | Option(other.map.search(kv._1).asInstanceOf[V2]))
114 | newMap.insert(kv._1, newV)
115 | }
116 | // Scan `other` and probe `this`, adding only the elements present in `other` but not `this`
117 | for (kv <- other.rawIterator) {
118 | if (this.map.search(kv._1) == null) {
119 | val newV = f(
120 | kSer.fromBytes(kv._1),
121 | None,
122 | Some(kv._2))
123 | newMap.insert(kv._1, newV)
124 | }
125 | }
126 | this.withMap[W](newMap)
127 |
128 | case _ =>
129 | fullOuterJoin(other.iterator)(f)
130 | }
131 |
132 | override def fullOuterJoin[V2: ClassTag, W: ClassTag]
133 | (other: Iterator[(K, V2)])
134 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
135 | fullOuterJoin(PARTPartition(other))(f)
136 |
137 | override def join[U: ClassTag]
138 | (other: IndexedRDDPartition[K, U])
139 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = join(other.iterator)(f)
140 |
141 | override def join[U: ClassTag]
142 | (other: Iterator[(K, U)])
143 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = {
144 | val newMap = map.snapshot()
145 | for (ku <- other) {
146 | val kBytes = kSer.toBytes(ku._1)
147 | val oldV = newMap.search(kBytes).asInstanceOf[V]
148 | if (oldV != null) {
149 | val newV = f(ku._1, oldV, ku._2)
150 | newMap.insert(kBytes, newV)
151 | }
152 | }
153 | this.withMap[V](newMap)
154 | }
155 |
156 | override def leftJoin[V2: ClassTag, V3: ClassTag]
157 | (other: IndexedRDDPartition[K, V2])
158 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = other match {
159 | case other: PARTPartition[K, V2] =>
160 | // Scan `this` and probe `other`
161 | val newMap = new ArtTree
162 | for (kv <- rawIterator) {
163 | val newV = f(kSer.fromBytes(kv._1), kv._2, Option(other.map.search(kv._1).asInstanceOf[V2]))
164 | newMap.insert(kv._1, newV)
165 | }
166 | this.withMap[V3](newMap)
167 |
168 | case _ =>
169 | leftJoin(other.iterator)(f)
170 | }
171 |
172 | override def leftJoin[V2: ClassTag, V3: ClassTag]
173 | (other: Iterator[(K, V2)])
174 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
175 | leftJoin(PARTPartition(other))(f)
176 |
177 | override def innerJoin[U: ClassTag, V2: ClassTag]
178 | (other: IndexedRDDPartition[K, U])
179 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = other match {
180 | case other: PARTPartition[K, U] =>
181 | // Scan `this` and probe `other`
182 | val newMap = new ArtTree
183 | for (kv <- rawIterator) {
184 | val otherV = other.map.search(kv._1).asInstanceOf[U]
185 | if (otherV != null) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2, otherV))
186 | }
187 | this.withMap[V2](newMap)
188 |
189 | case _ =>
190 | innerJoin(other.iterator)(f)
191 | }
192 |
193 | override def innerJoin[U: ClassTag, V2: ClassTag]
194 | (other: Iterator[(K, U)])
195 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
196 | innerJoin(PARTPartition(other))(f)
197 |
198 | override def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] =
199 | PARTPartition(elems)
200 |
201 | override def aggregateUsingIndex[V2: ClassTag](
202 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] =
203 | PARTPartition[K, V2, V2](elems, (id, a) => a, (id, a, b) => reduceFunc(a, b))
204 |
205 | override def reindex(): IndexedRDDPartition[K, V] = this
206 | }
207 |
208 | private[indexedrdd] object PARTPartition {
209 | def apply[K: ClassTag, V: ClassTag]
210 | (iter: Iterator[(K, V)])(implicit kSer: KeySerializer[K]) =
211 | apply[K, V, V](iter, (id, a) => a, (id, a, b) => b)
212 |
213 | def apply[K: ClassTag, U: ClassTag, V: ClassTag]
214 | (iter: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V)
215 | (implicit kSer: KeySerializer[K]): PARTPartition[K, V] = {
216 | val map = new ArtTree
217 | iter.foreach { ku =>
218 | val kBytes = kSer.toBytes(ku._1)
219 | val oldV = map.search(kBytes).asInstanceOf[V]
220 | val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2)
221 | map.insert(kBytes, newV)
222 | }
223 | new PARTPartition(map)
224 | }
225 | }
226 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file core/target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=false
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.eclipse.jetty=WARN
28 | org.eclipse.jetty.LEVEL=WARN
29 |
--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import scala.collection.immutable.LongMap
21 | import scala.reflect.ClassTag
22 | import org.apache.spark.HashPartitioner
23 |
24 | import org.apache.spark.SparkContext
25 | import org.apache.spark.rdd.RDD
26 | import org.scalatest.FunSuite
27 |
28 | abstract class IndexedRDDSuite extends FunSuite with SharedSparkContext {
29 |
30 | def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V]
31 |
32 | def pairs(sc: SparkContext, n: Int) = {
33 | create(sc.parallelize((0 to n).map(x => (x.toLong, x)), 5))
34 | }
35 |
36 | test("get, multiget") {
37 | val n = 100
38 | val ps = pairs(sc, n).cache()
39 | assert(ps.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 1L -> 1, 98L -> 98))
40 | assert(ps.get(-1L) === None)
41 | assert(ps.get(97L) === Some(97))
42 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
43 | assert(evens.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 98L -> 98))
44 | assert(evens.get(97L) === None)
45 | }
46 |
47 | test("filter") {
48 | val n = 100
49 | val ps = pairs(sc, n)
50 | val evens = ps.filter(q => ((q._2 % 2) == 0))
51 | assert(evens.count === (0 to n).filter(_ % 2 == 0).size)
52 | }
53 |
54 | test("mapValues") {
55 | val n = 100
56 | val ps = pairs(sc, n)
57 | val negatives = ps.mapValues(x => -x).cache() // Allow joining b with a derived RDD of b
58 | assert(negatives.count === n + 1)
59 | }
60 |
61 | test("diff") {
62 | val n = 100
63 | val ps = pairs(sc, n).cache()
64 | val flipEvens = ps.mapValues(x => if (x % 2 == 0) -x else x).cache()
65 | // diff should keep only the changed values
66 | assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet)
67 | }
68 |
69 | test("diff with pair RDD") {
70 | val n = 100
71 | val ps = pairs(sc, n).cache()
72 | val flipEvens: RDD[(Long, Int)] =
73 | sc.parallelize(0L to 100L)
74 | .map(id => if (id % 2 == 0) (id, -id.toInt) else (id, id.toInt)).cache()
75 | // diff should keep only the changed values
76 | assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet)
77 | }
78 |
79 | test("diff with non-equal number of partitions") {
80 | val a = create(sc.parallelize(0 until 24, 3).map(i => (i.toLong, 0)))
81 | val b = create(sc.parallelize(8 until 16, 2).map(i => (i.toLong, 1)))
82 | assert(a.partitions.size != b.partitions.size)
83 | val c = b.diff(a)
84 | assert(c.map(_._1).collect.toSet === (8 until 16).toSet)
85 | }
86 |
87 | test("fullOuterJoin") {
88 | Seq(true, false).foreach { maybeLazy =>
89 | val n = 200
90 | val bStart = 50
91 | val aEnd = 100
92 | val common = create(sc.parallelize((0 until n).map(x => (x.toLong, x)), 5)).cache()
93 | val a = common.filter(kv => kv._1 < aEnd).cache()
94 | val b = common.filter(kv => kv._1 >= bStart).cache()
95 | val sum = a.fullOuterJoin(b, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) }
96 | val expected = ((0 until bStart).map(x => (x.toLong, x)) ++
97 | (bStart until aEnd).map(x => (x.toLong, x * 2)) ++
98 | (aEnd until n).map(x => (x.toLong, x))).toSet
99 |
100 | // fullOuterJoin with another IndexedRDD with the same index
101 | assert(sum.collect.toSet === expected)
102 |
103 | // fullOuterJoin with another IndexedRDD with a different index
104 | val b2 = create(b.map(identity))
105 | val sum2 = a.fullOuterJoin(b2, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) }
106 | assert(sum2.collect.toSet === expected)
107 | }
108 | }
109 |
110 | test("leftJoin") {
111 | val n = 100
112 | val ps = pairs(sc, n).cache()
113 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
114 | // leftJoin with another IndexedRDD
115 | assert(ps.leftJoin(evens) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet ===
116 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
117 | // leftJoin with an RDD
118 | val evensRDD = evens.map(identity)
119 | assert(ps.leftJoin(evensRDD) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet ===
120 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
121 | }
122 |
123 | test("leftJoin vertices with non-equal number of partitions") {
124 | val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1)))
125 | val b = create(
126 | a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3)))
127 | assert(a.partitions.size != b.partitions.size)
128 | val c = a.leftJoin(b) { (vid, old, newOpt) =>
129 | old - newOpt.getOrElse(0)
130 | }
131 | assert(c.filter(v => v._2 != 0).map(_._1).collect.toSet == (1 to 99 by 2).toSet)
132 | }
133 |
134 | test("join") {
135 | val n = 100
136 | val ps = pairs(sc, n).cache()
137 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
138 | // join with another IndexedRDD
139 | assert(ps.join(evens) { (id, a, b) => a - b }.collect.toSet ===
140 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
141 | // join with an RDD
142 | val evensRDD = evens.map(identity)
143 | assert(ps.join(evensRDD) { (id, a, b) => a - b }.collect.toSet ===
144 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
145 | }
146 |
147 | test("innerJoin") {
148 | val n = 100
149 | val ps = pairs(sc, n).cache()
150 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
151 | // innerJoin with another IndexedRDD
152 | assert(ps.innerJoin(evens) { (id, a, b) => a - b }.collect.toSet ===
153 | (0 to n by 2).map(x => (x.toLong, 0)).toSet)
154 | // innerJoin with an RDD
155 | val evensRDD = evens.map(identity)
156 | assert(ps.innerJoin(evensRDD) { (id, a, b) => a - b }.collect.toSet ===
157 | (0 to n by 2).map(x => (x.toLong, 0)).toSet)
158 | }
159 |
160 | test("innerJoin with non-equal number of partitions") {
161 | val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1)))
162 | val b = create(
163 | a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3)))
164 | assert(a.partitions.size != b.partitions.size)
165 | val c = a.innerJoin(b) { (vid, old, newVal) =>
166 | old - newVal
167 | }
168 | assert(c.filter(v => v._2 == 0).map(_._1).collect.toSet == (0 to 98 by 2).toSet)
169 | }
170 |
171 | test("aggregateUsingIndex") {
172 | val n = 100
173 | val ps = pairs(sc, n)
174 | val messageTargets = (0 to n) ++ (0 to n by 2)
175 | val messages = sc.parallelize(messageTargets.map(x => (x.toLong, 1)))
176 | assert(ps.aggregateUsingIndex[Int](messages, _ + _).collect.toSet ===
177 | (0 to n).map(x => (x.toLong, if (x % 2 == 0) 2 else 1)).toSet)
178 |
179 | val messagesWithNew = List((0L, 1), (-1L, 1))
180 | assert(ps.aggregateUsingIndex[Int](sc.parallelize(messagesWithNew), _ + _).collect.toSet ===
181 | messagesWithNew.toSet)
182 | }
183 | }
184 |
185 | class UpdatableIndexedRDDSuite extends IndexedRDDSuite {
186 | override def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V] = {
187 | import IndexedRDD._
188 | IndexedRDD.updatable(elems)
189 | }
190 |
191 | test("put, multiput") {
192 | val n = 100
193 | val ps = pairs(sc, n).cache()
194 | assert(ps.multiput[Int](Map(0L -> 1, 1L -> 1), (id, a) => a, SumFunction).collect.toSet ===
195 | Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet)
196 | assert(ps.multiput[Int](Map(-1L -> -1, 0L -> 1), (id, a) => a, SumFunction).collect.toSet ===
197 | Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
198 | assert(ps.multiput(Map(-1L -> -1, 0L -> 1, 1L -> 1)).collect.toSet ===
199 | Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet)
200 | assert(ps.multiputRDD[Int](sc.parallelize(Seq(0L -> 1, 1L -> 1)), (id, a) => a, SumFunction).collect.toSet ===
201 | Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet)
202 | assert(ps.multiputRDD[Int](sc.parallelize(Seq(-1L -> -1, 0L -> 1)), (id, a) => a, SumFunction).collect.toSet ===
203 | Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
204 | assert(ps.multiputRDD(sc.parallelize(Seq(-1L -> -1, 0L -> 1, 1L -> 1))).collect.toSet ===
205 | Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet)
206 | assert(ps.put(-1L, -1).collect.toSet ===
207 | Set(-1L -> -1) ++ (0 to n).map(x => (x.toLong, x)).toSet)
208 | assert(ps.put(0L, 1).collect.toSet ===
209 | Set(0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
210 | }
211 |
212 | test("delete") {
213 | val n = 100
214 | val ps = pairs(sc, n).cache()
215 | assert(ps.delete(Array(0L)).collect.toSet === (1 to n).map(x => (x.toLong, x)).toSet)
216 | assert(ps.delete(Array(-1L)).collect.toSet === (0 to n).map(x => (x.toLong, x)).toSet)
217 | }
218 | }
219 |
220 | // Declared outside of test suite to avoid closure capture
221 | private object SumFunction extends Function3[Long, Int, Int, Int] with Serializable {
222 | def apply(id: Long, a: Int, b: Int) = a + b
223 | }
224 |
--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializerSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import java.util.UUID
21 |
22 | import org.scalacheck.Arbitrary
23 | import org.scalacheck.Gen
24 | import org.scalatest.FunSuite
25 | import org.scalatest.Matchers
26 | import org.scalatest.prop.GeneratorDrivenPropertyChecks
27 |
28 | class KeySerializerSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers {
29 |
30 | test("long") {
31 | val ser = new LongSerializer
32 | forAll { (a: Long) =>
33 | ser.fromBytes(ser.toBytes(a)) should be === a
34 | }
35 | }
36 |
37 | test("string") {
38 | val ser = new StringSerializer
39 |
40 | forAll { (a: String) =>
41 | ser.fromBytes(ser.toBytes(a)) should be === a
42 | }
43 |
44 | forAll { (a: String, b: String) =>
45 | whenever (a != b) {
46 | val aSer = ser.toBytes(a)
47 | val bSer = ser.toBytes(b)
48 | assert(!aSer.startsWith(bSer))
49 | assert(!bSer.startsWith(aSer))
50 | }
51 | }
52 | }
53 |
54 | test("short") {
55 | val ser = new ShortSerializer
56 | forAll { (a: Short) =>
57 | ser.fromBytes(ser.toBytes(a)) should be === a
58 | }
59 | }
60 |
61 | test("int") {
62 | val ser = new IntSerializer
63 | forAll { (a: Int) =>
64 | ser.fromBytes(ser.toBytes(a)) should be === a
65 | }
66 | }
67 |
68 | implicit val arbUUID: Arbitrary[UUID] = Arbitrary(Gen.uuid)
69 |
70 | test("UUID") {
71 | val ser = new UUIDSerializer
72 | forAll { (a: UUID) =>
73 | ser.fromBytes(ser.toBytes(a)) should be === a
74 | }
75 | }
76 |
77 | test("bigint") {
78 | val ser = new BigIntSerializer
79 |
80 | forAll { (a: BigInt) =>
81 | ser.fromBytes(ser.toBytes(a)) should be === a
82 | }
83 |
84 | forAll { (a: BigInt, b: BigInt) =>
85 | whenever (a != b) {
86 | val aSer = ser.toBytes(a)
87 | val bSer = ser.toBytes(b)
88 | assert(!aSer.startsWith(bSer))
89 | assert(!bSer.startsWith(aSer))
90 | }
91 | }
92 | }
93 |
94 | def tuple2Test[A: Arbitrary, B: Arbitrary](
95 | aSer: KeySerializer[A], bSer: KeySerializer[B]): Unit = {
96 | val ser = new Tuple2Serializer[A, B]()(aSer, bSer)
97 |
98 | forAll { (a: A, b: B) =>
99 | ser.fromBytes(ser.toBytes(Tuple2(a, b))) should be === (a, b)
100 | }
101 |
102 | forAll { (a: (A, B), b: (A, B)) =>
103 | whenever (a != b) {
104 | val aSer = ser.toBytes(a)
105 | val bSer = ser.toBytes(b)
106 | assert(!aSer.startsWith(bSer))
107 | assert(!bSer.startsWith(aSer))
108 | }
109 | }
110 | }
111 |
112 | test("Tuple2") {
113 | val stringSer = new StringSerializer
114 | val longSer = new LongSerializer
115 | val intSer = new IntSerializer
116 | val shortSer = new ShortSerializer
117 | val bigintSer = new BigIntSerializer
118 | val uuidSer = new UUIDSerializer
119 |
120 | tuple2Test[Long, Long](longSer, longSer)
121 | tuple2Test[String, Long](stringSer, longSer)
122 | tuple2Test[Long, String](longSer, stringSer)
123 | tuple2Test[String, String](stringSer, stringSer)
124 | tuple2Test[Short, Short](shortSer, shortSer)
125 | tuple2Test[Short, Int](shortSer, intSer)
126 | tuple2Test[Int, Int](intSer, intSer)
127 | tuple2Test[Int, BigInt](intSer, bigintSer)
128 | tuple2Test[BigInt, BigInt](bigintSer, bigintSer)
129 | tuple2Test[Int, UUID](intSer, uuidSer)
130 | tuple2Test[UUID, UUID](uuidSer, uuidSer)
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/SharedSparkContext.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 |
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.SparkContext
22 | import org.scalatest.BeforeAndAfterAll
23 | import org.scalatest.Suite
24 |
25 | /** Shares a local `SparkContext` between all tests in a suite and closes it at the end */
26 | trait SharedSparkContext extends BeforeAndAfterAll { self: Suite =>
27 |
28 | @transient private var _sc: SparkContext = _
29 |
30 | def sc: SparkContext = _sc
31 |
32 | var conf = new SparkConf(false)
33 |
34 | override def beforeAll() {
35 | _sc = new SparkContext("local", "test", conf)
36 | super.beforeAll()
37 | }
38 |
39 | override def afterAll() {
40 | if (_sc != null) {
41 | _sc.stop()
42 | }
43 | // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
44 | System.clearProperty("spark.driver.port")
45 | _sc = null
46 | super.afterAll()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/IndexedRDDPartitionSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl
19 |
20 | import scala.reflect.ClassTag
21 | import edu.berkeley.cs.amplab.spark.indexedrdd._
22 |
23 | import org.apache.spark.SparkConf
24 | import org.apache.spark.serializer.JavaSerializer
25 | import org.apache.spark.serializer.KryoSerializer
26 | import org.scalatest.FunSuite
27 |
28 | abstract class IndexedRDDPartitionSuite extends FunSuite {
29 |
30 | def create[V: ClassTag](iter: Iterator[(Long, V)]): IndexedRDDPartition[Long, V]
31 |
32 | test("serialization") {
33 | val elems = Set((0L, 1), (1L, 1), (2L, 1))
34 | val vp = create(elems.iterator)
35 | val javaSer = new JavaSerializer(new SparkConf())
36 | val kryoSer = new KryoSerializer(new SparkConf()
37 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"))
38 |
39 | for (ser <- List(javaSer, kryoSer); s = ser.newInstance()) {
40 | val vpSer: IndexedRDDPartition[Long, Int] = s.deserialize(s.serialize(vp))
41 | assert(vpSer.iterator.toSet === elems)
42 | }
43 | }
44 |
45 | test("get") {
46 | val elems = Set((0L, 1), (1L, 1), (2L, 1))
47 | val vp = create(elems.iterator)
48 | assert(vp(0L) == Some(1))
49 | assert(vp(1L) == Some(1))
50 | assert(vp(2L) == Some(1))
51 | assert(vp(3L) == None)
52 |
53 | assert(vp.multiget(Array(1L, 2L, 3L)).size == 2)
54 | }
55 | }
56 |
57 | class PARTPartitionSuite extends IndexedRDDPartitionSuite {
58 | override def create[V: ClassTag](iter: Iterator[(Long, V)]) = {
59 | import IndexedRDD._
60 | PARTPartition(iter)
61 | }
62 | }
63 |
64 | class LazyPartitionSuite extends IndexedRDDPartitionSuite {
65 | override def create[V: ClassTag](iter: Iterator[(Long, V)]) = {
66 | import IndexedRDD._
67 | val it = iter.toSeq
68 | new LazyPartition(
69 | Seq(PARTPartition(it.iterator), PARTPartition(it.iterator)),
70 | (id, a, b) => (a ++ b).headOption.getOrElse(null.asInstanceOf[V]))
71 | }
72 | }
73 |
--------------------------------------------------------------------------------