├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── project
    ├── build.properties
    └── plugins.sbt
├── sbt
    ├── sbt
    └── sbt-launch-lib.bash
└── src
    ├── main
        └── scala
        │   └── edu
        │       └── berkeley
        │           └── cs
        │               └── amplab
        │                   └── spark
        │                       └── indexedrdd
        │                           ├── IndexedRDD.scala
        │                           ├── IndexedRDDPartition.scala
        │                           ├── KeySerializer.scala
        │                           └── impl
        │                               ├── LazyPartition.scala
        │                               └── PARTPartition.scala
    └── test
        ├── resources
            └── log4j.properties
        └── scala
            └── edu
                └── berkeley
                    └── cs
                        └── amplab
                            └── spark
                                └── indexedrdd
                                    ├── IndexedRDDSuite.scala
                                    ├── KeySerializerSuite.scala
                                    ├── SharedSparkContext.scala
                                    └── impl
                                        └── IndexedRDDPartitionSuite.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | sbt/sbt-launch*.jar
 5 | .idea/
 6 | .idea_modules/
 7 | .cache
 8 | .history
 9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | project/boot/
15 | project/plugins/project/
16 | 
17 | .scala_dependencies
18 | .worksheet
19 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - 2.11.6
4 |   - 2.10.6
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IndexedRDD for Apache Spark
 2 | 
 3 | An efficient updatable key-value store for [Apache Spark](http://spark.apache.org).
 4 | 
 5 | IndexedRDD extends `RDD[(K, V)]` by enforcing key uniqueness and pre-indexing the entries for efficient joins and point lookups, updates, and deletions. It is implemented by (1) hash-partitioning the entries by key, (2) maintaining a radix tree ([PART](https://github.com/ankurdave/part)) index within each partition, and (3) using this immutable and efficiently updatable data structure to enable efficient modifications and deletions.
 6 | 
 7 | ## Usage
 8 | 
 9 | Add the dependency to your SBT project by adding the following to `build.sbt` (see the [Spark Packages listing](http://spark-packages.org/package/amplab/spark-indexedrdd) for spark-submit and Maven instructions):
10 | 
11 | ```scala
12 | resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven"
13 | 
14 | libraryDependencies += "amplab" % "spark-indexedrdd" % "0.3"
15 | ```
16 | 
17 | Then use IndexedRDD as follows:
18 | 
19 | ```scala
20 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD
21 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD._
22 | 
23 | // Create an RDD of key-value pairs with Long keys.
24 | val rdd = sc.parallelize((1 to 1000000).map(x => (x.toLong, 0)))
25 | // Construct an IndexedRDD from the pairs, hash-partitioning and indexing
26 | // the entries.
27 | val indexed = IndexedRDD(rdd).cache()
28 | 
29 | // Perform a point update.
30 | val indexed2 = indexed.put(1234L, 10873).cache()
31 | // Perform a point lookup. Note that the original IndexedRDD remains
32 | // unmodified.
33 | indexed2.get(1234L) // => Some(10873)
34 | indexed.get(1234L) // => Some(0)
35 | 
36 | // Efficiently join derived IndexedRDD with original.
37 | val indexed3 = indexed.innerJoin(indexed2) { (id, a, b) => b }.filter(_._2 != 0)
38 | indexed3.collect // => Array((1234L, 10873))
39 | 
40 | // Perform insertions and deletions.
41 | val indexed4 = indexed2.put(-100L, 111).delete(Array(998L, 999L)).cache()
42 | indexed2.get(-100L) // => None
43 | indexed4.get(-100L) // => Some(111)
44 | indexed2.get(999L) // => Some(0)
45 | indexed4.get(999L) // => None
46 | ```
47 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "spark-indexedrdd"
 2 | version := "0.4.0"
 3 | organization := "edu.berkeley.cs.amplab"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | crossScalaVersions := Seq("2.10.6", "2.11.6")
 7 | 
 8 | spName := "amplab/spark-indexedrdd"
 9 | sparkVersion := "2.1.0"
10 | sparkComponents += "core"
11 | 
12 | resolvers += "Repo at github.com/ankurdave/maven-repo" at "https://raw.githubusercontent.com/ankurdave/maven-repo/master"
13 | 
14 | libraryDependencies ++= Seq(
15 |   "com.ankurdave" % "part_2.10" % "0.1",  // artifact is not published for 2.11, but it only contains Java code anyway
16 |   "org.scalatest" %% "scalatest" % "2.2.4" % "test",
17 |   "org.scalacheck" %% "scalacheck" % "1.12.2" % "test"
18 | )
19 | 
20 | publishMavenStyle := true
21 | 
22 | licenses += "Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")
23 | 
24 | pomExtra :=
25 |   <url>https://github.com/amplab/spark-indexedrdd</url>
26 |   <scm>
27 |     <url>git@github.com:amplab/spark-indexedrdd.git</url>
28 |     <connection>scm:git:git@github.com:amplab/spark-indexedrdd.git</connection>
29 |   </scm>
30 |   <developers>
31 |     <developer>
32 |       <id>ankurdave</id>
33 |       <name>Ankur Dave</name>
34 |       <url>https://github.com/ankurdave</url>
35 |     </developer>
36 |   </developers>
37 | 
38 | 
39 | // Run tests with more memory
40 | javaOptions in test += "-Xmx2G"
41 | 
42 | fork in test := true
43 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.13
18 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | scalaVersion := "2.10.4"
 2 | 
 3 | // resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 4 | 
 5 | // resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
 6 | 
 7 | // resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 8 | 
 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
10 | 
11 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
12 | 
13 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")
14 | 


--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | realpath () {
  4 | (
  5 |   TARGET_FILE="$1"
  6 | 
  7 |   cd "$(dirname "$TARGET_FILE")"
  8 |   TARGET_FILE="$(basename "$TARGET_FILE")"
  9 | 
 10 |   COUNT=0
 11 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 12 |   do
 13 |       TARGET_FILE="$(readlink "$TARGET_FILE")"
 14 |       cd $(dirname "$TARGET_FILE")
 15 |       TARGET_FILE="$(basename $TARGET_FILE)"
 16 |       COUNT=$(($COUNT + 1))
 17 |   done
 18 | 
 19 |   echo "$(pwd -P)/"$TARGET_FILE""
 20 | )
 21 | }
 22 | 
 23 | . "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash
 24 | 
 25 | 
 26 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
 27 | declare -r sbt_opts_file=".sbtopts"
 28 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
 29 | 
 30 | usage() {
 31 |  cat <<EOM
 32 | Usage: $script_name [options]
 33 | 
 34 |   -h | -help         print this message
 35 |   -v | -verbose      this runner is chattier
 36 |   -d | -debug        set sbt log level to debug
 37 |   -no-colors         disable ANSI color codes
 38 |   -sbt-create        start sbt even if current directory contains no sbt project
 39 |   -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
 40 |   -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
 41 |   -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
 42 |   -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
 43 |   -no-share          use all local caches; no sharing
 44 |   -no-global         uses global caches, but does not use global ~/.sbt directory.
 45 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
 46 |   -batch             Disable interactive mode
 47 | 
 48 |   # sbt version (default: from project/build.properties if present, else latest release)
 49 |   -sbt-version  <version>   use the specified version of sbt
 50 |   -sbt-jar      <path>      use the specified jar as the sbt launcher
 51 |   -sbt-rc                   use an RC version of sbt
 52 |   -sbt-snapshot             use a snapshot version of sbt
 53 | 
 54 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
 55 |   -java-home <path>         alternate JAVA_HOME
 56 | 
 57 |   # jvm options and output control
 58 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
 59 |   SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
 60 |   .sbtopts           if this file exists in the current directory, it is
 61 |                      prepended to the runner args
 62 |   /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
 63 |   -Dkey=val          pass -Dkey=val directly to the java runtime
 64 |   -J-X               pass option -X directly to the java runtime
 65 |                      (-J is stripped)
 66 |   -S-X               add -X to sbt's scalacOptions (-J is stripped)
 67 |   -PmavenProfiles     Enable a maven profile for the build.
 68 | 
 69 | In the case of duplicated or conflicting options, the order above
 70 | shows precedence: JAVA_OPTS lowest, command line options highest.
 71 | EOM
 72 | }
 73 | 
 74 | process_my_args () {
 75 |   while [[ $# -gt 0 ]]; do
 76 |     case "$1" in
 77 |      -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
 78 |       -no-share) addJava "$noshare_opts" && shift ;;
 79 |      -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
 80 |       -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
 81 |        -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
 82 |      -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
 83 |          -batch) exec </dev/null && shift ;;
 84 | 
 85 |     -sbt-create) sbt_create=true && shift ;;
 86 | 
 87 |               *) addResidual "$1" && shift ;;
 88 |     esac
 89 |   done
 90 | 
 91 |   # Now, ensure sbt version is used.
 92 |   [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
 93 | }
 94 | 
 95 | loadConfigFile() {
 96 |   cat "$1" | sed '/^\#/d'
 97 | }
 98 | 
 99 | # if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
100 | [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
101 | [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
102 | 
103 | run "$@"
104 | 


--------------------------------------------------------------------------------
/sbt/sbt-launch-lib.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | 
  4 | # A library to simplify using the SBT launcher from other packages.
  5 | # Note: This should be used by tools like giter8/conscript etc.
  6 | 
  7 | # TODO - Should we merge the main SBT script with this library?
  8 | 
  9 | if test -z "$HOME"; then
 10 |   declare -r script_dir="$(dirname "$script_path")"
 11 | else
 12 |   declare -r script_dir="$HOME/.sbt"
 13 | fi
 14 | 
 15 | declare -a residual_args
 16 | declare -a java_args
 17 | declare -a scalac_args
 18 | declare -a sbt_commands
 19 | declare -a maven_profiles
 20 | 
 21 | if test -x "$JAVA_HOME/bin/java"; then
 22 |     echo -e "Using $JAVA_HOME as default JAVA_HOME."
 23 |     echo "Note, this will be overridden by -java-home if it is set."
 24 |     declare java_cmd="$JAVA_HOME/bin/java"
 25 | else
 26 |     declare java_cmd=java
 27 | fi
 28 | 
 29 | echoerr () {
 30 |   echo 1>&2 "$@"
 31 | }
 32 | vlog () {
 33 |   [[ $verbose || $debug ]] && echoerr "$@"
 34 | }
 35 | dlog () {
 36 |   [[ $debug ]] && echoerr "$@"
 37 | }
 38 | 
 39 | acquire_sbt_jar () {
 40 |   SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
 41 |   URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
 42 |   URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
 43 |   JAR=sbt/sbt-launch-${SBT_VERSION}.jar
 44 | 
 45 |   sbt_jar=$JAR
 46 | 
 47 |   if [[ ! -f "$sbt_jar" ]]; then
 48 |     # Download sbt launch jar if it hasn't been downloaded yet
 49 |     if [ ! -f "${JAR}" ]; then
 50 |     # Download
 51 |     printf "Attempting to fetch sbt\n"
 52 |     JAR_DL="${JAR}.part"
 53 |     if hash curl 2>/dev/null; then
 54 |       (curl -L --silent ${URL1} > "${JAR_DL}" || curl -L --silent ${URL2} > "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
 55 |     elif hash wget 2>/dev/null; then
 56 |       (wget --quiet ${URL1} -O "${JAR_DL}" || wget --quiet ${URL2} -O "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
 57 |     else
 58 |       printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
 59 |       exit -1
 60 |     fi
 61 |     fi
 62 |     if [ ! -f "${JAR}" ]; then
 63 |     # We failed to download
 64 |     printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
 65 |     exit -1
 66 |     fi
 67 |     printf "Launching sbt from ${JAR}\n"
 68 |   fi
 69 | }
 70 | 
 71 | execRunner () {
 72 |   # print the arguments one to a line, quoting any containing spaces
 73 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
 74 |     for arg; do
 75 |       if printf "%s\n" "$arg" | grep -q ' '; then
 76 |         printf "\"%s\"\n" "$arg"
 77 |       else
 78 |         printf "%s\n" "$arg"
 79 |       fi
 80 |     done
 81 |     echo ""
 82 |   }
 83 | 
 84 |   exec "$@"
 85 | }
 86 | 
 87 | addJava () {
 88 |   dlog "[addJava] arg = '$1'"
 89 |   java_args=( "${java_args[@]}" "$1" )
 90 | }
 91 | 
 92 | enableProfile () {
 93 |   dlog "[enableProfile] arg = '$1'"
 94 |   maven_profiles=( "${maven_profiles[@]}" "$1" )
 95 |   export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
 96 | }
 97 | 
 98 | addSbt () {
 99 |   dlog "[addSbt] arg = '$1'"
100 |   sbt_commands=( "${sbt_commands[@]}" "$1" )
101 | }
102 | addResidual () {
103 |   dlog "[residual] arg = '$1'"
104 |   residual_args=( "${residual_args[@]}" "$1" )
105 | }
106 | addDebugger () {
107 |   addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
108 | }
109 | 
110 | # a ham-fisted attempt to move some memory settings in concert
111 | # so they need not be dicked around with individually.
112 | get_mem_opts () {
113 |   local mem=${1:-2048}
114 |   local perm=$(( $mem / 4 ))
115 |   (( $perm > 256 )) || perm=256
116 |   (( $perm < 4096 )) || perm=4096
117 |   local codecache=$(( $perm / 2 ))
118 | 
119 |   echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
120 | }
121 | 
122 | require_arg () {
123 |   local type="$1"
124 |   local opt="$2"
125 |   local arg="$3"
126 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
127 |     die "$opt requires <$type> argument"
128 |   fi
129 | }
130 | 
131 | is_function_defined() {
132 |   declare -f "$1" > /dev/null
133 | }
134 | 
135 | process_args () {
136 |   while [[ $# -gt 0 ]]; do
137 |     case "$1" in
138 |        -h|-help) usage; exit 1 ;;
139 |     -v|-verbose) verbose=1 && shift ;;
140 |       -d|-debug) debug=1 && shift ;;
141 | 
142 |            -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
143 |            -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
144 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
145 |          -batch) exec </dev/null && shift ;;
146 | 
147 |        -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
148 |    -sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
149 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
150 | 
151 |             -D*) addJava "$1" && shift ;;
152 |             -J*) addJava "${1:2}" && shift ;; 
153 |             -P*) enableProfile "$1" && shift ;;
154 |               *) addResidual "$1" && shift ;;
155 |     esac
156 |   done
157 | 
158 |   is_function_defined process_my_args && {
159 |     myargs=("${residual_args[@]}")
160 |     residual_args=()
161 |     process_my_args "${myargs[@]}"
162 |   }
163 | }
164 | 
165 | run() {
166 |   # no jar? download it.
167 |   [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
168 |     # still no jar? uh-oh.
169 |     echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
170 |     exit 1
171 |   }
172 | 
173 |   # process the combined args, then reset "$@" to the residuals
174 |   process_args "$@"
175 |   set -- "${residual_args[@]}"
176 |   argumentCount=$#
177 | 
178 |   # run sbt
179 |   execRunner "$java_cmd" \
180 |     ${SBT_OPTS:-$default_sbt_opts} \
181 |     $(get_mem_opts $sbt_mem) \
182 |     ${java_opts} \
183 |     ${java_args[@]} \
184 |     -jar "$sbt_jar" \
185 |     "${sbt_commands[@]}" \
186 |     "${residual_args[@]}"
187 | }
188 | 
189 | runAlternateBoot() {
190 |   local bootpropsfile="$1"
191 |   shift
192 |   addJava "-Dsbt.boot.properties=$bootpropsfile"
193 |   run $@
194 | }
195 | 


--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark._
 23 | import org.apache.spark.SparkContext._
 24 | import org.apache.spark.rdd.RDD
 25 | import org.apache.spark.storage.StorageLevel
 26 | 
 27 | import edu.berkeley.cs.amplab.spark.indexedrdd.impl._
 28 | 
 29 | /**
 30 |  * An RDD of key-value `(K, V)` pairs that pre-indexes the entries for fast lookups, joins, and
 31 |  * optionally updates. To construct an `IndexedRDD`, use one of the constructors in the
 32 |  * [[edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD$ IndexedRDD object]].
 33 |  *
 34 |  * @tparam K the key associated with each entry in the set.
 35 |  * @tparam V the value associated with each entry in the set.
 36 |  */
 37 | class IndexedRDD[K: ClassTag, V: ClassTag](
 38 |     /** The underlying representation of the IndexedRDD as an RDD of partitions. */
 39 |     private val partitionsRDD: RDD[IndexedRDDPartition[K, V]])
 40 |   extends RDD[(K, V)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
 41 | 
 42 |   require(partitionsRDD.partitioner.isDefined)
 43 | 
 44 |   override val partitioner = partitionsRDD.partitioner
 45 | 
 46 |   override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
 47 | 
 48 |   override protected def getPreferredLocations(s: Partition): Seq[String] =
 49 |     partitionsRDD.preferredLocations(s)
 50 | 
 51 |   override def persist(newLevel: StorageLevel): this.type = {
 52 |     partitionsRDD.persist(newLevel)
 53 |     this
 54 |   }
 55 | 
 56 |   override def unpersist(blocking: Boolean = true): this.type = {
 57 |     partitionsRDD.unpersist(blocking)
 58 |     this
 59 |   }
 60 | 
 61 |   override def setName(_name: String): this.type = {
 62 |     partitionsRDD.setName(_name)
 63 |     this
 64 |   }
 65 | 
 66 |   override def count(): Long = {
 67 |     partitionsRDD.map(_.size).reduce(_ + _)
 68 |   }
 69 | 
 70 |   /** Provides the `RDD[(K, V)]` equivalent output. */
 71 |   override def compute(part: Partition, context: TaskContext): Iterator[(K, V)] = {
 72 |     firstParent[IndexedRDDPartition[K, V]].iterator(part, context).next.iterator
 73 |   }
 74 | 
 75 |   /** Gets the value corresponding to the specified key, if any. */
 76 |   def get(k: K): Option[V] = multiget(Array(k)).get(k)
 77 | 
 78 |   /** Gets the values corresponding to the specified keys, if any. */
 79 |   def multiget(ks: Array[K]): Map[K, V] = {
 80 |     val ksByPartition = ks.groupBy(k => partitioner.get.getPartition(k))
 81 |     val partitions = ksByPartition.keys.toSeq
 82 |     // TODO: avoid sending all keys to all partitions by creating and zipping an RDD of keys
 83 |     val results: Array[Array[(K, V)]] = context.runJob(partitionsRDD,
 84 |       (context: TaskContext, partIter: Iterator[IndexedRDDPartition[K, V]]) => {
 85 |         if (partIter.hasNext && ksByPartition.contains(context.partitionId)) {
 86 |           val part = partIter.next()
 87 |           val ksForPartition = ksByPartition.get(context.partitionId).get
 88 |           part.multiget(ksForPartition).toArray
 89 |         } else {
 90 |           Array.empty[(K, V)]
 91 |         }
 92 |       }, partitions)
 93 |     results.flatten.toMap
 94 |   }
 95 | 
 96 |   /**
 97 |    * Unconditionally updates the specified key to have the specified value. Returns a new IndexedRDD
 98 |    * that reflects the modification.
 99 |    *
100 |    * Some implementations may not support this operation and will throw
101 |    * `UnsupportedOperationException`.
102 |    */
103 |   def put(k: K, v: V): IndexedRDD[K, V] = multiput(Map(k -> v))
104 | 
105 |   /**
106 |    * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new
107 |    * IndexedRDD that reflects the modification.
108 |    *
109 |    * Some implementations may not support this operation and will throw
110 |    * `UnsupportedOperationException`.
111 |    */
112 |   def multiput(kvs: Map[K, V]): IndexedRDD[K, V] =
113 |     multiput(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b)
114 | 
115 |   /**
116 |     * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new
117 |     * IndexedRDD that reflects the modification.
118 |     *
119 |     * Some implementations may not support this operation and will throw
120 |     * `UnsupportedOperationException`.
121 |     */
122 |   def multiputRDD(kvs: RDD[(K, V)]): IndexedRDD[K, V] =
123 |     multiputRDD(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b)
124 | 
125 |   /**
126 |    * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
127 |    * if necessary. Returns a new IndexedRDD that reflects the modification.
128 |    *
129 |    * Some implementations may not support this operation and will throw
130 |    * `UnsupportedOperationException`.
131 |    */
132 |   def multiput(kvs: Map[K, V], merge: (K, V, V) => V): IndexedRDD[K, V] =
133 |     multiput(kvs, (id: K, a: V) => a, merge)
134 | 
135 |   /**
136 |     * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
137 |     * if necessary. Returns a new IndexedRDD that reflects the modification.
138 |     *
139 |     * Some implementations may not support this operation and will throw
140 |     * `UnsupportedOperationException`.
141 |     */
142 |   def multiputRDD(kvs: RDD[(K, V)], merge: (K, V, V) => V): IndexedRDD[K, V] =
143 |     multiputRDD(kvs, (id: K, a: V) => a, merge)
144 | 
145 |   /**
146 |    * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
147 |    * if necessary. Returns a new IndexedRDD that reflects the modification.
148 |    *
149 |    * Some implementations may not support this operation and will throw
150 |    * `UnsupportedOperationException`.
151 |    */
152 |   def multiput[U: ClassTag](kvs: Map[K, U], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] =
153 |     multiputRDD(context.parallelize(kvs.toSeq), project, merge)
154 | 
155 |   /**
156 |     * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values
157 |     * if necessary. Returns a new IndexedRDD that reflects the modification.
158 |     *
159 |     * Some implementations may not support this operation and will throw
160 |     * `UnsupportedOperationException`.
161 |     */
162 |   def multiputRDD[U: ClassTag](updates: RDD[(K, U)], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] = {
163 |     zipPartitionsWithOther(updates.partitionBy(partitioner.get))(new MultiputZipper(project, merge))
164 |   }
165 | 
166 |   /**
167 |    * Deletes the specified keys. Returns a new IndexedRDD that reflects the deletions.
168 |    *
169 |    * Some implementations may not support this operation and will throw
170 |    * `UnsupportedOperationException`.
171 |    */
172 |   def delete(ks: Array[K]): IndexedRDD[K, V] = {
173 |     val deletions = context.parallelize(ks.map(k => (k, ()))).partitionBy(partitioner.get)
174 |     zipPartitionsWithOther(deletions)(new DeleteZipper)
175 |   }
176 | 
177 |   /** Applies a function to each partition of this IndexedRDD. */
178 |   private def mapIndexedRDDPartitions[K2: ClassTag, V2: ClassTag](
179 |       f: IndexedRDDPartition[K, V] => IndexedRDDPartition[K2, V2]): IndexedRDD[K2, V2] = {
180 |     val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true)
181 |     new IndexedRDD(newPartitionsRDD)
182 |   }
183 | 
184 |   /** Applies a function to corresponding partitions of `this` and another IndexedRDD. */
185 |   private def zipIndexedRDDPartitions[V2: ClassTag, V3: ClassTag](other: IndexedRDD[K, V2])
186 |       (f: ZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = {
187 |     assert(partitioner == other.partitioner)
188 |     val newPartitionsRDD = partitionsRDD.zipPartitions(other.partitionsRDD, true)(f)
189 |     new IndexedRDD(newPartitionsRDD)
190 |   }
191 | 
192 |   /** Applies a function to corresponding partitions of `this` and a pair RDD. */
193 |   private def zipPartitionsWithOther[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)])
194 |       (f: OtherZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = {
195 |     val partitioned = other.partitionBy(partitioner.get)
196 |     val newPartitionsRDD = partitionsRDD.zipPartitions(partitioned, true)(f)
197 |     new IndexedRDD(newPartitionsRDD)
198 |   }
199 | 
200 |   /**
201 |    * Restricts the entries to those satisfying the given predicate. This operation preserves the
202 |    * index for efficient joins with the original IndexedRDD and is implemented using soft deletions.
203 |    *
204 |    * @param pred the user defined predicate, which takes a tuple to conform to the `RDD[(K, V)]`
205 |    * interface
206 |    */
207 |   override def filter(pred: Tuple2[K, V] => Boolean): IndexedRDD[K, V] =
208 |     this.mapIndexedRDDPartitions(_.filter(Function.untupled(pred)))
209 | 
210 |   /** Maps each value, preserving the index. */
211 |   def mapValues[V2: ClassTag](f: V => V2): IndexedRDD[K, V2] =
212 |     this.mapIndexedRDDPartitions(_.mapValues((vid, attr) => f(attr)))
213 | 
214 |   /** Maps each value, supplying the corresponding key and preserving the index. */
215 |   def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDD[K, V2] =
216 |     this.mapIndexedRDDPartitions(_.mapValues(f))
217 | 
218 |   /**
219 |    * Intersects `this` and `other` and keeps only elements with differing values. For these
220 |    * elements, keeps the values from `this`.
221 |    */
222 |   def diff(other: RDD[(K, V)]): IndexedRDD[K, V] = other match {
223 |     case other: IndexedRDD[K, V] if partitioner == other.partitioner =>
224 |       this.zipIndexedRDDPartitions(other)(new DiffZipper)
225 |     case _ =>
226 |       this.zipPartitionsWithOther(other)(new OtherDiffZipper)
227 |   }
228 | 
229 |   /**
230 |    * Joins `this` with `other`, running `f` on the values of all keys in both sets. Note that for
231 |    * efficiency `other` must be an IndexedRDD, not just a pair RDD. Use [[aggregateUsingIndex]] to
232 |    * construct an IndexedRDD co-partitioned with `this`.
233 |    * 
234 |    * @param maybeLazy if true, a joined "view" of the input RDDs (that preserves the underlying
235 |    * indices) may be returned
236 |    */
237 |   def fullOuterJoin[V2: ClassTag, W: ClassTag]
238 |       (other: RDD[(K, V2)], maybeLazy: Boolean = false)
239 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDD[K, W] = other match {
240 |     case other: IndexedRDD[K, V2] if partitioner == other.partitioner => {
241 |         val castFn = implicitly[ClassTag[(K, Option[V], Option[V]) => V]]
242 |         val castRDD = implicitly[ClassTag[IndexedRDD[K, V]]]
243 |         (other, f) match {
244 |           case (castRDD(other), castFn(f)) if maybeLazy =>
245 |             this.zipIndexedRDDPartitions(other)(new LazyFullOuterJoinZipper(f)).asInstanceOf[IndexedRDD[K, W]]
246 |           case (other, f) =>
247 |             this.zipIndexedRDDPartitions(other)(new FullOuterJoinZipper(f))
248 |         }
249 |       }
250 |     case _ =>
251 |       this.zipPartitionsWithOther(other)(new OtherFullOuterJoinZipper(f))
252 |   }
253 | 
254 |   /**
255 |    * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
256 |    * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
257 |    * value type.
258 |    */
259 |   def join[U: ClassTag]
260 |       (other: RDD[(K, U)])(f: (K, V, U) => V): IndexedRDD[K, V] = other match {
261 |     case other: IndexedRDD[K, U] if partitioner == other.partitioner =>
262 |       this.zipIndexedRDDPartitions(other)(new JoinZipper(f))
263 |     case _ =>
264 |       this.zipPartitionsWithOther(other)(new OtherJoinZipper(f))
265 |   }
266 | 
267 |   /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
268 |   def leftJoin[V2: ClassTag, V3: ClassTag]
269 |       (other: RDD[(K, V2)])(f: (K, V, Option[V2]) => V3): IndexedRDD[K, V3] = other match {
270 |     case other: IndexedRDD[K, V2] if partitioner == other.partitioner =>
271 |       this.zipIndexedRDDPartitions(other)(new LeftJoinZipper(f))
272 |     case _ =>
273 |       this.zipPartitionsWithOther(other)(new OtherLeftJoinZipper(f))
274 |   }
275 | 
276 |   /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
277 |   def innerJoin[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)])
278 |       (f: (K, V, V2) => V3): IndexedRDD[K, V3] = other match {
279 |     case other: IndexedRDD[K, V2] if partitioner == other.partitioner =>
280 |       this.zipIndexedRDDPartitions(other)(new InnerJoinZipper(f))
281 |     case _ =>
282 |       this.zipPartitionsWithOther(other)(new OtherInnerJoinZipper(f))
283 |   }
284 | 
285 |   /**
286 |    * Creates a new IndexedRDD with values from `elems` that may share an index with `this`,
287 |    * merging duplicate keys in `elems` arbitrarily.
288 |    */
289 |   def createUsingIndex[V2: ClassTag](elems: RDD[(K, V2)]): IndexedRDD[K, V2] = {
290 |     this.zipPartitionsWithOther(elems)(new CreateUsingIndexZipper)
291 |   }
292 | 
293 |   /** Creates a new IndexedRDD with values from `elems` that may share an index with `this`. */
294 |   def aggregateUsingIndex[V2: ClassTag](
295 |       elems: RDD[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDD[K, V2] = {
296 |     this.zipPartitionsWithOther(elems)(new AggregateUsingIndexZipper(reduceFunc))
297 |   }
298 | 
299 |   /**
300 |    * Optionally rebuilds the indexes of this IndexedRDD. Depending on the implementation, this may
301 |    * remove tombstoned entries and the resulting IndexedRDD may not support efficient joins with the
302 |    * original one.
303 |    */
304 |   def reindex(): IndexedRDD[K, V] = this.mapIndexedRDDPartitions(_.reindex())
305 | 
306 |   // The following functions could have been anonymous, but we name them to work around a Scala
307 |   // compiler bug related to specialization.
308 | 
309 |   private type ZipPartitionsFunction[V2, V3] =
310 |     Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[IndexedRDDPartition[K, V2]],
311 |       Iterator[IndexedRDDPartition[K, V3]]]
312 | 
313 |   private type OtherZipPartitionsFunction[V2, V3] =
314 |     Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[(K, V2)],
315 |       Iterator[IndexedRDDPartition[K, V3]]]
316 | 
317 |   private class MultiputZipper[U](z: (K, U) => V, f: (K, V, U) => V)
318 |       extends OtherZipPartitionsFunction[U, V] with Serializable {
319 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)])
320 |       : Iterator[IndexedRDDPartition[K, V]] = {
321 |       val thisPart = thisIter.next()
322 |       Iterator(thisPart.multiput(otherIter, z, f))
323 |     }
324 |   }
325 | 
326 |   private class DeleteZipper extends OtherZipPartitionsFunction[Unit, V] with Serializable {
327 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, Unit)])
328 |       : Iterator[IndexedRDDPartition[K, V]] = {
329 |       val thisPart = thisIter.next()
330 |       Iterator(thisPart.delete(otherIter.map(_._1)))
331 |     }
332 |   }
333 | 
334 |   private class DiffZipper extends ZipPartitionsFunction[V, V] with Serializable {
335 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]]): Iterator[IndexedRDDPartition[K, V]] = {
336 |       val thisPart = thisIter.next()
337 |       val otherPart = otherIter.next()
338 |       Iterator(thisPart.diff(otherPart))
339 |     }
340 |   }
341 | 
342 |   private class OtherDiffZipper extends OtherZipPartitionsFunction[V, V] with Serializable {
343 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V)]): Iterator[IndexedRDDPartition[K, V]] = {
344 |       val thisPart = thisIter.next()
345 |       Iterator(thisPart.diff(otherIter))
346 |     }
347 |   }
348 | 
349 |   private class FullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W)
350 |       extends ZipPartitionsFunction[V2, W] with Serializable {
351 |     def apply(
352 |         thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]])
353 |         : Iterator[IndexedRDDPartition[K, W]] = {
354 |       val thisPart = thisIter.next()
355 |       val otherPart = otherIter.next()
356 |       Iterator(thisPart.fullOuterJoin(otherPart)(f))
357 |     }
358 |   }
359 |   
360 |   private class LazyFullOuterJoinZipper(f: (K, Option[V], Option[V]) => V)
361 |       extends ZipPartitionsFunction[V, V] with Serializable {
362 |     def apply(
363 |         thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]])
364 |         : Iterator[IndexedRDDPartition[K, V]] = {
365 |       val thisPart = thisIter.next()
366 |       val otherPart = otherIter.next()
367 |       (thisPart, otherPart) match {
368 |         case (thisPart: LazyPartition[K, V], otherPart: LazyPartition[K, V]) if thisPart.reducer == f && otherPart.reducer == f =>
369 |           Iterator(new LazyPartition(thisPart.partitions ++ otherPart.partitions, f))
370 |         case (thisPart: LazyPartition[K, V], _) if thisPart.reducer == f =>
371 |           Iterator(new LazyPartition(thisPart.partitions :+ otherPart, f))
372 |         case (_, otherPart: LazyPartition[K, V]) if otherPart.reducer == f =>
373 |           Iterator(new LazyPartition(thisPart +: otherPart.partitions, f))
374 |         case _ =>
375 |           Iterator(new LazyPartition(Seq(thisPart, otherPart), f))
376 |       }
377 |     }
378 |   }
379 | 
380 |   private class OtherFullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W)
381 |       extends OtherZipPartitionsFunction[V2, W] with Serializable {
382 |     def apply(
383 |         thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)])
384 |         : Iterator[IndexedRDDPartition[K, W]] = {
385 |       val thisPart = thisIter.next()
386 |       Iterator(thisPart.fullOuterJoin(otherIter)(f))
387 |     }
388 |   }
389 | 
390 |   private class JoinZipper[U: ClassTag](f: (K, V, U) => V)
391 |       extends ZipPartitionsFunction[U, V] with Serializable {
392 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, U]]): Iterator[IndexedRDDPartition[K, V]] = {
393 |       val thisPart = thisIter.next()
394 |       val otherPart = otherIter.next()
395 |       Iterator(thisPart.join(otherPart)(f))
396 |     }
397 |   }
398 | 
399 |   private class OtherJoinZipper[U: ClassTag](f: (K, V, U) => V)
400 |       extends OtherZipPartitionsFunction[U, V] with Serializable {
401 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)]): Iterator[IndexedRDDPartition[K, V]] = {
402 |       val thisPart = thisIter.next()
403 |       Iterator(thisPart.join(otherIter)(f))
404 |     }
405 |   }
406 | 
407 |   private class LeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3)
408 |       extends ZipPartitionsFunction[V2, V3] with Serializable {
409 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]]): Iterator[IndexedRDDPartition[K, V3]] = {
410 |       val thisPart = thisIter.next()
411 |       val otherPart = otherIter.next()
412 |       Iterator(thisPart.leftJoin(otherPart)(f))
413 |     }
414 |   }
415 | 
416 |   private class OtherLeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3)
417 |       extends OtherZipPartitionsFunction[V2, V3] with Serializable {
418 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V3]] = {
419 |       val thisPart = thisIter.next()
420 |       Iterator(thisPart.leftJoin(otherIter)(f))
421 |     }
422 |   }
423 | 
424 |   private class InnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3)
425 |       extends ZipPartitionsFunction[V2, V3] with Serializable {
426 |     def apply(
427 |         thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]])
428 |       : Iterator[IndexedRDDPartition[K, V3]] = {
429 |       val thisPart = thisIter.next()
430 |       val otherPart = otherIter.next()
431 |       Iterator(thisPart.innerJoin(otherPart)(f))
432 |     }
433 |   }
434 | 
435 |   private class OtherInnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3)
436 |       extends OtherZipPartitionsFunction[V2, V3] with Serializable {
437 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)])
438 |       : Iterator[IndexedRDDPartition[K, V3]] = {
439 |       val thisPart = thisIter.next()
440 |       Iterator(thisPart.innerJoin(otherIter)(f))
441 |     }
442 |   }
443 | 
444 |   private class CreateUsingIndexZipper[V2: ClassTag]
445 |       extends OtherZipPartitionsFunction[V2, V2] with Serializable {
446 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = {
447 |       val thisPart = thisIter.next()
448 |       Iterator(thisPart.createUsingIndex(otherIter))
449 |     }
450 |   }
451 | 
452 |   private class AggregateUsingIndexZipper[V2: ClassTag](reduceFunc: (V2, V2) => V2)
453 |       extends OtherZipPartitionsFunction[V2, V2] with Serializable {
454 |     def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = {
455 |       val thisPart = thisIter.next()
456 |       Iterator(thisPart.aggregateUsingIndex(otherIter, reduceFunc))
457 |     }
458 |   }
459 | }
460 | 
461 | object IndexedRDD {
462 |   /**
463 |    * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily.
464 |    */
465 |   def apply[K: ClassTag : KeySerializer, V: ClassTag]
466 |       (elems: RDD[(K, V)]): IndexedRDD[K, V] = updatable(elems)
467 | 
468 |   /**
469 |    * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily.
470 |    */
471 |   def updatable[K: ClassTag : KeySerializer, V: ClassTag]
472 |       (elems: RDD[(K, V)])
473 |     : IndexedRDD[K, V] = updatable[K, V, V](elems, (id, a) => a, (id, a, b) => b)
474 | 
475 |   /** Constructs an IndexedRDD from an RDD of pairs. */
476 |   def updatable[K: ClassTag : KeySerializer, U: ClassTag, V: ClassTag]
477 |       (elems: RDD[(K, U)], z: (K, U) => V, f: (K, V, U) => V)
478 |     : IndexedRDD[K, V] = {
479 |     val elemsPartitioned =
480 |       if (elems.partitioner.isDefined) elems
481 |       else elems.partitionBy(new HashPartitioner(elems.partitions.size))
482 |     val partitions = elemsPartitioned.mapPartitions[IndexedRDDPartition[K, V]](
483 |       iter => Iterator(PARTPartition(iter, z, f)),
484 |       preservesPartitioning = true)
485 |     new IndexedRDD(partitions)
486 |   }
487 | 
488 |   implicit val longSer = new LongSerializer
489 |   implicit val stringSer = new StringSerializer
490 |   implicit val shortSer = new ShortSerializer
491 |   implicit val charSer = new CharSerializer
492 |   implicit val intSet = new IntSerializer
493 |   implicit val bigintSer = new BigIntSerializer
494 |   implicit val uuidSer = new UUIDSerializer
495 | 
496 |   implicit def tuple2Ser[A, B](
497 |       implicit aSer: KeySerializer[A], bSer: KeySerializer[B]): Tuple2Serializer[A, B] =
498 |     new Tuple2Serializer()(aSer, bSer)
499 | }
500 | 


--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDPartition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | /**
 23 |  * A map of key-value `(K, V)` pairs that enforces key uniqueness and pre-indexes the entries for
 24 |  * fast lookups, joins, and optionally updates. To construct an `IndexedRDDPartition`, use one of
 25 |  * the constructors in the [[edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDDPartition$
 26 |  * IndexedRDDPartition object]].
 27 |  *
 28 |  * @tparam K the key associated with each entry in the set.
 29 |  * @tparam V the value associated with each entry in the set.
 30 |  */
 31 | private[indexedrdd] abstract class IndexedRDDPartition[K, V] extends Serializable {
 32 | 
 33 |   protected implicit def kTag: ClassTag[K]
 34 |   protected implicit def vTag: ClassTag[V]
 35 | 
 36 |   def size: Long
 37 | 
 38 |   /** Return the value for the given key. */
 39 |   def apply(k: K): Option[V]
 40 | 
 41 |   def isDefined(k: K): Boolean =
 42 |     apply(k).isDefined
 43 | 
 44 |   def iterator: Iterator[(K, V)]
 45 | 
 46 |   /**
 47 |    * Gets the values corresponding to the specified keys, if any.
 48 |    */
 49 |   def multiget(ks: Array[K]): Iterator[(K, V)]
 50 | 
 51 |   /**
 52 |    * Updates the keys in `kvs` to their corresponding values generated by running `f` on old and new
 53 |    * values, if an old value exists, or `z` otherwise. Returns a new IndexedRDDPartition that
 54 |    * reflects the modification.
 55 |    */
 56 |   def multiput[U](
 57 |       kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] =
 58 |     throw new UnsupportedOperationException("modifications not supported")
 59 | 
 60 |   /** Deletes the specified keys. Returns a new IndexedRDDPartition that reflects the deletions. */
 61 |   def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] =
 62 |     throw new UnsupportedOperationException("modifications not supported")
 63 | 
 64 |   /** Maps each value, supplying the corresponding key and preserving the index. */
 65 |   def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2]
 66 | 
 67 |   /**
 68 |    * Restricts the entries to those satisfying the given predicate.
 69 |    */
 70 |   def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V]
 71 | 
 72 |   /**
 73 |    * Intersects `this` and `other` and keeps only elements with differing values. For these
 74 |    * elements, keeps the values from `this`.
 75 |    */
 76 |   def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V]
 77 | 
 78 |   /**
 79 |    * Intersects `this` and `other` and keeps only elements with differing values. For these
 80 |    * elements, keeps the values from `this`.
 81 |    */
 82 |   def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V]
 83 | 
 84 |   /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */
 85 |   def fullOuterJoin[V2: ClassTag, W: ClassTag]
 86 |       (other: IndexedRDDPartition[K, V2])
 87 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W]
 88 | 
 89 |   /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */
 90 |   def fullOuterJoin[V2: ClassTag, W: ClassTag]
 91 |       (other: Iterator[(K, V2)])
 92 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W]
 93 | 
 94 |   /**
 95 |    * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
 96 |    * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
 97 |    * value type.
 98 |    */
 99 |   def join[U: ClassTag]
100 |       (other: IndexedRDDPartition[K, U])
101 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V]
102 | 
103 |   /**
104 |    * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because
105 |    * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the
106 |    * value type.
107 |    */
108 |   def join[U: ClassTag]
109 |       (other: Iterator[(K, U)])
110 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V]
111 | 
112 |   /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
113 |   def leftJoin[V2: ClassTag, V3: ClassTag]
114 |       (other: IndexedRDDPartition[K, V2])
115 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3]
116 | 
117 |   /** Left outer joins `this` with `other`, running `f` on all values of `this`. */
118 |   def leftJoin[V2: ClassTag, V3: ClassTag]
119 |       (other: Iterator[(K, V2)])
120 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3]
121 | 
122 |   /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
123 |   def innerJoin[U: ClassTag, V2: ClassTag]
124 |       (other: IndexedRDDPartition[K, U])
125 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2]
126 | 
127 |   /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */
128 |   def innerJoin[U: ClassTag, V2: ClassTag]
129 |       (other: Iterator[(K, U)])
130 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2]
131 | 
132 |   /**
133 |    * Creates a new partition with values from `elems` that may share an index with `this`,
134 |    * merging duplicate keys in `elems` arbitrarily.
135 |    */
136 |   def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2]
137 | 
138 |   /** Creates a new partition with values from `elems` that shares an index with `this`. */
139 |   def aggregateUsingIndex[V2: ClassTag](
140 |       elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2]
141 | 
142 |   /**
143 |    * Optionally rebuilds the indexes of this partition. Depending on the implementation, this may
144 |    * remove tombstoned entries and the resulting partition may support efficient joins with the
145 |    * original one.
146 |    */
147 |   def reindex(): IndexedRDDPartition[K, V]
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import java.util.UUID
 21 | 
 22 | /**
 23 |  * Serializer for storing arbitrary key types as byte arrays for PART.
 24 |  *
 25 |  * If serialized keys may be of variable length, they should be terminated with a unique value,
 26 |  * because keys in PART cannot be prefixes of other keys.
 27 |  */
 28 | trait KeySerializer[K] extends Serializable {
 29 |   def toBytes(k: K): Array[Byte]
 30 |   def fromBytes(b: Array[Byte]): K
 31 | }
 32 | 
 33 | class LongSerializer extends KeySerializer[Long] {
 34 |   override def toBytes(k: Long) = Array(
 35 |     ((k >> 56) & 0xFF).toByte,
 36 |     ((k >> 48) & 0xFF).toByte,
 37 |     ((k >> 40) & 0xFF).toByte,
 38 |     ((k >> 32) & 0xFF).toByte,
 39 |     ((k >> 24) & 0xFF).toByte,
 40 |     ((k >> 16) & 0xFF).toByte,
 41 |     ((k >>  8) & 0xFF).toByte,
 42 |     ( k        & 0xFF).toByte)
 43 | 
 44 |   override def fromBytes(b: Array[Byte]): Long =
 45 |     ( (b(0).toLong << 56) & (0xFFL << 56) |
 46 |       (b(1).toLong << 48) & (0xFFL << 48) |
 47 |       (b(2).toLong << 40) & (0xFFL << 40) |
 48 |       (b(3).toLong << 32) & (0xFFL << 32) |
 49 |       (b(4).toLong << 24) & (0xFFL << 24) |
 50 |       (b(5).toLong << 16) & (0xFFL << 16) |
 51 |       (b(6).toLong <<  8) & (0xFFL <<  8) |
 52 |        b(7).toLong        &  0xFFL)
 53 | }
 54 | 
 55 | class IntSerializer extends KeySerializer[Int] {
 56 |   override def toBytes(k: Int) = Array(
 57 |     ((k >> 24) & 0xFF).toByte,
 58 |     ((k >> 16) & 0xFF).toByte,
 59 |     ((k >>  8) & 0xFF).toByte,
 60 |     ( k        & 0xFF).toByte)
 61 | 
 62 |   override def fromBytes(b: Array[Byte]): Int =
 63 |     (b(0).toInt << 24) & (0xFF << 24) |
 64 |     (b(1).toInt << 16) & (0xFF << 16) |
 65 |     (b(2).toInt <<  8) & (0xFF <<  8) |
 66 |      b(3).toInt        &  0xFF
 67 | }
 68 | 
 69 | class BigIntSerializer extends KeySerializer[BigInt] {
 70 |   override def toBytes(k: BigInt) = {
 71 |     // Prepend the BigInt bit length to ensure no key is a prefix of any other
 72 |     val lengthBytes = Array(
 73 |       ((k.bitLength >> 24) & 0xFF).toByte,
 74 |       ((k.bitLength >> 16) & 0xFF).toByte,
 75 |       ((k.bitLength >>  8) & 0xFF).toByte,
 76 |       ( k.bitLength        & 0xFF).toByte)
 77 |     lengthBytes ++ k.toByteArray
 78 |   }
 79 |   override def fromBytes(b: Array[Byte]): BigInt = BigInt.apply(b.drop(4))
 80 | }
 81 | 
 82 | class ShortSerializer extends KeySerializer[Short] {
 83 |   override def toBytes(k: Short) = Array(
 84 |     ((k >>  8) & 0xFF).toByte,
 85 |     ( k        & 0xFF).toByte)
 86 |   override def fromBytes(b: Array[Byte]): Short =
 87 |     ((b(0).toInt << 8) & (0xFF << 8) |
 88 |       b(1).toInt       &  0xFF).toShort
 89 | }
 90 | 
 91 | class CharSerializer extends KeySerializer[Char] {
 92 |   override def toBytes(k: Char) = Array(
 93 |     ((k >>  8) & 0xFF).toByte,
 94 |     ( k        & 0xFF).toByte)
 95 |   override def fromBytes(b: Array[Byte]): Char =
 96 |     ((b(0).toInt << 8) & (0xFF << 8) |
 97 |       b(1).toInt       &  0xFF).toChar
 98 | }
 99 | 
100 | class UUIDSerializer(val longSer: LongSerializer = new LongSerializer) extends KeySerializer[UUID] {
101 |   override def toBytes(k: UUID) =
102 |     (longSer.toBytes(k.getMostSignificantBits) ++
103 |       longSer.toBytes(k.getLeastSignificantBits))
104 |   override def fromBytes(b: Array[Byte]): UUID =
105 |     new UUID(
106 |       longSer.fromBytes(b.take(8)),
107 |       longSer.fromBytes(b.takeRight(8)))
108 | }
109 | 
110 | class StringSerializer extends KeySerializer[String] {
111 |   override def toBytes(k: String) = {
112 |     val result = new Array[Byte](4 + k.length * 2)
113 | 
114 |     // Prepend the string length to ensure no key is a prefix of any other
115 |     result(0) = ((k.length >> 24) & 0xFF).toByte
116 |     result(1) = ((k.length >> 16) & 0xFF).toByte
117 |     result(2) = ((k.length >>  8) & 0xFF).toByte
118 |     result(3) = ( k.length        & 0xFF).toByte
119 | 
120 |     var i = 0
121 |     while (i < k.length) {
122 |       result(4 + 2 * i)     = ((k(i) >> 8) & 0xFF).toByte
123 |       result(4 + 2 * i + 1) = ( k(i)       & 0xFF).toByte
124 |       i += 1
125 |     }
126 | 
127 |     result
128 |   }
129 | 
130 |   override def fromBytes(b: Array[Byte]): String = {
131 |     val result = new Array[Char]((b.length - 4) / 2)
132 | 
133 |     var i = 0
134 |     while (i < result.length) {
135 |       result(i) =
136 |         ((b(4 + 2 * i) << 8) & (0xFF << 8) |
137 |          (b(4 + 2 * i + 1)   &  0xFF)).toChar
138 |       i += 1
139 |     }
140 | 
141 |     new String(result)
142 |   }
143 | }
144 | 
145 | class Tuple2Serializer[A, B](
146 |     implicit aSer: KeySerializer[A], bSer: KeySerializer[B])
147 |   extends KeySerializer[(A, B)] {
148 | 
149 |   override def toBytes(k: (A, B)) = {
150 |     val aBytes = aSer.toBytes(k._1)
151 |     val bBytes = bSer.toBytes(k._2)
152 | 
153 |     val result = new Array[Byte](4 + aBytes.length + bBytes.length)
154 | 
155 |     // Prepend the length of aBytes so we know where the boundary is when reading
156 |     result(0) = ((aBytes.length >> 24) & 0xFF).toByte
157 |     result(1) = ((aBytes.length >> 16) & 0xFF).toByte
158 |     result(2) = ((aBytes.length >>  8) & 0xFF).toByte
159 |     result(3) = ( aBytes.length        & 0xFF).toByte
160 | 
161 |     aBytes.copyToArray(result, 4)
162 |     bBytes.copyToArray(result, 4 + aBytes.length)
163 | 
164 |     result
165 |   }
166 | 
167 |   override def fromBytes(b: Array[Byte]): (A, B) = {
168 |     val aLength =
169 |       ( (b(0).toInt << 24) & (0xFF << 24) |
170 |         (b(1).toInt << 16) & (0xFF << 16) |
171 |         (b(2).toInt <<  8) & (0xFF <<  8) |
172 |          b(3).toInt        &  0xFF)
173 |     (aSer.fromBytes(b.slice(4, 4 + aLength)),
174 |       bSer.fromBytes(b.drop(4 + aLength)))
175 |   }
176 | }
177 | 


--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/LazyPartition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import scala.reflect.ClassTag
 21 | import scala.collection.Traversable
 22 | 
 23 | /**
 24 |  * A wrapper around several IndexedRDDPartition that avoids rebuilding
 25 |  * the index for the combined partitions. Instead, each operation probes
 26 |  * the nested partitions and merges the results.
 27 |  */
 28 | 
 29 | private[indexedrdd] class LazyPartition[K, V]
 30 |     (val partitions: Seq[IndexedRDDPartition[K, V]],
 31 |      val reducer: (K, Option[V], Option[V]) => V)
 32 |     (override implicit val kTag: ClassTag[K],
 33 |      override implicit val vTag: ClassTag[V])
 34 |   extends IndexedRDDPartition[K, V] {
 35 | 
 36 |   @transient private lazy val cached: IndexedRDDPartition[K, V] =
 37 |     partitions.reduce((a, b) => a.fullOuterJoin(b)(reducer))
 38 |   
 39 |   def size: Long =
 40 |     cached.size
 41 | 
 42 |   /** Return the value for the given key. */
 43 |   def apply(k: K): Option[V] =
 44 |     partitions.
 45 |       map(_(k)).
 46 |       reduce((a, b) => Option(reducer(k, a, b)))
 47 | 
 48 |   override def isDefined(k: K): Boolean =
 49 |     partitions.find(_.isDefined(k)).isDefined
 50 | 
 51 |   def iterator: Iterator[(K, V)] =
 52 |     cached.iterator
 53 | 
 54 |   /**
 55 |    * Query each partition independently, then merge the results by key. This
 56 |    * could be more efficient if multiget returned ordered results!
 57 |    */
 58 |   def multiget(ks: Array[K]): Iterator[(K, V)] =
 59 |     partitions.
 60 |       flatMap(_.multiget(ks)).
 61 |       groupBy(_._1).
 62 |       map {
 63 |         case (k, vs) =>
 64 |           val v = vs.map(_._2).reduce((v1, v2) => reducer(k, Some(v1), Some(v2)))
 65 |           (k, v)
 66 |       }.
 67 |       iterator
 68 | 
 69 |   /**
 70 |    * We have to re-index as we don't know how to reduce the mapped values.
 71 |    */
 72 |   def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] =
 73 |     cached.mapValues(f)
 74 | 
 75 |   def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] =
 76 |     new LazyPartition(partitions.map(_.filter(pred)), reducer)
 77 | 
 78 |   def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] =
 79 |     cached.diff(other)
 80 | 
 81 |   def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] =
 82 |     cached.diff(other)
 83 | 
 84 |   def fullOuterJoin[V2: ClassTag, W: ClassTag]
 85 |       (other: IndexedRDDPartition[K, V2])
 86 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
 87 |     cached.fullOuterJoin(other)(f)
 88 | 
 89 |   def fullOuterJoin[V2: ClassTag, W: ClassTag]
 90 |       (other: Iterator[(K, V2)])
 91 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
 92 |     cached.fullOuterJoin(other)(f)
 93 | 
 94 |   def join[U: ClassTag]
 95 |       (other: IndexedRDDPartition[K, U])
 96 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V] =
 97 |     cached.join(other)(f)
 98 | 
 99 |   def join[U: ClassTag]
100 |       (other: Iterator[(K, U)])
101 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V] =
102 |     cached.join(other)(f)
103 | 
104 |   def leftJoin[V2: ClassTag, V3: ClassTag]
105 |       (other: IndexedRDDPartition[K, V2])
106 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
107 |     cached.leftJoin(other)(f)
108 | 
109 |   def leftJoin[V2: ClassTag, V3: ClassTag]
110 |       (other: Iterator[(K, V2)])
111 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
112 |     cached.leftJoin(other)(f)
113 | 
114 |   def innerJoin[U: ClassTag, V2: ClassTag]
115 |       (other: IndexedRDDPartition[K, U])
116 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
117 |     cached.innerJoin(other)(f)
118 | 
119 |   def innerJoin[U: ClassTag, V2: ClassTag]
120 |       (other: Iterator[(K, U)])
121 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
122 |     cached.innerJoin(other)(f)
123 | 
124 |   def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] =
125 |     cached.createUsingIndex(elems)
126 | 
127 |   def aggregateUsingIndex[V2: ClassTag](
128 |       elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] =
129 |     cached.aggregateUsingIndex(elems, reduceFunc)
130 | 
131 |   /**
132 |    * Forces the partitions to re-index, and rebuilds the combined index.
133 |    */
134 |   def reindex(): IndexedRDDPartition[K, V] =
135 |     partitions.map(_.reindex).reduce((a, b) => a.fullOuterJoin(b)(reducer))
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/PARTPartition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl
 19 | 
 20 | import scala.reflect.ClassTag
 21 | import scala.collection.JavaConversions._
 22 | 
 23 | import edu.berkeley.cs.amplab.spark.indexedrdd._
 24 | import com.ankurdave.part.ArtTree
 25 | 
 26 | private[indexedrdd] class PARTPartition[K, V]
 27 |     (protected val map: ArtTree)
 28 |     (override implicit val kTag: ClassTag[K],
 29 |      override implicit val vTag: ClassTag[V],
 30 |      implicit val kSer: KeySerializer[K])
 31 |   extends IndexedRDDPartition[K, V] {
 32 | 
 33 |   protected def withMap[V2: ClassTag]
 34 |       (map: ArtTree): PARTPartition[K, V2] = {
 35 |     new PARTPartition(map)
 36 |   }
 37 | 
 38 |   override def size: Long = map.size()
 39 | 
 40 |   override def apply(k: K): Option[V] = Option(map.search(kSer.toBytes(k)).asInstanceOf[V])
 41 | 
 42 |   override def iterator: Iterator[(K, V)] =
 43 |     map.iterator.map(kv => (kSer.fromBytes(kv._1), kv._2.asInstanceOf[V]))
 44 | 
 45 |   private def rawIterator: Iterator[(Array[Byte], V)] =
 46 |     map.iterator.map(kv => (kv._1, kv._2.asInstanceOf[V]))
 47 | 
 48 |   override def multiget(ks: Array[K]): Iterator[(K, V)] =
 49 |     ks.flatMap { k => this(k).map(v => (k, v)) }.iterator
 50 | 
 51 |   override def multiput[U](
 52 |       kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] = {
 53 |     val newMap = map.snapshot()
 54 |     for (ku <- kvs) {
 55 |       val kBytes = kSer.toBytes(ku._1)
 56 |       val oldV = newMap.search(kBytes).asInstanceOf[V]
 57 |       val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2)
 58 |       newMap.insert(kBytes, newV)
 59 |     }
 60 |     this.withMap[V](newMap)
 61 |   }
 62 | 
 63 |   override def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] = {
 64 |     val newMap = map.snapshot()
 65 |     for (k <- ks) {
 66 |       newMap.delete(kSer.toBytes(k))
 67 |     }
 68 |     this.withMap[V](newMap)
 69 |   }
 70 | 
 71 |   override def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] = {
 72 |     val newMap = new ArtTree
 73 |     for (kv <- rawIterator) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2))
 74 |     this.withMap[V2](newMap)
 75 |   }
 76 | 
 77 |   override def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] = {
 78 |     val newMap = new ArtTree
 79 |     for (kv <- rawIterator if pred(kSer.fromBytes(kv._1), kv._2)) {
 80 |       newMap.insert(kv._1, kv._2)
 81 |     }
 82 |     this.withMap[V](newMap)
 83 |   }
 84 | 
 85 |   override def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] = other match {
 86 |     case other: PARTPartition[K, V] =>
 87 |       val newMap = new ArtTree
 88 |       for (kv <- rawIterator) {
 89 |         val otherV = other.map.search(kv._1).asInstanceOf[V]
 90 |         if (otherV != null && otherV != kv._2) {
 91 |           newMap.insert(kv._1, kv._2)
 92 |         }
 93 |       }
 94 |       this.withMap[V](newMap)
 95 | 
 96 |     case _ =>
 97 |       diff(other.iterator)
 98 |   }
 99 | 
100 |   override def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] =
101 |     diff(PARTPartition(other))
102 | 
103 |   override def fullOuterJoin[V2: ClassTag, W: ClassTag]
104 |       (other: IndexedRDDPartition[K, V2])
105 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = other match {
106 |     case other: PARTPartition[K, V2] =>
107 |       val newMap = new ArtTree
108 |       // Scan `this` and probe `other`, adding all elements in `this`
109 |       for (kv <- rawIterator) {
110 |         val newV = f(
111 |           kSer.fromBytes(kv._1),
112 |           Some(kv._2),
113 |           Option(other.map.search(kv._1).asInstanceOf[V2]))
114 |         newMap.insert(kv._1, newV)
115 |       }
116 |       // Scan `other` and probe `this`, adding only the elements present in `other` but not `this`
117 |       for (kv <- other.rawIterator) {
118 |         if (this.map.search(kv._1) == null) {
119 |           val newV = f(
120 |             kSer.fromBytes(kv._1),
121 |             None,
122 |             Some(kv._2))
123 |           newMap.insert(kv._1, newV)
124 |         }
125 |       }
126 |       this.withMap[W](newMap)
127 | 
128 |     case _ =>
129 |       fullOuterJoin(other.iterator)(f)
130 |   }
131 | 
132 |   override def fullOuterJoin[V2: ClassTag, W: ClassTag]
133 |       (other: Iterator[(K, V2)])
134 |       (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] =
135 |     fullOuterJoin(PARTPartition(other))(f)
136 | 
137 |   override def join[U: ClassTag]
138 |       (other: IndexedRDDPartition[K, U])
139 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V] = join(other.iterator)(f)
140 | 
141 |   override def join[U: ClassTag]
142 |       (other: Iterator[(K, U)])
143 |       (f: (K, V, U) => V): IndexedRDDPartition[K, V] = {
144 |     val newMap = map.snapshot()
145 |     for (ku <- other) {
146 |       val kBytes = kSer.toBytes(ku._1)
147 |       val oldV = newMap.search(kBytes).asInstanceOf[V]
148 |       if (oldV != null) {
149 |         val newV = f(ku._1, oldV, ku._2)
150 |         newMap.insert(kBytes, newV)
151 |       }
152 |     }
153 |     this.withMap[V](newMap)
154 |   }
155 | 
156 |   override def leftJoin[V2: ClassTag, V3: ClassTag]
157 |       (other: IndexedRDDPartition[K, V2])
158 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = other match {
159 |     case other: PARTPartition[K, V2] =>
160 |       // Scan `this` and probe `other`
161 |       val newMap = new ArtTree
162 |       for (kv <- rawIterator) {
163 |         val newV = f(kSer.fromBytes(kv._1), kv._2, Option(other.map.search(kv._1).asInstanceOf[V2]))
164 |         newMap.insert(kv._1, newV)
165 |       }
166 |       this.withMap[V3](newMap)
167 | 
168 |     case _ =>
169 |       leftJoin(other.iterator)(f)
170 |   }
171 | 
172 |   override def leftJoin[V2: ClassTag, V3: ClassTag]
173 |       (other: Iterator[(K, V2)])
174 |       (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] =
175 |     leftJoin(PARTPartition(other))(f)
176 | 
177 |   override def innerJoin[U: ClassTag, V2: ClassTag]
178 |       (other: IndexedRDDPartition[K, U])
179 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = other match {
180 |     case other: PARTPartition[K, U] =>
181 |       // Scan `this` and probe `other`
182 |       val newMap = new ArtTree
183 |       for (kv <- rawIterator) {
184 |         val otherV = other.map.search(kv._1).asInstanceOf[U]
185 |         if (otherV != null) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2, otherV))
186 |       }
187 |       this.withMap[V2](newMap)
188 | 
189 |     case _ =>
190 |       innerJoin(other.iterator)(f)
191 |   }
192 | 
193 |   override def innerJoin[U: ClassTag, V2: ClassTag]
194 |       (other: Iterator[(K, U)])
195 |       (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] =
196 |     innerJoin(PARTPartition(other))(f)
197 | 
198 |   override def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] =
199 |     PARTPartition(elems)
200 | 
201 |   override def aggregateUsingIndex[V2: ClassTag](
202 |       elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] =
203 |     PARTPartition[K, V2, V2](elems, (id, a) => a, (id, a, b) => reduceFunc(a, b))
204 | 
205 |   override def reindex(): IndexedRDDPartition[K, V] = this
206 | }
207 | 
208 | private[indexedrdd] object PARTPartition {
209 |   def apply[K: ClassTag, V: ClassTag]
210 |       (iter: Iterator[(K, V)])(implicit kSer: KeySerializer[K]) =
211 |     apply[K, V, V](iter, (id, a) => a, (id, a, b) => b)
212 | 
213 |   def apply[K: ClassTag, U: ClassTag, V: ClassTag]
214 |       (iter: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V)
215 |       (implicit kSer: KeySerializer[K]): PARTPartition[K, V] = {
216 |     val map = new ArtTree
217 |     iter.foreach { ku =>
218 |       val kBytes = kSer.toBytes(ku._1)
219 |       val oldV = map.search(kBytes).asInstanceOf[V]
220 |       val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2)
221 |       map.insert(kBytes, newV)
222 |     }
223 |     new PARTPartition(map)
224 |   }
225 | }
226 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the file core/target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=false
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
25 | 
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.eclipse.jetty=WARN
28 | org.eclipse.jetty.LEVEL=WARN
29 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import scala.collection.immutable.LongMap
 21 | import scala.reflect.ClassTag
 22 | import org.apache.spark.HashPartitioner
 23 | 
 24 | import org.apache.spark.SparkContext
 25 | import org.apache.spark.rdd.RDD
 26 | import org.scalatest.FunSuite
 27 | 
 28 | abstract class IndexedRDDSuite extends FunSuite with SharedSparkContext {
 29 | 
 30 |   def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V]
 31 | 
 32 |   def pairs(sc: SparkContext, n: Int) = {
 33 |     create(sc.parallelize((0 to n).map(x => (x.toLong, x)), 5))
 34 |   }
 35 | 
 36 |   test("get, multiget") {
 37 |     val n = 100
 38 |     val ps = pairs(sc, n).cache()
 39 |     assert(ps.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 1L -> 1, 98L -> 98))
 40 |     assert(ps.get(-1L) === None)
 41 |     assert(ps.get(97L) === Some(97))
 42 |     val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
 43 |     assert(evens.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 98L -> 98))
 44 |     assert(evens.get(97L) === None)
 45 |   }
 46 | 
 47 |   test("filter") {
 48 |     val n = 100
 49 |     val ps = pairs(sc, n)
 50 |     val evens = ps.filter(q => ((q._2 % 2) == 0))
 51 |     assert(evens.count === (0 to n).filter(_ % 2 == 0).size)
 52 |   }
 53 | 
 54 |   test("mapValues") {
 55 |     val n = 100
 56 |     val ps = pairs(sc, n)
 57 |     val negatives = ps.mapValues(x => -x).cache() // Allow joining b with a derived RDD of b
 58 |     assert(negatives.count === n + 1)
 59 |   }
 60 | 
 61 |   test("diff") {
 62 |     val n = 100
 63 |     val ps = pairs(sc, n).cache()
 64 |     val flipEvens = ps.mapValues(x => if (x % 2 == 0) -x else x).cache()
 65 |     // diff should keep only the changed values
 66 |     assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet)
 67 |   }
 68 | 
 69 |   test("diff with pair RDD") {
 70 |     val n = 100
 71 |     val ps = pairs(sc, n).cache()
 72 |     val flipEvens: RDD[(Long, Int)] =
 73 |       sc.parallelize(0L to 100L)
 74 |         .map(id => if (id % 2 == 0) (id, -id.toInt) else (id, id.toInt)).cache()
 75 |     // diff should keep only the changed values
 76 |     assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet)
 77 |   }
 78 | 
 79 |   test("diff with non-equal number of partitions") {
 80 |     val a = create(sc.parallelize(0 until 24, 3).map(i => (i.toLong, 0)))
 81 |     val b = create(sc.parallelize(8 until 16, 2).map(i => (i.toLong, 1)))
 82 |     assert(a.partitions.size != b.partitions.size)
 83 |     val c = b.diff(a)
 84 |     assert(c.map(_._1).collect.toSet === (8 until 16).toSet)
 85 |   }
 86 | 
 87 |   test("fullOuterJoin") {
 88 |     Seq(true, false).foreach { maybeLazy =>
 89 |       val n = 200
 90 |       val bStart = 50
 91 |       val aEnd = 100
 92 |       val common = create(sc.parallelize((0 until n).map(x => (x.toLong, x)), 5)).cache()
 93 |       val a = common.filter(kv => kv._1 < aEnd).cache()
 94 |       val b = common.filter(kv => kv._1 >= bStart).cache()
 95 |       val sum = a.fullOuterJoin(b, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) }
 96 |       val expected = ((0 until bStart).map(x => (x.toLong, x)) ++
 97 |         (bStart until aEnd).map(x => (x.toLong, x * 2)) ++
 98 |         (aEnd until n).map(x => (x.toLong, x))).toSet
 99 |   
100 |       // fullOuterJoin with another IndexedRDD with the same index
101 |       assert(sum.collect.toSet === expected)
102 |   
103 |       // fullOuterJoin with another IndexedRDD with a different index
104 |       val b2 = create(b.map(identity))
105 |       val sum2 = a.fullOuterJoin(b2, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) }
106 |       assert(sum2.collect.toSet === expected)
107 |     }
108 |   }
109 | 
110 |   test("leftJoin") {
111 |     val n = 100
112 |     val ps = pairs(sc, n).cache()
113 |     val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
114 |     // leftJoin with another IndexedRDD
115 |     assert(ps.leftJoin(evens) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet ===
116 |       (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
117 |     // leftJoin with an RDD
118 |     val evensRDD = evens.map(identity)
119 |     assert(ps.leftJoin(evensRDD) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet ===
120 |       (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
121 |   }
122 | 
123 |   test("leftJoin vertices with non-equal number of partitions") {
124 |     val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1)))
125 |     val b = create(
126 |       a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3)))
127 |     assert(a.partitions.size != b.partitions.size)
128 |     val c = a.leftJoin(b) { (vid, old, newOpt) =>
129 |       old - newOpt.getOrElse(0)
130 |     }
131 |     assert(c.filter(v => v._2 != 0).map(_._1).collect.toSet == (1 to 99 by 2).toSet)
132 |   }
133 | 
134 |   test("join") {
135 |     val n = 100
136 |     val ps = pairs(sc, n).cache()
137 |     val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
138 |     // join with another IndexedRDD
139 |     assert(ps.join(evens) { (id, a, b) => a - b }.collect.toSet ===
140 |       (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
141 |     // join with an RDD
142 |     val evensRDD = evens.map(identity)
143 |     assert(ps.join(evensRDD) { (id, a, b) => a - b }.collect.toSet ===
144 |       (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet)
145 |   }
146 | 
147 |   test("innerJoin") {
148 |     val n = 100
149 |     val ps = pairs(sc, n).cache()
150 |     val evens = ps.filter(q => ((q._2 % 2) == 0)).cache()
151 |     // innerJoin with another IndexedRDD
152 |     assert(ps.innerJoin(evens) { (id, a, b) => a - b }.collect.toSet ===
153 |       (0 to n by 2).map(x => (x.toLong, 0)).toSet)
154 |     // innerJoin with an RDD
155 |     val evensRDD = evens.map(identity)
156 |     assert(ps.innerJoin(evensRDD) { (id, a, b) => a - b }.collect.toSet ===
157 |      (0 to n by 2).map(x => (x.toLong, 0)).toSet)
158 |   }
159 | 
160 |   test("innerJoin with non-equal number of partitions") {
161 |     val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1)))
162 |     val b = create(
163 |       a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3)))
164 |     assert(a.partitions.size != b.partitions.size)
165 |     val c = a.innerJoin(b) { (vid, old, newVal) =>
166 |       old - newVal
167 |     }
168 |     assert(c.filter(v => v._2 == 0).map(_._1).collect.toSet == (0 to 98 by 2).toSet)
169 |   }
170 | 
171 |   test("aggregateUsingIndex") {
172 |     val n = 100
173 |     val ps = pairs(sc, n)
174 |     val messageTargets = (0 to n) ++ (0 to n by 2)
175 |     val messages = sc.parallelize(messageTargets.map(x => (x.toLong, 1)))
176 |     assert(ps.aggregateUsingIndex[Int](messages, _ + _).collect.toSet ===
177 |       (0 to n).map(x => (x.toLong, if (x % 2 == 0) 2 else 1)).toSet)
178 | 
179 |     val messagesWithNew = List((0L, 1), (-1L, 1))
180 |     assert(ps.aggregateUsingIndex[Int](sc.parallelize(messagesWithNew), _ + _).collect.toSet ===
181 |       messagesWithNew.toSet)
182 |   }
183 | }
184 | 
185 | class UpdatableIndexedRDDSuite extends IndexedRDDSuite {
186 |   override def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V] = {
187 |     import IndexedRDD._
188 |     IndexedRDD.updatable(elems)
189 |   }
190 | 
191 |   test("put, multiput") {
192 |     val n = 100
193 |     val ps = pairs(sc, n).cache()
194 |     assert(ps.multiput[Int](Map(0L -> 1, 1L -> 1), (id, a) => a, SumFunction).collect.toSet ===
195 |       Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet)
196 |     assert(ps.multiput[Int](Map(-1L -> -1, 0L -> 1), (id, a) => a, SumFunction).collect.toSet ===
197 |       Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
198 |     assert(ps.multiput(Map(-1L -> -1, 0L -> 1, 1L -> 1)).collect.toSet ===
199 |       Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet)
200 |     assert(ps.multiputRDD[Int](sc.parallelize(Seq(0L -> 1, 1L -> 1)), (id, a) => a, SumFunction).collect.toSet ===
201 |       Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet)
202 |     assert(ps.multiputRDD[Int](sc.parallelize(Seq(-1L -> -1, 0L -> 1)), (id, a) => a, SumFunction).collect.toSet ===
203 |       Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
204 |     assert(ps.multiputRDD(sc.parallelize(Seq(-1L -> -1, 0L -> 1, 1L -> 1))).collect.toSet ===
205 |       Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet)
206 |     assert(ps.put(-1L, -1).collect.toSet ===
207 |       Set(-1L -> -1) ++ (0 to n).map(x => (x.toLong, x)).toSet)
208 |     assert(ps.put(0L, 1).collect.toSet ===
209 |       Set(0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet)
210 |   }
211 | 
212 |   test("delete") {
213 |     val n = 100
214 |     val ps = pairs(sc, n).cache()
215 |     assert(ps.delete(Array(0L)).collect.toSet === (1 to n).map(x => (x.toLong, x)).toSet)
216 |     assert(ps.delete(Array(-1L)).collect.toSet === (0 to n).map(x => (x.toLong, x)).toSet)
217 |   }
218 | }
219 | 
220 | // Declared outside of test suite to avoid closure capture
221 | private object SumFunction extends Function3[Long, Int, Int, Int] with Serializable {
222 |   def apply(id: Long, a: Int, b: Int) = a + b
223 | }
224 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializerSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package edu.berkeley.cs.amplab.spark.indexedrdd
 19 | 
 20 | import java.util.UUID
 21 | 
 22 | import org.scalacheck.Arbitrary
 23 | import org.scalacheck.Gen
 24 | import org.scalatest.FunSuite
 25 | import org.scalatest.Matchers
 26 | import org.scalatest.prop.GeneratorDrivenPropertyChecks
 27 | 
 28 | class KeySerializerSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers {
 29 | 
 30 |   test("long") {
 31 |     val ser = new LongSerializer
 32 |     forAll { (a: Long) =>
 33 |       ser.fromBytes(ser.toBytes(a)) should be === a
 34 |     }
 35 |   }
 36 | 
 37 |   test("string") {
 38 |     val ser = new StringSerializer
 39 | 
 40 |     forAll { (a: String) =>
 41 |       ser.fromBytes(ser.toBytes(a)) should be === a
 42 |     }
 43 | 
 44 |     forAll { (a: String, b: String) =>
 45 |       whenever (a != b) {
 46 |         val aSer = ser.toBytes(a)
 47 |         val bSer = ser.toBytes(b)
 48 |         assert(!aSer.startsWith(bSer))
 49 |         assert(!bSer.startsWith(aSer))
 50 |       }
 51 |     }
 52 |   }
 53 | 
 54 |   test("short") {
 55 |     val ser = new ShortSerializer
 56 |     forAll { (a: Short) =>
 57 |       ser.fromBytes(ser.toBytes(a)) should be === a
 58 |     }
 59 |   }
 60 | 
 61 |   test("int") {
 62 |     val ser = new IntSerializer
 63 |     forAll { (a: Int) =>
 64 |       ser.fromBytes(ser.toBytes(a)) should be === a
 65 |     }
 66 |   }
 67 | 
 68 |   implicit val arbUUID: Arbitrary[UUID] = Arbitrary(Gen.uuid)
 69 | 
 70 |   test("UUID") {
 71 |     val ser = new UUIDSerializer
 72 |     forAll { (a: UUID) =>
 73 |       ser.fromBytes(ser.toBytes(a)) should be === a
 74 |     }
 75 |   }
 76 | 
 77 |   test("bigint") {
 78 |     val ser = new BigIntSerializer
 79 | 
 80 |     forAll { (a: BigInt) =>
 81 |       ser.fromBytes(ser.toBytes(a)) should be === a
 82 |     }
 83 | 
 84 |     forAll { (a: BigInt, b: BigInt) =>
 85 |       whenever (a != b) {
 86 |         val aSer = ser.toBytes(a)
 87 |         val bSer = ser.toBytes(b)
 88 |         assert(!aSer.startsWith(bSer))
 89 |         assert(!bSer.startsWith(aSer))
 90 |       }
 91 |     }
 92 |   }
 93 | 
 94 |   def tuple2Test[A: Arbitrary, B: Arbitrary](
 95 |       aSer: KeySerializer[A], bSer: KeySerializer[B]): Unit = {
 96 |     val ser = new Tuple2Serializer[A, B]()(aSer, bSer)
 97 | 
 98 |     forAll { (a: A, b: B) =>
 99 |       ser.fromBytes(ser.toBytes(Tuple2(a, b))) should be === (a, b)
100 |     }
101 | 
102 |     forAll { (a: (A, B), b: (A, B)) =>
103 |       whenever (a != b) {
104 |         val aSer = ser.toBytes(a)
105 |         val bSer = ser.toBytes(b)
106 |         assert(!aSer.startsWith(bSer))
107 |         assert(!bSer.startsWith(aSer))
108 |       }
109 |     }
110 |   }
111 | 
112 |   test("Tuple2") {
113 |     val stringSer = new StringSerializer
114 |     val longSer = new LongSerializer
115 |     val intSer = new IntSerializer
116 |     val shortSer = new ShortSerializer
117 |     val bigintSer = new BigIntSerializer
118 |     val uuidSer = new UUIDSerializer
119 | 
120 |     tuple2Test[Long, Long](longSer, longSer)
121 |     tuple2Test[String, Long](stringSer, longSer)
122 |     tuple2Test[Long, String](longSer, stringSer)
123 |     tuple2Test[String, String](stringSer, stringSer)
124 |     tuple2Test[Short, Short](shortSer, shortSer)
125 |     tuple2Test[Short, Int](shortSer, intSer)
126 |     tuple2Test[Int, Int](intSer, intSer)
127 |     tuple2Test[Int, BigInt](intSer, bigintSer)
128 |     tuple2Test[BigInt, BigInt](bigintSer, bigintSer)
129 |     tuple2Test[Int, UUID](intSer, uuidSer)
130 |     tuple2Test[UUID, UUID](uuidSer, uuidSer)
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/SharedSparkContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package edu.berkeley.cs.amplab.spark.indexedrdd
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.SparkContext
22 | import org.scalatest.BeforeAndAfterAll
23 | import org.scalatest.Suite
24 | 
25 | /** Shares a local `SparkContext` between all tests in a suite and closes it at the end */
26 | trait SharedSparkContext extends BeforeAndAfterAll { self: Suite =>
27 | 
28 |   @transient private var _sc: SparkContext = _
29 | 
30 |   def sc: SparkContext = _sc
31 | 
32 |   var conf = new SparkConf(false)
33 | 
34 |   override def beforeAll() {
35 |     _sc = new SparkContext("local", "test", conf)
36 |     super.beforeAll()
37 |   }
38 | 
39 |   override def afterAll() {
40 |     if (_sc != null) {
41 |       _sc.stop()
42 |     }
43 |     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
44 |     System.clearProperty("spark.driver.port")
45 |     _sc = null
46 |     super.afterAll()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/IndexedRDDPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl
19 | 
20 | import scala.reflect.ClassTag
21 | import edu.berkeley.cs.amplab.spark.indexedrdd._
22 | 
23 | import org.apache.spark.SparkConf
24 | import org.apache.spark.serializer.JavaSerializer
25 | import org.apache.spark.serializer.KryoSerializer
26 | import org.scalatest.FunSuite
27 | 
28 | abstract class IndexedRDDPartitionSuite extends FunSuite {
29 | 
30 |   def create[V: ClassTag](iter: Iterator[(Long, V)]): IndexedRDDPartition[Long, V]
31 | 
32 |   test("serialization") {
33 |     val elems = Set((0L, 1), (1L, 1), (2L, 1))
34 |     val vp = create(elems.iterator)
35 |     val javaSer = new JavaSerializer(new SparkConf())
36 |     val kryoSer = new KryoSerializer(new SparkConf()
37 |       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"))
38 | 
39 |     for (ser <- List(javaSer, kryoSer); s = ser.newInstance()) {
40 |       val vpSer: IndexedRDDPartition[Long, Int] = s.deserialize(s.serialize(vp))
41 |       assert(vpSer.iterator.toSet === elems)
42 |     }
43 |   }
44 |   
45 |   test("get") {
46 |     val elems = Set((0L, 1), (1L, 1), (2L, 1))
47 |     val vp = create(elems.iterator)
48 |     assert(vp(0L) == Some(1))
49 |     assert(vp(1L) == Some(1))
50 |     assert(vp(2L) == Some(1))
51 |     assert(vp(3L) == None)
52 |     
53 |     assert(vp.multiget(Array(1L, 2L, 3L)).size == 2)
54 |   }
55 | }
56 | 
57 | class PARTPartitionSuite extends IndexedRDDPartitionSuite {
58 |   override def create[V: ClassTag](iter: Iterator[(Long, V)]) = {
59 |     import IndexedRDD._
60 |     PARTPartition(iter)
61 |   }
62 | }
63 | 
64 | class LazyPartitionSuite extends IndexedRDDPartitionSuite {
65 |   override def create[V: ClassTag](iter: Iterator[(Long, V)]) = {
66 |     import IndexedRDD._
67 |     val it = iter.toSeq
68 |     new LazyPartition(
69 |       Seq(PARTPartition(it.iterator), PARTPartition(it.iterator)),
70 |       (id, a, b) => (a ++ b).headOption.getOrElse(null.asInstanceOf[V]))
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------