├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── project ├── build.properties └── plugins.sbt ├── sbt ├── sbt └── sbt-launch-lib.bash └── src ├── main └── scala │ └── edu │ └── berkeley │ └── cs │ └── amplab │ └── spark │ └── indexedrdd │ ├── IndexedRDD.scala │ ├── IndexedRDDPartition.scala │ ├── KeySerializer.scala │ └── impl │ ├── LazyPartition.scala │ └── PARTPartition.scala └── test ├── resources └── log4j.properties └── scala └── edu └── berkeley └── cs └── amplab └── spark └── indexedrdd ├── IndexedRDDSuite.scala ├── KeySerializerSuite.scala ├── SharedSparkContext.scala └── impl └── IndexedRDDPartitionSuite.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | sbt/sbt-launch*.jar 5 | .idea/ 6 | .idea_modules/ 7 | .cache 8 | .history 9 | .lib/ 10 | dist/* 11 | target/ 12 | lib_managed/ 13 | src_managed/ 14 | project/boot/ 15 | project/plugins/project/ 16 | 17 | .scala_dependencies 18 | .worksheet 19 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.6 4 | - 2.10.6 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IndexedRDD for Apache Spark 2 | 3 | An efficient updatable key-value store for [Apache Spark](http://spark.apache.org). 4 | 5 | IndexedRDD extends `RDD[(K, V)]` by enforcing key uniqueness and pre-indexing the entries for efficient joins and point lookups, updates, and deletions. It is implemented by (1) hash-partitioning the entries by key, (2) maintaining a radix tree ([PART](https://github.com/ankurdave/part)) index within each partition, and (3) using this immutable and efficiently updatable data structure to enable efficient modifications and deletions. 6 | 7 | ## Usage 8 | 9 | Add the dependency to your SBT project by adding the following to `build.sbt` (see the [Spark Packages listing](http://spark-packages.org/package/amplab/spark-indexedrdd) for spark-submit and Maven instructions): 10 | 11 | ```scala 12 | resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" 13 | 14 | libraryDependencies += "amplab" % "spark-indexedrdd" % "0.3" 15 | ``` 16 | 17 | Then use IndexedRDD as follows: 18 | 19 | ```scala 20 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD 21 | import edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDD._ 22 | 23 | // Create an RDD of key-value pairs with Long keys. 24 | val rdd = sc.parallelize((1 to 1000000).map(x => (x.toLong, 0))) 25 | // Construct an IndexedRDD from the pairs, hash-partitioning and indexing 26 | // the entries. 27 | val indexed = IndexedRDD(rdd).cache() 28 | 29 | // Perform a point update. 30 | val indexed2 = indexed.put(1234L, 10873).cache() 31 | // Perform a point lookup. Note that the original IndexedRDD remains 32 | // unmodified. 33 | indexed2.get(1234L) // => Some(10873) 34 | indexed.get(1234L) // => Some(0) 35 | 36 | // Efficiently join derived IndexedRDD with original. 37 | val indexed3 = indexed.innerJoin(indexed2) { (id, a, b) => b }.filter(_._2 != 0) 38 | indexed3.collect // => Array((1234L, 10873)) 39 | 40 | // Perform insertions and deletions. 41 | val indexed4 = indexed2.put(-100L, 111).delete(Array(998L, 999L)).cache() 42 | indexed2.get(-100L) // => None 43 | indexed4.get(-100L) // => Some(111) 44 | indexed2.get(999L) // => Some(0) 45 | indexed4.get(999L) // => None 46 | ``` 47 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-indexedrdd" 2 | version := "0.4.0" 3 | organization := "edu.berkeley.cs.amplab" 4 | 5 | scalaVersion := "2.11.8" 6 | crossScalaVersions := Seq("2.10.6", "2.11.6") 7 | 8 | spName := "amplab/spark-indexedrdd" 9 | sparkVersion := "2.1.0" 10 | sparkComponents += "core" 11 | 12 | resolvers += "Repo at github.com/ankurdave/maven-repo" at "https://raw.githubusercontent.com/ankurdave/maven-repo/master" 13 | 14 | libraryDependencies ++= Seq( 15 | "com.ankurdave" % "part_2.10" % "0.1", // artifact is not published for 2.11, but it only contains Java code anyway 16 | "org.scalatest" %% "scalatest" % "2.2.4" % "test", 17 | "org.scalacheck" %% "scalacheck" % "1.12.2" % "test" 18 | ) 19 | 20 | publishMavenStyle := true 21 | 22 | licenses += "Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html") 23 | 24 | pomExtra := 25 | https://github.com/amplab/spark-indexedrdd 26 | 27 | git@github.com:amplab/spark-indexedrdd.git 28 | scm:git:git@github.com:amplab/spark-indexedrdd.git 29 | 30 | 31 | 32 | ankurdave 33 | Ankur Dave 34 | https://github.com/ankurdave 35 | 36 | 37 | 38 | 39 | // Run tests with more memory 40 | javaOptions in test += "-Xmx2G" 41 | 42 | fork in test := true 43 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | sbt.version=0.13.13 18 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | scalaVersion := "2.10.4" 2 | 3 | // resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 4 | 5 | // resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 6 | 7 | // resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 8 | 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 10 | 11 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" 12 | 13 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2") 14 | -------------------------------------------------------------------------------- /sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | realpath () { 4 | ( 5 | TARGET_FILE="$1" 6 | 7 | cd "$(dirname "$TARGET_FILE")" 8 | TARGET_FILE="$(basename "$TARGET_FILE")" 9 | 10 | COUNT=0 11 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 12 | do 13 | TARGET_FILE="$(readlink "$TARGET_FILE")" 14 | cd $(dirname "$TARGET_FILE") 15 | TARGET_FILE="$(basename $TARGET_FILE)" 16 | COUNT=$(($COUNT + 1)) 17 | done 18 | 19 | echo "$(pwd -P)/"$TARGET_FILE"" 20 | ) 21 | } 22 | 23 | . "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash 24 | 25 | 26 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" 27 | declare -r sbt_opts_file=".sbtopts" 28 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" 29 | 30 | usage() { 31 | cat < path to global settings/plugins directory (default: ~/.sbt) 40 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 41 | -ivy path to local Ivy repository (default: ~/.ivy2) 42 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 43 | -no-share use all local caches; no sharing 44 | -no-global uses global caches, but does not use global ~/.sbt directory. 45 | -jvm-debug Turn on JVM debugging, open at the given port. 46 | -batch Disable interactive mode 47 | 48 | # sbt version (default: from project/build.properties if present, else latest release) 49 | -sbt-version use the specified version of sbt 50 | -sbt-jar use the specified jar as the sbt launcher 51 | -sbt-rc use an RC version of sbt 52 | -sbt-snapshot use a snapshot version of sbt 53 | 54 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 55 | -java-home alternate JAVA_HOME 56 | 57 | # jvm options and output control 58 | JAVA_OPTS environment variable, if unset uses "$java_opts" 59 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts" 60 | .sbtopts if this file exists in the current directory, it is 61 | prepended to the runner args 62 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args 63 | -Dkey=val pass -Dkey=val directly to the java runtime 64 | -J-X pass option -X directly to the java runtime 65 | (-J is stripped) 66 | -S-X add -X to sbt's scalacOptions (-J is stripped) 67 | -PmavenProfiles Enable a maven profile for the build. 68 | 69 | In the case of duplicated or conflicting options, the order above 70 | shows precedence: JAVA_OPTS lowest, command line options highest. 71 | EOM 72 | } 73 | 74 | process_my_args () { 75 | while [[ $# -gt 0 ]]; do 76 | case "$1" in 77 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; 78 | -no-share) addJava "$noshare_opts" && shift ;; 79 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; 80 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; 81 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; 82 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; 83 | -batch) exec &2 "$@" 31 | } 32 | vlog () { 33 | [[ $verbose || $debug ]] && echoerr "$@" 34 | } 35 | dlog () { 36 | [[ $debug ]] && echoerr "$@" 37 | } 38 | 39 | acquire_sbt_jar () { 40 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties` 41 | URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 42 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 43 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 44 | 45 | sbt_jar=$JAR 46 | 47 | if [[ ! -f "$sbt_jar" ]]; then 48 | # Download sbt launch jar if it hasn't been downloaded yet 49 | if [ ! -f "${JAR}" ]; then 50 | # Download 51 | printf "Attempting to fetch sbt\n" 52 | JAR_DL="${JAR}.part" 53 | if hash curl 2>/dev/null; then 54 | (curl -L --silent ${URL1} > "${JAR_DL}" || curl -L --silent ${URL2} > "${JAR_DL}") && mv "${JAR_DL}" "${JAR}" 55 | elif hash wget 2>/dev/null; then 56 | (wget --quiet ${URL1} -O "${JAR_DL}" || wget --quiet ${URL2} -O "${JAR_DL}") && mv "${JAR_DL}" "${JAR}" 57 | else 58 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 59 | exit -1 60 | fi 61 | fi 62 | if [ ! -f "${JAR}" ]; then 63 | # We failed to download 64 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 65 | exit -1 66 | fi 67 | printf "Launching sbt from ${JAR}\n" 68 | fi 69 | } 70 | 71 | execRunner () { 72 | # print the arguments one to a line, quoting any containing spaces 73 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 74 | for arg; do 75 | if printf "%s\n" "$arg" | grep -q ' '; then 76 | printf "\"%s\"\n" "$arg" 77 | else 78 | printf "%s\n" "$arg" 79 | fi 80 | done 81 | echo "" 82 | } 83 | 84 | exec "$@" 85 | } 86 | 87 | addJava () { 88 | dlog "[addJava] arg = '$1'" 89 | java_args=( "${java_args[@]}" "$1" ) 90 | } 91 | 92 | enableProfile () { 93 | dlog "[enableProfile] arg = '$1'" 94 | maven_profiles=( "${maven_profiles[@]}" "$1" ) 95 | export SBT_MAVEN_PROFILES="${maven_profiles[@]}" 96 | } 97 | 98 | addSbt () { 99 | dlog "[addSbt] arg = '$1'" 100 | sbt_commands=( "${sbt_commands[@]}" "$1" ) 101 | } 102 | addResidual () { 103 | dlog "[residual] arg = '$1'" 104 | residual_args=( "${residual_args[@]}" "$1" ) 105 | } 106 | addDebugger () { 107 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" 108 | } 109 | 110 | # a ham-fisted attempt to move some memory settings in concert 111 | # so they need not be dicked around with individually. 112 | get_mem_opts () { 113 | local mem=${1:-2048} 114 | local perm=$(( $mem / 4 )) 115 | (( $perm > 256 )) || perm=256 116 | (( $perm < 4096 )) || perm=4096 117 | local codecache=$(( $perm / 2 )) 118 | 119 | echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m" 120 | } 121 | 122 | require_arg () { 123 | local type="$1" 124 | local opt="$2" 125 | local arg="$3" 126 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 127 | die "$opt requires <$type> argument" 128 | fi 129 | } 130 | 131 | is_function_defined() { 132 | declare -f "$1" > /dev/null 133 | } 134 | 135 | process_args () { 136 | while [[ $# -gt 0 ]]; do 137 | case "$1" in 138 | -h|-help) usage; exit 1 ;; 139 | -v|-verbose) verbose=1 && shift ;; 140 | -d|-debug) debug=1 && shift ;; 141 | 142 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; 143 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; 144 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 145 | -batch) exec partitioner.get.getPartition(k)) 81 | val partitions = ksByPartition.keys.toSeq 82 | // TODO: avoid sending all keys to all partitions by creating and zipping an RDD of keys 83 | val results: Array[Array[(K, V)]] = context.runJob(partitionsRDD, 84 | (context: TaskContext, partIter: Iterator[IndexedRDDPartition[K, V]]) => { 85 | if (partIter.hasNext && ksByPartition.contains(context.partitionId)) { 86 | val part = partIter.next() 87 | val ksForPartition = ksByPartition.get(context.partitionId).get 88 | part.multiget(ksForPartition).toArray 89 | } else { 90 | Array.empty[(K, V)] 91 | } 92 | }, partitions) 93 | results.flatten.toMap 94 | } 95 | 96 | /** 97 | * Unconditionally updates the specified key to have the specified value. Returns a new IndexedRDD 98 | * that reflects the modification. 99 | * 100 | * Some implementations may not support this operation and will throw 101 | * `UnsupportedOperationException`. 102 | */ 103 | def put(k: K, v: V): IndexedRDD[K, V] = multiput(Map(k -> v)) 104 | 105 | /** 106 | * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new 107 | * IndexedRDD that reflects the modification. 108 | * 109 | * Some implementations may not support this operation and will throw 110 | * `UnsupportedOperationException`. 111 | */ 112 | def multiput(kvs: Map[K, V]): IndexedRDD[K, V] = 113 | multiput(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b) 114 | 115 | /** 116 | * Unconditionally updates the keys in `kvs` to their corresponding values. Returns a new 117 | * IndexedRDD that reflects the modification. 118 | * 119 | * Some implementations may not support this operation and will throw 120 | * `UnsupportedOperationException`. 121 | */ 122 | def multiputRDD(kvs: RDD[(K, V)]): IndexedRDD[K, V] = 123 | multiputRDD(kvs, (id: K, a: V) => a, (id: K, a: V, b: V) => b) 124 | 125 | /** 126 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values 127 | * if necessary. Returns a new IndexedRDD that reflects the modification. 128 | * 129 | * Some implementations may not support this operation and will throw 130 | * `UnsupportedOperationException`. 131 | */ 132 | def multiput(kvs: Map[K, V], merge: (K, V, V) => V): IndexedRDD[K, V] = 133 | multiput(kvs, (id: K, a: V) => a, merge) 134 | 135 | /** 136 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values 137 | * if necessary. Returns a new IndexedRDD that reflects the modification. 138 | * 139 | * Some implementations may not support this operation and will throw 140 | * `UnsupportedOperationException`. 141 | */ 142 | def multiputRDD(kvs: RDD[(K, V)], merge: (K, V, V) => V): IndexedRDD[K, V] = 143 | multiputRDD(kvs, (id: K, a: V) => a, merge) 144 | 145 | /** 146 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values 147 | * if necessary. Returns a new IndexedRDD that reflects the modification. 148 | * 149 | * Some implementations may not support this operation and will throw 150 | * `UnsupportedOperationException`. 151 | */ 152 | def multiput[U: ClassTag](kvs: Map[K, U], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] = 153 | multiputRDD(context.parallelize(kvs.toSeq), project, merge) 154 | 155 | /** 156 | * Updates the keys in `kvs` to their corresponding values, running `merge` on old and new values 157 | * if necessary. Returns a new IndexedRDD that reflects the modification. 158 | * 159 | * Some implementations may not support this operation and will throw 160 | * `UnsupportedOperationException`. 161 | */ 162 | def multiputRDD[U: ClassTag](updates: RDD[(K, U)], project: (K, U) => V, merge: (K, V, U) => V): IndexedRDD[K, V] = { 163 | zipPartitionsWithOther(updates.partitionBy(partitioner.get))(new MultiputZipper(project, merge)) 164 | } 165 | 166 | /** 167 | * Deletes the specified keys. Returns a new IndexedRDD that reflects the deletions. 168 | * 169 | * Some implementations may not support this operation and will throw 170 | * `UnsupportedOperationException`. 171 | */ 172 | def delete(ks: Array[K]): IndexedRDD[K, V] = { 173 | val deletions = context.parallelize(ks.map(k => (k, ()))).partitionBy(partitioner.get) 174 | zipPartitionsWithOther(deletions)(new DeleteZipper) 175 | } 176 | 177 | /** Applies a function to each partition of this IndexedRDD. */ 178 | private def mapIndexedRDDPartitions[K2: ClassTag, V2: ClassTag]( 179 | f: IndexedRDDPartition[K, V] => IndexedRDDPartition[K2, V2]): IndexedRDD[K2, V2] = { 180 | val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) 181 | new IndexedRDD(newPartitionsRDD) 182 | } 183 | 184 | /** Applies a function to corresponding partitions of `this` and another IndexedRDD. */ 185 | private def zipIndexedRDDPartitions[V2: ClassTag, V3: ClassTag](other: IndexedRDD[K, V2]) 186 | (f: ZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = { 187 | assert(partitioner == other.partitioner) 188 | val newPartitionsRDD = partitionsRDD.zipPartitions(other.partitionsRDD, true)(f) 189 | new IndexedRDD(newPartitionsRDD) 190 | } 191 | 192 | /** Applies a function to corresponding partitions of `this` and a pair RDD. */ 193 | private def zipPartitionsWithOther[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)]) 194 | (f: OtherZipPartitionsFunction[V2, V3]): IndexedRDD[K, V3] = { 195 | val partitioned = other.partitionBy(partitioner.get) 196 | val newPartitionsRDD = partitionsRDD.zipPartitions(partitioned, true)(f) 197 | new IndexedRDD(newPartitionsRDD) 198 | } 199 | 200 | /** 201 | * Restricts the entries to those satisfying the given predicate. This operation preserves the 202 | * index for efficient joins with the original IndexedRDD and is implemented using soft deletions. 203 | * 204 | * @param pred the user defined predicate, which takes a tuple to conform to the `RDD[(K, V)]` 205 | * interface 206 | */ 207 | override def filter(pred: Tuple2[K, V] => Boolean): IndexedRDD[K, V] = 208 | this.mapIndexedRDDPartitions(_.filter(Function.untupled(pred))) 209 | 210 | /** Maps each value, preserving the index. */ 211 | def mapValues[V2: ClassTag](f: V => V2): IndexedRDD[K, V2] = 212 | this.mapIndexedRDDPartitions(_.mapValues((vid, attr) => f(attr))) 213 | 214 | /** Maps each value, supplying the corresponding key and preserving the index. */ 215 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDD[K, V2] = 216 | this.mapIndexedRDDPartitions(_.mapValues(f)) 217 | 218 | /** 219 | * Intersects `this` and `other` and keeps only elements with differing values. For these 220 | * elements, keeps the values from `this`. 221 | */ 222 | def diff(other: RDD[(K, V)]): IndexedRDD[K, V] = other match { 223 | case other: IndexedRDD[K, V] if partitioner == other.partitioner => 224 | this.zipIndexedRDDPartitions(other)(new DiffZipper) 225 | case _ => 226 | this.zipPartitionsWithOther(other)(new OtherDiffZipper) 227 | } 228 | 229 | /** 230 | * Joins `this` with `other`, running `f` on the values of all keys in both sets. Note that for 231 | * efficiency `other` must be an IndexedRDD, not just a pair RDD. Use [[aggregateUsingIndex]] to 232 | * construct an IndexedRDD co-partitioned with `this`. 233 | * 234 | * @param maybeLazy if true, a joined "view" of the input RDDs (that preserves the underlying 235 | * indices) may be returned 236 | */ 237 | def fullOuterJoin[V2: ClassTag, W: ClassTag] 238 | (other: RDD[(K, V2)], maybeLazy: Boolean = false) 239 | (f: (K, Option[V], Option[V2]) => W): IndexedRDD[K, W] = other match { 240 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner => { 241 | val castFn = implicitly[ClassTag[(K, Option[V], Option[V]) => V]] 242 | val castRDD = implicitly[ClassTag[IndexedRDD[K, V]]] 243 | (other, f) match { 244 | case (castRDD(other), castFn(f)) if maybeLazy => 245 | this.zipIndexedRDDPartitions(other)(new LazyFullOuterJoinZipper(f)).asInstanceOf[IndexedRDD[K, W]] 246 | case (other, f) => 247 | this.zipIndexedRDDPartitions(other)(new FullOuterJoinZipper(f)) 248 | } 249 | } 250 | case _ => 251 | this.zipPartitionsWithOther(other)(new OtherFullOuterJoinZipper(f)) 252 | } 253 | 254 | /** 255 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because 256 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the 257 | * value type. 258 | */ 259 | def join[U: ClassTag] 260 | (other: RDD[(K, U)])(f: (K, V, U) => V): IndexedRDD[K, V] = other match { 261 | case other: IndexedRDD[K, U] if partitioner == other.partitioner => 262 | this.zipIndexedRDDPartitions(other)(new JoinZipper(f)) 263 | case _ => 264 | this.zipPartitionsWithOther(other)(new OtherJoinZipper(f)) 265 | } 266 | 267 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */ 268 | def leftJoin[V2: ClassTag, V3: ClassTag] 269 | (other: RDD[(K, V2)])(f: (K, V, Option[V2]) => V3): IndexedRDD[K, V3] = other match { 270 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner => 271 | this.zipIndexedRDDPartitions(other)(new LeftJoinZipper(f)) 272 | case _ => 273 | this.zipPartitionsWithOther(other)(new OtherLeftJoinZipper(f)) 274 | } 275 | 276 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */ 277 | def innerJoin[V2: ClassTag, V3: ClassTag](other: RDD[(K, V2)]) 278 | (f: (K, V, V2) => V3): IndexedRDD[K, V3] = other match { 279 | case other: IndexedRDD[K, V2] if partitioner == other.partitioner => 280 | this.zipIndexedRDDPartitions(other)(new InnerJoinZipper(f)) 281 | case _ => 282 | this.zipPartitionsWithOther(other)(new OtherInnerJoinZipper(f)) 283 | } 284 | 285 | /** 286 | * Creates a new IndexedRDD with values from `elems` that may share an index with `this`, 287 | * merging duplicate keys in `elems` arbitrarily. 288 | */ 289 | def createUsingIndex[V2: ClassTag](elems: RDD[(K, V2)]): IndexedRDD[K, V2] = { 290 | this.zipPartitionsWithOther(elems)(new CreateUsingIndexZipper) 291 | } 292 | 293 | /** Creates a new IndexedRDD with values from `elems` that may share an index with `this`. */ 294 | def aggregateUsingIndex[V2: ClassTag]( 295 | elems: RDD[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDD[K, V2] = { 296 | this.zipPartitionsWithOther(elems)(new AggregateUsingIndexZipper(reduceFunc)) 297 | } 298 | 299 | /** 300 | * Optionally rebuilds the indexes of this IndexedRDD. Depending on the implementation, this may 301 | * remove tombstoned entries and the resulting IndexedRDD may not support efficient joins with the 302 | * original one. 303 | */ 304 | def reindex(): IndexedRDD[K, V] = this.mapIndexedRDDPartitions(_.reindex()) 305 | 306 | // The following functions could have been anonymous, but we name them to work around a Scala 307 | // compiler bug related to specialization. 308 | 309 | private type ZipPartitionsFunction[V2, V3] = 310 | Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[IndexedRDDPartition[K, V2]], 311 | Iterator[IndexedRDDPartition[K, V3]]] 312 | 313 | private type OtherZipPartitionsFunction[V2, V3] = 314 | Function2[Iterator[IndexedRDDPartition[K, V]], Iterator[(K, V2)], 315 | Iterator[IndexedRDDPartition[K, V3]]] 316 | 317 | private class MultiputZipper[U](z: (K, U) => V, f: (K, V, U) => V) 318 | extends OtherZipPartitionsFunction[U, V] with Serializable { 319 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)]) 320 | : Iterator[IndexedRDDPartition[K, V]] = { 321 | val thisPart = thisIter.next() 322 | Iterator(thisPart.multiput(otherIter, z, f)) 323 | } 324 | } 325 | 326 | private class DeleteZipper extends OtherZipPartitionsFunction[Unit, V] with Serializable { 327 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, Unit)]) 328 | : Iterator[IndexedRDDPartition[K, V]] = { 329 | val thisPart = thisIter.next() 330 | Iterator(thisPart.delete(otherIter.map(_._1))) 331 | } 332 | } 333 | 334 | private class DiffZipper extends ZipPartitionsFunction[V, V] with Serializable { 335 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]]): Iterator[IndexedRDDPartition[K, V]] = { 336 | val thisPart = thisIter.next() 337 | val otherPart = otherIter.next() 338 | Iterator(thisPart.diff(otherPart)) 339 | } 340 | } 341 | 342 | private class OtherDiffZipper extends OtherZipPartitionsFunction[V, V] with Serializable { 343 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V)]): Iterator[IndexedRDDPartition[K, V]] = { 344 | val thisPart = thisIter.next() 345 | Iterator(thisPart.diff(otherIter)) 346 | } 347 | } 348 | 349 | private class FullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W) 350 | extends ZipPartitionsFunction[V2, W] with Serializable { 351 | def apply( 352 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]]) 353 | : Iterator[IndexedRDDPartition[K, W]] = { 354 | val thisPart = thisIter.next() 355 | val otherPart = otherIter.next() 356 | Iterator(thisPart.fullOuterJoin(otherPart)(f)) 357 | } 358 | } 359 | 360 | private class LazyFullOuterJoinZipper(f: (K, Option[V], Option[V]) => V) 361 | extends ZipPartitionsFunction[V, V] with Serializable { 362 | def apply( 363 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V]]) 364 | : Iterator[IndexedRDDPartition[K, V]] = { 365 | val thisPart = thisIter.next() 366 | val otherPart = otherIter.next() 367 | (thisPart, otherPart) match { 368 | case (thisPart: LazyPartition[K, V], otherPart: LazyPartition[K, V]) if thisPart.reducer == f && otherPart.reducer == f => 369 | Iterator(new LazyPartition(thisPart.partitions ++ otherPart.partitions, f)) 370 | case (thisPart: LazyPartition[K, V], _) if thisPart.reducer == f => 371 | Iterator(new LazyPartition(thisPart.partitions :+ otherPart, f)) 372 | case (_, otherPart: LazyPartition[K, V]) if otherPart.reducer == f => 373 | Iterator(new LazyPartition(thisPart +: otherPart.partitions, f)) 374 | case _ => 375 | Iterator(new LazyPartition(Seq(thisPart, otherPart), f)) 376 | } 377 | } 378 | } 379 | 380 | private class OtherFullOuterJoinZipper[V2: ClassTag, W: ClassTag](f: (K, Option[V], Option[V2]) => W) 381 | extends OtherZipPartitionsFunction[V2, W] with Serializable { 382 | def apply( 383 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]) 384 | : Iterator[IndexedRDDPartition[K, W]] = { 385 | val thisPart = thisIter.next() 386 | Iterator(thisPart.fullOuterJoin(otherIter)(f)) 387 | } 388 | } 389 | 390 | private class JoinZipper[U: ClassTag](f: (K, V, U) => V) 391 | extends ZipPartitionsFunction[U, V] with Serializable { 392 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, U]]): Iterator[IndexedRDDPartition[K, V]] = { 393 | val thisPart = thisIter.next() 394 | val otherPart = otherIter.next() 395 | Iterator(thisPart.join(otherPart)(f)) 396 | } 397 | } 398 | 399 | private class OtherJoinZipper[U: ClassTag](f: (K, V, U) => V) 400 | extends OtherZipPartitionsFunction[U, V] with Serializable { 401 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, U)]): Iterator[IndexedRDDPartition[K, V]] = { 402 | val thisPart = thisIter.next() 403 | Iterator(thisPart.join(otherIter)(f)) 404 | } 405 | } 406 | 407 | private class LeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3) 408 | extends ZipPartitionsFunction[V2, V3] with Serializable { 409 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]]): Iterator[IndexedRDDPartition[K, V3]] = { 410 | val thisPart = thisIter.next() 411 | val otherPart = otherIter.next() 412 | Iterator(thisPart.leftJoin(otherPart)(f)) 413 | } 414 | } 415 | 416 | private class OtherLeftJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, Option[V2]) => V3) 417 | extends OtherZipPartitionsFunction[V2, V3] with Serializable { 418 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V3]] = { 419 | val thisPart = thisIter.next() 420 | Iterator(thisPart.leftJoin(otherIter)(f)) 421 | } 422 | } 423 | 424 | private class InnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3) 425 | extends ZipPartitionsFunction[V2, V3] with Serializable { 426 | def apply( 427 | thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[IndexedRDDPartition[K, V2]]) 428 | : Iterator[IndexedRDDPartition[K, V3]] = { 429 | val thisPart = thisIter.next() 430 | val otherPart = otherIter.next() 431 | Iterator(thisPart.innerJoin(otherPart)(f)) 432 | } 433 | } 434 | 435 | private class OtherInnerJoinZipper[V2: ClassTag, V3: ClassTag](f: (K, V, V2) => V3) 436 | extends OtherZipPartitionsFunction[V2, V3] with Serializable { 437 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]) 438 | : Iterator[IndexedRDDPartition[K, V3]] = { 439 | val thisPart = thisIter.next() 440 | Iterator(thisPart.innerJoin(otherIter)(f)) 441 | } 442 | } 443 | 444 | private class CreateUsingIndexZipper[V2: ClassTag] 445 | extends OtherZipPartitionsFunction[V2, V2] with Serializable { 446 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = { 447 | val thisPart = thisIter.next() 448 | Iterator(thisPart.createUsingIndex(otherIter)) 449 | } 450 | } 451 | 452 | private class AggregateUsingIndexZipper[V2: ClassTag](reduceFunc: (V2, V2) => V2) 453 | extends OtherZipPartitionsFunction[V2, V2] with Serializable { 454 | def apply(thisIter: Iterator[IndexedRDDPartition[K, V]], otherIter: Iterator[(K, V2)]): Iterator[IndexedRDDPartition[K, V2]] = { 455 | val thisPart = thisIter.next() 456 | Iterator(thisPart.aggregateUsingIndex(otherIter, reduceFunc)) 457 | } 458 | } 459 | } 460 | 461 | object IndexedRDD { 462 | /** 463 | * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily. 464 | */ 465 | def apply[K: ClassTag : KeySerializer, V: ClassTag] 466 | (elems: RDD[(K, V)]): IndexedRDD[K, V] = updatable(elems) 467 | 468 | /** 469 | * Constructs an updatable IndexedRDD from an RDD of pairs, merging duplicate keys arbitrarily. 470 | */ 471 | def updatable[K: ClassTag : KeySerializer, V: ClassTag] 472 | (elems: RDD[(K, V)]) 473 | : IndexedRDD[K, V] = updatable[K, V, V](elems, (id, a) => a, (id, a, b) => b) 474 | 475 | /** Constructs an IndexedRDD from an RDD of pairs. */ 476 | def updatable[K: ClassTag : KeySerializer, U: ClassTag, V: ClassTag] 477 | (elems: RDD[(K, U)], z: (K, U) => V, f: (K, V, U) => V) 478 | : IndexedRDD[K, V] = { 479 | val elemsPartitioned = 480 | if (elems.partitioner.isDefined) elems 481 | else elems.partitionBy(new HashPartitioner(elems.partitions.size)) 482 | val partitions = elemsPartitioned.mapPartitions[IndexedRDDPartition[K, V]]( 483 | iter => Iterator(PARTPartition(iter, z, f)), 484 | preservesPartitioning = true) 485 | new IndexedRDD(partitions) 486 | } 487 | 488 | implicit val longSer = new LongSerializer 489 | implicit val stringSer = new StringSerializer 490 | implicit val shortSer = new ShortSerializer 491 | implicit val charSer = new CharSerializer 492 | implicit val intSet = new IntSerializer 493 | implicit val bigintSer = new BigIntSerializer 494 | implicit val uuidSer = new UUIDSerializer 495 | 496 | implicit def tuple2Ser[A, B]( 497 | implicit aSer: KeySerializer[A], bSer: KeySerializer[B]): Tuple2Serializer[A, B] = 498 | new Tuple2Serializer()(aSer, bSer) 499 | } 500 | -------------------------------------------------------------------------------- /src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import scala.reflect.ClassTag 21 | 22 | /** 23 | * A map of key-value `(K, V)` pairs that enforces key uniqueness and pre-indexes the entries for 24 | * fast lookups, joins, and optionally updates. To construct an `IndexedRDDPartition`, use one of 25 | * the constructors in the [[edu.berkeley.cs.amplab.spark.indexedrdd.IndexedRDDPartition$ 26 | * IndexedRDDPartition object]]. 27 | * 28 | * @tparam K the key associated with each entry in the set. 29 | * @tparam V the value associated with each entry in the set. 30 | */ 31 | private[indexedrdd] abstract class IndexedRDDPartition[K, V] extends Serializable { 32 | 33 | protected implicit def kTag: ClassTag[K] 34 | protected implicit def vTag: ClassTag[V] 35 | 36 | def size: Long 37 | 38 | /** Return the value for the given key. */ 39 | def apply(k: K): Option[V] 40 | 41 | def isDefined(k: K): Boolean = 42 | apply(k).isDefined 43 | 44 | def iterator: Iterator[(K, V)] 45 | 46 | /** 47 | * Gets the values corresponding to the specified keys, if any. 48 | */ 49 | def multiget(ks: Array[K]): Iterator[(K, V)] 50 | 51 | /** 52 | * Updates the keys in `kvs` to their corresponding values generated by running `f` on old and new 53 | * values, if an old value exists, or `z` otherwise. Returns a new IndexedRDDPartition that 54 | * reflects the modification. 55 | */ 56 | def multiput[U]( 57 | kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] = 58 | throw new UnsupportedOperationException("modifications not supported") 59 | 60 | /** Deletes the specified keys. Returns a new IndexedRDDPartition that reflects the deletions. */ 61 | def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] = 62 | throw new UnsupportedOperationException("modifications not supported") 63 | 64 | /** Maps each value, supplying the corresponding key and preserving the index. */ 65 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] 66 | 67 | /** 68 | * Restricts the entries to those satisfying the given predicate. 69 | */ 70 | def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] 71 | 72 | /** 73 | * Intersects `this` and `other` and keeps only elements with differing values. For these 74 | * elements, keeps the values from `this`. 75 | */ 76 | def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] 77 | 78 | /** 79 | * Intersects `this` and `other` and keeps only elements with differing values. For these 80 | * elements, keeps the values from `this`. 81 | */ 82 | def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] 83 | 84 | /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */ 85 | def fullOuterJoin[V2: ClassTag, W: ClassTag] 86 | (other: IndexedRDDPartition[K, V2]) 87 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] 88 | 89 | /** Joins `this` with `other`, running `f` on the values of all keys in both sets. */ 90 | def fullOuterJoin[V2: ClassTag, W: ClassTag] 91 | (other: Iterator[(K, V2)]) 92 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] 93 | 94 | /** 95 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because 96 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the 97 | * value type. 98 | */ 99 | def join[U: ClassTag] 100 | (other: IndexedRDDPartition[K, U]) 101 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] 102 | 103 | /** 104 | * Left outer joins `this` with `other`, running `f` on the values of corresponding keys. Because 105 | * values in `this` with no corresponding entries in `other` are preserved, `f` cannot change the 106 | * value type. 107 | */ 108 | def join[U: ClassTag] 109 | (other: Iterator[(K, U)]) 110 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] 111 | 112 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */ 113 | def leftJoin[V2: ClassTag, V3: ClassTag] 114 | (other: IndexedRDDPartition[K, V2]) 115 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] 116 | 117 | /** Left outer joins `this` with `other`, running `f` on all values of `this`. */ 118 | def leftJoin[V2: ClassTag, V3: ClassTag] 119 | (other: Iterator[(K, V2)]) 120 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] 121 | 122 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */ 123 | def innerJoin[U: ClassTag, V2: ClassTag] 124 | (other: IndexedRDDPartition[K, U]) 125 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] 126 | 127 | /** Inner joins `this` with `other`, running `f` on the values of corresponding keys. */ 128 | def innerJoin[U: ClassTag, V2: ClassTag] 129 | (other: Iterator[(K, U)]) 130 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] 131 | 132 | /** 133 | * Creates a new partition with values from `elems` that may share an index with `this`, 134 | * merging duplicate keys in `elems` arbitrarily. 135 | */ 136 | def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] 137 | 138 | /** Creates a new partition with values from `elems` that shares an index with `this`. */ 139 | def aggregateUsingIndex[V2: ClassTag]( 140 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] 141 | 142 | /** 143 | * Optionally rebuilds the indexes of this partition. Depending on the implementation, this may 144 | * remove tombstoned entries and the resulting partition may support efficient joins with the 145 | * original one. 146 | */ 147 | def reindex(): IndexedRDDPartition[K, V] 148 | } 149 | -------------------------------------------------------------------------------- /src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import java.util.UUID 21 | 22 | /** 23 | * Serializer for storing arbitrary key types as byte arrays for PART. 24 | * 25 | * If serialized keys may be of variable length, they should be terminated with a unique value, 26 | * because keys in PART cannot be prefixes of other keys. 27 | */ 28 | trait KeySerializer[K] extends Serializable { 29 | def toBytes(k: K): Array[Byte] 30 | def fromBytes(b: Array[Byte]): K 31 | } 32 | 33 | class LongSerializer extends KeySerializer[Long] { 34 | override def toBytes(k: Long) = Array( 35 | ((k >> 56) & 0xFF).toByte, 36 | ((k >> 48) & 0xFF).toByte, 37 | ((k >> 40) & 0xFF).toByte, 38 | ((k >> 32) & 0xFF).toByte, 39 | ((k >> 24) & 0xFF).toByte, 40 | ((k >> 16) & 0xFF).toByte, 41 | ((k >> 8) & 0xFF).toByte, 42 | ( k & 0xFF).toByte) 43 | 44 | override def fromBytes(b: Array[Byte]): Long = 45 | ( (b(0).toLong << 56) & (0xFFL << 56) | 46 | (b(1).toLong << 48) & (0xFFL << 48) | 47 | (b(2).toLong << 40) & (0xFFL << 40) | 48 | (b(3).toLong << 32) & (0xFFL << 32) | 49 | (b(4).toLong << 24) & (0xFFL << 24) | 50 | (b(5).toLong << 16) & (0xFFL << 16) | 51 | (b(6).toLong << 8) & (0xFFL << 8) | 52 | b(7).toLong & 0xFFL) 53 | } 54 | 55 | class IntSerializer extends KeySerializer[Int] { 56 | override def toBytes(k: Int) = Array( 57 | ((k >> 24) & 0xFF).toByte, 58 | ((k >> 16) & 0xFF).toByte, 59 | ((k >> 8) & 0xFF).toByte, 60 | ( k & 0xFF).toByte) 61 | 62 | override def fromBytes(b: Array[Byte]): Int = 63 | (b(0).toInt << 24) & (0xFF << 24) | 64 | (b(1).toInt << 16) & (0xFF << 16) | 65 | (b(2).toInt << 8) & (0xFF << 8) | 66 | b(3).toInt & 0xFF 67 | } 68 | 69 | class BigIntSerializer extends KeySerializer[BigInt] { 70 | override def toBytes(k: BigInt) = { 71 | // Prepend the BigInt bit length to ensure no key is a prefix of any other 72 | val lengthBytes = Array( 73 | ((k.bitLength >> 24) & 0xFF).toByte, 74 | ((k.bitLength >> 16) & 0xFF).toByte, 75 | ((k.bitLength >> 8) & 0xFF).toByte, 76 | ( k.bitLength & 0xFF).toByte) 77 | lengthBytes ++ k.toByteArray 78 | } 79 | override def fromBytes(b: Array[Byte]): BigInt = BigInt.apply(b.drop(4)) 80 | } 81 | 82 | class ShortSerializer extends KeySerializer[Short] { 83 | override def toBytes(k: Short) = Array( 84 | ((k >> 8) & 0xFF).toByte, 85 | ( k & 0xFF).toByte) 86 | override def fromBytes(b: Array[Byte]): Short = 87 | ((b(0).toInt << 8) & (0xFF << 8) | 88 | b(1).toInt & 0xFF).toShort 89 | } 90 | 91 | class CharSerializer extends KeySerializer[Char] { 92 | override def toBytes(k: Char) = Array( 93 | ((k >> 8) & 0xFF).toByte, 94 | ( k & 0xFF).toByte) 95 | override def fromBytes(b: Array[Byte]): Char = 96 | ((b(0).toInt << 8) & (0xFF << 8) | 97 | b(1).toInt & 0xFF).toChar 98 | } 99 | 100 | class UUIDSerializer(val longSer: LongSerializer = new LongSerializer) extends KeySerializer[UUID] { 101 | override def toBytes(k: UUID) = 102 | (longSer.toBytes(k.getMostSignificantBits) ++ 103 | longSer.toBytes(k.getLeastSignificantBits)) 104 | override def fromBytes(b: Array[Byte]): UUID = 105 | new UUID( 106 | longSer.fromBytes(b.take(8)), 107 | longSer.fromBytes(b.takeRight(8))) 108 | } 109 | 110 | class StringSerializer extends KeySerializer[String] { 111 | override def toBytes(k: String) = { 112 | val result = new Array[Byte](4 + k.length * 2) 113 | 114 | // Prepend the string length to ensure no key is a prefix of any other 115 | result(0) = ((k.length >> 24) & 0xFF).toByte 116 | result(1) = ((k.length >> 16) & 0xFF).toByte 117 | result(2) = ((k.length >> 8) & 0xFF).toByte 118 | result(3) = ( k.length & 0xFF).toByte 119 | 120 | var i = 0 121 | while (i < k.length) { 122 | result(4 + 2 * i) = ((k(i) >> 8) & 0xFF).toByte 123 | result(4 + 2 * i + 1) = ( k(i) & 0xFF).toByte 124 | i += 1 125 | } 126 | 127 | result 128 | } 129 | 130 | override def fromBytes(b: Array[Byte]): String = { 131 | val result = new Array[Char]((b.length - 4) / 2) 132 | 133 | var i = 0 134 | while (i < result.length) { 135 | result(i) = 136 | ((b(4 + 2 * i) << 8) & (0xFF << 8) | 137 | (b(4 + 2 * i + 1) & 0xFF)).toChar 138 | i += 1 139 | } 140 | 141 | new String(result) 142 | } 143 | } 144 | 145 | class Tuple2Serializer[A, B]( 146 | implicit aSer: KeySerializer[A], bSer: KeySerializer[B]) 147 | extends KeySerializer[(A, B)] { 148 | 149 | override def toBytes(k: (A, B)) = { 150 | val aBytes = aSer.toBytes(k._1) 151 | val bBytes = bSer.toBytes(k._2) 152 | 153 | val result = new Array[Byte](4 + aBytes.length + bBytes.length) 154 | 155 | // Prepend the length of aBytes so we know where the boundary is when reading 156 | result(0) = ((aBytes.length >> 24) & 0xFF).toByte 157 | result(1) = ((aBytes.length >> 16) & 0xFF).toByte 158 | result(2) = ((aBytes.length >> 8) & 0xFF).toByte 159 | result(3) = ( aBytes.length & 0xFF).toByte 160 | 161 | aBytes.copyToArray(result, 4) 162 | bBytes.copyToArray(result, 4 + aBytes.length) 163 | 164 | result 165 | } 166 | 167 | override def fromBytes(b: Array[Byte]): (A, B) = { 168 | val aLength = 169 | ( (b(0).toInt << 24) & (0xFF << 24) | 170 | (b(1).toInt << 16) & (0xFF << 16) | 171 | (b(2).toInt << 8) & (0xFF << 8) | 172 | b(3).toInt & 0xFF) 173 | (aSer.fromBytes(b.slice(4, 4 + aLength)), 174 | bSer.fromBytes(b.drop(4 + aLength))) 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/LazyPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import scala.reflect.ClassTag 21 | import scala.collection.Traversable 22 | 23 | /** 24 | * A wrapper around several IndexedRDDPartition that avoids rebuilding 25 | * the index for the combined partitions. Instead, each operation probes 26 | * the nested partitions and merges the results. 27 | */ 28 | 29 | private[indexedrdd] class LazyPartition[K, V] 30 | (val partitions: Seq[IndexedRDDPartition[K, V]], 31 | val reducer: (K, Option[V], Option[V]) => V) 32 | (override implicit val kTag: ClassTag[K], 33 | override implicit val vTag: ClassTag[V]) 34 | extends IndexedRDDPartition[K, V] { 35 | 36 | @transient private lazy val cached: IndexedRDDPartition[K, V] = 37 | partitions.reduce((a, b) => a.fullOuterJoin(b)(reducer)) 38 | 39 | def size: Long = 40 | cached.size 41 | 42 | /** Return the value for the given key. */ 43 | def apply(k: K): Option[V] = 44 | partitions. 45 | map(_(k)). 46 | reduce((a, b) => Option(reducer(k, a, b))) 47 | 48 | override def isDefined(k: K): Boolean = 49 | partitions.find(_.isDefined(k)).isDefined 50 | 51 | def iterator: Iterator[(K, V)] = 52 | cached.iterator 53 | 54 | /** 55 | * Query each partition independently, then merge the results by key. This 56 | * could be more efficient if multiget returned ordered results! 57 | */ 58 | def multiget(ks: Array[K]): Iterator[(K, V)] = 59 | partitions. 60 | flatMap(_.multiget(ks)). 61 | groupBy(_._1). 62 | map { 63 | case (k, vs) => 64 | val v = vs.map(_._2).reduce((v1, v2) => reducer(k, Some(v1), Some(v2))) 65 | (k, v) 66 | }. 67 | iterator 68 | 69 | /** 70 | * We have to re-index as we don't know how to reduce the mapped values. 71 | */ 72 | def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] = 73 | cached.mapValues(f) 74 | 75 | def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] = 76 | new LazyPartition(partitions.map(_.filter(pred)), reducer) 77 | 78 | def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] = 79 | cached.diff(other) 80 | 81 | def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] = 82 | cached.diff(other) 83 | 84 | def fullOuterJoin[V2: ClassTag, W: ClassTag] 85 | (other: IndexedRDDPartition[K, V2]) 86 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = 87 | cached.fullOuterJoin(other)(f) 88 | 89 | def fullOuterJoin[V2: ClassTag, W: ClassTag] 90 | (other: Iterator[(K, V2)]) 91 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = 92 | cached.fullOuterJoin(other)(f) 93 | 94 | def join[U: ClassTag] 95 | (other: IndexedRDDPartition[K, U]) 96 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = 97 | cached.join(other)(f) 98 | 99 | def join[U: ClassTag] 100 | (other: Iterator[(K, U)]) 101 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = 102 | cached.join(other)(f) 103 | 104 | def leftJoin[V2: ClassTag, V3: ClassTag] 105 | (other: IndexedRDDPartition[K, V2]) 106 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = 107 | cached.leftJoin(other)(f) 108 | 109 | def leftJoin[V2: ClassTag, V3: ClassTag] 110 | (other: Iterator[(K, V2)]) 111 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = 112 | cached.leftJoin(other)(f) 113 | 114 | def innerJoin[U: ClassTag, V2: ClassTag] 115 | (other: IndexedRDDPartition[K, U]) 116 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = 117 | cached.innerJoin(other)(f) 118 | 119 | def innerJoin[U: ClassTag, V2: ClassTag] 120 | (other: Iterator[(K, U)]) 121 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = 122 | cached.innerJoin(other)(f) 123 | 124 | def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] = 125 | cached.createUsingIndex(elems) 126 | 127 | def aggregateUsingIndex[V2: ClassTag]( 128 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] = 129 | cached.aggregateUsingIndex(elems, reduceFunc) 130 | 131 | /** 132 | * Forces the partitions to re-index, and rebuilds the combined index. 133 | */ 134 | def reindex(): IndexedRDDPartition[K, V] = 135 | partitions.map(_.reindex).reduce((a, b) => a.fullOuterJoin(b)(reducer)) 136 | } 137 | -------------------------------------------------------------------------------- /src/main/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/PARTPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl 19 | 20 | import scala.reflect.ClassTag 21 | import scala.collection.JavaConversions._ 22 | 23 | import edu.berkeley.cs.amplab.spark.indexedrdd._ 24 | import com.ankurdave.part.ArtTree 25 | 26 | private[indexedrdd] class PARTPartition[K, V] 27 | (protected val map: ArtTree) 28 | (override implicit val kTag: ClassTag[K], 29 | override implicit val vTag: ClassTag[V], 30 | implicit val kSer: KeySerializer[K]) 31 | extends IndexedRDDPartition[K, V] { 32 | 33 | protected def withMap[V2: ClassTag] 34 | (map: ArtTree): PARTPartition[K, V2] = { 35 | new PARTPartition(map) 36 | } 37 | 38 | override def size: Long = map.size() 39 | 40 | override def apply(k: K): Option[V] = Option(map.search(kSer.toBytes(k)).asInstanceOf[V]) 41 | 42 | override def iterator: Iterator[(K, V)] = 43 | map.iterator.map(kv => (kSer.fromBytes(kv._1), kv._2.asInstanceOf[V])) 44 | 45 | private def rawIterator: Iterator[(Array[Byte], V)] = 46 | map.iterator.map(kv => (kv._1, kv._2.asInstanceOf[V])) 47 | 48 | override def multiget(ks: Array[K]): Iterator[(K, V)] = 49 | ks.flatMap { k => this(k).map(v => (k, v)) }.iterator 50 | 51 | override def multiput[U]( 52 | kvs: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V): IndexedRDDPartition[K, V] = { 53 | val newMap = map.snapshot() 54 | for (ku <- kvs) { 55 | val kBytes = kSer.toBytes(ku._1) 56 | val oldV = newMap.search(kBytes).asInstanceOf[V] 57 | val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2) 58 | newMap.insert(kBytes, newV) 59 | } 60 | this.withMap[V](newMap) 61 | } 62 | 63 | override def delete(ks: Iterator[K]): IndexedRDDPartition[K, V] = { 64 | val newMap = map.snapshot() 65 | for (k <- ks) { 66 | newMap.delete(kSer.toBytes(k)) 67 | } 68 | this.withMap[V](newMap) 69 | } 70 | 71 | override def mapValues[V2: ClassTag](f: (K, V) => V2): IndexedRDDPartition[K, V2] = { 72 | val newMap = new ArtTree 73 | for (kv <- rawIterator) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2)) 74 | this.withMap[V2](newMap) 75 | } 76 | 77 | override def filter(pred: (K, V) => Boolean): IndexedRDDPartition[K, V] = { 78 | val newMap = new ArtTree 79 | for (kv <- rawIterator if pred(kSer.fromBytes(kv._1), kv._2)) { 80 | newMap.insert(kv._1, kv._2) 81 | } 82 | this.withMap[V](newMap) 83 | } 84 | 85 | override def diff(other: IndexedRDDPartition[K, V]): IndexedRDDPartition[K, V] = other match { 86 | case other: PARTPartition[K, V] => 87 | val newMap = new ArtTree 88 | for (kv <- rawIterator) { 89 | val otherV = other.map.search(kv._1).asInstanceOf[V] 90 | if (otherV != null && otherV != kv._2) { 91 | newMap.insert(kv._1, kv._2) 92 | } 93 | } 94 | this.withMap[V](newMap) 95 | 96 | case _ => 97 | diff(other.iterator) 98 | } 99 | 100 | override def diff(other: Iterator[(K, V)]): IndexedRDDPartition[K, V] = 101 | diff(PARTPartition(other)) 102 | 103 | override def fullOuterJoin[V2: ClassTag, W: ClassTag] 104 | (other: IndexedRDDPartition[K, V2]) 105 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = other match { 106 | case other: PARTPartition[K, V2] => 107 | val newMap = new ArtTree 108 | // Scan `this` and probe `other`, adding all elements in `this` 109 | for (kv <- rawIterator) { 110 | val newV = f( 111 | kSer.fromBytes(kv._1), 112 | Some(kv._2), 113 | Option(other.map.search(kv._1).asInstanceOf[V2])) 114 | newMap.insert(kv._1, newV) 115 | } 116 | // Scan `other` and probe `this`, adding only the elements present in `other` but not `this` 117 | for (kv <- other.rawIterator) { 118 | if (this.map.search(kv._1) == null) { 119 | val newV = f( 120 | kSer.fromBytes(kv._1), 121 | None, 122 | Some(kv._2)) 123 | newMap.insert(kv._1, newV) 124 | } 125 | } 126 | this.withMap[W](newMap) 127 | 128 | case _ => 129 | fullOuterJoin(other.iterator)(f) 130 | } 131 | 132 | override def fullOuterJoin[V2: ClassTag, W: ClassTag] 133 | (other: Iterator[(K, V2)]) 134 | (f: (K, Option[V], Option[V2]) => W): IndexedRDDPartition[K, W] = 135 | fullOuterJoin(PARTPartition(other))(f) 136 | 137 | override def join[U: ClassTag] 138 | (other: IndexedRDDPartition[K, U]) 139 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = join(other.iterator)(f) 140 | 141 | override def join[U: ClassTag] 142 | (other: Iterator[(K, U)]) 143 | (f: (K, V, U) => V): IndexedRDDPartition[K, V] = { 144 | val newMap = map.snapshot() 145 | for (ku <- other) { 146 | val kBytes = kSer.toBytes(ku._1) 147 | val oldV = newMap.search(kBytes).asInstanceOf[V] 148 | if (oldV != null) { 149 | val newV = f(ku._1, oldV, ku._2) 150 | newMap.insert(kBytes, newV) 151 | } 152 | } 153 | this.withMap[V](newMap) 154 | } 155 | 156 | override def leftJoin[V2: ClassTag, V3: ClassTag] 157 | (other: IndexedRDDPartition[K, V2]) 158 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = other match { 159 | case other: PARTPartition[K, V2] => 160 | // Scan `this` and probe `other` 161 | val newMap = new ArtTree 162 | for (kv <- rawIterator) { 163 | val newV = f(kSer.fromBytes(kv._1), kv._2, Option(other.map.search(kv._1).asInstanceOf[V2])) 164 | newMap.insert(kv._1, newV) 165 | } 166 | this.withMap[V3](newMap) 167 | 168 | case _ => 169 | leftJoin(other.iterator)(f) 170 | } 171 | 172 | override def leftJoin[V2: ClassTag, V3: ClassTag] 173 | (other: Iterator[(K, V2)]) 174 | (f: (K, V, Option[V2]) => V3): IndexedRDDPartition[K, V3] = 175 | leftJoin(PARTPartition(other))(f) 176 | 177 | override def innerJoin[U: ClassTag, V2: ClassTag] 178 | (other: IndexedRDDPartition[K, U]) 179 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = other match { 180 | case other: PARTPartition[K, U] => 181 | // Scan `this` and probe `other` 182 | val newMap = new ArtTree 183 | for (kv <- rawIterator) { 184 | val otherV = other.map.search(kv._1).asInstanceOf[U] 185 | if (otherV != null) newMap.insert(kv._1, f(kSer.fromBytes(kv._1), kv._2, otherV)) 186 | } 187 | this.withMap[V2](newMap) 188 | 189 | case _ => 190 | innerJoin(other.iterator)(f) 191 | } 192 | 193 | override def innerJoin[U: ClassTag, V2: ClassTag] 194 | (other: Iterator[(K, U)]) 195 | (f: (K, V, U) => V2): IndexedRDDPartition[K, V2] = 196 | innerJoin(PARTPartition(other))(f) 197 | 198 | override def createUsingIndex[V2: ClassTag](elems: Iterator[(K, V2)]): IndexedRDDPartition[K, V2] = 199 | PARTPartition(elems) 200 | 201 | override def aggregateUsingIndex[V2: ClassTag]( 202 | elems: Iterator[(K, V2)], reduceFunc: (V2, V2) => V2): IndexedRDDPartition[K, V2] = 203 | PARTPartition[K, V2, V2](elems, (id, a) => a, (id, a, b) => reduceFunc(a, b)) 204 | 205 | override def reindex(): IndexedRDDPartition[K, V] = this 206 | } 207 | 208 | private[indexedrdd] object PARTPartition { 209 | def apply[K: ClassTag, V: ClassTag] 210 | (iter: Iterator[(K, V)])(implicit kSer: KeySerializer[K]) = 211 | apply[K, V, V](iter, (id, a) => a, (id, a, b) => b) 212 | 213 | def apply[K: ClassTag, U: ClassTag, V: ClassTag] 214 | (iter: Iterator[(K, U)], z: (K, U) => V, f: (K, V, U) => V) 215 | (implicit kSer: KeySerializer[K]): PARTPartition[K, V] = { 216 | val map = new ArtTree 217 | iter.foreach { ku => 218 | val kBytes = kSer.toBytes(ku._1) 219 | val oldV = map.search(kBytes).asInstanceOf[V] 220 | val newV = if (oldV == null) z(ku._1, ku._2) else f(ku._1, oldV, ku._2) 221 | map.insert(kBytes, newV) 222 | } 223 | new PARTPartition(map) 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file core/target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=false 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.eclipse.jetty=WARN 28 | org.eclipse.jetty.LEVEL=WARN 29 | -------------------------------------------------------------------------------- /src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/IndexedRDDSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import scala.collection.immutable.LongMap 21 | import scala.reflect.ClassTag 22 | import org.apache.spark.HashPartitioner 23 | 24 | import org.apache.spark.SparkContext 25 | import org.apache.spark.rdd.RDD 26 | import org.scalatest.FunSuite 27 | 28 | abstract class IndexedRDDSuite extends FunSuite with SharedSparkContext { 29 | 30 | def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V] 31 | 32 | def pairs(sc: SparkContext, n: Int) = { 33 | create(sc.parallelize((0 to n).map(x => (x.toLong, x)), 5)) 34 | } 35 | 36 | test("get, multiget") { 37 | val n = 100 38 | val ps = pairs(sc, n).cache() 39 | assert(ps.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 1L -> 1, 98L -> 98)) 40 | assert(ps.get(-1L) === None) 41 | assert(ps.get(97L) === Some(97)) 42 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache() 43 | assert(evens.multiget(Array(-1L, 0L, 1L, 98L)) === LongMap(0L -> 0, 98L -> 98)) 44 | assert(evens.get(97L) === None) 45 | } 46 | 47 | test("filter") { 48 | val n = 100 49 | val ps = pairs(sc, n) 50 | val evens = ps.filter(q => ((q._2 % 2) == 0)) 51 | assert(evens.count === (0 to n).filter(_ % 2 == 0).size) 52 | } 53 | 54 | test("mapValues") { 55 | val n = 100 56 | val ps = pairs(sc, n) 57 | val negatives = ps.mapValues(x => -x).cache() // Allow joining b with a derived RDD of b 58 | assert(negatives.count === n + 1) 59 | } 60 | 61 | test("diff") { 62 | val n = 100 63 | val ps = pairs(sc, n).cache() 64 | val flipEvens = ps.mapValues(x => if (x % 2 == 0) -x else x).cache() 65 | // diff should keep only the changed values 66 | assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet) 67 | } 68 | 69 | test("diff with pair RDD") { 70 | val n = 100 71 | val ps = pairs(sc, n).cache() 72 | val flipEvens: RDD[(Long, Int)] = 73 | sc.parallelize(0L to 100L) 74 | .map(id => if (id % 2 == 0) (id, -id.toInt) else (id, id.toInt)).cache() 75 | // diff should keep only the changed values 76 | assert(ps.diff(flipEvens).map(_._2).collect().toSet === (2 to n by 2).toSet) 77 | } 78 | 79 | test("diff with non-equal number of partitions") { 80 | val a = create(sc.parallelize(0 until 24, 3).map(i => (i.toLong, 0))) 81 | val b = create(sc.parallelize(8 until 16, 2).map(i => (i.toLong, 1))) 82 | assert(a.partitions.size != b.partitions.size) 83 | val c = b.diff(a) 84 | assert(c.map(_._1).collect.toSet === (8 until 16).toSet) 85 | } 86 | 87 | test("fullOuterJoin") { 88 | Seq(true, false).foreach { maybeLazy => 89 | val n = 200 90 | val bStart = 50 91 | val aEnd = 100 92 | val common = create(sc.parallelize((0 until n).map(x => (x.toLong, x)), 5)).cache() 93 | val a = common.filter(kv => kv._1 < aEnd).cache() 94 | val b = common.filter(kv => kv._1 >= bStart).cache() 95 | val sum = a.fullOuterJoin(b, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) } 96 | val expected = ((0 until bStart).map(x => (x.toLong, x)) ++ 97 | (bStart until aEnd).map(x => (x.toLong, x * 2)) ++ 98 | (aEnd until n).map(x => (x.toLong, x))).toSet 99 | 100 | // fullOuterJoin with another IndexedRDD with the same index 101 | assert(sum.collect.toSet === expected) 102 | 103 | // fullOuterJoin with another IndexedRDD with a different index 104 | val b2 = create(b.map(identity)) 105 | val sum2 = a.fullOuterJoin(b2, maybeLazy) { (id, aOpt, bOpt) => aOpt.getOrElse(0) + bOpt.getOrElse(0) } 106 | assert(sum2.collect.toSet === expected) 107 | } 108 | } 109 | 110 | test("leftJoin") { 111 | val n = 100 112 | val ps = pairs(sc, n).cache() 113 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache() 114 | // leftJoin with another IndexedRDD 115 | assert(ps.leftJoin(evens) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet === 116 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet) 117 | // leftJoin with an RDD 118 | val evensRDD = evens.map(identity) 119 | assert(ps.leftJoin(evensRDD) { (id, a, bOpt) => a - bOpt.getOrElse(0) }.collect.toSet === 120 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet) 121 | } 122 | 123 | test("leftJoin vertices with non-equal number of partitions") { 124 | val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1))) 125 | val b = create( 126 | a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3))) 127 | assert(a.partitions.size != b.partitions.size) 128 | val c = a.leftJoin(b) { (vid, old, newOpt) => 129 | old - newOpt.getOrElse(0) 130 | } 131 | assert(c.filter(v => v._2 != 0).map(_._1).collect.toSet == (1 to 99 by 2).toSet) 132 | } 133 | 134 | test("join") { 135 | val n = 100 136 | val ps = pairs(sc, n).cache() 137 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache() 138 | // join with another IndexedRDD 139 | assert(ps.join(evens) { (id, a, b) => a - b }.collect.toSet === 140 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet) 141 | // join with an RDD 142 | val evensRDD = evens.map(identity) 143 | assert(ps.join(evensRDD) { (id, a, b) => a - b }.collect.toSet === 144 | (0 to n by 2).map(x => (x.toLong, 0)).toSet ++ (1 to n by 2).map(x => (x.toLong, x)).toSet) 145 | } 146 | 147 | test("innerJoin") { 148 | val n = 100 149 | val ps = pairs(sc, n).cache() 150 | val evens = ps.filter(q => ((q._2 % 2) == 0)).cache() 151 | // innerJoin with another IndexedRDD 152 | assert(ps.innerJoin(evens) { (id, a, b) => a - b }.collect.toSet === 153 | (0 to n by 2).map(x => (x.toLong, 0)).toSet) 154 | // innerJoin with an RDD 155 | val evensRDD = evens.map(identity) 156 | assert(ps.innerJoin(evensRDD) { (id, a, b) => a - b }.collect.toSet === 157 | (0 to n by 2).map(x => (x.toLong, 0)).toSet) 158 | } 159 | 160 | test("innerJoin with non-equal number of partitions") { 161 | val a = create(sc.parallelize(0 until 100, 2).map(i => (i.toLong, 1))) 162 | val b = create( 163 | a.filter(v => v._1 % 2 == 0).partitionBy(new HashPartitioner(3))) 164 | assert(a.partitions.size != b.partitions.size) 165 | val c = a.innerJoin(b) { (vid, old, newVal) => 166 | old - newVal 167 | } 168 | assert(c.filter(v => v._2 == 0).map(_._1).collect.toSet == (0 to 98 by 2).toSet) 169 | } 170 | 171 | test("aggregateUsingIndex") { 172 | val n = 100 173 | val ps = pairs(sc, n) 174 | val messageTargets = (0 to n) ++ (0 to n by 2) 175 | val messages = sc.parallelize(messageTargets.map(x => (x.toLong, 1))) 176 | assert(ps.aggregateUsingIndex[Int](messages, _ + _).collect.toSet === 177 | (0 to n).map(x => (x.toLong, if (x % 2 == 0) 2 else 1)).toSet) 178 | 179 | val messagesWithNew = List((0L, 1), (-1L, 1)) 180 | assert(ps.aggregateUsingIndex[Int](sc.parallelize(messagesWithNew), _ + _).collect.toSet === 181 | messagesWithNew.toSet) 182 | } 183 | } 184 | 185 | class UpdatableIndexedRDDSuite extends IndexedRDDSuite { 186 | override def create[V: ClassTag](elems: RDD[(Long, V)]): IndexedRDD[Long, V] = { 187 | import IndexedRDD._ 188 | IndexedRDD.updatable(elems) 189 | } 190 | 191 | test("put, multiput") { 192 | val n = 100 193 | val ps = pairs(sc, n).cache() 194 | assert(ps.multiput[Int](Map(0L -> 1, 1L -> 1), (id, a) => a, SumFunction).collect.toSet === 195 | Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet) 196 | assert(ps.multiput[Int](Map(-1L -> -1, 0L -> 1), (id, a) => a, SumFunction).collect.toSet === 197 | Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet) 198 | assert(ps.multiput(Map(-1L -> -1, 0L -> 1, 1L -> 1)).collect.toSet === 199 | Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet) 200 | assert(ps.multiputRDD[Int](sc.parallelize(Seq(0L -> 1, 1L -> 1)), (id, a) => a, SumFunction).collect.toSet === 201 | Set(0L -> 1, 1L -> 2) ++ (2 to n).map(x => (x.toLong, x)).toSet) 202 | assert(ps.multiputRDD[Int](sc.parallelize(Seq(-1L -> -1, 0L -> 1)), (id, a) => a, SumFunction).collect.toSet === 203 | Set(-1L -> -1, 0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet) 204 | assert(ps.multiputRDD(sc.parallelize(Seq(-1L -> -1, 0L -> 1, 1L -> 1))).collect.toSet === 205 | Set(-1L -> -1, 0L -> 1, 1L -> 1) ++ (2 to n).map(x => (x.toLong, x)).toSet) 206 | assert(ps.put(-1L, -1).collect.toSet === 207 | Set(-1L -> -1) ++ (0 to n).map(x => (x.toLong, x)).toSet) 208 | assert(ps.put(0L, 1).collect.toSet === 209 | Set(0L -> 1) ++ (1 to n).map(x => (x.toLong, x)).toSet) 210 | } 211 | 212 | test("delete") { 213 | val n = 100 214 | val ps = pairs(sc, n).cache() 215 | assert(ps.delete(Array(0L)).collect.toSet === (1 to n).map(x => (x.toLong, x)).toSet) 216 | assert(ps.delete(Array(-1L)).collect.toSet === (0 to n).map(x => (x.toLong, x)).toSet) 217 | } 218 | } 219 | 220 | // Declared outside of test suite to avoid closure capture 221 | private object SumFunction extends Function3[Long, Int, Int, Int] with Serializable { 222 | def apply(id: Long, a: Int, b: Int) = a + b 223 | } 224 | -------------------------------------------------------------------------------- /src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/KeySerializerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import java.util.UUID 21 | 22 | import org.scalacheck.Arbitrary 23 | import org.scalacheck.Gen 24 | import org.scalatest.FunSuite 25 | import org.scalatest.Matchers 26 | import org.scalatest.prop.GeneratorDrivenPropertyChecks 27 | 28 | class KeySerializerSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers { 29 | 30 | test("long") { 31 | val ser = new LongSerializer 32 | forAll { (a: Long) => 33 | ser.fromBytes(ser.toBytes(a)) should be === a 34 | } 35 | } 36 | 37 | test("string") { 38 | val ser = new StringSerializer 39 | 40 | forAll { (a: String) => 41 | ser.fromBytes(ser.toBytes(a)) should be === a 42 | } 43 | 44 | forAll { (a: String, b: String) => 45 | whenever (a != b) { 46 | val aSer = ser.toBytes(a) 47 | val bSer = ser.toBytes(b) 48 | assert(!aSer.startsWith(bSer)) 49 | assert(!bSer.startsWith(aSer)) 50 | } 51 | } 52 | } 53 | 54 | test("short") { 55 | val ser = new ShortSerializer 56 | forAll { (a: Short) => 57 | ser.fromBytes(ser.toBytes(a)) should be === a 58 | } 59 | } 60 | 61 | test("int") { 62 | val ser = new IntSerializer 63 | forAll { (a: Int) => 64 | ser.fromBytes(ser.toBytes(a)) should be === a 65 | } 66 | } 67 | 68 | implicit val arbUUID: Arbitrary[UUID] = Arbitrary(Gen.uuid) 69 | 70 | test("UUID") { 71 | val ser = new UUIDSerializer 72 | forAll { (a: UUID) => 73 | ser.fromBytes(ser.toBytes(a)) should be === a 74 | } 75 | } 76 | 77 | test("bigint") { 78 | val ser = new BigIntSerializer 79 | 80 | forAll { (a: BigInt) => 81 | ser.fromBytes(ser.toBytes(a)) should be === a 82 | } 83 | 84 | forAll { (a: BigInt, b: BigInt) => 85 | whenever (a != b) { 86 | val aSer = ser.toBytes(a) 87 | val bSer = ser.toBytes(b) 88 | assert(!aSer.startsWith(bSer)) 89 | assert(!bSer.startsWith(aSer)) 90 | } 91 | } 92 | } 93 | 94 | def tuple2Test[A: Arbitrary, B: Arbitrary]( 95 | aSer: KeySerializer[A], bSer: KeySerializer[B]): Unit = { 96 | val ser = new Tuple2Serializer[A, B]()(aSer, bSer) 97 | 98 | forAll { (a: A, b: B) => 99 | ser.fromBytes(ser.toBytes(Tuple2(a, b))) should be === (a, b) 100 | } 101 | 102 | forAll { (a: (A, B), b: (A, B)) => 103 | whenever (a != b) { 104 | val aSer = ser.toBytes(a) 105 | val bSer = ser.toBytes(b) 106 | assert(!aSer.startsWith(bSer)) 107 | assert(!bSer.startsWith(aSer)) 108 | } 109 | } 110 | } 111 | 112 | test("Tuple2") { 113 | val stringSer = new StringSerializer 114 | val longSer = new LongSerializer 115 | val intSer = new IntSerializer 116 | val shortSer = new ShortSerializer 117 | val bigintSer = new BigIntSerializer 118 | val uuidSer = new UUIDSerializer 119 | 120 | tuple2Test[Long, Long](longSer, longSer) 121 | tuple2Test[String, Long](stringSer, longSer) 122 | tuple2Test[Long, String](longSer, stringSer) 123 | tuple2Test[String, String](stringSer, stringSer) 124 | tuple2Test[Short, Short](shortSer, shortSer) 125 | tuple2Test[Short, Int](shortSer, intSer) 126 | tuple2Test[Int, Int](intSer, intSer) 127 | tuple2Test[Int, BigInt](intSer, bigintSer) 128 | tuple2Test[BigInt, BigInt](bigintSer, bigintSer) 129 | tuple2Test[Int, UUID](intSer, uuidSer) 130 | tuple2Test[UUID, UUID](uuidSer, uuidSer) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/SharedSparkContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.SparkContext 22 | import org.scalatest.BeforeAndAfterAll 23 | import org.scalatest.Suite 24 | 25 | /** Shares a local `SparkContext` between all tests in a suite and closes it at the end */ 26 | trait SharedSparkContext extends BeforeAndAfterAll { self: Suite => 27 | 28 | @transient private var _sc: SparkContext = _ 29 | 30 | def sc: SparkContext = _sc 31 | 32 | var conf = new SparkConf(false) 33 | 34 | override def beforeAll() { 35 | _sc = new SparkContext("local", "test", conf) 36 | super.beforeAll() 37 | } 38 | 39 | override def afterAll() { 40 | if (_sc != null) { 41 | _sc.stop() 42 | } 43 | // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown 44 | System.clearProperty("spark.driver.port") 45 | _sc = null 46 | super.afterAll() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/test/scala/edu/berkeley/cs/amplab/spark/indexedrdd/impl/IndexedRDDPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package edu.berkeley.cs.amplab.spark.indexedrdd.impl 19 | 20 | import scala.reflect.ClassTag 21 | import edu.berkeley.cs.amplab.spark.indexedrdd._ 22 | 23 | import org.apache.spark.SparkConf 24 | import org.apache.spark.serializer.JavaSerializer 25 | import org.apache.spark.serializer.KryoSerializer 26 | import org.scalatest.FunSuite 27 | 28 | abstract class IndexedRDDPartitionSuite extends FunSuite { 29 | 30 | def create[V: ClassTag](iter: Iterator[(Long, V)]): IndexedRDDPartition[Long, V] 31 | 32 | test("serialization") { 33 | val elems = Set((0L, 1), (1L, 1), (2L, 1)) 34 | val vp = create(elems.iterator) 35 | val javaSer = new JavaSerializer(new SparkConf()) 36 | val kryoSer = new KryoSerializer(new SparkConf() 37 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) 38 | 39 | for (ser <- List(javaSer, kryoSer); s = ser.newInstance()) { 40 | val vpSer: IndexedRDDPartition[Long, Int] = s.deserialize(s.serialize(vp)) 41 | assert(vpSer.iterator.toSet === elems) 42 | } 43 | } 44 | 45 | test("get") { 46 | val elems = Set((0L, 1), (1L, 1), (2L, 1)) 47 | val vp = create(elems.iterator) 48 | assert(vp(0L) == Some(1)) 49 | assert(vp(1L) == Some(1)) 50 | assert(vp(2L) == Some(1)) 51 | assert(vp(3L) == None) 52 | 53 | assert(vp.multiget(Array(1L, 2L, 3L)).size == 2) 54 | } 55 | } 56 | 57 | class PARTPartitionSuite extends IndexedRDDPartitionSuite { 58 | override def create[V: ClassTag](iter: Iterator[(Long, V)]) = { 59 | import IndexedRDD._ 60 | PARTPartition(iter) 61 | } 62 | } 63 | 64 | class LazyPartitionSuite extends IndexedRDDPartitionSuite { 65 | override def create[V: ClassTag](iter: Iterator[(Long, V)]) = { 66 | import IndexedRDD._ 67 | val it = iter.toSeq 68 | new LazyPartition( 69 | Seq(PARTPartition(it.iterator), PARTPartition(it.iterator)), 70 | (id, a, b) => (a ++ b).headOption.getOrElse(null.asInstanceOf[V])) 71 | } 72 | } 73 | --------------------------------------------------------------------------------