├── README.md └── src └── main ├── java └── org │ └── apache │ └── spark │ └── graphx │ ├── TripletFields.java │ └── impl │ └── EdgeActiveness.java └── scala └── org └── apache └── spark └── graphx ├── Edge.scala ├── EdgeContext.scala ├── EdgeDirection.scala ├── EdgeRDD.scala ├── EdgeTriplet.scala ├── Graph.scala ├── GraphKryoRegistrator.scala ├── GraphLoader.scala ├── GraphOps.scala ├── GraphXUtils.scala ├── PartitionStrategy.scala ├── Pregel.scala ├── VertexRDD.scala ├── impl ├── EdgePartition.scala ├── EdgePartitionBuilder.scala ├── EdgeRDDImpl.scala ├── GraphImpl.scala ├── ReplicatedVertexView.scala ├── RoutingTablePartition.scala ├── ShippableVertexPartition.scala ├── VertexPartition.scala ├── VertexPartitionBase.scala ├── VertexPartitionBaseOps.scala ├── VertexRDDImpl.scala └── package.scala ├── lib ├── ConnectedComponents.scala ├── LabelPropagation.scala ├── PageRank.scala ├── SVDPlusPlus.scala ├── ShortestPaths.scala ├── StronglyConnectedComponents.scala ├── TriangleCount.scala ├── package-info.java └── package.scala ├── package-info.java ├── package.scala └── util ├── BytecodeUtils.scala ├── GraphGenerators.scala ├── collection └── GraphXPrimitiveKeyOpenHashMap.scala ├── package-info.java └── package.scala /README.md: -------------------------------------------------------------------------------- 1 | # Spark图计算引擎GraphX的源码注释中文翻译版 2 | 3 | -------------------------------------------------------------------------------- /src/main/java/org/apache/spark/graphx/TripletFields.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx; 19 | 20 | import java.io.Serializable; 21 | 22 | /** 23 | * Represents a subset of the fields of an [[EdgeTriplet]] or [[EdgeContext]]. This allows the 24 | * system to populate only those fields for efficiency. 25 | */ 26 | public class TripletFields implements Serializable { 27 | 28 | /** Indicates whether the source vertex attribute is included. */ 29 | public final boolean useSrc; 30 | 31 | /** Indicates whether the destination vertex attribute is included. */ 32 | public final boolean useDst; 33 | 34 | /** Indicates whether the edge attribute is included. */ 35 | public final boolean useEdge; 36 | 37 | /** Constructs a default TripletFields in which all fields are included. */ 38 | public TripletFields() { 39 | this(true, true, true); 40 | } 41 | 42 | public TripletFields(boolean useSrc, boolean useDst, boolean useEdge) { 43 | this.useSrc = useSrc; 44 | this.useDst = useDst; 45 | this.useEdge = useEdge; 46 | } 47 | 48 | /** 49 | * None of the triplet fields are exposed. 50 | */ 51 | public static final TripletFields None = new TripletFields(false, false, false); 52 | 53 | /** 54 | * Expose only the edge field and not the source or destination field. 55 | */ 56 | public static final TripletFields EdgeOnly = new TripletFields(false, false, true); 57 | 58 | /** 59 | * Expose the source and edge fields but not the destination field. (Same as Src) 60 | */ 61 | public static final TripletFields Src = new TripletFields(true, false, true); 62 | 63 | /** 64 | * Expose the destination and edge fields but not the source field. (Same as Dst) 65 | */ 66 | public static final TripletFields Dst = new TripletFields(false, true, true); 67 | 68 | /** 69 | * Expose all the fields (source, edge, and destination). 70 | */ 71 | public static final TripletFields All = new TripletFields(true, true, true); 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl; 19 | 20 | /** 21 | * Criteria for filtering edges based on activeness. For internal use only. 22 | */ 23 | public enum EdgeActiveness { 24 | /** Neither the source vertex nor the destination vertex need be active. */ 25 | Neither, 26 | /** The source vertex must be active. */ 27 | SrcOnly, 28 | /** The destination vertex must be active. */ 29 | DstOnly, 30 | /** Both vertices must be active. */ 31 | Both, 32 | /** At least one vertex must be active. */ 33 | Either 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/Edge.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import org.apache.spark.util.collection.SortDataFormat 21 | 22 | /** 23 | * A single directed edge consisting of a source id, target id, 24 | * and the data associated with the edge. 25 | * 26 | * @tparam ED type of the edge attribute 27 | * 28 | * @param srcId The vertex id of the source vertex 29 | * @param dstId The vertex id of the target vertex 30 | * @param attr The attribute associated with the edge 31 | */ 32 | case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] ( 33 | var srcId: VertexId = 0, 34 | var dstId: VertexId = 0, 35 | var attr: ED = null.asInstanceOf[ED]) 36 | extends Serializable { 37 | 38 | /** 39 | * Given one vertex in the edge return the other vertex. 40 | * 41 | * @param vid the id one of the two vertices on the edge. 42 | * @return the id of the other vertex on the edge. 43 | */ 44 | def otherVertexId(vid: VertexId): VertexId = 45 | if (srcId == vid) dstId else { assert(dstId == vid); srcId } 46 | 47 | /** 48 | * Return the relative direction of the edge to the corresponding 49 | * vertex. 50 | * 51 | * @param vid the id of one of the two vertices in the edge. 52 | * @return the relative direction of the edge to the corresponding 53 | * vertex. 54 | */ 55 | def relativeDirection(vid: VertexId): EdgeDirection = 56 | if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In } 57 | } 58 | 59 | object Edge { 60 | private[graphx] def lexicographicOrdering[ED] = new Ordering[Edge[ED]] { 61 | override def compare(a: Edge[ED], b: Edge[ED]): Int = { 62 | if (a.srcId == b.srcId) { 63 | if (a.dstId == b.dstId) 0 64 | else if (a.dstId < b.dstId) -1 65 | else 1 66 | } else if (a.srcId < b.srcId) -1 67 | else 1 68 | } 69 | } 70 | 71 | private[graphx] def edgeArraySortDataFormat[ED] = new SortDataFormat[Edge[ED], Array[Edge[ED]]] { 72 | override def getKey(data: Array[Edge[ED]], pos: Int): Edge[ED] = { 73 | data(pos) 74 | } 75 | 76 | override def swap(data: Array[Edge[ED]], pos0: Int, pos1: Int): Unit = { 77 | val tmp = data(pos0) 78 | data(pos0) = data(pos1) 79 | data(pos1) = tmp 80 | } 81 | 82 | override def copyElement( 83 | src: Array[Edge[ED]], srcPos: Int, 84 | dst: Array[Edge[ED]], dstPos: Int) { 85 | dst(dstPos) = src(srcPos) 86 | } 87 | 88 | override def copyRange( 89 | src: Array[Edge[ED]], srcPos: Int, 90 | dst: Array[Edge[ED]], dstPos: Int, length: Int) { 91 | System.arraycopy(src, srcPos, dst, dstPos, length) 92 | } 93 | 94 | override def allocate(length: Int): Array[Edge[ED]] = { 95 | new Array[Edge[ED]](length) 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/EdgeContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | /** 21 | * Represents an edge along with its neighboring vertices and allows sending messages along the 22 | * edge. Used in [[Graph#aggregateMessages]]. 23 | */ 24 | abstract class EdgeContext[VD, ED, A] { 25 | /** The vertex id of the edge's source vertex. */ 26 | def srcId: VertexId 27 | /** The vertex id of the edge's destination vertex. */ 28 | def dstId: VertexId 29 | /** The vertex attribute of the edge's source vertex. */ 30 | def srcAttr: VD 31 | /** The vertex attribute of the edge's destination vertex. */ 32 | def dstAttr: VD 33 | /** The attribute associated with the edge. */ 34 | def attr: ED 35 | 36 | /** Sends a message to the source vertex. */ 37 | def sendToSrc(msg: A): Unit 38 | /** Sends a message to the destination vertex. */ 39 | def sendToDst(msg: A): Unit 40 | 41 | /** Converts the edge and vertex properties into an [[EdgeTriplet]] for convenience. */ 42 | def toEdgeTriplet: EdgeTriplet[VD, ED] = { 43 | val et = new EdgeTriplet[VD, ED] 44 | et.srcId = srcId 45 | et.srcAttr = srcAttr 46 | et.dstId = dstId 47 | et.dstAttr = dstAttr 48 | et.attr = attr 49 | et 50 | } 51 | } 52 | 53 | object EdgeContext { 54 | 55 | /** 56 | * Extractor mainly used for Graph#aggregateMessages*. 57 | * Example: 58 | * {{{ 59 | * val messages = graph.aggregateMessages( 60 | * case ctx @ EdgeContext(_, _, _, _, attr) => 61 | * ctx.sendToDst(attr) 62 | * , _ + _) 63 | * }}} 64 | */ 65 | def unapply[VD, ED, A](edge: EdgeContext[VD, ED, A]): Some[(VertexId, VertexId, VD, VD, ED)] = 66 | Some(edge.srcId, edge.dstId, edge.srcAttr, edge.dstAttr, edge.attr) 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/EdgeDirection.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | /** 21 | * The direction of a directed edge relative to a vertex. 22 | */ 23 | class EdgeDirection private (private val name: String) extends Serializable { 24 | /** 25 | * Reverse the direction of an edge. An in becomes out, 26 | * out becomes in and both and either remain the same. 27 | */ 28 | def reverse: EdgeDirection = this match { 29 | case EdgeDirection.In => EdgeDirection.Out 30 | case EdgeDirection.Out => EdgeDirection.In 31 | case EdgeDirection.Either => EdgeDirection.Either 32 | case EdgeDirection.Both => EdgeDirection.Both 33 | } 34 | 35 | override def toString: String = "EdgeDirection." + name 36 | 37 | override def equals(o: Any): Boolean = o match { 38 | case other: EdgeDirection => other.name == name 39 | case _ => false 40 | } 41 | 42 | override def hashCode: Int = name.hashCode 43 | } 44 | 45 | 46 | /** 47 | * A set of [[EdgeDirection]]s. 48 | */ 49 | object EdgeDirection { 50 | /** Edges arriving at a vertex. */ 51 | final val In: EdgeDirection = new EdgeDirection("In") 52 | 53 | /** Edges originating from a vertex. */ 54 | final val Out: EdgeDirection = new EdgeDirection("Out") 55 | 56 | /** Edges originating from *or* arriving at a vertex of interest. */ 57 | final val Either: EdgeDirection = new EdgeDirection("Either") 58 | 59 | /** Edges originating from *and* arriving at a vertex of interest. */ 60 | final val Both: EdgeDirection = new EdgeDirection("Both") 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/EdgeRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import scala.language.existentials 21 | import scala.reflect.ClassTag 22 | 23 | import org.apache.spark.Dependency 24 | import org.apache.spark.Partition 25 | import org.apache.spark.SparkContext 26 | import org.apache.spark.TaskContext 27 | import org.apache.spark.rdd.RDD 28 | import org.apache.spark.storage.StorageLevel 29 | 30 | import org.apache.spark.graphx.impl.EdgePartition 31 | import org.apache.spark.graphx.impl.EdgePartitionBuilder 32 | import org.apache.spark.graphx.impl.EdgeRDDImpl 33 | 34 | /** 35 | * `EdgeRDD[ED, VD]` extends `RDD[Edge[ED]]` by storing the edges in columnar format on each 36 | * partition for performance. It may additionally store the vertex attributes associated with each 37 | * edge to provide the triplet view. Shipping of the vertex attributes is managed by 38 | * `impl.ReplicatedVertexView`. 39 | */ 40 | abstract class EdgeRDD[ED]( 41 | sc: SparkContext, 42 | deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) { 43 | 44 | // scalastyle:off structural.type 45 | private[graphx] def partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])] forSome { type VD } 46 | // scalastyle:on structural.type 47 | 48 | //得到EdgeRDD所在分区 49 | override protected def getPartitions: Array[Partition] = partitionsRDD.partitions 50 | 51 | override def compute(part: Partition, context: TaskContext): Iterator[Edge[ED]] = { 52 | val p = firstParent[(PartitionID, EdgePartition[ED, _])].iterator(part, context) 53 | if (p.hasNext) { 54 | p.next()._2.iterator.map(_.copy()) 55 | } else { 56 | Iterator.empty 57 | } 58 | } 59 | 60 | /** 61 | * Map the values in an edge partitioning preserving the structure but changing the values. 62 | *对值做映射 63 | * @tparam ED2 the new edge value type 64 | * @param f the function from an edge to a new edge value 65 | * @return a new EdgeRDD containing the new edge values 66 | */ 67 | def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDD[ED2] 68 | 69 | /** 70 | * Reverse all the edges in this RDD. 71 | * 72 | * @return a new EdgeRDD containing all the edges reversed 73 | */ 74 | def reverse: EdgeRDD[ED] 75 | 76 | /** 77 | * Inner joins this EdgeRDD with another EdgeRDD, assuming both are partitioned using the same 78 | * [[PartitionStrategy]]. 79 | *内连接相同的边(有相同的顶点边) 80 | * @param other the EdgeRDD to join with 81 | * @param f the join function applied to corresponding values of `this` and `other` 82 | * @return a new EdgeRDD containing only edges that appear in both `this` and `other`, 83 | * with values supplied by `f` 84 | */ 85 | def innerJoin[ED2: ClassTag, ED3: ClassTag] 86 | (other: EdgeRDD[ED2]) 87 | (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3] 88 | 89 | /** 90 | * Changes the target storage level while preserving all other properties of the 91 | * EdgeRDD. Operations on the returned EdgeRDD will preserve this storage level. 92 | *改变存储级别 93 | * This does not actually trigger a cache; to do this, call 94 | * [[org.apache.spark.graphx.EdgeRDD#cache]] on the returned EdgeRDD. 95 | */ 96 | private[graphx] def withTargetStorageLevel(targetStorageLevel: StorageLevel): EdgeRDD[ED] 97 | } 98 | 99 | object EdgeRDD { 100 | /** 101 | * Creates an EdgeRDD from a set of edges. 102 | *从RDD[Edge[ED]]生成EdgeRDD 103 | * @tparam ED the edge attribute type 104 | * @tparam VD the type of the vertex attributes that may be joined with the returned EdgeRDD 105 | */ 106 | def fromEdges[ED: ClassTag, VD: ClassTag](edges: RDD[Edge[ED]]): EdgeRDDImpl[ED, VD] = { 107 | val edgePartitions = edges.mapPartitionsWithIndex { (pid, iter) => 108 | val builder = new EdgePartitionBuilder[ED, VD] 109 | iter.foreach { e => 110 | builder.add(e.srcId, e.dstId, e.attr) 111 | } 112 | Iterator((pid, builder.toEdgePartition)) 113 | } 114 | EdgeRDD.fromEdgePartitions(edgePartitions) 115 | } 116 | 117 | /** 118 | * Creates an EdgeRDD from already-constructed edge partitions. 119 | *从edge partitions生成EdgeRDD 120 | * @tparam ED the edge attribute type 121 | * @tparam VD the type of the vertex attributes that may be joined with the returned EdgeRDD 122 | */ 123 | private[graphx] def fromEdgePartitions[ED: ClassTag, VD: ClassTag]( 124 | edgePartitions: RDD[(Int, EdgePartition[ED, VD])]): EdgeRDDImpl[ED, VD] = { 125 | new EdgeRDDImpl(edgePartitions) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | /** 21 | * An edge triplet represents an edge along with the vertex attributes of its neighboring vertices. 22 | * 23 | * @tparam VD the type of the vertex attribute. 24 | * @tparam ED the type of the edge attribute 25 | */ 26 | class EdgeTriplet[VD, ED] extends Edge[ED] { 27 | /** 28 | * The source vertex attribute 29 | */ 30 | var srcAttr: VD = _ // nullValue[VD] 31 | 32 | /** 33 | * The destination vertex attribute 34 | */ 35 | var dstAttr: VD = _ // nullValue[VD] 36 | 37 | /** 38 | * Set the edge properties of this triplet. 39 | * 设置三元体边的属性 40 | * 注意:srcId、dstId、attr是从超类中继承过来的 41 | */ 42 | protected[spark] def set(other: Edge[ED]): EdgeTriplet[VD, ED] = { 43 | srcId = other.srcId 44 | dstId = other.dstId 45 | attr = other.attr 46 | this 47 | } 48 | 49 | /** 50 | * Given one vertex in the edge return the other vertex. 51 | *已知边的一个顶点得到另一个顶点的属性 52 | * @param vid the id one of the two vertices on the edge 53 | * @return the attribute for the other vertex on the edge 54 | */ 55 | def otherVertexAttr(vid: VertexId): VD = 56 | if (srcId == vid) dstAttr else { assert(dstId == vid); srcAttr } 57 | 58 | /** 59 | * Get the vertex object for the given vertex in the edge. 60 | * 得到VertexId对应顶点的attr 61 | * 62 | * @param vid the id of one of the two vertices on the edge 63 | * @return the attr for the vertex with that id 64 | */ 65 | def vertexAttr(vid: VertexId): VD = 66 | if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr } 67 | 68 | override def toString: String = ((srcId, srcAttr), (dstId, dstAttr), attr).toString() 69 | 70 | //转化成三元组 71 | def toTuple: ((VertexId, VD), (VertexId, VD), ED) = ((srcId, srcAttr), (dstId, dstAttr), attr) 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import com.esotericsoftware.kryo.Kryo 21 | 22 | import org.apache.spark.serializer.KryoRegistrator 23 | import org.apache.spark.util.BoundedPriorityQueue 24 | import org.apache.spark.util.collection.BitSet 25 | 26 | import org.apache.spark.graphx.impl._ 27 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 28 | import org.apache.spark.util.collection.OpenHashSet 29 | 30 | /** 31 | * Registers GraphX classes with Kryo for improved performance. 32 | */ 33 | @deprecated("Register GraphX classes with Kryo using GraphXUtils.registerKryoClasses", "1.2.0") 34 | class GraphKryoRegistrator extends KryoRegistrator { 35 | 36 | def registerClasses(kryo: Kryo) { 37 | kryo.register(classOf[Edge[Object]]) 38 | kryo.register(classOf[(VertexId, Object)]) 39 | kryo.register(classOf[EdgePartition[Object, Object]]) 40 | kryo.register(classOf[BitSet]) 41 | kryo.register(classOf[VertexIdToIndexMap]) 42 | kryo.register(classOf[VertexAttributeBlock[Object]]) 43 | kryo.register(classOf[PartitionStrategy]) 44 | kryo.register(classOf[BoundedPriorityQueue[Object]]) 45 | kryo.register(classOf[EdgeDirection]) 46 | kryo.register(classOf[GraphXPrimitiveKeyOpenHashMap[VertexId, Int]]) 47 | kryo.register(classOf[OpenHashSet[Int]]) 48 | kryo.register(classOf[OpenHashSet[Long]]) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/GraphLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import org.apache.spark.storage.StorageLevel 21 | import org.apache.spark.{Logging, SparkContext} 22 | import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} 23 | 24 | /** 25 | * Provides utilities for loading [[Graph]]s from files. 26 | */ 27 | object GraphLoader extends Logging { 28 | 29 | /** 30 | * Loads a graph from an edge list formatted file where each line contains two integers: a source 31 | * id and a target id. Skips lines that begin with `#`. 32 | * 33 | * If desired the edges can be automatically oriented in the positive 34 | * direction (source Id < target Id) by setting `canonicalOrientation` to 35 | * true. 36 | * 37 | * @example Loads a file in the following format: 38 | * {{{ 39 | * # Comment Line 40 | * # Source Id <\t> Target Id 41 | * 1 -5 42 | * 1 2 43 | * 2 7 44 | * 1 8 45 | * }}} 46 | * 47 | * @param sc SparkContext 48 | * @param path the path to the file (e.g., /home/data/file or hdfs://file) 49 | * @param canonicalOrientation whether to orient edges in the positive 50 | * direction 51 | * @param numEdgePartitions the number of partitions for the edge RDD 52 | * Setting this value to -1 will use the default parallelism. 53 | * @param edgeStorageLevel the desired storage level for the edge partitions 54 | * @param vertexStorageLevel the desired storage level for the vertex partitions 55 | */ 56 | def edgeListFile( 57 | sc: SparkContext, 58 | path: String, 59 | canonicalOrientation: Boolean = false, 60 | numEdgePartitions: Int = -1, 61 | edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, 62 | vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) 63 | : Graph[Int, Int] = 64 | { 65 | val startTime = System.currentTimeMillis 66 | 67 | // Parse the edge data table directly into edge partitions 68 | val lines = 69 | if (numEdgePartitions > 0) { 70 | sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) 71 | } else { 72 | sc.textFile(path) 73 | } 74 | val edges = lines.mapPartitionsWithIndex { (pid, iter) => 75 | val builder = new EdgePartitionBuilder[Int, Int] 76 | iter.foreach { line => 77 | if (!line.isEmpty && line(0) != '#') { 78 | val lineArray = line.split("\\s+") 79 | if (lineArray.length < 2) { 80 | throw new IllegalArgumentException("Invalid line: " + line) 81 | } 82 | val srcId = lineArray(0).toLong 83 | val dstId = lineArray(1).toLong 84 | if (canonicalOrientation && srcId > dstId) { 85 | builder.add(dstId, srcId, 1) 86 | } else { 87 | builder.add(srcId, dstId, 1) 88 | } 89 | } 90 | } 91 | Iterator((pid, builder.toEdgePartition)) 92 | }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) 93 | edges.count() 94 | 95 | logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) 96 | 97 | GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, 98 | vertexStorageLevel = vertexStorageLevel) 99 | } // end of edgeListFile 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/GraphOps.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import scala.reflect.ClassTag 21 | import scala.util.Random 22 | 23 | import org.apache.spark.SparkException 24 | import org.apache.spark.SparkContext._ 25 | import org.apache.spark.rdd.RDD 26 | 27 | import org.apache.spark.graphx.lib._ 28 | 29 | /** 30 | * Contains additional functionality for [[Graph]]. All operations are expressed in terms of the 31 | * efficient GraphX API. This class is implicitly constructed for each Graph object. 32 | * 33 | * @tparam VD the vertex attribute type 34 | * @tparam ED the edge attribute type 35 | */ 36 | class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Serializable { 37 | 38 | /** The number of edges in the graph. */ 39 | @transient lazy val numEdges: Long = graph.edges.count() 40 | 41 | /** The number of vertices in the graph. */ 42 | @transient lazy val numVertices: Long = graph.vertices.count() 43 | 44 | /** 45 | * The in-degree of each vertex in the graph. 46 | * @note Vertices with no in-edges are not returned in the resulting RDD. 47 | */ 48 | @transient lazy val inDegrees: VertexRDD[Int] = 49 | degreesRDD(EdgeDirection.In).setName("GraphOps.inDegrees") 50 | 51 | /** 52 | * The out-degree of each vertex in the graph. 53 | * @note Vertices with no out-edges are not returned in the resulting RDD. 54 | */ 55 | @transient lazy val outDegrees: VertexRDD[Int] = 56 | degreesRDD(EdgeDirection.Out).setName("GraphOps.outDegrees") 57 | 58 | /** 59 | * The degree of each vertex in the graph. 60 | * @note Vertices with no edges are not returned in the resulting RDD. 61 | */ 62 | @transient lazy val degrees: VertexRDD[Int] = 63 | degreesRDD(EdgeDirection.Either).setName("GraphOps.degrees") 64 | 65 | /** 66 | * Computes the neighboring vertex degrees. 67 | * 68 | * @param edgeDirection the direction along which to collect neighboring vertex attributes 69 | */ 70 | private def degreesRDD(edgeDirection: EdgeDirection): VertexRDD[Int] = { 71 | if (edgeDirection == EdgeDirection.In) { 72 | graph.aggregateMessages(_.sendToDst(1), _ + _, TripletFields.None) 73 | } else if (edgeDirection == EdgeDirection.Out) { 74 | graph.aggregateMessages(_.sendToSrc(1), _ + _, TripletFields.None) 75 | } else { // EdgeDirection.Either 76 | graph.aggregateMessages(ctx => { ctx.sendToSrc(1); ctx.sendToDst(1) }, _ + _, 77 | TripletFields.None) 78 | } 79 | } 80 | 81 | /** 82 | * Collect the neighbor vertex ids for each vertex. 83 | * 84 | * @param edgeDirection the direction along which to collect 85 | * neighboring vertices 86 | * 87 | * @return the set of neighboring ids for each vertex 88 | */ 89 | def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]] = { 90 | val nbrs = 91 | if (edgeDirection == EdgeDirection.Either) { 92 | graph.aggregateMessages[Array[VertexId]]( 93 | ctx => { ctx.sendToSrc(Array(ctx.dstId)); ctx.sendToDst(Array(ctx.srcId)) }, 94 | _ ++ _, TripletFields.None) 95 | } else if (edgeDirection == EdgeDirection.Out) { 96 | graph.aggregateMessages[Array[VertexId]]( 97 | ctx => ctx.sendToSrc(Array(ctx.dstId)), 98 | _ ++ _, TripletFields.None) 99 | } else if (edgeDirection == EdgeDirection.In) { 100 | graph.aggregateMessages[Array[VertexId]]( 101 | ctx => ctx.sendToDst(Array(ctx.srcId)), 102 | _ ++ _, TripletFields.None) 103 | } else { 104 | throw new SparkException("It doesn't make sense to collect neighbor ids without a " + 105 | "direction. (EdgeDirection.Both is not supported; use EdgeDirection.Either instead.)") 106 | } 107 | graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) => 108 | nbrsOpt.getOrElse(Array.empty[VertexId]) 109 | } 110 | } // end of collectNeighborIds 111 | 112 | /** 113 | * Collect the neighbor vertex attributes for each vertex. 114 | * 115 | * @note This function could be highly inefficient on power-law 116 | * graphs where high degree vertices may force a large amount of 117 | * information to be collected to a single location. 118 | * 119 | * @param edgeDirection the direction along which to collect 120 | * neighboring vertices 121 | * 122 | * @return the vertex set of neighboring vertex attributes for each vertex 123 | */ 124 | def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]] = { 125 | val nbrs = edgeDirection match { 126 | case EdgeDirection.Either => 127 | graph.aggregateMessages[Array[(VertexId, VD)]]( 128 | ctx => { 129 | ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr))) 130 | ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr))) 131 | }, 132 | (a, b) => a ++ b, TripletFields.All) 133 | case EdgeDirection.In => 134 | graph.aggregateMessages[Array[(VertexId, VD)]]( 135 | ctx => ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr))), 136 | (a, b) => a ++ b, TripletFields.Src) 137 | case EdgeDirection.Out => 138 | graph.aggregateMessages[Array[(VertexId, VD)]]( 139 | ctx => ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr))), 140 | (a, b) => a ++ b, TripletFields.Dst) 141 | case EdgeDirection.Both => 142 | throw new SparkException("collectEdges does not support EdgeDirection.Both. Use" + 143 | "EdgeDirection.Either instead.") 144 | } 145 | graph.vertices.leftJoin(nbrs) { (vid, vdata, nbrsOpt) => 146 | nbrsOpt.getOrElse(Array.empty[(VertexId, VD)]) 147 | } 148 | } // end of collectNeighbor 149 | 150 | /** 151 | * Returns an RDD that contains for each vertex v its local edges, 152 | * i.e., the edges that are incident on v, in the user-specified direction. 153 | * Warning: note that singleton vertices, those with no edges in the given 154 | * direction will not be part of the return value. 155 | * 156 | * @note This function could be highly inefficient on power-law 157 | * graphs where high degree vertices may force a large amount of 158 | * information to be collected to a single location. 159 | * 160 | * @param edgeDirection the direction along which to collect 161 | * the local edges of vertices 162 | * 163 | * @return the local edges for each vertex 164 | */ 165 | def collectEdges(edgeDirection: EdgeDirection): VertexRDD[Array[Edge[ED]]] = { 166 | edgeDirection match { 167 | case EdgeDirection.Either => 168 | graph.aggregateMessages[Array[Edge[ED]]]( 169 | ctx => { 170 | ctx.sendToSrc(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))) 171 | ctx.sendToDst(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))) 172 | }, 173 | (a, b) => a ++ b, TripletFields.EdgeOnly) 174 | case EdgeDirection.In => 175 | graph.aggregateMessages[Array[Edge[ED]]]( 176 | ctx => ctx.sendToDst(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))), 177 | (a, b) => a ++ b, TripletFields.EdgeOnly) 178 | case EdgeDirection.Out => 179 | graph.aggregateMessages[Array[Edge[ED]]]( 180 | ctx => ctx.sendToSrc(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))), 181 | (a, b) => a ++ b, TripletFields.EdgeOnly) 182 | case EdgeDirection.Both => 183 | throw new SparkException("collectEdges does not support EdgeDirection.Both. Use" + 184 | "EdgeDirection.Either instead.") 185 | } 186 | } 187 | 188 | /** 189 | * Join the vertices with an RDD and then apply a function from the 190 | * vertex and RDD entry to a new vertex value. The input table 191 | * should contain at most one entry for each vertex. If no entry is 192 | * provided the map function is skipped and the old value is used. 193 | * 194 | * @tparam U the type of entry in the table of updates 195 | * @param table the table to join with the vertices in the graph. 196 | * The table should contain at most one entry for each vertex. 197 | * @param mapFunc the function used to compute the new vertex 198 | * values. The map function is invoked only for vertices with a 199 | * corresponding entry in the table otherwise the old vertex value 200 | * is used. 201 | * 202 | * @example This function is used to update the vertices with new 203 | * values based on external data. For example we could add the out 204 | * degree to each vertex record 205 | * 206 | * {{{ 207 | * val rawGraph: Graph[Int, Int] = GraphLoader.edgeListFile(sc, "webgraph") 208 | * .mapVertices((_, _) => 0) 209 | * val outDeg = rawGraph.outDegrees 210 | * val graph = rawGraph.joinVertices[Int](outDeg) 211 | * ((_, _, outDeg) => outDeg) 212 | * }}} 213 | * 214 | */ 215 | def joinVertices[U: ClassTag](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD) 216 | : Graph[VD, ED] = { 217 | val uf = (id: VertexId, data: VD, o: Option[U]) => { 218 | o match { 219 | case Some(u) => mapFunc(id, data, u) 220 | case None => data 221 | } 222 | } 223 | graph.outerJoinVertices(table)(uf) 224 | } 225 | 226 | /** 227 | * Filter the graph by computing some values to filter on, and applying the predicates. 228 | * 229 | * @param preprocess a function to compute new vertex and edge data before filtering 230 | * @param epred edge pred to filter on after preprocess, see more details under 231 | * [[org.apache.spark.graphx.Graph#subgraph]] 232 | * @param vpred vertex pred to filter on after prerocess, see more details under 233 | * [[org.apache.spark.graphx.Graph#subgraph]] 234 | * @tparam VD2 vertex type the vpred operates on 235 | * @tparam ED2 edge type the epred operates on 236 | * @return a subgraph of the orginal graph, with its data unchanged 237 | * 238 | * @example This function can be used to filter the graph based on some property, without 239 | * changing the vertex and edge values in your program. For example, we could remove the vertices 240 | * in a graph with 0 outdegree 241 | * 242 | * {{{ 243 | * graph.filter( 244 | * graph => { 245 | * val degrees: VertexRDD[Int] = graph.outDegrees 246 | * graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)} 247 | * }, 248 | * vpred = (vid: VertexId, deg:Int) => deg > 0 249 | * ) 250 | * }}} 251 | * 252 | */ 253 | def filter[VD2: ClassTag, ED2: ClassTag]( 254 | preprocess: Graph[VD, ED] => Graph[VD2, ED2], 255 | epred: (EdgeTriplet[VD2, ED2]) => Boolean = (x: EdgeTriplet[VD2, ED2]) => true, 256 | vpred: (VertexId, VD2) => Boolean = (v: VertexId, d: VD2) => true): Graph[VD, ED] = { 257 | graph.mask(preprocess(graph).subgraph(epred, vpred)) 258 | } 259 | 260 | /** 261 | * Picks a random vertex from the graph and returns its ID. 262 | */ 263 | def pickRandomVertex(): VertexId = { 264 | val probability = 50.0 / graph.numVertices 265 | var found = false 266 | var retVal: VertexId = null.asInstanceOf[VertexId] 267 | while (!found) { 268 | val selectedVertices = graph.vertices.flatMap { vidVvals => 269 | if (Random.nextDouble() < probability) { Some(vidVvals._1) } 270 | else { None } 271 | } 272 | if (selectedVertices.count > 1) { 273 | found = true 274 | val collectedVertices = selectedVertices.collect() 275 | retVal = collectedVertices(Random.nextInt(collectedVertices.size)) 276 | } 277 | } 278 | retVal 279 | } 280 | 281 | /** 282 | * Convert bi-directional edges into uni-directional ones. 283 | * Some graph algorithms (e.g., TriangleCount) assume that an input graph 284 | * has its edges in canonical direction. 285 | * This function rewrites the vertex ids of edges so that srcIds are smaller 286 | * than dstIds, and merges the duplicated edges. 287 | * 288 | * @param mergeFunc the user defined reduce function which should 289 | * be commutative and associative and is used to combine the output 290 | * of the map phase 291 | * 292 | * @return the resulting graph with canonical edges 293 | */ 294 | def convertToCanonicalEdges( 295 | mergeFunc: (ED, ED) => ED = (e1, e2) => e1): Graph[VD, ED] = { 296 | val newEdges = 297 | graph.edges 298 | .map { 299 | case e if e.srcId < e.dstId => ((e.srcId, e.dstId), e.attr) 300 | case e => ((e.dstId, e.srcId), e.attr) 301 | } 302 | .reduceByKey(mergeFunc) 303 | .map(e => new Edge(e._1._1, e._1._2, e._2)) 304 | Graph(graph.vertices, newEdges) 305 | } 306 | 307 | /** 308 | * Execute a Pregel-like iterative vertex-parallel abstraction. The 309 | * user-defined vertex-program `vprog` is executed in parallel on 310 | * each vertex receiving any inbound messages and computing a new 311 | * value for the vertex. The `sendMsg` function is then invoked on 312 | * all out-edges and is used to compute an optional message to the 313 | * destination vertex. The `mergeMsg` function is a commutative 314 | * associative function used to combine messages destined to the 315 | * same vertex. 316 | * 317 | * On the first iteration all vertices receive the `initialMsg` and 318 | * on subsequent iterations if a vertex does not receive a message 319 | * then the vertex-program is not invoked. 320 | * 321 | * This function iterates until there are no remaining messages, or 322 | * for `maxIterations` iterations. 323 | * 324 | * @tparam A the Pregel message type 325 | * 326 | * @param initialMsg the message each vertex will receive at the on 327 | * the first iteration 328 | * 329 | * @param maxIterations the maximum number of iterations to run for 330 | * 331 | * @param activeDirection the direction of edges incident to a vertex that received a message in 332 | * the previous round on which to run `sendMsg`. For example, if this is `EdgeDirection.Out`, only 333 | * out-edges of vertices that received a message in the previous round will run. 334 | * 335 | * @param vprog the user-defined vertex program which runs on each 336 | * vertex and receives the inbound message and computes a new vertex 337 | * value. On the first iteration the vertex program is invoked on 338 | * all vertices and is passed the default message. On subsequent 339 | * iterations the vertex program is only invoked on those vertices 340 | * that receive messages. 341 | * 342 | * @param sendMsg a user supplied function that is applied to out 343 | * edges of vertices that received messages in the current 344 | * iteration 345 | * 346 | * @param mergeMsg a user supplied function that takes two incoming 347 | * messages of type A and merges them into a single message of type 348 | * A. ''This function must be commutative and associative and 349 | * ideally the size of A should not increase.'' 350 | * 351 | * @return the resulting graph at the end of the computation 352 | * 353 | */ 354 | def pregel[A: ClassTag]( 355 | initialMsg: A, 356 | maxIterations: Int = Int.MaxValue, 357 | activeDirection: EdgeDirection = EdgeDirection.Either)( 358 | vprog: (VertexId, VD, A) => VD, 359 | sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], 360 | mergeMsg: (A, A) => A) 361 | : Graph[VD, ED] = { 362 | Pregel(graph, initialMsg, maxIterations, activeDirection)(vprog, sendMsg, mergeMsg) 363 | } 364 | 365 | /** 366 | * Run a dynamic version of PageRank returning a graph with vertex attributes containing the 367 | * PageRank and edge attributes containing the normalized edge weight. 368 | * 369 | * @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergence]] 370 | */ 371 | def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double] = { 372 | PageRank.runUntilConvergence(graph, tol, resetProb) 373 | } 374 | 375 | 376 | /** 377 | * Run personalized PageRank for a given vertex, such that all random walks 378 | * are started relative to the source node. 379 | * 380 | * @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergenceWithOptions]] 381 | */ 382 | def personalizedPageRank(src: VertexId, tol: Double, 383 | resetProb: Double = 0.15) : Graph[Double, Double] = { 384 | PageRank.runUntilConvergenceWithOptions(graph, tol, resetProb, Some(src)) 385 | } 386 | 387 | /** 388 | * Run Personalized PageRank for a fixed number of iterations with 389 | * with all iterations originating at the source node 390 | * returning a graph with vertex attributes 391 | * containing the PageRank and edge attributes the normalized edge weight. 392 | * 393 | * @see [[org.apache.spark.graphx.lib.PageRank$#runWithOptions]] 394 | */ 395 | def staticPersonalizedPageRank(src: VertexId, numIter: Int, 396 | resetProb: Double = 0.15) : Graph[Double, Double] = { 397 | PageRank.runWithOptions(graph, numIter, resetProb, Some(src)) 398 | } 399 | 400 | /** 401 | * Run PageRank for a fixed number of iterations returning a graph with vertex attributes 402 | * containing the PageRank and edge attributes the normalized edge weight. 403 | * 404 | * @see [[org.apache.spark.graphx.lib.PageRank$#run]] 405 | */ 406 | def staticPageRank(numIter: Int, resetProb: Double = 0.15): Graph[Double, Double] = { 407 | PageRank.run(graph, numIter, resetProb) 408 | } 409 | 410 | /** 411 | * Compute the connected component membership of each vertex and return a graph with the vertex 412 | * value containing the lowest vertex id in the connected component containing that vertex. 413 | * 414 | * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]] 415 | */ 416 | def connectedComponents(): Graph[VertexId, ED] = { 417 | ConnectedComponents.run(graph) 418 | } 419 | 420 | /** 421 | * Compute the number of triangles passing through each vertex. 422 | * 423 | * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]] 424 | */ 425 | def triangleCount(): Graph[Int, ED] = { 426 | TriangleCount.run(graph) 427 | } 428 | 429 | /** 430 | * Compute the strongly connected component (SCC) of each vertex and return a graph with the 431 | * vertex value containing the lowest vertex id in the SCC containing that vertex. 432 | * 433 | * @see [[org.apache.spark.graphx.lib.StronglyConnectedComponents$#run]] 434 | */ 435 | def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED] = { 436 | StronglyConnectedComponents.run(graph, numIter) 437 | } 438 | } // end of GraphOps 439 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/GraphXUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import org.apache.spark.SparkConf 21 | 22 | import org.apache.spark.graphx.impl._ 23 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 24 | 25 | import org.apache.spark.util.collection.{OpenHashSet, BitSet} 26 | import org.apache.spark.util.BoundedPriorityQueue 27 | 28 | object GraphXUtils { 29 | /** 30 | * Registers classes that GraphX uses with Kryo. 31 | */ 32 | def registerKryoClasses(conf: SparkConf) { 33 | conf.registerKryoClasses(Array( 34 | classOf[Edge[Object]], 35 | classOf[(VertexId, Object)], 36 | classOf[EdgePartition[Object, Object]], 37 | classOf[BitSet], 38 | classOf[VertexIdToIndexMap], 39 | classOf[VertexAttributeBlock[Object]], 40 | classOf[PartitionStrategy], 41 | classOf[BoundedPriorityQueue[Object]], 42 | classOf[EdgeDirection], 43 | classOf[GraphXPrimitiveKeyOpenHashMap[VertexId, Int]], 44 | classOf[OpenHashSet[Int]], 45 | classOf[OpenHashSet[Long]])) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | /** 21 | * Represents the way edges are assigned to edge partitions based on their source and destination 22 | * vertex IDs. 23 | */ 24 | trait PartitionStrategy extends Serializable { 25 | /** Returns the partition number for a given edge. */ 26 | def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID 27 | } 28 | 29 | /** 30 | * Collection of built-in [[PartitionStrategy]] implementations. 31 | */ 32 | object PartitionStrategy { 33 | /** 34 | * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix, 35 | * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication. 36 | * 37 | * Suppose we have a graph with 12 vertices that we want to partition 38 | * over 9 machines. We can use the following sparse matrix representation: 39 | * 40 | *
41 | * __________________________________ 42 | * v0 | P0 * | P1 | P2 * | 43 | * v1 | **** | * | | 44 | * v2 | ******* | ** | **** | 45 | * v3 | ***** | * * | * | 46 | * ---------------------------------- 47 | * v4 | P3 * | P4 *** | P5 ** * | 48 | * v5 | * * | * | | 49 | * v6 | * | ** | **** | 50 | * v7 | * * * | * * | * | 51 | * ---------------------------------- 52 | * v8 | P6 * | P7 * | P8 * *| 53 | * v9 | * | * * | | 54 | * v10 | * | ** | * * | 55 | * v11 | * <-E | *** | ** | 56 | * ---------------------------------- 57 | *58 | * 59 | * The edge denoted by `E` connects `v11` with `v1` and is assigned to processor `P6`. To get the 60 | * processor number we divide the matrix into `sqrt(numParts)` by `sqrt(numParts)` blocks. Notice 61 | * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3, 62 | * P6)` or the last 63 | * row of blocks `(P6, P7, P8)`. As a consequence we can guarantee that `v11` will need to be 64 | * replicated to at most `2 * sqrt(numParts)` machines. 65 | * 66 | * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work 67 | * balance. To improve balance we first multiply each vertex id by a large prime to shuffle the 68 | * vertex locations. 69 | * 70 | * When the number of partitions requested is not a perfect square we use a slightly different 71 | * method where the last column can have a different number of rows than the others while still 72 | * maintaining the same size per block. 73 | */ 74 | case object EdgePartition2D extends PartitionStrategy { 75 | override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { 76 | val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt 77 | val mixingPrime: VertexId = 1125899906842597L 78 | if (numParts == ceilSqrtNumParts * ceilSqrtNumParts) { 79 | // Use old method for perfect squared to ensure we get same results 80 | val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt 81 | val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt 82 | (col * ceilSqrtNumParts + row) % numParts 83 | 84 | } else { 85 | // Otherwise use new method 86 | val cols = ceilSqrtNumParts 87 | val rows = (numParts + cols - 1) / cols 88 | val lastColRows = numParts - rows * (cols - 1) 89 | val col = (math.abs(src * mixingPrime) % numParts / rows).toInt 90 | val row = (math.abs(dst * mixingPrime) % (if (col < cols - 1) rows else lastColRows)).toInt 91 | col * rows + row 92 | 93 | } 94 | } 95 | } 96 | 97 | /** 98 | * Assigns edges to partitions using only the source vertex ID, colocating edges with the same 99 | * source. 100 | */ 101 | case object EdgePartition1D extends PartitionStrategy { 102 | override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { 103 | val mixingPrime: VertexId = 1125899906842597L 104 | (math.abs(src * mixingPrime) % numParts).toInt 105 | } 106 | } 107 | 108 | 109 | /** 110 | * Assigns edges to partitions by hashing the source and destination vertex IDs, resulting in a 111 | * random vertex cut that colocates all same-direction edges between two vertices. 112 | */ 113 | case object RandomVertexCut extends PartitionStrategy { 114 | override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { 115 | math.abs((src, dst).hashCode()) % numParts 116 | } 117 | } 118 | 119 | 120 | /** 121 | * Assigns edges to partitions by hashing the source and destination vertex IDs in a canonical 122 | * direction, resulting in a random vertex cut that colocates all edges between two vertices, 123 | * regardless of direction. 124 | */ 125 | case object CanonicalRandomVertexCut extends PartitionStrategy { 126 | override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { 127 | if (src < dst) { 128 | math.abs((src, dst).hashCode()) % numParts 129 | } else { 130 | math.abs((dst, src).hashCode()) % numParts 131 | } 132 | } 133 | } 134 | 135 | /** Returns the PartitionStrategy with the specified name. */ 136 | def fromString(s: String): PartitionStrategy = s match { 137 | case "RandomVertexCut" => RandomVertexCut 138 | case "EdgePartition1D" => EdgePartition1D 139 | case "EdgePartition2D" => EdgePartition2D 140 | case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut 141 | case _ => throw new IllegalArgumentException("Invalid PartitionStrategy: " + s) 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/Pregel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import scala.reflect.ClassTag 21 | import org.apache.spark.Logging 22 | 23 | 24 | /** 25 | * Implements a Pregel-like bulk-synchronous message-passing API. 26 | * 27 | * Unlike the original Pregel API, the GraphX Pregel API factors the sendMessage computation over 28 | * edges, enables the message sending computation to read both vertex attributes, and constrains 29 | * messages to the graph structure. These changes allow for substantially more efficient 30 | * distributed execution while also exposing greater flexibility for graph-based computation. 31 | * 32 | * @example We can use the Pregel abstraction to implement PageRank: 33 | * {{{ 34 | * val pagerankGraph: Graph[Double, Double] = graph 35 | * // Associate the degree with each vertex 36 | * .outerJoinVertices(graph.outDegrees) { 37 | * (vid, vdata, deg) => deg.getOrElse(0) 38 | * } 39 | * // Set the weight on the edges based on the degree 40 | * .mapTriplets(e => 1.0 / e.srcAttr) 41 | * // Set the vertex attributes to the initial pagerank values 42 | * .mapVertices((id, attr) => 1.0) 43 | * 44 | * def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double = 45 | * resetProb + (1.0 - resetProb) * msgSum 46 | * def sendMessage(id: VertexId, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] = 47 | * Iterator((edge.dstId, edge.srcAttr * edge.attr)) 48 | * def messageCombiner(a: Double, b: Double): Double = a + b 49 | * val initialMessage = 0.0 50 | * // Execute Pregel for a fixed number of iterations. 51 | * Pregel(pagerankGraph, initialMessage, numIter)( 52 | * vertexProgram, sendMessage, messageCombiner) 53 | * }}} 54 | * 55 | */ 56 | object Pregel extends Logging { 57 | 58 | /** 59 | * Execute a Pregel-like iterative vertex-parallel abstraction. The 60 | * user-defined vertex-program `vprog` is executed in parallel on 61 | * each vertex receiving any inbound messages and computing a new 62 | * value for the vertex. The `sendMsg` function is then invoked on 63 | * all out-edges and is used to compute an optional message to the 64 | * destination vertex. The `mergeMsg` function is a commutative 65 | * associative function used to combine messages destined to the 66 | * same vertex. 67 | * 68 | * On the first iteration all vertices receive the `initialMsg` and 69 | * on subsequent iterations if a vertex does not receive a message 70 | * then the vertex-program is not invoked. 71 | * 72 | * This function iterates until there are no remaining messages, or 73 | * for `maxIterations` iterations. 74 | * 75 | * @tparam VD the vertex data type 76 | * @tparam ED the edge data type 77 | * @tparam A the Pregel message type 78 | * 79 | * @param graph the input graph. 80 | * 81 | * @param initialMsg the message each vertex will receive at the first 82 | * iteration 83 | * 84 | * @param maxIterations the maximum number of iterations to run for 85 | * 86 | * @param activeDirection the direction of edges incident to a vertex that received a message in 87 | * the previous round on which to run `sendMsg`. For example, if this is `EdgeDirection.Out`, only 88 | * out-edges of vertices that received a message in the previous round will run. The default is 89 | * `EdgeDirection.Either`, which will run `sendMsg` on edges where either side received a message 90 | * in the previous round. If this is `EdgeDirection.Both`, `sendMsg` will only run on edges where 91 | * *both* vertices received a message. 92 | * 93 | * @param vprog the user-defined vertex program which runs on each 94 | * vertex and receives the inbound message and computes a new vertex 95 | * value. On the first iteration the vertex program is invoked on 96 | * all vertices and is passed the default message. On subsequent 97 | * iterations the vertex program is only invoked on those vertices 98 | * that receive messages. 99 | * 100 | * @param sendMsg a user supplied function that is applied to out 101 | * edges of vertices that received messages in the current 102 | * iteration 103 | * 104 | * @param mergeMsg a user supplied function that takes two incoming 105 | * messages of type A and merges them into a single message of type 106 | * A. ''This function must be commutative and associative and 107 | * ideally the size of A should not increase.'' 108 | * 109 | * @return the resulting graph at the end of the computation 110 | * 111 | */ 112 | def apply[VD: ClassTag, ED: ClassTag, A: ClassTag] 113 | (graph: Graph[VD, ED], 114 | initialMsg: A, 115 | maxIterations: Int = Int.MaxValue, 116 | activeDirection: EdgeDirection = EdgeDirection.Either) 117 | (vprog: (VertexId, VD, A) => VD, 118 | sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], 119 | mergeMsg: (A, A) => A) 120 | : Graph[VD, ED] = 121 | { 122 | var g = graph.mapVertices((vid, vdata) => vprog(vid, vdata, initialMsg)).cache() 123 | // compute the messages 124 | var messages = g.mapReduceTriplets(sendMsg, mergeMsg) 125 | var activeMessages = messages.count() 126 | // Loop 127 | var prevG: Graph[VD, ED] = null 128 | var i = 0 129 | while (activeMessages > 0 && i < maxIterations) { 130 | // Receive the messages and update the vertices. 131 | prevG = g 132 | g = g.joinVertices(messages)(vprog).cache() 133 | 134 | val oldMessages = messages 135 | // Send new messages, skipping edges where neither side received a message. We must cache 136 | // messages so it can be materialized on the next line, allowing us to uncache the previous 137 | // iteration. 138 | messages = g.mapReduceTriplets( 139 | sendMsg, mergeMsg, Some((oldMessages, activeDirection))).cache() 140 | // The call to count() materializes `messages` and the vertices of `g`. This hides oldMessages 141 | // (depended on by the vertices of g) and the vertices of prevG (depended on by oldMessages 142 | // and the vertices of g). 143 | activeMessages = messages.count() 144 | 145 | logInfo("Pregel finished iteration " + i) 146 | 147 | // Unpersist the RDDs hidden by newly-materialized RDDs 148 | oldMessages.unpersist(blocking = false) 149 | prevG.unpersistVertices(blocking = false) 150 | prevG.edges.unpersist(blocking = false) 151 | // count the iteration 152 | i += 1 153 | } 154 | 155 | g 156 | } // end of apply 157 | 158 | } // end of class Pregel 159 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/VertexRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.SparkContext._ 24 | import org.apache.spark.rdd._ 25 | import org.apache.spark.storage.StorageLevel 26 | 27 | import org.apache.spark.graphx.impl.RoutingTablePartition 28 | import org.apache.spark.graphx.impl.ShippableVertexPartition 29 | import org.apache.spark.graphx.impl.VertexAttributeBlock 30 | import org.apache.spark.graphx.impl.VertexRDDImpl 31 | 32 | /** 33 | * Extends `RDD[(VertexId, VD)]` by ensuring that there is only one entry for each vertex and by 34 | * pre-indexing the entries for fast, efficient joins. Two VertexRDDs with the same index can be 35 | * joined efficiently. All operations except [[reindex]] preserve the index. To construct a 36 | * `VertexRDD`, use the [[org.apache.spark.graphx.VertexRDD$ VertexRDD object]]. 37 | * 38 | * Additionally, stores routing information to enable joining the vertex attributes with an 39 | * [[EdgeRDD]]. 40 | * 41 | * @example Construct a `VertexRDD` from a plain RDD: 42 | * {{{ 43 | * // Construct an initial vertex set 44 | * val someData: RDD[(VertexId, SomeType)] = loadData(someFile) 45 | * val vset = VertexRDD(someData) 46 | * // If there were redundant values in someData we would use a reduceFunc 47 | * val vset2 = VertexRDD(someData, reduceFunc) 48 | * // Finally we can use the VertexRDD to index another dataset 49 | * val otherData: RDD[(VertexId, OtherType)] = loadData(otherFile) 50 | * val vset3 = vset2.innerJoin(otherData) { (vid, a, b) => b } 51 | * // Now we can construct very fast joins between the two sets 52 | * val vset4: VertexRDD[(SomeType, OtherType)] = vset.leftJoin(vset3) 53 | * }}} 54 | * 55 | * @tparam VD the vertex attribute associated with each vertex in the set. 56 | */ 57 | abstract class VertexRDD[VD]( 58 | sc: SparkContext, 59 | deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) { 60 | 61 | implicit protected def vdTag: ClassTag[VD] 62 | 63 | private[graphx] def partitionsRDD: RDD[ShippableVertexPartition[VD]] 64 | 65 | override protected def getPartitions: Array[Partition] = partitionsRDD.partitions 66 | 67 | /** 68 | * Provides the `RDD[(VertexId, VD)]` equivalent output. 69 | */ 70 | override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { 71 | firstParent[ShippableVertexPartition[VD]].iterator(part, context).next().iterator 72 | } 73 | 74 | /** 75 | * Construct a new VertexRDD that is indexed by only the visible vertices. The resulting 76 | * VertexRDD will be based on a different index and can no longer be quickly joined with this 77 | * RDD. 78 | */ 79 | def reindex(): VertexRDD[VD] 80 | 81 | /** 82 | * Applies a function to each `VertexPartition` of this RDD and returns a new VertexRDD. 83 | * 对当前RDD的每个分区进行函数变换得到一个新的VertexRDD 84 | */ 85 | private[graphx] def mapVertexPartitions[VD2: ClassTag]( 86 | f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]) 87 | : VertexRDD[VD2] 88 | 89 | /** 90 | * Restricts the vertex set to the set of vertices satisfying the given predicate. This operation 91 | * preserves the index for efficient joins with the original RDD, and it sets bits in the bitmask 92 | * rather than allocating new memory. 93 | * 94 | * It is declared and defined here to allow refining the return type from `RDD[(VertexId, VD)]` to 95 | * `VertexRDD[VD]`. 96 | * 97 | * @param pred the user defined predicate, which takes a tuple to conform to the 98 | * `RDD[(VertexId, VD)]` interface 99 | */ 100 | override def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD] = 101 | this.mapVertexPartitions(_.filter(Function.untupled(pred))) 102 | 103 | /** 104 | * Maps each vertex attribute, preserving the index. 105 | * 106 | * @tparam VD2 the type returned by the map function 107 | * 108 | * @param f the function applied to each value in the RDD 109 | * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the 110 | * original VertexRDD 111 | */ 112 | def mapValues[VD2: ClassTag](f: VD => VD2): VertexRDD[VD2] 113 | 114 | /** 115 | * Maps each vertex attribute, additionally supplying the vertex ID. 116 | * 117 | * @tparam VD2 the type returned by the map function 118 | * 119 | * @param f the function applied to each ID-value pair in the RDD 120 | * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the 121 | * original VertexRDD. The resulting VertexRDD retains the same index. 122 | */ 123 | def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2] 124 | 125 | /** 126 | * For each VertexId present in both `this` and `other`, minus will act as a set difference 127 | * operation returning only those unique VertexId's present in `this`. 128 | *根据VertexId做差集, VertexRDD[VD]和RDD[(VertexId, VD)]底层存储是一致的, 129 | * VertexRDD[VD]是RDD[(VertexId, VD)]的子类 130 | * @param other an RDD to run the set operation against 131 | */ 132 | def minus(other: RDD[(VertexId, VD)]): VertexRDD[VD] 133 | 134 | /** 135 | * For each VertexId present in both `this` and `other`, minus will act as a set difference 136 | * operation returning only those unique VertexId's present in `this`. 137 | *根据VertexId做差集 138 | * @param other a VertexRDD to run the set operation against 139 | */ 140 | def minus(other: VertexRDD[VD]): VertexRDD[VD] 141 | 142 | /** 143 | * For each vertex present in both `this` and `other`, `diff` returns only those vertices with 144 | * differing values; for values that are different, keeps the values from `other`. This is 145 | * only guaranteed to work if the VertexRDDs share a common ancestor. 146 | *去掉this和other中有相同值的点。如果冲突留下other的 147 | * @param other the other RDD[(VertexId, VD)] with which to diff against. 148 | */ 149 | def diff(other: RDD[(VertexId, VD)]): VertexRDD[VD] 150 | 151 | /** 152 | * For each vertex present in both `this` and `other`, `diff` returns only those vertices with 153 | * differing values; for values that are different, keeps the values from `other`. This is 154 | * only guaranteed to work if the VertexRDDs share a common ancestor. 155 | *去掉this和other中有相同值的点 156 | * @param other the other VertexRDD with which to diff against. 157 | */ 158 | def diff(other: VertexRDD[VD]): VertexRDD[VD] 159 | 160 | /** 161 | * Left joins this RDD with anotther VertexRDD with the same index. This function will fail if 162 | * both VertexRDDs do not share the same index. The resuling vertex set contains an entry for 163 | * each vertex in `this`. 164 | * 对相同index的元素做连接,左边有而右边没有的,右边返回None 165 | * If `other` is missing any vertex in this VertexRDD, `f` is passed `None`. 166 | * 167 | * @tparam VD2 the attribute type of the other VertexRDD 168 | * other VertexRDD中属性的类型 169 | * @tparam VD3 the attribute type of the resulting VertexRDD 170 | * @param other the other VertexRDD with which to join. 171 | * @param f the function mapping a vertex id and its attributes in this and the other vertex set 172 | * to a new vertex attribute. 173 | * @return a VertexRDD containing the results of `f` 174 | */ 175 | def leftZipJoin[VD2: ClassTag, VD3: ClassTag] 176 | (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] 177 | 178 | /** 179 | * Left joins this VertexRDD with an RDD containing vertex attribute pairs. If the other RDD is 180 | * backed by a VertexRDD with the same index then the efficient [[leftZipJoin]] implementation is 181 | * used. The resulting VertexRDD contains an entry for each vertex in `this`. If `other` is 182 | * missing any vertex in this VertexRDD, `f` is passed `None`. If there are duplicates, 183 | * the vertex is picked arbitrarily.如果有重复会从中随意挑选一个。 184 | * 185 | * @tparam VD2 the attribute type of the other VertexRDD 186 | * @tparam VD3 the attribute type of the resulting VertexRDD 187 | * 188 | * @param other the other VertexRDD with which to join 189 | * @param f the function mapping a vertex id and its attributes in this and the other vertex set 190 | * to a new vertex attribute. 191 | * @return a VertexRDD containing all the vertices in this VertexRDD with the attributes emitted (发出,放出) 192 | * by `f`. 193 | */ 194 | def leftJoin[VD2: ClassTag, VD3: ClassTag] 195 | (other: RDD[(VertexId, VD2)]) 196 | (f: (VertexId, VD, Option[VD2]) => VD3) 197 | : VertexRDD[VD3] 198 | 199 | /** 200 | * Efficiently inner joins this VertexRDD with another VertexRDD sharing the same index. See 201 | * [[innerJoin]] for the behavior of the join. 202 | */ 203 | def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U]) 204 | (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] 205 | 206 | /** 207 | * Inner joins this VertexRDD with an RDD containing vertex attribute pairs. If the other RDD is 208 | * backed by a VertexRDD with the same index then the efficient [[innerZipJoin]] implementation 209 | * is used. 210 | * 211 | * @param other an RDD containing vertices to join. If there are multiple entries for the same 212 | * vertex, one is picked arbitrarily. Use [[aggregateUsingIndex]] to merge multiple entries. 213 | * @param f the join function applied to corresponding values of `this` and `other` 214 | * @return a VertexRDD co-indexed with `this`, containing only vertices that appear in both 215 | * `this` and `other`, with values supplied by `f` 216 | */ 217 | def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) 218 | (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] 219 | 220 | /** 221 | * Aggregates vertices in `messages` that have the same ids using `reduceFunc`, returning a 222 | * VertexRDD co-indexed with `this`. 223 | * 224 | * @param messages an RDD containing messages to aggregate, where each message is a pair of its 225 | * target vertex ID and the message data 226 | * @param reduceFunc the associative aggregation function for merging messages to the same vertex 227 | * @return a VertexRDD co-indexed with `this`, containing only vertices that received messages. 228 | * For those vertices, their values are the result of applying `reduceFunc` to all received 229 | * messages. 230 | */ 231 | def aggregateUsingIndex[VD2: ClassTag]( 232 | messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] 233 | 234 | /** 235 | * Returns a new `VertexRDD` reflecting a reversal of all edge directions in the corresponding 236 | * [[EdgeRDD]]. 237 | * 把相应的EdgeRDD进行反向得到的新的VertexRDD 238 | */ 239 | def reverseRoutingTables(): VertexRDD[VD] 240 | 241 | /** Prepares this VertexRDD for efficient joins with the given EdgeRDD. */ 242 | def withEdges(edges: EdgeRDD[_]): VertexRDD[VD] 243 | 244 | /** Replaces the vertex partitions while preserving all other properties of the VertexRDD. */ 245 | private[graphx] def withPartitionsRDD[VD2: ClassTag]( 246 | partitionsRDD: RDD[ShippableVertexPartition[VD2]]): VertexRDD[VD2] 247 | 248 | /** 249 | * Changes the target storage level while preserving all other properties of the 250 | * VertexRDD. Operations on the returned VertexRDD will preserve this storage level. 251 | *改变存储级别。 252 | * This does not actually trigger a cache; to do this, call 253 | * [[org.apache.spark.graphx.VertexRDD#cache]] on the returned VertexRDD. 254 | */ 255 | private[graphx] def withTargetStorageLevel( 256 | targetStorageLevel: StorageLevel): VertexRDD[VD] 257 | 258 | /** Generates an RDD of vertex attributes suitable for shipping to the edge partitions. */ 259 | private[graphx] def shipVertexAttributes( 260 | shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, VertexAttributeBlock[VD])] 261 | 262 | /** Generates an RDD of vertex IDs suitable for shipping to the edge partitions. */ 263 | private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])] 264 | 265 | } // end of VertexRDD 266 | 267 | 268 | /** 269 | * The VertexRDD singleton is used to construct VertexRDDs. 270 | */ 271 | object VertexRDD { 272 | 273 | /** 274 | * Constructs a standalone `VertexRDD` (one that is not set up for efficient joins with an 275 | * [[EdgeRDD]]) from an RDD of vertex-attribute pairs. Duplicate entries are removed arbitrarily. 276 | *赋值操作 277 | * @tparam VD the vertex attribute type 278 | * 279 | * @param vertices the collection of vertex-attribute pairs 280 | */ 281 | def apply[VD: ClassTag](vertices: RDD[(VertexId, VD)]): VertexRDD[VD] = { 282 | val vPartitioned: RDD[(VertexId, VD)] = vertices.partitioner match { 283 | case Some(p) => vertices 284 | case None => vertices.partitionBy(new HashPartitioner(vertices.partitions.size)) 285 | } 286 | val vertexPartitions = vPartitioned.mapPartitions( 287 | iter => Iterator(ShippableVertexPartition(iter)), 288 | preservesPartitioning = true) 289 | new VertexRDDImpl(vertexPartitions) 290 | } 291 | 292 | /** 293 | * Constructs a `VertexRDD` from an RDD of vertex-attribute pairs. Duplicate vertex entries are 294 | * removed arbitrarily. The resulting `VertexRDD` will be joinable with `edges`, and any missing 295 | * vertices referred to by `edges` will be created with the attribute `defaultVal`. 296 | * 297 | * @tparam VD the vertex attribute type 298 | * 299 | * @param vertices the collection of vertex-attribute pairs 300 | * @param edges the [[EdgeRDD]] that these vertices may be joined with 301 | * @param defaultVal the vertex attribute to use when creating missing vertices 302 | */ 303 | def apply[VD: ClassTag]( 304 | vertices: RDD[(VertexId, VD)], edges: EdgeRDD[_], defaultVal: VD): VertexRDD[VD] = { 305 | VertexRDD(vertices, edges, defaultVal, (a, b) => a) 306 | } 307 | 308 | /** 309 | * Constructs a `VertexRDD` from an RDD of vertex-attribute pairs. Duplicate vertex entries are 310 | * merged using `mergeFunc`. The resulting `VertexRDD` will be joinable with `edges`, and any 311 | * missing vertices referred to by `edges` will be created with the attribute `defaultVal`. 312 | * 313 | * @tparam VD the vertex attribute type 314 | * 315 | * @param vertices the collection of vertex-attribute pairs 316 | * @param edges the [[EdgeRDD]] that these vertices may be joined with 317 | * @param defaultVal the vertex attribute to use when creating missing vertices 318 | * @param mergeFunc the commutative, associative duplicate vertex attribute merge function 319 | */ 320 | def apply[VD: ClassTag]( 321 | vertices: RDD[(VertexId, VD)], edges: EdgeRDD[_], defaultVal: VD, mergeFunc: (VD, VD) => VD 322 | ): VertexRDD[VD] = { 323 | val vPartitioned: RDD[(VertexId, VD)] = vertices.partitioner match { 324 | case Some(p) => vertices 325 | case None => vertices.partitionBy(new HashPartitioner(vertices.partitions.size)) 326 | } 327 | val routingTables = createRoutingTables(edges, vPartitioned.partitioner.get) 328 | val vertexPartitions = vPartitioned.zipPartitions(routingTables, preservesPartitioning = true) { 329 | (vertexIter, routingTableIter) => 330 | val routingTable = 331 | if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty 332 | Iterator(ShippableVertexPartition(vertexIter, routingTable, defaultVal, mergeFunc)) 333 | } 334 | new VertexRDDImpl(vertexPartitions) 335 | } 336 | 337 | /** 338 | * Constructs a `VertexRDD` containing all vertices referred to in `edges`. The vertices will be 339 | * created with the attribute `defaultVal`. The resulting `VertexRDD` will be joinable with 340 | * `edges`. 341 | *从边生成VertexRDD 342 | * @tparam VD the vertex attribute type 343 | * 344 | * @param edges the [[EdgeRDD]] referring to the vertices to create 345 | * @param numPartitions the desired number of partitions for the resulting `VertexRDD` 346 | * @param defaultVal the vertex attribute to use when creating missing vertices 347 | */ 348 | def fromEdges[VD: ClassTag]( 349 | edges: EdgeRDD[_], numPartitions: Int, defaultVal: VD): VertexRDD[VD] = { 350 | val routingTables = createRoutingTables(edges, new HashPartitioner(numPartitions)) 351 | val vertexPartitions = routingTables.mapPartitions({ routingTableIter => 352 | val routingTable = 353 | if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty 354 | Iterator(ShippableVertexPartition(Iterator.empty, routingTable, defaultVal)) 355 | }, preservesPartitioning = true) 356 | new VertexRDDImpl(vertexPartitions) 357 | } 358 | 359 | private[graphx] def createRoutingTables( 360 | edges: EdgeRDD[_], vertexPartitioner: Partitioner): RDD[RoutingTablePartition] = { 361 | // Determine which vertices each edge partition needs by creating a mapping from vid to pid. 362 | val vid2pid = edges.partitionsRDD.mapPartitions(_.flatMap( 363 | Function.tupled(RoutingTablePartition.edgePartitionToMsgs))) 364 | .setName("VertexRDD.createRoutingTables - vid2pid (aggregation)") 365 | 366 | val numEdgePartitions = edges.partitions.size 367 | vid2pid.partitionBy(vertexPartitioner).mapPartitions( 368 | iter => Iterator(RoutingTablePartition.fromMsgs(numEdgePartitions, iter)), 369 | preservesPartitioning = true) 370 | } 371 | } 372 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.graphx._ 23 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 24 | import org.apache.spark.util.collection.{SortDataFormat, Sorter, PrimitiveVector} 25 | 26 | /** Constructs an EdgePartition from scratch. */ 27 | private[graphx] 28 | class EdgePartitionBuilder[@specialized(Long, Int, Double) ED: ClassTag, VD: ClassTag]( 29 | size: Int = 64) { 30 | private[this] val edges = new PrimitiveVector[Edge[ED]](size) 31 | 32 | /** Add a new edge to the partition. */ 33 | def add(src: VertexId, dst: VertexId, d: ED) { 34 | edges += Edge(src, dst, d) 35 | } 36 | 37 | def toEdgePartition: EdgePartition[ED, VD] = { 38 | val edgeArray = edges.trim().array 39 | new Sorter(Edge.edgeArraySortDataFormat[ED]) 40 | .sort(edgeArray, 0, edgeArray.length, Edge.lexicographicOrdering) 41 | val localSrcIds = new Array[Int](edgeArray.size) 42 | val localDstIds = new Array[Int](edgeArray.size) 43 | val data = new Array[ED](edgeArray.size) 44 | val index = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int] 45 | val global2local = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int] 46 | val local2global = new PrimitiveVector[VertexId] 47 | var vertexAttrs = Array.empty[VD] 48 | // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and 49 | // adding them to the index. Also populate a map from vertex id to a sequential local offset. 50 | if (edgeArray.length > 0) { 51 | index.update(edgeArray(0).srcId, 0) 52 | var currSrcId: VertexId = edgeArray(0).srcId 53 | var currLocalId = -1 54 | var i = 0 55 | while (i < edgeArray.size) { 56 | val srcId = edgeArray(i).srcId 57 | val dstId = edgeArray(i).dstId 58 | localSrcIds(i) = global2local.changeValue(srcId, 59 | { currLocalId += 1; local2global += srcId; currLocalId }, identity) 60 | localDstIds(i) = global2local.changeValue(dstId, 61 | { currLocalId += 1; local2global += dstId; currLocalId }, identity) 62 | data(i) = edgeArray(i).attr 63 | if (srcId != currSrcId) { 64 | currSrcId = srcId 65 | index.update(currSrcId, i) 66 | } 67 | 68 | i += 1 69 | } 70 | vertexAttrs = new Array[VD](currLocalId + 1) 71 | } 72 | new EdgePartition( 73 | localSrcIds, localDstIds, data, index, global2local, local2global.trim().array, vertexAttrs, 74 | None) 75 | } 76 | } 77 | 78 | /** 79 | * Constructs an EdgePartition from an existing EdgePartition with the same vertex set. This enables 80 | * reuse of the local vertex ids. Intended for internal use in EdgePartition only. 81 | */ 82 | private[impl] 83 | class ExistingEdgePartitionBuilder[ 84 | @specialized(Long, Int, Double) ED: ClassTag, VD: ClassTag]( 85 | global2local: GraphXPrimitiveKeyOpenHashMap[VertexId, Int], 86 | local2global: Array[VertexId], 87 | vertexAttrs: Array[VD], 88 | activeSet: Option[VertexSet], 89 | size: Int = 64) { 90 | private[this] val edges = new PrimitiveVector[EdgeWithLocalIds[ED]](size) 91 | 92 | /** Add a new edge to the partition. */ 93 | def add(src: VertexId, dst: VertexId, localSrc: Int, localDst: Int, d: ED) { 94 | edges += EdgeWithLocalIds(src, dst, localSrc, localDst, d) 95 | } 96 | 97 | def toEdgePartition: EdgePartition[ED, VD] = { 98 | val edgeArray = edges.trim().array 99 | new Sorter(EdgeWithLocalIds.edgeArraySortDataFormat[ED]) 100 | .sort(edgeArray, 0, edgeArray.length, EdgeWithLocalIds.lexicographicOrdering) 101 | val localSrcIds = new Array[Int](edgeArray.size) 102 | val localDstIds = new Array[Int](edgeArray.size) 103 | val data = new Array[ED](edgeArray.size) 104 | val index = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int] 105 | // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and 106 | // adding them to the index 107 | if (edgeArray.length > 0) { 108 | index.update(edgeArray(0).srcId, 0) 109 | var currSrcId: VertexId = edgeArray(0).srcId 110 | var i = 0 111 | while (i < edgeArray.size) { 112 | localSrcIds(i) = edgeArray(i).localSrcId 113 | localDstIds(i) = edgeArray(i).localDstId 114 | data(i) = edgeArray(i).attr 115 | if (edgeArray(i).srcId != currSrcId) { 116 | currSrcId = edgeArray(i).srcId 117 | index.update(currSrcId, i) 118 | } 119 | i += 1 120 | } 121 | } 122 | 123 | new EdgePartition( 124 | localSrcIds, localDstIds, data, index, global2local, local2global, vertexAttrs, activeSet) 125 | } 126 | } 127 | 128 | private[impl] case class EdgeWithLocalIds[@specialized ED]( 129 | srcId: VertexId, dstId: VertexId, localSrcId: Int, localDstId: Int, attr: ED) 130 | 131 | private[impl] object EdgeWithLocalIds { 132 | implicit def lexicographicOrdering[ED]: Ordering[EdgeWithLocalIds[ED]] = 133 | new Ordering[EdgeWithLocalIds[ED]] { 134 | override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = { 135 | if (a.srcId == b.srcId) { 136 | if (a.dstId == b.dstId) 0 137 | else if (a.dstId < b.dstId) -1 138 | else 1 139 | } else if (a.srcId < b.srcId) -1 140 | else 1 141 | } 142 | } 143 | 144 | private[graphx] def edgeArraySortDataFormat[ED] = { 145 | new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] { 146 | override def getKey(data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = { 147 | data(pos) 148 | } 149 | 150 | override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = { 151 | val tmp = data(pos0) 152 | data(pos0) = data(pos1) 153 | data(pos1) = tmp 154 | } 155 | 156 | override def copyElement( 157 | src: Array[EdgeWithLocalIds[ED]], srcPos: Int, 158 | dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) { 159 | dst(dstPos) = src(srcPos) 160 | } 161 | 162 | override def copyRange( 163 | src: Array[EdgeWithLocalIds[ED]], srcPos: Int, 164 | dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) { 165 | System.arraycopy(src, srcPos, dst, dstPos, length) 166 | } 167 | 168 | override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = { 169 | new Array[EdgeWithLocalIds[ED]](length) 170 | } 171 | } 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.{classTag, ClassTag} 21 | 22 | import org.apache.spark.{OneToOneDependency, HashPartitioner} 23 | import org.apache.spark.rdd.RDD 24 | import org.apache.spark.storage.StorageLevel 25 | 26 | import org.apache.spark.graphx._ 27 | 28 | class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( 29 | @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], 30 | val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) 31 | extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { 32 | 33 | override def setName(_name: String): this.type = { 34 | if (partitionsRDD.name != null) { 35 | partitionsRDD.setName(partitionsRDD.name + ", " + _name) 36 | } else { 37 | partitionsRDD.setName(_name) 38 | } 39 | this 40 | } 41 | setName("EdgeRDD") 42 | 43 | /** 44 | * If `partitionsRDD` already has a partitioner, use it. Otherwise assume that the 45 | * [[PartitionID]]s in `partitionsRDD` correspond to the actual partitions and create a new 46 | * partitioner that allows co-partitioning with `partitionsRDD`. 47 | */ 48 | override val partitioner = 49 | partitionsRDD.partitioner.orElse(Some(new HashPartitioner(partitions.size))) 50 | 51 | override def collect(): Array[Edge[ED]] = this.map(_.copy()).collect() 52 | 53 | /** 54 | * Persists the edge partitions at the specified storage level, ignoring any existing target 55 | * storage level. 56 | */ 57 | override def persist(newLevel: StorageLevel): this.type = { 58 | partitionsRDD.persist(newLevel) 59 | this 60 | } 61 | 62 | override def unpersist(blocking: Boolean = true): this.type = { 63 | partitionsRDD.unpersist(blocking) 64 | this 65 | } 66 | 67 | /** Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY. */ 68 | override def cache(): this.type = { 69 | partitionsRDD.persist(targetStorageLevel) 70 | this 71 | } 72 | 73 | override def getStorageLevel: StorageLevel = partitionsRDD.getStorageLevel 74 | 75 | override def checkpoint(): Unit = { 76 | partitionsRDD.checkpoint() 77 | } 78 | 79 | override def isCheckpointed: Boolean = { 80 | firstParent[(PartitionID, EdgePartition[ED, VD])].isCheckpointed 81 | } 82 | 83 | override def getCheckpointFile: Option[String] = { 84 | partitionsRDD.getCheckpointFile 85 | } 86 | 87 | /** The number of edges in the RDD. */ 88 | override def count(): Long = { 89 | partitionsRDD.map(_._2.size.toLong).reduce(_ + _) 90 | } 91 | 92 | override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = 93 | mapEdgePartitions((pid, part) => part.map(f)) 94 | 95 | override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) 96 | 97 | def filter( 98 | epred: EdgeTriplet[VD, ED] => Boolean, 99 | vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { 100 | mapEdgePartitions((pid, part) => part.filter(epred, vpred)) 101 | } 102 | 103 | override def innerJoin[ED2: ClassTag, ED3: ClassTag] 104 | (other: EdgeRDD[ED2]) 105 | (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { 106 | val ed2Tag = classTag[ED2] 107 | val ed3Tag = classTag[ED3] 108 | this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { 109 | (thisIter, otherIter) => 110 | val (pid, thisEPart) = thisIter.next() 111 | val (_, otherEPart) = otherIter.next() 112 | Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) 113 | }) 114 | } 115 | 116 | def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( 117 | f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { 118 | this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => 119 | if (iter.hasNext) { 120 | val (pid, ep) = iter.next() 121 | Iterator(Tuple2(pid, f(pid, ep))) 122 | } else { 123 | Iterator.empty 124 | } 125 | }, preservesPartitioning = true)) 126 | } 127 | 128 | private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( 129 | partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { 130 | new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) 131 | } 132 | 133 | override private[graphx] def withTargetStorageLevel( 134 | targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { 135 | new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.{classTag, ClassTag} 21 | 22 | import org.apache.spark.HashPartitioner 23 | import org.apache.spark.SparkContext._ 24 | import org.apache.spark.rdd.{RDD, ShuffledRDD} 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.graphx._ 27 | import org.apache.spark.graphx.impl.GraphImpl._ 28 | import org.apache.spark.graphx.util.BytecodeUtils 29 | 30 | 31 | /** 32 | * An implementation of [[org.apache.spark.graphx.Graph]] to support computation on graphs. 33 | * 34 | * Graphs are represented using two RDDs: `vertices`, which contains vertex attributes and the 35 | * routing information for shipping vertex attributes to edge partitions, and 36 | * `replicatedVertexView`, which contains edges and the vertex attributes mentioned by each edge. 37 | */ 38 | class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( 39 | @transient val vertices: VertexRDD[VD], 40 | @transient val replicatedVertexView: ReplicatedVertexView[VD, ED]) 41 | extends Graph[VD, ED] with Serializable { 42 | 43 | /** Default constructor is provided to support serialization */ 44 | protected def this() = this(null, null) 45 | 46 | @transient override val edges: EdgeRDDImpl[ED, VD] = replicatedVertexView.edges 47 | 48 | /** Return a RDD that brings edges together with their source and destination vertices. */ 49 | @transient override lazy val triplets: RDD[EdgeTriplet[VD, ED]] = { 50 | replicatedVertexView.upgrade(vertices, true, true) 51 | replicatedVertexView.edges.partitionsRDD.mapPartitions(_.flatMap { 52 | case (pid, part) => part.tripletIterator() 53 | }) 54 | } 55 | 56 | override def persist(newLevel: StorageLevel): Graph[VD, ED] = { 57 | vertices.persist(newLevel) 58 | replicatedVertexView.edges.persist(newLevel) 59 | this 60 | } 61 | 62 | override def cache(): Graph[VD, ED] = { 63 | vertices.cache() 64 | replicatedVertexView.edges.cache() 65 | this 66 | } 67 | 68 | override def checkpoint(): Unit = { 69 | vertices.checkpoint() 70 | replicatedVertexView.edges.checkpoint() 71 | } 72 | 73 | override def isCheckpointed: Boolean = { 74 | vertices.isCheckpointed && replicatedVertexView.edges.isCheckpointed 75 | } 76 | 77 | override def getCheckpointFiles: Seq[String] = { 78 | Seq(vertices.getCheckpointFile, replicatedVertexView.edges.getCheckpointFile).flatMap { 79 | case Some(path) => Seq(path) 80 | case None => Seq() 81 | } 82 | } 83 | 84 | override def unpersist(blocking: Boolean = true): Graph[VD, ED] = { 85 | unpersistVertices(blocking) 86 | replicatedVertexView.edges.unpersist(blocking) 87 | this 88 | } 89 | 90 | override def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] = { 91 | vertices.unpersist(blocking) 92 | // TODO: unpersist the replicated vertices in `replicatedVertexView` but leave the edges alone 93 | this 94 | } 95 | 96 | override def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] = { 97 | partitionBy(partitionStrategy, edges.partitions.size) 98 | } 99 | 100 | override def partitionBy( 101 | partitionStrategy: PartitionStrategy, numPartitions: Int): Graph[VD, ED] = { 102 | val edTag = classTag[ED] 103 | val vdTag = classTag[VD] 104 | val newEdges = edges.withPartitionsRDD(edges.map { e => 105 | val part: PartitionID = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions) 106 | (part, (e.srcId, e.dstId, e.attr)) 107 | } 108 | .partitionBy(new HashPartitioner(numPartitions)) 109 | .mapPartitionsWithIndex( { (pid, iter) => 110 | val builder = new EdgePartitionBuilder[ED, VD]()(edTag, vdTag) 111 | iter.foreach { message => 112 | val data = message._2 113 | builder.add(data._1, data._2, data._3) 114 | } 115 | val edgePartition = builder.toEdgePartition 116 | Iterator((pid, edgePartition)) 117 | }, preservesPartitioning = true)).cache() 118 | GraphImpl.fromExistingRDDs(vertices.withEdges(newEdges), newEdges) 119 | } 120 | 121 | override def reverse: Graph[VD, ED] = { 122 | new GraphImpl(vertices.reverseRoutingTables(), replicatedVertexView.reverse()) 123 | } 124 | 125 | override def mapVertices[VD2: ClassTag] 126 | (f: (VertexId, VD) => VD2)(implicit eq: VD =:= VD2 = null): Graph[VD2, ED] = { 127 | // The implicit parameter eq will be populated by the compiler if VD and VD2 are equal, and left 128 | // null if not 129 | if (eq != null) { 130 | vertices.cache() 131 | // The map preserves type, so we can use incremental replication 132 | val newVerts = vertices.mapVertexPartitions(_.map(f)).cache() 133 | val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts) 134 | val newReplicatedVertexView = replicatedVertexView.asInstanceOf[ReplicatedVertexView[VD2, ED]] 135 | .updateVertices(changedVerts) 136 | new GraphImpl(newVerts, newReplicatedVertexView) 137 | } else { 138 | // The map does not preserve type, so we must re-replicate all vertices 139 | GraphImpl(vertices.mapVertexPartitions(_.map(f)), replicatedVertexView.edges) 140 | } 141 | } 142 | 143 | override def mapEdges[ED2: ClassTag]( 144 | f: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2] = { 145 | val newEdges = replicatedVertexView.edges 146 | .mapEdgePartitions((pid, part) => part.map(f(pid, part.iterator))) 147 | new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges)) 148 | } 149 | 150 | override def mapTriplets[ED2: ClassTag]( 151 | f: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2], 152 | tripletFields: TripletFields): Graph[VD, ED2] = { 153 | vertices.cache() 154 | replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst) 155 | val newEdges = replicatedVertexView.edges.mapEdgePartitions { (pid, part) => 156 | part.map(f(pid, part.tripletIterator(tripletFields.useSrc, tripletFields.useDst))) 157 | } 158 | new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges)) 159 | } 160 | 161 | override def subgraph( 162 | epred: EdgeTriplet[VD, ED] => Boolean = x => true, 163 | vpred: (VertexId, VD) => Boolean = (a, b) => true): Graph[VD, ED] = { 164 | vertices.cache() 165 | // Filter the vertices, reusing the partitioner and the index from this graph 166 | val newVerts = vertices.mapVertexPartitions(_.filter(vpred)) 167 | // Filter the triplets. We must always upgrade the triplet view fully because vpred always runs 168 | // on both src and dst vertices 169 | replicatedVertexView.upgrade(vertices, true, true) 170 | val newEdges = replicatedVertexView.edges.filter(epred, vpred) 171 | new GraphImpl(newVerts, replicatedVertexView.withEdges(newEdges)) 172 | } 173 | 174 | override def mask[VD2: ClassTag, ED2: ClassTag] ( 175 | other: Graph[VD2, ED2]): Graph[VD, ED] = { 176 | val newVerts = vertices.innerJoin(other.vertices) { (vid, v, w) => v } 177 | val newEdges = replicatedVertexView.edges.innerJoin(other.edges) { (src, dst, v, w) => v } 178 | new GraphImpl(newVerts, replicatedVertexView.withEdges(newEdges)) 179 | } 180 | 181 | override def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] = { 182 | val newEdges = replicatedVertexView.edges.mapEdgePartitions( 183 | (pid, part) => part.groupEdges(merge)) 184 | new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges)) 185 | } 186 | 187 | // /////////////////////////////////////////////////////////////////////////////////////////////// 188 | // Lower level transformation methods 189 | // /////////////////////////////////////////////////////////////////////////////////////////////// 190 | 191 | override def mapReduceTriplets[A: ClassTag]( 192 | mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], 193 | reduceFunc: (A, A) => A, 194 | activeSetOpt: Option[(VertexRDD[_], EdgeDirection)]): VertexRDD[A] = { 195 | 196 | def sendMsg(ctx: EdgeContext[VD, ED, A]) { 197 | mapFunc(ctx.toEdgeTriplet).foreach { kv => 198 | val id = kv._1 199 | val msg = kv._2 200 | if (id == ctx.srcId) { 201 | ctx.sendToSrc(msg) 202 | } else { 203 | assert(id == ctx.dstId) 204 | ctx.sendToDst(msg) 205 | } 206 | } 207 | } 208 | 209 | val mapUsesSrcAttr = accessesVertexAttr(mapFunc, "srcAttr") 210 | val mapUsesDstAttr = accessesVertexAttr(mapFunc, "dstAttr") 211 | val tripletFields = new TripletFields(mapUsesSrcAttr, mapUsesDstAttr, true) 212 | 213 | aggregateMessagesWithActiveSet(sendMsg, reduceFunc, tripletFields, activeSetOpt) 214 | } 215 | 216 | override def aggregateMessagesWithActiveSet[A: ClassTag]( 217 | sendMsg: EdgeContext[VD, ED, A] => Unit, 218 | mergeMsg: (A, A) => A, 219 | tripletFields: TripletFields, 220 | activeSetOpt: Option[(VertexRDD[_], EdgeDirection)]): VertexRDD[A] = { 221 | 222 | vertices.cache() 223 | // For each vertex, replicate its attribute only to partitions where it is 224 | // in the relevant position in an edge. 225 | replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst) 226 | val view = activeSetOpt match { 227 | case Some((activeSet, _)) => 228 | replicatedVertexView.withActiveSet(activeSet) 229 | case None => 230 | replicatedVertexView 231 | } 232 | val activeDirectionOpt = activeSetOpt.map(_._2) 233 | 234 | // Map and combine. 235 | val preAgg = view.edges.partitionsRDD.mapPartitions(_.flatMap { 236 | case (pid, edgePartition) => 237 | // Choose scan method 238 | val activeFraction = edgePartition.numActives.getOrElse(0) / edgePartition.indexSize.toFloat 239 | activeDirectionOpt match { 240 | case Some(EdgeDirection.Both) => 241 | if (activeFraction < 0.8) { 242 | edgePartition.aggregateMessagesIndexScan(sendMsg, mergeMsg, tripletFields, 243 | EdgeActiveness.Both) 244 | } else { 245 | edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields, 246 | EdgeActiveness.Both) 247 | } 248 | case Some(EdgeDirection.Either) => 249 | // TODO: Because we only have a clustered index on the source vertex ID, we can't filter 250 | // the index here. Instead we have to scan all edges and then do the filter. 251 | edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields, 252 | EdgeActiveness.Either) 253 | case Some(EdgeDirection.Out) => 254 | if (activeFraction < 0.8) { 255 | edgePartition.aggregateMessagesIndexScan(sendMsg, mergeMsg, tripletFields, 256 | EdgeActiveness.SrcOnly) 257 | } else { 258 | edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields, 259 | EdgeActiveness.SrcOnly) 260 | } 261 | case Some(EdgeDirection.In) => 262 | edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields, 263 | EdgeActiveness.DstOnly) 264 | case _ => // None 265 | edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields, 266 | EdgeActiveness.Neither) 267 | } 268 | }).setName("GraphImpl.aggregateMessages - preAgg") 269 | 270 | // do the final reduction reusing the index map 271 | vertices.aggregateUsingIndex(preAgg, mergeMsg) 272 | } 273 | 274 | override def outerJoinVertices[U: ClassTag, VD2: ClassTag] 275 | (other: RDD[(VertexId, U)]) 276 | (updateF: (VertexId, VD, Option[U]) => VD2) 277 | (implicit eq: VD =:= VD2 = null): Graph[VD2, ED] = { 278 | // The implicit parameter eq will be populated by the compiler if VD and VD2 are equal, and left 279 | // null if not 280 | if (eq != null) { 281 | vertices.cache() 282 | // updateF preserves type, so we can use incremental replication 283 | val newVerts = vertices.leftJoin(other)(updateF).cache() 284 | val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts) 285 | val newReplicatedVertexView = replicatedVertexView.asInstanceOf[ReplicatedVertexView[VD2, ED]] 286 | .updateVertices(changedVerts) 287 | new GraphImpl(newVerts, newReplicatedVertexView) 288 | } else { 289 | // updateF does not preserve type, so we must re-replicate all vertices 290 | val newVerts = vertices.leftJoin(other)(updateF) 291 | GraphImpl(newVerts, replicatedVertexView.edges) 292 | } 293 | } 294 | 295 | /** Test whether the closure accesses the the attribute with name `attrName`. */ 296 | private def accessesVertexAttr(closure: AnyRef, attrName: String): Boolean = { 297 | try { 298 | BytecodeUtils.invokedMethod(closure, classOf[EdgeTriplet[VD, ED]], attrName) 299 | } catch { 300 | case _: ClassNotFoundException => true // if we don't know, be conservative 301 | } 302 | } 303 | } // end of class GraphImpl 304 | 305 | 306 | object GraphImpl { 307 | 308 | /** Create a graph from edges, setting referenced vertices to `defaultVertexAttr`. */ 309 | def apply[VD: ClassTag, ED: ClassTag]( 310 | edges: RDD[Edge[ED]], 311 | defaultVertexAttr: VD, 312 | edgeStorageLevel: StorageLevel, 313 | vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = { 314 | fromEdgeRDD(EdgeRDD.fromEdges(edges), defaultVertexAttr, edgeStorageLevel, vertexStorageLevel) 315 | } 316 | 317 | /** Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`. */ 318 | def fromEdgePartitions[VD: ClassTag, ED: ClassTag]( 319 | edgePartitions: RDD[(PartitionID, EdgePartition[ED, VD])], 320 | defaultVertexAttr: VD, 321 | edgeStorageLevel: StorageLevel, 322 | vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = { 323 | fromEdgeRDD(EdgeRDD.fromEdgePartitions(edgePartitions), defaultVertexAttr, edgeStorageLevel, 324 | vertexStorageLevel) 325 | } 326 | 327 | /** Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`. */ 328 | def apply[VD: ClassTag, ED: ClassTag]( 329 | vertices: RDD[(VertexId, VD)], 330 | edges: RDD[Edge[ED]], 331 | defaultVertexAttr: VD, 332 | edgeStorageLevel: StorageLevel, 333 | vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = { 334 | val edgeRDD = EdgeRDD.fromEdges(edges)(classTag[ED], classTag[VD]) 335 | .withTargetStorageLevel(edgeStorageLevel) 336 | val vertexRDD = VertexRDD(vertices, edgeRDD, defaultVertexAttr) 337 | .withTargetStorageLevel(vertexStorageLevel) 338 | GraphImpl(vertexRDD, edgeRDD) 339 | } 340 | 341 | /** 342 | * Create a graph from a VertexRDD and an EdgeRDD with arbitrary replicated vertices. The 343 | * VertexRDD must already be set up for efficient joins with the EdgeRDD by calling 344 | * `VertexRDD.withEdges` or an appropriate VertexRDD constructor. 345 | */ 346 | def apply[VD: ClassTag, ED: ClassTag]( 347 | vertices: VertexRDD[VD], 348 | edges: EdgeRDD[ED]): GraphImpl[VD, ED] = { 349 | 350 | vertices.cache() 351 | 352 | // Convert the vertex partitions in edges to the correct type 353 | val newEdges = edges.asInstanceOf[EdgeRDDImpl[ED, _]] 354 | .mapEdgePartitions((pid, part) => part.withoutVertexAttributes[VD]) 355 | .cache() 356 | 357 | GraphImpl.fromExistingRDDs(vertices, newEdges) 358 | } 359 | 360 | /** 361 | * Create a graph from a VertexRDD and an EdgeRDD with the same replicated vertex type as the 362 | * vertices. The VertexRDD must already be set up for efficient joins with the EdgeRDD by calling 363 | * `VertexRDD.withEdges` or an appropriate VertexRDD constructor. 364 | */ 365 | def fromExistingRDDs[VD: ClassTag, ED: ClassTag]( 366 | vertices: VertexRDD[VD], 367 | edges: EdgeRDD[ED]): GraphImpl[VD, ED] = { 368 | new GraphImpl(vertices, new ReplicatedVertexView(edges.asInstanceOf[EdgeRDDImpl[ED, VD]])) 369 | } 370 | 371 | /** 372 | * Create a graph from an EdgeRDD with the correct vertex type, setting missing vertices to 373 | * `defaultVertexAttr`. The vertices will have the same number of partitions as the EdgeRDD. 374 | */ 375 | private def fromEdgeRDD[VD: ClassTag, ED: ClassTag]( 376 | edges: EdgeRDDImpl[ED, VD], 377 | defaultVertexAttr: VD, 378 | edgeStorageLevel: StorageLevel, 379 | vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = { 380 | val edgesCached = edges.withTargetStorageLevel(edgeStorageLevel).cache() 381 | val vertices = VertexRDD.fromEdges(edgesCached, edgesCached.partitions.size, defaultVertexAttr) 382 | .withTargetStorageLevel(vertexStorageLevel) 383 | fromExistingRDDs(vertices, edgesCached) 384 | } 385 | 386 | } // end of object GraphImpl 387 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.{classTag, ClassTag} 21 | 22 | import org.apache.spark.SparkContext._ 23 | import org.apache.spark.rdd.RDD 24 | 25 | import org.apache.spark.graphx._ 26 | 27 | /** 28 | * Manages shipping vertex attributes to the edge partitions of an 29 | * [[org.apache.spark.graphx.EdgeRDD]]. Vertex attributes may be partially shipped to construct a 30 | * triplet view with vertex attributes on only one side, and they may be updated. An active vertex 31 | * set may additionally be shipped to the edge partitions. Be careful not to store a reference to 32 | * `edges`, since it may be modified when the attribute shipping level is upgraded. 33 | */ 34 | private[impl] 35 | class ReplicatedVertexView[VD: ClassTag, ED: ClassTag]( 36 | var edges: EdgeRDDImpl[ED, VD], 37 | var hasSrcId: Boolean = false, 38 | var hasDstId: Boolean = false) { 39 | 40 | /** 41 | * Return a new `ReplicatedVertexView` with the specified `EdgeRDD`, which must have the same 42 | * shipping level. 43 | */ 44 | def withEdges[VD2: ClassTag, ED2: ClassTag]( 45 | edges_ : EdgeRDDImpl[ED2, VD2]): ReplicatedVertexView[VD2, ED2] = { 46 | new ReplicatedVertexView(edges_, hasSrcId, hasDstId) 47 | } 48 | 49 | /** 50 | * Return a new `ReplicatedVertexView` where edges are reversed and shipping levels are swapped to 51 | * match. 52 | */ 53 | def reverse(): ReplicatedVertexView[VD, ED] = { 54 | val newEdges = edges.mapEdgePartitions((pid, part) => part.reverse) 55 | new ReplicatedVertexView(newEdges, hasDstId, hasSrcId) 56 | } 57 | 58 | /** 59 | * Upgrade the shipping level in-place to the specified levels by shipping vertex attributes from 60 | * `vertices`. This operation modifies the `ReplicatedVertexView`, and callers can access `edges` 61 | * afterwards to obtain the upgraded view. 62 | */ 63 | def upgrade(vertices: VertexRDD[VD], includeSrc: Boolean, includeDst: Boolean) { 64 | val shipSrc = includeSrc && !hasSrcId 65 | val shipDst = includeDst && !hasDstId 66 | if (shipSrc || shipDst) { 67 | val shippedVerts: RDD[(Int, VertexAttributeBlock[VD])] = 68 | vertices.shipVertexAttributes(shipSrc, shipDst) 69 | .setName("ReplicatedVertexView.upgrade(%s, %s) - shippedVerts %s %s (broadcast)".format( 70 | includeSrc, includeDst, shipSrc, shipDst)) 71 | .partitionBy(edges.partitioner.get) 72 | val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) { 73 | (ePartIter, shippedVertsIter) => ePartIter.map { 74 | case (pid, edgePartition) => 75 | (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator))) 76 | } 77 | }) 78 | edges = newEdges 79 | hasSrcId = includeSrc 80 | hasDstId = includeDst 81 | } 82 | } 83 | 84 | /** 85 | * Return a new `ReplicatedVertexView` where the `activeSet` in each edge partition contains only 86 | * vertex ids present in `actives`. This ships a vertex id to all edge partitions where it is 87 | * referenced, ignoring the attribute shipping level. 88 | */ 89 | def withActiveSet(actives: VertexRDD[_]): ReplicatedVertexView[VD, ED] = { 90 | val shippedActives = actives.shipVertexIds() 91 | .setName("ReplicatedVertexView.withActiveSet - shippedActives (broadcast)") 92 | .partitionBy(edges.partitioner.get) 93 | 94 | val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedActives) { 95 | (ePartIter, shippedActivesIter) => ePartIter.map { 96 | case (pid, edgePartition) => 97 | (pid, edgePartition.withActiveSet(shippedActivesIter.flatMap(_._2.iterator))) 98 | } 99 | }) 100 | new ReplicatedVertexView(newEdges, hasSrcId, hasDstId) 101 | } 102 | 103 | /** 104 | * Return a new `ReplicatedVertexView` where vertex attributes in edge partition are updated using 105 | * `updates`. This ships a vertex attribute only to the edge partitions where it is in the 106 | * position(s) specified by the attribute shipping level. 107 | */ 108 | def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = { 109 | val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId) 110 | .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format( 111 | hasSrcId, hasDstId)) 112 | .partitionBy(edges.partitioner.get) 113 | 114 | val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) { 115 | (ePartIter, shippedVertsIter) => ePartIter.map { 116 | case (pid, edgePartition) => 117 | (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator))) 118 | } 119 | }) 120 | new ReplicatedVertexView(newEdges, hasSrcId, hasDstId) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.Partitioner 23 | import org.apache.spark.rdd.RDD 24 | import org.apache.spark.rdd.ShuffledRDD 25 | import org.apache.spark.util.collection.{BitSet, PrimitiveVector} 26 | 27 | import org.apache.spark.graphx._ 28 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 29 | 30 | import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage 31 | 32 | private[graphx] 33 | object RoutingTablePartition { 34 | /** 35 | * A message from an edge partition to a vertex specifying the position in which the edge 36 | * partition references the vertex (src, dst, or both). The edge partition is encoded in the lower 37 | * 30 bits of the Int, and the position is encoded in the upper 2 bits of the Int. 38 | */ 39 | type RoutingTableMessage = (VertexId, Int) 40 | 41 | private def toMessage(vid: VertexId, pid: PartitionID, position: Byte): RoutingTableMessage = { 42 | val positionUpper2 = position << 30 43 | val pidLower30 = pid & 0x3FFFFFFF 44 | (vid, positionUpper2 | pidLower30) 45 | } 46 | 47 | private def vidFromMessage(msg: RoutingTableMessage): VertexId = msg._1 48 | private def pidFromMessage(msg: RoutingTableMessage): PartitionID = msg._2 & 0x3FFFFFFF 49 | private def positionFromMessage(msg: RoutingTableMessage): Byte = (msg._2 >> 30).toByte 50 | 51 | val empty: RoutingTablePartition = new RoutingTablePartition(Array.empty) 52 | 53 | /** Generate a `RoutingTableMessage` for each vertex referenced in `edgePartition`. */ 54 | def edgePartitionToMsgs(pid: PartitionID, edgePartition: EdgePartition[_, _]) 55 | : Iterator[RoutingTableMessage] = { 56 | // Determine which positions each vertex id appears in using a map where the low 2 bits 57 | // represent src and dst 58 | val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, Byte] 59 | edgePartition.iterator.foreach { e => 60 | map.changeValue(e.srcId, 0x1, (b: Byte) => (b | 0x1).toByte) 61 | map.changeValue(e.dstId, 0x2, (b: Byte) => (b | 0x2).toByte) 62 | } 63 | map.iterator.map { vidAndPosition => 64 | val vid = vidAndPosition._1 65 | val position = vidAndPosition._2 66 | toMessage(vid, pid, position) 67 | } 68 | } 69 | 70 | /** Build a `RoutingTablePartition` from `RoutingTableMessage`s. */ 71 | def fromMsgs(numEdgePartitions: Int, iter: Iterator[RoutingTableMessage]) 72 | : RoutingTablePartition = { 73 | val pid2vid = Array.fill(numEdgePartitions)(new PrimitiveVector[VertexId]) 74 | val srcFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean]) 75 | val dstFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean]) 76 | for (msg <- iter) { 77 | val vid = vidFromMessage(msg) 78 | val pid = pidFromMessage(msg) 79 | val position = positionFromMessage(msg) 80 | pid2vid(pid) += vid 81 | srcFlags(pid) += (position & 0x1) != 0 82 | dstFlags(pid) += (position & 0x2) != 0 83 | } 84 | 85 | new RoutingTablePartition(pid2vid.zipWithIndex.map { 86 | case (vids, pid) => (vids.trim().array, toBitSet(srcFlags(pid)), toBitSet(dstFlags(pid))) 87 | }) 88 | } 89 | 90 | /** Compact the given vector of Booleans into a BitSet. */ 91 | private def toBitSet(flags: PrimitiveVector[Boolean]): BitSet = { 92 | val bitset = new BitSet(flags.size) 93 | var i = 0 94 | while (i < flags.size) { 95 | if (flags(i)) { 96 | bitset.set(i) 97 | } 98 | i += 1 99 | } 100 | bitset 101 | } 102 | } 103 | 104 | /** 105 | * Stores the locations of edge-partition join sites for each vertex attribute in a particular 106 | * vertex partition. This provides routing information for shipping vertex attributes to edge 107 | * partitions. 108 | */ 109 | private[graphx] 110 | class RoutingTablePartition( 111 | private val routingTable: Array[(Array[VertexId], BitSet, BitSet)]) extends Serializable { 112 | /** The maximum number of edge partitions this `RoutingTablePartition` is built to join with. */ 113 | val numEdgePartitions: Int = routingTable.size 114 | 115 | /** Returns the number of vertices that will be sent to the specified edge partition. */ 116 | def partitionSize(pid: PartitionID): Int = routingTable(pid)._1.size 117 | 118 | /** Returns an iterator over all vertex ids stored in this `RoutingTablePartition`. */ 119 | def iterator: Iterator[VertexId] = routingTable.iterator.flatMap(_._1.iterator) 120 | 121 | /** Returns a new RoutingTablePartition reflecting a reversal of all edge directions. */ 122 | def reverse: RoutingTablePartition = { 123 | new RoutingTablePartition(routingTable.map { 124 | case (vids, srcVids, dstVids) => (vids, dstVids, srcVids) 125 | }) 126 | } 127 | 128 | /** 129 | * Runs `f` on each vertex id to be sent to the specified edge partition. Vertex ids can be 130 | * filtered by the position they have in the edge partition. 131 | */ 132 | def foreachWithinEdgePartition 133 | (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean) 134 | (f: VertexId => Unit) { 135 | val (vidsCandidate, srcVids, dstVids) = routingTable(pid) 136 | val size = vidsCandidate.length 137 | if (includeSrc && includeDst) { 138 | // Avoid checks for performance 139 | vidsCandidate.iterator.foreach(f) 140 | } else if (!includeSrc && !includeDst) { 141 | // Do nothing 142 | } else { 143 | val relevantVids = if (includeSrc) srcVids else dstVids 144 | relevantVids.iterator.foreach { i => f(vidsCandidate(i)) } 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.util.collection.{BitSet, PrimitiveVector} 23 | 24 | import org.apache.spark.graphx._ 25 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 26 | 27 | /** Stores vertex attributes to ship to an edge partition. */ 28 | private[graphx] 29 | class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexId], val attrs: Array[VD]) 30 | extends Serializable { 31 | def iterator: Iterator[(VertexId, VD)] = 32 | (0 until vids.size).iterator.map { i => (vids(i), attrs(i)) } 33 | } 34 | 35 | private[graphx] 36 | object ShippableVertexPartition { 37 | /** Construct a `ShippableVertexPartition` from the given vertices without any routing table. */ 38 | def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)]): ShippableVertexPartition[VD] = 39 | apply(iter, RoutingTablePartition.empty, null.asInstanceOf[VD], (a, b) => a) 40 | 41 | /** 42 | * Construct a `ShippableVertexPartition` from the given vertices with the specified routing 43 | * table, filling in missing vertices mentioned in the routing table using `defaultVal`. 44 | */ 45 | def apply[VD: ClassTag]( 46 | iter: Iterator[(VertexId, VD)], routingTable: RoutingTablePartition, defaultVal: VD) 47 | : ShippableVertexPartition[VD] = 48 | apply(iter, routingTable, defaultVal, (a, b) => a) 49 | 50 | /** 51 | * Construct a `ShippableVertexPartition` from the given vertices with the specified routing 52 | * table, filling in missing vertices mentioned in the routing table using `defaultVal`, 53 | * and merging duplicate vertex atrribute with mergeFunc. 54 | */ 55 | def apply[VD: ClassTag]( 56 | iter: Iterator[(VertexId, VD)], routingTable: RoutingTablePartition, defaultVal: VD, 57 | mergeFunc: (VD, VD) => VD): ShippableVertexPartition[VD] = { 58 | val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD] 59 | // Merge the given vertices using mergeFunc 60 | iter.foreach { pair => 61 | map.setMerge(pair._1, pair._2, mergeFunc) 62 | } 63 | // Fill in missing vertices mentioned in the routing table 64 | routingTable.iterator.foreach { vid => 65 | map.changeValue(vid, defaultVal, identity) 66 | } 67 | 68 | new ShippableVertexPartition(map.keySet, map._values, map.keySet.getBitSet, routingTable) 69 | } 70 | 71 | import scala.language.implicitConversions 72 | 73 | /** 74 | * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a 75 | * `ShippableVertexPartition`. 76 | */ 77 | implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD]) 78 | : ShippableVertexPartitionOps[VD] = new ShippableVertexPartitionOps(partition) 79 | 80 | /** 81 | * Implicit evidence that `ShippableVertexPartition` is a member of the 82 | * `VertexPartitionBaseOpsConstructor` typeclass. This enables invoking `VertexPartitionBase` 83 | * operations on a `ShippableVertexPartition` via an evidence parameter, as in 84 | * [[VertexPartitionBaseOps]]. 85 | */ 86 | implicit object ShippableVertexPartitionOpsConstructor 87 | extends VertexPartitionBaseOpsConstructor[ShippableVertexPartition] { 88 | def toOps[VD: ClassTag](partition: ShippableVertexPartition[VD]) 89 | : VertexPartitionBaseOps[VD, ShippableVertexPartition] = shippablePartitionToOps(partition) 90 | } 91 | } 92 | 93 | /** 94 | * A map from vertex id to vertex attribute that additionally stores edge partition join sites for 95 | * each vertex attribute, enabling joining with an [[org.apache.spark.graphx.EdgeRDD]]. 96 | */ 97 | private[graphx] 98 | class ShippableVertexPartition[VD: ClassTag]( 99 | val index: VertexIdToIndexMap, 100 | val values: Array[VD], 101 | val mask: BitSet, 102 | val routingTable: RoutingTablePartition) 103 | extends VertexPartitionBase[VD] { 104 | 105 | /** Return a new ShippableVertexPartition with the specified routing table. */ 106 | def withRoutingTable(routingTable_ : RoutingTablePartition): ShippableVertexPartition[VD] = { 107 | new ShippableVertexPartition(index, values, mask, routingTable_) 108 | } 109 | 110 | /** 111 | * Generate a `VertexAttributeBlock` for each edge partition keyed on the edge partition ID. The 112 | * `VertexAttributeBlock` contains the vertex attributes from the current partition that are 113 | * referenced in the specified positions in the edge partition. 114 | */ 115 | def shipVertexAttributes( 116 | shipSrc: Boolean, shipDst: Boolean): Iterator[(PartitionID, VertexAttributeBlock[VD])] = { 117 | Iterator.tabulate(routingTable.numEdgePartitions) { pid => 118 | val initialSize = if (shipSrc && shipDst) routingTable.partitionSize(pid) else 64 119 | val vids = new PrimitiveVector[VertexId](initialSize) 120 | val attrs = new PrimitiveVector[VD](initialSize) 121 | var i = 0 122 | routingTable.foreachWithinEdgePartition(pid, shipSrc, shipDst) { vid => 123 | if (isDefined(vid)) { 124 | vids += vid 125 | attrs += this(vid) 126 | } 127 | i += 1 128 | } 129 | (pid, new VertexAttributeBlock(vids.trim().array, attrs.trim().array)) 130 | } 131 | } 132 | 133 | /** 134 | * Generate a `VertexId` array for each edge partition keyed on the edge partition ID. The array 135 | * contains the visible vertex ids from the current partition that are referenced in the edge 136 | * partition. 137 | */ 138 | def shipVertexIds(): Iterator[(PartitionID, Array[VertexId])] = { 139 | Iterator.tabulate(routingTable.numEdgePartitions) { pid => 140 | val vids = new PrimitiveVector[VertexId](routingTable.partitionSize(pid)) 141 | var i = 0 142 | routingTable.foreachWithinEdgePartition(pid, true, true) { vid => 143 | if (isDefined(vid)) { 144 | vids += vid 145 | } 146 | i += 1 147 | } 148 | (pid, vids.trim().array) 149 | } 150 | } 151 | } 152 | 153 | private[graphx] class ShippableVertexPartitionOps[VD: ClassTag](self: ShippableVertexPartition[VD]) 154 | extends VertexPartitionBaseOps[VD, ShippableVertexPartition](self) { 155 | 156 | def withIndex(index: VertexIdToIndexMap): ShippableVertexPartition[VD] = { 157 | new ShippableVertexPartition(index, self.values, self.mask, self.routingTable) 158 | } 159 | 160 | def withValues[VD2: ClassTag](values: Array[VD2]): ShippableVertexPartition[VD2] = { 161 | new ShippableVertexPartition(self.index, values, self.mask, self.routingTable) 162 | } 163 | 164 | def withMask(mask: BitSet): ShippableVertexPartition[VD] = { 165 | new ShippableVertexPartition(self.index, self.values, mask, self.routingTable) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.util.collection.BitSet 23 | 24 | import org.apache.spark.graphx._ 25 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 26 | 27 | private[graphx] object VertexPartition { 28 | /** Construct a `VertexPartition` from the given vertices. */ 29 | def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)]) 30 | : VertexPartition[VD] = { 31 | val (index, values, mask) = VertexPartitionBase.initFrom(iter) 32 | new VertexPartition(index, values, mask) 33 | } 34 | 35 | import scala.language.implicitConversions 36 | 37 | /** 38 | * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a 39 | * `VertexPartition`. 40 | */ 41 | implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD]) 42 | : VertexPartitionOps[VD] = new VertexPartitionOps(partition) 43 | 44 | /** 45 | * Implicit evidence that `VertexPartition` is a member of the `VertexPartitionBaseOpsConstructor` 46 | * typeclass. This enables invoking `VertexPartitionBase` operations on a `VertexPartition` via an 47 | * evidence parameter, as in [[VertexPartitionBaseOps]]. 48 | */ 49 | implicit object VertexPartitionOpsConstructor 50 | extends VertexPartitionBaseOpsConstructor[VertexPartition] { 51 | def toOps[VD: ClassTag](partition: VertexPartition[VD]) 52 | : VertexPartitionBaseOps[VD, VertexPartition] = partitionToOps(partition) 53 | } 54 | } 55 | 56 | /** A map from vertex id to vertex attribute. */ 57 | private[graphx] class VertexPartition[VD: ClassTag]( 58 | val index: VertexIdToIndexMap, 59 | val values: Array[VD], 60 | val mask: BitSet) 61 | extends VertexPartitionBase[VD] 62 | 63 | private[graphx] class VertexPartitionOps[VD: ClassTag](self: VertexPartition[VD]) 64 | extends VertexPartitionBaseOps[VD, VertexPartition](self) { 65 | 66 | def withIndex(index: VertexIdToIndexMap): VertexPartition[VD] = { 67 | new VertexPartition(index, self.values, self.mask) 68 | } 69 | 70 | def withValues[VD2: ClassTag](values: Array[VD2]): VertexPartition[VD2] = { 71 | new VertexPartition(self.index, values, self.mask) 72 | } 73 | 74 | def withMask(mask: BitSet): VertexPartition[VD] = { 75 | new VertexPartition(self.index, self.values, mask) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.language.higherKinds 21 | import scala.reflect.ClassTag 22 | 23 | import org.apache.spark.util.collection.BitSet 24 | 25 | import org.apache.spark.graphx._ 26 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 27 | 28 | private[graphx] object VertexPartitionBase { 29 | /** 30 | * Construct the constituents of a VertexPartitionBase from the given vertices, merging duplicate 31 | * entries arbitrarily. 32 | */ 33 | def initFrom[VD: ClassTag](iter: Iterator[(VertexId, VD)]) 34 | : (VertexIdToIndexMap, Array[VD], BitSet) = { 35 | val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD] 36 | iter.foreach { pair => 37 | map(pair._1) = pair._2 38 | } 39 | (map.keySet, map._values, map.keySet.getBitSet) 40 | } 41 | 42 | /** 43 | * Construct the constituents of a VertexPartitionBase from the given vertices, merging duplicate 44 | * entries using `mergeFunc`. 45 | */ 46 | def initFrom[VD: ClassTag](iter: Iterator[(VertexId, VD)], mergeFunc: (VD, VD) => VD) 47 | : (VertexIdToIndexMap, Array[VD], BitSet) = { 48 | val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD] 49 | iter.foreach { pair => 50 | map.setMerge(pair._1, pair._2, mergeFunc) 51 | } 52 | (map.keySet, map._values, map.keySet.getBitSet) 53 | } 54 | } 55 | 56 | /** 57 | * An abstract map from vertex id to vertex attribute. [[VertexPartition]] is the corresponding 58 | * concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for 59 | * VertexPartitionBase and subclasses that provide implicit evidence of membership in the 60 | * `VertexPartitionBaseOpsConstructor` typeclass (for example, 61 | * [[VertexPartition.VertexPartitionOpsConstructor]]). 62 | */ 63 | private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag] 64 | extends Serializable { 65 | 66 | def index: VertexIdToIndexMap 67 | def values: Array[VD] 68 | def mask: BitSet 69 | 70 | val capacity: Int = index.capacity 71 | 72 | def size: Int = mask.cardinality() 73 | 74 | /** Return the vertex attribute for the given vertex ID. */ 75 | def apply(vid: VertexId): VD = values(index.getPos(vid)) 76 | 77 | def isDefined(vid: VertexId): Boolean = { 78 | val pos = index.getPos(vid) 79 | pos >= 0 && mask.get(pos) 80 | } 81 | 82 | def iterator: Iterator[(VertexId, VD)] = 83 | mask.iterator.map(ind => (index.getValue(ind), values(ind))) 84 | } 85 | 86 | /** 87 | * A typeclass for subclasses of `VertexPartitionBase` representing the ability to wrap them in a 88 | * `VertexPartitionBaseOps`. 89 | */ 90 | private[graphx] trait VertexPartitionBaseOpsConstructor[T[X] <: VertexPartitionBase[X]] { 91 | def toOps[VD: ClassTag](partition: T[VD]): VertexPartitionBaseOps[VD, T] 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.language.higherKinds 21 | import scala.language.implicitConversions 22 | import scala.reflect.ClassTag 23 | 24 | import org.apache.spark.Logging 25 | import org.apache.spark.util.collection.BitSet 26 | 27 | import org.apache.spark.graphx._ 28 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap 29 | 30 | /** 31 | * An class containing additional operations for subclasses of VertexPartitionBase that provide 32 | * implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for 33 | * example, [[VertexPartition.VertexPartitionOpsConstructor]]). 34 | */ 35 | private[graphx] abstract class VertexPartitionBaseOps 36 | [VD: ClassTag, Self[X] <: VertexPartitionBase[X] : VertexPartitionBaseOpsConstructor] 37 | (self: Self[VD]) 38 | extends Serializable with Logging { 39 | 40 | def withIndex(index: VertexIdToIndexMap): Self[VD] 41 | def withValues[VD2: ClassTag](values: Array[VD2]): Self[VD2] 42 | def withMask(mask: BitSet): Self[VD] 43 | 44 | /** 45 | * Pass each vertex attribute along with the vertex id through a map 46 | * function and retain the original RDD's partitioning and index. 47 | * 48 | * @tparam VD2 the type returned by the map function 49 | * 50 | * @param f the function applied to each vertex id and vertex 51 | * attribute in the RDD 52 | * 53 | * @return a new VertexPartition with values obtained by applying `f` to 54 | * each of the entries in the original VertexRDD. The resulting 55 | * VertexPartition retains the same index. 56 | */ 57 | def map[VD2: ClassTag](f: (VertexId, VD) => VD2): Self[VD2] = { 58 | // Construct a view of the map transformation 59 | val newValues = new Array[VD2](self.capacity) 60 | var i = self.mask.nextSetBit(0) 61 | while (i >= 0) { 62 | newValues(i) = f(self.index.getValue(i), self.values(i)) 63 | i = self.mask.nextSetBit(i + 1) 64 | } 65 | this.withValues(newValues) 66 | } 67 | 68 | /** 69 | * Restrict the vertex set to the set of vertices satisfying the given predicate. 70 | * 71 | * @param pred the user defined predicate 72 | * 73 | * @note The vertex set preserves the original index structure which means that the returned 74 | * RDD can be easily joined with the original vertex-set. Furthermore, the filter only 75 | * modifies the bitmap index and so no new values are allocated. 76 | */ 77 | def filter(pred: (VertexId, VD) => Boolean): Self[VD] = { 78 | // Allocate the array to store the results into 79 | val newMask = new BitSet(self.capacity) 80 | // Iterate over the active bits in the old mask and evaluate the predicate 81 | var i = self.mask.nextSetBit(0) 82 | while (i >= 0) { 83 | if (pred(self.index.getValue(i), self.values(i))) { 84 | newMask.set(i) 85 | } 86 | i = self.mask.nextSetBit(i + 1) 87 | } 88 | this.withMask(newMask) 89 | } 90 | 91 | /** Hides the VertexId's that are the same between `this` and `other`. */ 92 | def minus(other: Self[VD]): Self[VD] = { 93 | if (self.index != other.index) { 94 | logWarning("Minus operations on two VertexPartitions with different indexes is slow.") 95 | minus(createUsingIndex(other.iterator)) 96 | } else { 97 | self.withMask(self.mask.andNot(other.mask)) 98 | } 99 | } 100 | 101 | /** Hides the VertexId's that are the same between `this` and `other`. */ 102 | def minus(other: Iterator[(VertexId, VD)]): Self[VD] = { 103 | minus(createUsingIndex(other)) 104 | } 105 | 106 | /** 107 | * Hides vertices that are the same between this and other. For vertices that are different, keeps 108 | * the values from `other`. The indices of `this` and `other` must be the same. 109 | */ 110 | def diff(other: Self[VD]): Self[VD] = { 111 | if (self.index != other.index) { 112 | logWarning("Diffing two VertexPartitions with different indexes is slow.") 113 | diff(createUsingIndex(other.iterator)) 114 | } else { 115 | val newMask = self.mask & other.mask 116 | var i = newMask.nextSetBit(0) 117 | while (i >= 0) { 118 | if (self.values(i) == other.values(i)) { 119 | newMask.unset(i) 120 | } 121 | i = newMask.nextSetBit(i + 1) 122 | } 123 | this.withValues(other.values).withMask(newMask) 124 | } 125 | } 126 | 127 | /** Left outer join another VertexPartition. */ 128 | def leftJoin[VD2: ClassTag, VD3: ClassTag] 129 | (other: Self[VD2]) 130 | (f: (VertexId, VD, Option[VD2]) => VD3): Self[VD3] = { 131 | if (self.index != other.index) { 132 | logWarning("Joining two VertexPartitions with different indexes is slow.") 133 | leftJoin(createUsingIndex(other.iterator))(f) 134 | } else { 135 | val newValues = new Array[VD3](self.capacity) 136 | 137 | var i = self.mask.nextSetBit(0) 138 | while (i >= 0) { 139 | val otherV: Option[VD2] = if (other.mask.get(i)) Some(other.values(i)) else None 140 | newValues(i) = f(self.index.getValue(i), self.values(i), otherV) 141 | i = self.mask.nextSetBit(i + 1) 142 | } 143 | this.withValues(newValues) 144 | } 145 | } 146 | 147 | /** Left outer join another iterator of messages. */ 148 | def leftJoin[VD2: ClassTag, VD3: ClassTag] 149 | (other: Iterator[(VertexId, VD2)]) 150 | (f: (VertexId, VD, Option[VD2]) => VD3): Self[VD3] = { 151 | leftJoin(createUsingIndex(other))(f) 152 | } 153 | 154 | /** Inner join another VertexPartition. */ 155 | def innerJoin[U: ClassTag, VD2: ClassTag] 156 | (other: Self[U]) 157 | (f: (VertexId, VD, U) => VD2): Self[VD2] = { 158 | if (self.index != other.index) { 159 | logWarning("Joining two VertexPartitions with different indexes is slow.") 160 | innerJoin(createUsingIndex(other.iterator))(f) 161 | } else { 162 | val newMask = self.mask & other.mask 163 | val newValues = new Array[VD2](self.capacity) 164 | var i = newMask.nextSetBit(0) 165 | while (i >= 0) { 166 | newValues(i) = f(self.index.getValue(i), self.values(i), other.values(i)) 167 | i = newMask.nextSetBit(i + 1) 168 | } 169 | this.withValues(newValues).withMask(newMask) 170 | } 171 | } 172 | 173 | /** 174 | * Inner join an iterator of messages. 175 | */ 176 | def innerJoin[U: ClassTag, VD2: ClassTag] 177 | (iter: Iterator[Product2[VertexId, U]]) 178 | (f: (VertexId, VD, U) => VD2): Self[VD2] = { 179 | innerJoin(createUsingIndex(iter))(f) 180 | } 181 | 182 | /** 183 | * Similar effect as aggregateUsingIndex((a, b) => a) 184 | */ 185 | def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexId, VD2]]) 186 | : Self[VD2] = { 187 | val newMask = new BitSet(self.capacity) 188 | val newValues = new Array[VD2](self.capacity) 189 | iter.foreach { pair => 190 | val pos = self.index.getPos(pair._1) 191 | if (pos >= 0) { 192 | newMask.set(pos) 193 | newValues(pos) = pair._2 194 | } 195 | } 196 | this.withValues(newValues).withMask(newMask) 197 | } 198 | 199 | /** 200 | * Similar to innerJoin, but vertices from the left side that don't appear in iter will remain in 201 | * the partition, hidden by the bitmask. 202 | */ 203 | def innerJoinKeepLeft(iter: Iterator[Product2[VertexId, VD]]): Self[VD] = { 204 | val newMask = new BitSet(self.capacity) 205 | val newValues = new Array[VD](self.capacity) 206 | System.arraycopy(self.values, 0, newValues, 0, newValues.length) 207 | iter.foreach { pair => 208 | val pos = self.index.getPos(pair._1) 209 | if (pos >= 0) { 210 | newMask.set(pos) 211 | newValues(pos) = pair._2 212 | } 213 | } 214 | this.withValues(newValues).withMask(newMask) 215 | } 216 | 217 | def aggregateUsingIndex[VD2: ClassTag]( 218 | iter: Iterator[Product2[VertexId, VD2]], 219 | reduceFunc: (VD2, VD2) => VD2): Self[VD2] = { 220 | val newMask = new BitSet(self.capacity) 221 | val newValues = new Array[VD2](self.capacity) 222 | iter.foreach { product => 223 | val vid = product._1 224 | val vdata = product._2 225 | val pos = self.index.getPos(vid) 226 | if (pos >= 0) { 227 | if (newMask.get(pos)) { 228 | newValues(pos) = reduceFunc(newValues(pos), vdata) 229 | } else { // otherwise just store the new value 230 | newMask.set(pos) 231 | newValues(pos) = vdata 232 | } 233 | } 234 | } 235 | this.withValues(newValues).withMask(newMask) 236 | } 237 | 238 | /** 239 | * Construct a new VertexPartition whose index contains only the vertices in the mask. 240 | */ 241 | def reindex(): Self[VD] = { 242 | val hashMap = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD] 243 | val arbitraryMerge = (a: VD, b: VD) => a 244 | for ((k, v) <- self.iterator) { 245 | hashMap.setMerge(k, v, arbitraryMerge) 246 | } 247 | this.withIndex(hashMap.keySet).withValues(hashMap._values).withMask(hashMap.keySet.getBitSet) 248 | } 249 | 250 | /** 251 | * Converts a vertex partition (in particular, one of type `Self`) into a 252 | * `VertexPartitionBaseOps`. Within this class, this allows chaining the methods defined above, 253 | * because these methods return a `Self` and this implicit conversion re-wraps that in a 254 | * `VertexPartitionBaseOps`. This relies on the context bound on `Self`. 255 | */ 256 | private implicit def toOps[VD2: ClassTag](partition: Self[VD2]) 257 | : VertexPartitionBaseOps[VD2, Self] = { 258 | implicitly[VertexPartitionBaseOpsConstructor[Self]].toOps(partition) 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.impl 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.SparkContext._ 24 | import org.apache.spark.rdd._ 25 | import org.apache.spark.storage.StorageLevel 26 | 27 | import org.apache.spark.graphx._ 28 | 29 | class VertexRDDImpl[VD] private[graphx] ( 30 | @transient val partitionsRDD: RDD[ShippableVertexPartition[VD]], 31 | val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) 32 | (implicit override protected val vdTag: ClassTag[VD]) 33 | extends VertexRDD[VD](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { 34 | 35 | require(partitionsRDD.partitioner.isDefined) 36 | 37 | override def reindex(): VertexRDD[VD] = this.withPartitionsRDD(partitionsRDD.map(_.reindex())) 38 | 39 | override val partitioner = partitionsRDD.partitioner 40 | 41 | override protected def getPreferredLocations(s: Partition): Seq[String] = 42 | partitionsRDD.preferredLocations(s) 43 | 44 | override def setName(_name: String): this.type = { 45 | if (partitionsRDD.name != null) { 46 | partitionsRDD.setName(partitionsRDD.name + ", " + _name) 47 | } else { 48 | partitionsRDD.setName(_name) 49 | } 50 | this 51 | } 52 | setName("VertexRDD") 53 | 54 | /** 55 | * Persists the vertex partitions at the specified storage level, ignoring any existing target 56 | * storage level. 57 | */ 58 | override def persist(newLevel: StorageLevel): this.type = { 59 | partitionsRDD.persist(newLevel) 60 | this 61 | } 62 | 63 | override def unpersist(blocking: Boolean = true): this.type = { 64 | partitionsRDD.unpersist(blocking) 65 | this 66 | } 67 | 68 | /** Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY. */ 69 | override def cache(): this.type = { 70 | partitionsRDD.persist(targetStorageLevel) 71 | this 72 | } 73 | 74 | override def getStorageLevel: StorageLevel = partitionsRDD.getStorageLevel 75 | 76 | override def checkpoint(): Unit = { 77 | partitionsRDD.checkpoint() 78 | } 79 | 80 | override def isCheckpointed: Boolean = { 81 | firstParent[ShippableVertexPartition[VD]].isCheckpointed 82 | } 83 | 84 | override def getCheckpointFile: Option[String] = { 85 | partitionsRDD.getCheckpointFile 86 | } 87 | 88 | /** The number of vertices in the RDD. */ 89 | override def count(): Long = { 90 | partitionsRDD.map(_.size.toLong).reduce(_ + _) 91 | } 92 | 93 | override private[graphx] def mapVertexPartitions[VD2: ClassTag]( 94 | f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]) 95 | : VertexRDD[VD2] = { 96 | val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) 97 | this.withPartitionsRDD(newPartitionsRDD) 98 | } 99 | 100 | override def mapValues[VD2: ClassTag](f: VD => VD2): VertexRDD[VD2] = 101 | this.mapVertexPartitions(_.map((vid, attr) => f(attr))) 102 | 103 | override def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2] = 104 | this.mapVertexPartitions(_.map(f)) 105 | 106 | override def minus(other: RDD[(VertexId, VD)]): VertexRDD[VD] = { 107 | minus(this.aggregateUsingIndex(other, (a: VD, b: VD) => a)) 108 | } 109 | 110 | override def minus (other: VertexRDD[VD]): VertexRDD[VD] = { 111 | other match { 112 | case other: VertexRDD[_] if this.partitioner == other.partitioner => 113 | this.withPartitionsRDD[VD]( 114 | partitionsRDD.zipPartitions( 115 | other.partitionsRDD, preservesPartitioning = true) { 116 | (thisIter, otherIter) => 117 | val thisPart = thisIter.next() 118 | val otherPart = otherIter.next() 119 | Iterator(thisPart.minus(otherPart)) 120 | }) 121 | case _ => 122 | this.withPartitionsRDD[VD]( 123 | partitionsRDD.zipPartitions( 124 | other.partitionBy(this.partitioner.get), preservesPartitioning = true) { 125 | (partIter, msgs) => partIter.map(_.minus(msgs)) 126 | } 127 | ) 128 | } 129 | } 130 | 131 | override def diff(other: RDD[(VertexId, VD)]): VertexRDD[VD] = { 132 | diff(this.aggregateUsingIndex(other, (a: VD, b: VD) => a)) 133 | } 134 | 135 | override def diff(other: VertexRDD[VD]): VertexRDD[VD] = { 136 | val otherPartition = other match { 137 | case other: VertexRDD[_] if this.partitioner == other.partitioner => 138 | other.partitionsRDD 139 | case _ => 140 | VertexRDD(other.partitionBy(this.partitioner.get)).partitionsRDD 141 | } 142 | val newPartitionsRDD = partitionsRDD.zipPartitions( 143 | otherPartition, preservesPartitioning = true 144 | ) { (thisIter, otherIter) => 145 | val thisPart = thisIter.next() 146 | val otherPart = otherIter.next() 147 | Iterator(thisPart.diff(otherPart)) 148 | } 149 | this.withPartitionsRDD(newPartitionsRDD) 150 | } 151 | 152 | override def leftZipJoin[VD2: ClassTag, VD3: ClassTag] 153 | (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] = { 154 | val newPartitionsRDD = partitionsRDD.zipPartitions( 155 | other.partitionsRDD, preservesPartitioning = true 156 | ) { (thisIter, otherIter) => 157 | val thisPart = thisIter.next() 158 | val otherPart = otherIter.next() 159 | Iterator(thisPart.leftJoin(otherPart)(f)) 160 | } 161 | this.withPartitionsRDD(newPartitionsRDD) 162 | } 163 | 164 | override def leftJoin[VD2: ClassTag, VD3: ClassTag] 165 | (other: RDD[(VertexId, VD2)]) 166 | (f: (VertexId, VD, Option[VD2]) => VD3) 167 | : VertexRDD[VD3] = { 168 | // Test if the other vertex is a VertexRDD to choose the optimal join strategy. 169 | // If the other set is a VertexRDD then we use the much more efficient leftZipJoin 170 | other match { 171 | case other: VertexRDD[_] if this.partitioner == other.partitioner => 172 | leftZipJoin(other)(f) 173 | case _ => 174 | this.withPartitionsRDD[VD3]( 175 | partitionsRDD.zipPartitions( 176 | other.partitionBy(this.partitioner.get), preservesPartitioning = true) { 177 | (partIter, msgs) => partIter.map(_.leftJoin(msgs)(f)) 178 | } 179 | ) 180 | } 181 | } 182 | 183 | override def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U]) 184 | (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = { 185 | val newPartitionsRDD = partitionsRDD.zipPartitions( 186 | other.partitionsRDD, preservesPartitioning = true 187 | ) { (thisIter, otherIter) => 188 | val thisPart = thisIter.next() 189 | val otherPart = otherIter.next() 190 | Iterator(thisPart.innerJoin(otherPart)(f)) 191 | } 192 | this.withPartitionsRDD(newPartitionsRDD) 193 | } 194 | 195 | override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) 196 | (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = { 197 | // Test if the other vertex is a VertexRDD to choose the optimal join strategy. 198 | // If the other set is a VertexRDD then we use the much more efficient innerZipJoin 199 | other match { 200 | case other: VertexRDD[_] if this.partitioner == other.partitioner => 201 | innerZipJoin(other)(f) 202 | case _ => 203 | this.withPartitionsRDD( 204 | partitionsRDD.zipPartitions( 205 | other.partitionBy(this.partitioner.get), preservesPartitioning = true) { 206 | (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f)) 207 | } 208 | ) 209 | } 210 | } 211 | 212 | override def aggregateUsingIndex[VD2: ClassTag]( 213 | messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = { 214 | val shuffled = messages.partitionBy(this.partitioner.get) 215 | val parts = partitionsRDD.zipPartitions(shuffled, true) { (thisIter, msgIter) => 216 | thisIter.map(_.aggregateUsingIndex(msgIter, reduceFunc)) 217 | } 218 | this.withPartitionsRDD[VD2](parts) 219 | } 220 | 221 | override def reverseRoutingTables(): VertexRDD[VD] = 222 | this.mapVertexPartitions(vPart => vPart.withRoutingTable(vPart.routingTable.reverse)) 223 | 224 | override def withEdges(edges: EdgeRDD[_]): VertexRDD[VD] = { 225 | val routingTables = VertexRDD.createRoutingTables(edges, this.partitioner.get) 226 | val vertexPartitions = partitionsRDD.zipPartitions(routingTables, true) { 227 | (partIter, routingTableIter) => 228 | val routingTable = 229 | if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty 230 | partIter.map(_.withRoutingTable(routingTable)) 231 | } 232 | this.withPartitionsRDD(vertexPartitions) 233 | } 234 | 235 | override private[graphx] def withPartitionsRDD[VD2: ClassTag]( 236 | partitionsRDD: RDD[ShippableVertexPartition[VD2]]): VertexRDD[VD2] = { 237 | new VertexRDDImpl(partitionsRDD, this.targetStorageLevel) 238 | } 239 | 240 | override private[graphx] def withTargetStorageLevel( 241 | targetStorageLevel: StorageLevel): VertexRDD[VD] = { 242 | new VertexRDDImpl(this.partitionsRDD, targetStorageLevel) 243 | } 244 | 245 | override private[graphx] def shipVertexAttributes( 246 | shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, VertexAttributeBlock[VD])] = { 247 | partitionsRDD.mapPartitions(_.flatMap(_.shipVertexAttributes(shipSrc, shipDst))) 248 | } 249 | 250 | override private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])] = { 251 | partitionsRDD.mapPartitions(_.flatMap(_.shipVertexIds())) 252 | } 253 | 254 | } 255 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/impl/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | import org.apache.spark.util.collection.OpenHashSet 21 | 22 | package object impl { 23 | private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexId] 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.graphx._ 23 | 24 | /** Connected components algorithm. */ 25 | object ConnectedComponents { 26 | /** 27 | * Compute the connected component membership of each vertex and return a graph with the vertex 28 | * value containing the lowest vertex id in the connected component containing that vertex. 29 | * 30 | * @tparam VD the vertex attribute type (discarded in the computation) 31 | * @tparam ED the edge attribute type (preserved in the computation) 32 | * 33 | * @param graph the graph for which to compute the connected components 34 | * 35 | * @return a graph with vertex attributes containing the smallest vertex in each 36 | * connected component 37 | */ 38 | def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexId, ED] = { 39 | val ccGraph = graph.mapVertices { case (vid, _) => vid } 40 | def sendMessage(edge: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, VertexId)] = { 41 | if (edge.srcAttr < edge.dstAttr) { 42 | Iterator((edge.dstId, edge.srcAttr)) 43 | } else if (edge.srcAttr > edge.dstAttr) { 44 | Iterator((edge.srcId, edge.dstAttr)) 45 | } else { 46 | Iterator.empty 47 | } 48 | } 49 | val initialMessage = Long.MaxValue 50 | Pregel(ccGraph, initialMessage, activeDirection = EdgeDirection.Either)( 51 | vprog = (id, attr, msg) => math.min(attr, msg), 52 | sendMsg = sendMessage, 53 | mergeMsg = (a, b) => math.min(a, b)) 54 | } // end of connectedComponents 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.reflect.ClassTag 21 | import org.apache.spark.graphx._ 22 | 23 | /** Label Propagation algorithm. */ 24 | object LabelPropagation { 25 | /** 26 | * Run static Label Propagation for detecting communities in networks. 27 | * 28 | * Each node in the network is initially assigned to its own community. At every superstep, nodes 29 | * send their community affiliation to all neighbors and update their state to the mode community 30 | * affiliation of incoming messages. 31 | * 32 | * LPA is a standard community detection algorithm for graphs. It is very inexpensive 33 | * computationally, although (1) convergence is not guaranteed and (2) one can end up with 34 | * trivial solutions (all nodes are identified into a single community). 35 | * 36 | * @tparam ED the edge attribute type (not used in the computation) 37 | * 38 | * @param graph the graph for which to compute the community affiliation 39 | * @param maxSteps the number of supersteps of LPA to be performed. Because this is a static 40 | * implementation, the algorithm will run for exactly this many supersteps. 41 | * 42 | * @return a graph with vertex attributes containing the label of community affiliation 43 | */ 44 | def run[VD, ED: ClassTag](graph: Graph[VD, ED], maxSteps: Int): Graph[VertexId, ED] = { 45 | val lpaGraph = graph.mapVertices { case (vid, _) => vid } 46 | def sendMessage(e: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, Map[VertexId, Long])] = { 47 | Iterator((e.srcId, Map(e.dstAttr -> 1L)), (e.dstId, Map(e.srcAttr -> 1L))) 48 | } 49 | def mergeMessage(count1: Map[VertexId, Long], count2: Map[VertexId, Long]) 50 | : Map[VertexId, Long] = { 51 | (count1.keySet ++ count2.keySet).map { i => 52 | val count1Val = count1.getOrElse(i, 0L) 53 | val count2Val = count2.getOrElse(i, 0L) 54 | i -> (count1Val + count2Val) 55 | }.toMap 56 | } 57 | def vertexProgram(vid: VertexId, attr: Long, message: Map[VertexId, Long]): VertexId = { 58 | if (message.isEmpty) attr else message.maxBy(_._2)._1 59 | } 60 | val initialMessage = Map[VertexId, Long]() 61 | Pregel(lpaGraph, initialMessage, maxIterations = maxSteps)( 62 | vprog = vertexProgram, 63 | sendMsg = sendMessage, 64 | mergeMsg = mergeMessage) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/PageRank.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.reflect.ClassTag 21 | import scala.language.postfixOps 22 | 23 | import org.apache.spark.Logging 24 | import org.apache.spark.graphx._ 25 | 26 | /** 27 | * 计算一张图中所有顶点的重要程度,进而对他们进行排名. 28 | * 这是GraphX提供的用Pregel的模型改进后产生的图算法,通常我们在进行使用PageRank的代码编写时并不涉及去改动这份源码, 29 | * 而是直接调用:graph.pageRank(0.0001) 30 | */ 31 | 32 | /** 33 | * PageRank algorithm implementation. There are two implementations of PageRank implemented. 34 | *PageRank的两种实现方式. 35 | * The first implementation uses the standalone [[Graph]] interface and runs PageRank 36 | * for a fixed number of iterations: 37 | * 第一种(静态实现): 使用standalone [[Graph]]接口,在调用时提供一个参数number,用于指定迭代次数, 38 | * 即无论结果如何,该算法在迭代number次后停止计算,返回图结果。 39 | * {{{ 40 | * var PR = Array.fill(n)( 1.0 ) 41 | * val oldPR = Array.fill(n)( 1.0 ) 42 | * for( iter <- 0 until numIter ) { 43 | * swap(oldPR, PR) 44 | * for( i <- 0 until n ) { 45 | * PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum 46 | * } 47 | * } 48 | * }}} 49 | * 50 | * The second implementation uses the [[Pregel]] interface and runs PageRank until 51 | * convergence: 52 | * 第二种:(动态)在调用时提供一个参数tol,用于指定前后两次迭代的结果差值应小于tol, 53 | * 以达到最终收敛的效果时才停止计算,返回图结果。 54 | * {{{ 55 | * var PR = Array.fill(n)( 1.0 ) 56 | * val oldPR = Array.fill(n)( 0.0 ) 57 | * while( max(abs(PR - oldPr)) > tol ) { 58 | * swap(oldPR, PR) 59 | * for( i <- 0 until n if abs(PR[i] - oldPR[i]) > tol ) { 60 | * PR[i] = alpha + (1 - \alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum 61 | * } 62 | * } 63 | * }}} 64 | * 65 | * `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of 66 | * neighbors which link to `i` and `outDeg[j]` is the out degree of vertex `j`. 67 | * 68 | * Note that this is not the "normalized" PageRank and as a consequence pages that have no 69 | * inlinks will have a PageRank of alpha. 70 | */ 71 | object PageRank extends Logging { 72 | 73 | 74 | /** 75 | * Run PageRank for a fixed number of iterations returning a graph 76 | * with vertex attributes containing the PageRank and edge 77 | * attributes the normalized edge weight. 78 | * 以固定的迭代次数运行PageRank算法,以图的形式返回,包括最终的顶点值(pagerank值) 79 | * 和标准化的边值(权重值),进而得到最终的排名结果. 80 | * @tparam VD the original vertex attribute (not used) 81 | * 顶点的属性类型(不需要用户指定,由你传入的图的属性决定) 82 | * @tparam ED the original edge attribute (not used) 83 | * 边的属性类型(不需要用户指定,由你传入的图的属性决定) 84 | * @param graph the graph on which to compute PageRank 85 | * 进行PageRank计算的图模型 86 | * @param numIter the number of iterations of PageRank to run 87 | * 迭代次数 88 | * @param resetProb the random reset probability (alpha) 89 | * 随机重置的概率,通常都是0.15 90 | * @return the graph containing with each vertex containing the PageRank and each edge 91 | * containing the normalized weight. 92 | * 以图的形式返回,包括最终的顶点值(pagerank值)和标准化的边值(权重值), 93 | * 进而得到最终的排名结果 94 | */ 95 | def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int, 96 | resetProb: Double = 0.15): Graph[Double, Double] = 97 | { 98 | runWithOptions(graph, numIter, resetProb) 99 | } 100 | 101 | /** 102 | * Run PageRank for a fixed number of iterations returning a graph 103 | * with vertex attributes containing the PageRank and edge 104 | * attributes the normalized edge weight. 105 | * 106 | * withOptions : 可以个性化定义srcId 107 | * 108 | * @tparam VD the original vertex attribute (not used) 109 | * @tparam ED the original edge attribute (not used) 110 | * 111 | * @param graph the graph on which to compute PageRank 112 | * 进行PageRank计算的图模型 113 | * @param numIter the number of iterations of PageRank to run 114 | * 迭代次数 115 | * @param resetProb the random reset probability (alpha) 116 | * 随机重置的概率,通常都是0.15 117 | * @param srcId the source vertex for a Personalized Page Rank (optional) 118 | * 个性化的顶点Id值 119 | * @return the graph containing with each vertex containing the PageRank and each edge 120 | * containing the normalized weight. 121 | * 以图的形式返回,包括最终的顶点值(pagerank值)和标准化的边值(权重值) 122 | */ 123 | def runWithOptions[VD: ClassTag, ED: ClassTag]( 124 | graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15, 125 | srcId: Option[VertexId] = None): Graph[Double, Double] = 126 | { 127 | //srcId是否被定义,即是否个性化,返回Boolean值 128 | val personalized = srcId isDefined 129 | //获取srcId的具体值,如果为空,则赋值为-1L 130 | val src: VertexId = srcId.getOrElse(-1L) 131 | 132 | // Initialize the PageRank graph with each edge attribute having 133 | // weight 1/outDegree and each vertex with attribute resetProb. 134 | // 下面这段代码用于初始化PageRank图模型rankGraph,具体内容是 135 | // 赋予每条边属性为值“1/该边的出度数”,赋予每个顶点属性为resetProb的值。 136 | // When running personalized pagerank, only the source vertex 137 | // has an attribute resetProb. All others are set to 0. 138 | // 当运行personlized PageRank时,仅仅是出发顶点使用resetProb作为属性, 139 | // 其他所有顶点的属性被设置为0. 140 | var rankGraph: Graph[Double, Double] = graph 141 | // Associate the degree with each vertex 142 | // 将出度数与每个顶点的属性值关联(给每个顶点添加出度这个属性) 143 | .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) } 144 | // Set the weight on the edges based on the degree 145 | //基于度数为边设置权重 146 | .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src ) 147 | // Set the vertex attributes to the initial pagerank values 148 | //设置每个顶点的初始属性为initial pagerank values 149 | .mapVertices { (id, attr) => 150 | if (!(id != src && personalized)) resetProb else 0.0 151 | } 152 | 153 | def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 } 154 | 155 | var iteration = 0 156 | var prevRankGraph: Graph[Double, Double] = null 157 | while (iteration < numIter) { 158 | rankGraph.cache() 159 | 160 | // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and 161 | // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation. 162 | val rankUpdates = rankGraph.aggregateMessages[Double]( 163 | ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src) 164 | 165 | // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices 166 | // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the 167 | // edge partitions. 168 | prevRankGraph = rankGraph 169 | val rPrb = if (personalized) { 170 | (src: VertexId , id: VertexId) => resetProb * delta(src, id) 171 | } else { 172 | (src: VertexId, id: VertexId) => resetProb 173 | } 174 | 175 | rankGraph = rankGraph.joinVertices(rankUpdates) { 176 | (id, oldRank, msgSum) => rPrb(src, id) + (1.0 - resetProb) * msgSum 177 | }.cache() 178 | 179 | rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices 180 | logInfo(s"PageRank finished iteration $iteration.") 181 | prevRankGraph.vertices.unpersist(false) 182 | prevRankGraph.edges.unpersist(false) 183 | 184 | iteration += 1 185 | } 186 | 187 | rankGraph 188 | } 189 | 190 | /** 191 | * Run a dynamic version of PageRank returning a graph with vertex attributes containing the 192 | * PageRank and edge attributes containing the normalized edge weight. 193 | * 194 | * @tparam VD the original vertex attribute (not used) 195 | * @tparam ED the original edge attribute (not used) 196 | * 197 | * @param graph the graph on which to compute PageRank 198 | * @param tol the tolerance allowed at convergence (smaller => more accurate). 199 | * @param resetProb the random reset probability (alpha) 200 | * 201 | * @return the graph containing with each vertex containing the PageRank and each edge 202 | * containing the normalized weight. 203 | */ 204 | def runUntilConvergence[VD: ClassTag, ED: ClassTag]( 205 | graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] = 206 | { 207 | runUntilConvergenceWithOptions(graph, tol, resetProb) 208 | } 209 | 210 | /** 211 | * Run a dynamic version of PageRank returning a graph with vertex attributes containing the 212 | * PageRank and edge attributes containing the normalized edge weight. 213 | * 214 | * @tparam VD the original vertex attribute (not used) 215 | * @tparam ED the original edge attribute (not used) 216 | * 217 | * @param graph the graph on which to compute PageRank 218 | * @param tol the tolerance allowed at convergence (smaller => more accurate). 219 | * @param resetProb the random reset probability (alpha) 220 | * @param srcId the source vertex for a Personalized Page Rank (optional) 221 | * 222 | * @return the graph containing with each vertex containing the PageRank and each edge 223 | * containing the normalized weight. 224 | */ 225 | def runUntilConvergenceWithOptions[VD: ClassTag, ED: ClassTag]( 226 | graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15, 227 | srcId: Option[VertexId] = None): Graph[Double, Double] = 228 | { 229 | val personalized = srcId.isDefined 230 | val src: VertexId = srcId.getOrElse(-1L) 231 | 232 | // Initialize the pagerankGraph with each edge attribute 233 | // having weight 1/outDegree and each vertex with attribute 1.0. 234 | val pagerankGraph: Graph[(Double, Double), Double] = graph 235 | // Associate the degree with each vertex 236 | .outerJoinVertices(graph.outDegrees) { 237 | (vid, vdata, deg) => deg.getOrElse(0) 238 | } 239 | // Set the weight on the edges based on the degree 240 | .mapTriplets( e => 1.0 / e.srcAttr ) 241 | // Set the vertex attributes to (initalPR, delta = 0) 242 | .mapVertices { (id, attr) => 243 | if (id == src) (resetProb, Double.NegativeInfinity) else (0.0, 0.0) 244 | } 245 | .cache() 246 | 247 | // Define the three functions needed to implement PageRank in the GraphX 248 | // 以下将定义三个所需函数来完成GraphX对PageRank的算法实现想 249 | // version of Pregel 250 | // 第一个函数用于返回一个考虑“随机事件”发生后的计算结果 251 | def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = { 252 | val (oldPR, lastDelta) = attr 253 | val newPR = oldPR + (1.0 - resetProb) * msgSum 254 | (newPR, newPR - oldPR) 255 | } 256 | // 257 | def personalizedVertexProgram(id: VertexId, attr: (Double, Double), 258 | msgSum: Double): (Double, Double) = { 259 | val (oldPR, lastDelta) = attr 260 | var teleport = oldPR 261 | val delta = if (src==id) 1.0 else 0.0 262 | teleport = oldPR*delta 263 | 264 | val newPR = teleport + (1.0 - resetProb) * msgSum 265 | val newDelta = if (lastDelta == Double.NegativeInfinity) newPR else newPR - oldPR 266 | (newPR, newDelta) 267 | } 268 | // 第二个函数用于得到一个迭代器,里面包含了两个信息:该边的目的ID、 269 | // 该边的源属性值和权重的乘积(该边传递的实际PR值) 270 | def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = { 271 | if (edge.srcAttr._2 > tol) { 272 | Iterator((edge.dstId, edge.srcAttr._2 * edge.attr)) 273 | } else { 274 | Iterator.empty 275 | } 276 | } 277 | // 第三个函数用于将顶点属性值和传递的值进行累加 278 | def messageCombiner(a: Double, b: Double): Double = a + b 279 | 280 | // The initial message received by all vertices in PageRank 281 | // 所有顶点接收到的初始信息 282 | val initialMessage = if (personalized) 0.0 else resetProb / (1.0 - resetProb) 283 | 284 | // Execute a dynamic version of Pregel. 285 | val vp = if (personalized) { 286 | (id: VertexId, attr: (Double, Double), msgSum: Double) => 287 | personalizedVertexProgram(id, attr, msgSum) 288 | } else { 289 | (id: VertexId, attr: (Double, Double), msgSum: Double) => 290 | vertexProgram(id, attr, msgSum) 291 | } 292 | 293 | Pregel(pagerankGraph, initialMessage, activeDirection = EdgeDirection.Out)( 294 | vp, sendMessage, messageCombiner) 295 | .mapVertices((vid, attr) => attr._1) 296 | } // end of deltaPageRank 297 | 298 | } 299 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.util.Random 21 | 22 | import com.github.fommil.netlib.BLAS.{getInstance => blas} 23 | 24 | import org.apache.spark.rdd._ 25 | import org.apache.spark.graphx._ 26 | 27 | /** Implementation of SVD++ algorithm. */ 28 | object SVDPlusPlus { 29 | 30 | /** Configuration parameters for SVDPlusPlus. */ 31 | class Conf( 32 | var rank: Int, 33 | var maxIters: Int, 34 | var minVal: Double, 35 | var maxVal: Double, 36 | var gamma1: Double, 37 | var gamma2: Double, 38 | var gamma6: Double, 39 | var gamma7: Double) 40 | extends Serializable 41 | 42 | /** 43 | * This method is now replaced by the updated version of `run()` and returns exactly 44 | * the same result. 45 | */ 46 | @deprecated("Call run()", "1.4.0") 47 | def runSVDPlusPlus(edges: RDD[Edge[Double]], conf: Conf) 48 | : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) = 49 | { 50 | run(edges, conf) 51 | } 52 | 53 | /** 54 | * Implement SVD++ based on "Factorization Meets the Neighborhood: 55 | * a Multifaceted Collaborative Filtering Model", 56 | * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]]. 57 | * 58 | * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^^-0.5^^*sum(y)), 59 | * see the details on page 6. 60 | * 61 | * @param edges edges for constructing the graph 62 | * 63 | * @param conf SVDPlusPlus parameters 64 | * 65 | * @return a graph with vertex attributes containing the trained model 66 | */ 67 | def run(edges: RDD[Edge[Double]], conf: Conf) 68 | : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) = 69 | { 70 | // Generate default vertex attribute 71 | def defaultF(rank: Int): (Array[Double], Array[Double], Double, Double) = { 72 | // TODO: use a fixed random seed 73 | val v1 = Array.fill(rank)(Random.nextDouble()) 74 | val v2 = Array.fill(rank)(Random.nextDouble()) 75 | (v1, v2, 0.0, 0.0) 76 | } 77 | 78 | // calculate global rating mean 79 | edges.cache() 80 | val (rs, rc) = edges.map(e => (e.attr, 1L)).reduce((a, b) => (a._1 + b._1, a._2 + b._2)) 81 | val u = rs / rc 82 | 83 | // construct graph 84 | var g = Graph.fromEdges(edges, defaultF(conf.rank)).cache() 85 | materialize(g) 86 | edges.unpersist() 87 | 88 | // Calculate initial bias and norm 89 | val t0 = g.aggregateMessages[(Long, Double)]( 90 | ctx => { ctx.sendToSrc((1L, ctx.attr)); ctx.sendToDst((1L, ctx.attr)) }, 91 | (g1, g2) => (g1._1 + g2._1, g1._2 + g2._2)) 92 | 93 | val gJoinT0 = g.outerJoinVertices(t0) { 94 | (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double), 95 | msg: Option[(Long, Double)]) => 96 | (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1)) 97 | }.cache() 98 | materialize(gJoinT0) 99 | g.unpersist() 100 | g = gJoinT0 101 | 102 | def sendMsgTrainF(conf: Conf, u: Double) 103 | (ctx: EdgeContext[ 104 | (Array[Double], Array[Double], Double, Double), 105 | Double, 106 | (Array[Double], Array[Double], Double)]) { 107 | val (usr, itm) = (ctx.srcAttr, ctx.dstAttr) 108 | val (p, q) = (usr._1, itm._1) 109 | val rank = p.length 110 | var pred = u + usr._3 + itm._3 + blas.ddot(rank, q, 1, usr._2, 1) 111 | pred = math.max(pred, conf.minVal) 112 | pred = math.min(pred, conf.maxVal) 113 | val err = ctx.attr - pred 114 | // updateP = (err * q - conf.gamma7 * p) * conf.gamma2 115 | val updateP = q.clone() 116 | blas.dscal(rank, err * conf.gamma2, updateP, 1) 117 | blas.daxpy(rank, -conf.gamma7 * conf.gamma2, p, 1, updateP, 1) 118 | // updateQ = (err * usr._2 - conf.gamma7 * q) * conf.gamma2 119 | val updateQ = usr._2.clone() 120 | blas.dscal(rank, err * conf.gamma2, updateQ, 1) 121 | blas.daxpy(rank, -conf.gamma7 * conf.gamma2, q, 1, updateQ, 1) 122 | // updateY = (err * usr._4 * q - conf.gamma7 * itm._2) * conf.gamma2 123 | val updateY = q.clone() 124 | blas.dscal(rank, err * usr._4 * conf.gamma2, updateY, 1) 125 | blas.daxpy(rank, -conf.gamma7 * conf.gamma2, itm._2, 1, updateY, 1) 126 | ctx.sendToSrc((updateP, updateY, (err - conf.gamma6 * usr._3) * conf.gamma1)) 127 | ctx.sendToDst((updateQ, updateY, (err - conf.gamma6 * itm._3) * conf.gamma1)) 128 | } 129 | 130 | for (i <- 0 until conf.maxIters) { 131 | // Phase 1, calculate pu + |N(u)|^(-0.5)*sum(y) for user nodes 132 | g.cache() 133 | val t1 = g.aggregateMessages[Array[Double]]( 134 | ctx => ctx.sendToSrc(ctx.dstAttr._2), 135 | (g1, g2) => { 136 | val out = g1.clone() 137 | blas.daxpy(out.length, 1.0, g2, 1, out, 1) 138 | out 139 | }) 140 | val gJoinT1 = g.outerJoinVertices(t1) { 141 | (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double), 142 | msg: Option[Array[Double]]) => 143 | if (msg.isDefined) { 144 | val out = vd._1.clone() 145 | blas.daxpy(out.length, vd._4, msg.get, 1, out, 1) 146 | (vd._1, out, vd._3, vd._4) 147 | } else { 148 | vd 149 | } 150 | }.cache() 151 | materialize(gJoinT1) 152 | g.unpersist() 153 | g = gJoinT1 154 | 155 | // Phase 2, update p for user nodes and q, y for item nodes 156 | g.cache() 157 | val t2 = g.aggregateMessages( 158 | sendMsgTrainF(conf, u), 159 | (g1: (Array[Double], Array[Double], Double), g2: (Array[Double], Array[Double], Double)) => 160 | { 161 | val out1 = g1._1.clone() 162 | blas.daxpy(out1.length, 1.0, g2._1, 1, out1, 1) 163 | val out2 = g2._2.clone() 164 | blas.daxpy(out2.length, 1.0, g2._2, 1, out2, 1) 165 | (out1, out2, g1._3 + g2._3) 166 | }) 167 | val gJoinT2 = g.outerJoinVertices(t2) { 168 | (vid: VertexId, 169 | vd: (Array[Double], Array[Double], Double, Double), 170 | msg: Option[(Array[Double], Array[Double], Double)]) => { 171 | val out1 = vd._1.clone() 172 | blas.daxpy(out1.length, 1.0, msg.get._1, 1, out1, 1) 173 | val out2 = vd._2.clone() 174 | blas.daxpy(out2.length, 1.0, msg.get._2, 1, out2, 1) 175 | (out1, out2, vd._3 + msg.get._3, vd._4) 176 | } 177 | }.cache() 178 | materialize(gJoinT2) 179 | g.unpersist() 180 | g = gJoinT2 181 | } 182 | 183 | // calculate error on training set 184 | def sendMsgTestF(conf: Conf, u: Double) 185 | (ctx: EdgeContext[(Array[Double], Array[Double], Double, Double), Double, Double]) { 186 | val (usr, itm) = (ctx.srcAttr, ctx.dstAttr) 187 | val (p, q) = (usr._1, itm._1) 188 | var pred = u + usr._3 + itm._3 + blas.ddot(q.length, q, 1, usr._2, 1) 189 | pred = math.max(pred, conf.minVal) 190 | pred = math.min(pred, conf.maxVal) 191 | val err = (ctx.attr - pred) * (ctx.attr - pred) 192 | ctx.sendToDst(err) 193 | } 194 | 195 | g.cache() 196 | val t3 = g.aggregateMessages[Double](sendMsgTestF(conf, u), _ + _) 197 | val gJoinT3 = g.outerJoinVertices(t3) { 198 | (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double), msg: Option[Double]) => 199 | if (msg.isDefined) (vd._1, vd._2, vd._3, msg.get) else vd 200 | }.cache() 201 | materialize(gJoinT3) 202 | g.unpersist() 203 | g = gJoinT3 204 | 205 | // Convert DoubleMatrix to Array[Double]: 206 | val newVertices = g.vertices.mapValues(v => (v._1.toArray, v._2.toArray, v._3, v._4)) 207 | (Graph(newVertices, g.edges), u) 208 | } 209 | 210 | /** 211 | * Forces materialization of a Graph by count()ing its RDDs. 212 | */ 213 | private def materialize(g: Graph[_, _]): Unit = { 214 | g.vertices.count() 215 | g.edges.count() 216 | } 217 | 218 | } 219 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import org.apache.spark.graphx._ 21 | import scala.reflect.ClassTag 22 | 23 | /** 24 | * Computes shortest paths to the given set of landmark vertices, returning a graph where each 25 | * vertex attribute is a map containing the shortest-path distance to each reachable landmark. 26 | */ 27 | object ShortestPaths { 28 | /** Stores a map from the vertex id of a landmark to the distance to that landmark. */ 29 | type SPMap = Map[VertexId, Int] 30 | 31 | private def makeMap(x: (VertexId, Int)*) = Map(x: _*) 32 | 33 | private def incrementMap(spmap: SPMap): SPMap = spmap.map { case (v, d) => v -> (d + 1) } 34 | 35 | private def addMaps(spmap1: SPMap, spmap2: SPMap): SPMap = 36 | (spmap1.keySet ++ spmap2.keySet).map { 37 | k => k -> math.min(spmap1.getOrElse(k, Int.MaxValue), spmap2.getOrElse(k, Int.MaxValue)) 38 | }.toMap 39 | 40 | /** 41 | * Computes shortest paths to the given set of landmark vertices. 42 | * 43 | * @tparam ED the edge attribute type (not used in the computation) 44 | * 45 | * @param graph the graph for which to compute the shortest paths 46 | * @param landmarks the list of landmark vertex ids. Shortest paths will be computed to each 47 | * landmark. 48 | * 49 | * @return a graph where each vertex attribute is a map containing the shortest-path distance to 50 | * each reachable landmark vertex. 51 | */ 52 | def run[VD, ED: ClassTag](graph: Graph[VD, ED], landmarks: Seq[VertexId]): Graph[SPMap, ED] = { 53 | val spGraph = graph.mapVertices { (vid, attr) => 54 | if (landmarks.contains(vid)) makeMap(vid -> 0) else makeMap() 55 | } 56 | 57 | val initialMessage = makeMap() 58 | 59 | def vertexProgram(id: VertexId, attr: SPMap, msg: SPMap): SPMap = { 60 | addMaps(attr, msg) 61 | } 62 | 63 | def sendMessage(edge: EdgeTriplet[SPMap, _]): Iterator[(VertexId, SPMap)] = { 64 | val newAttr = incrementMap(edge.dstAttr) 65 | if (edge.srcAttr != addMaps(newAttr, edge.srcAttr)) Iterator((edge.srcId, newAttr)) 66 | else Iterator.empty 67 | } 68 | 69 | Pregel(spGraph, initialMessage)(vertexProgram, sendMessage, addMaps) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.graphx._ 23 | 24 | /** Strongly connected components algorithm implementation. */ 25 | object StronglyConnectedComponents { 26 | 27 | /** 28 | * Compute the strongly connected component (SCC) of each vertex and return a graph with the 29 | * vertex value containing the lowest vertex id in the SCC containing that vertex. 30 | * 31 | * @tparam VD the vertex attribute type (discarded in the computation) 32 | * @tparam ED the edge attribute type (preserved in the computation) 33 | * 34 | * @param graph the graph for which to compute the SCC 35 | * 36 | * @return a graph with vertex attributes containing the smallest vertex id in each SCC 37 | */ 38 | def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexId, ED] = { 39 | 40 | // the graph we update with final SCC ids, and the graph we return at the end 41 | var sccGraph = graph.mapVertices { case (vid, _) => vid } 42 | // graph we are going to work with in our iterations 43 | var sccWorkGraph = graph.mapVertices { case (vid, _) => (vid, false) }.cache() 44 | 45 | var numVertices = sccWorkGraph.numVertices 46 | var iter = 0 47 | while (sccWorkGraph.numVertices > 0 && iter < numIter) { 48 | iter += 1 49 | do { 50 | numVertices = sccWorkGraph.numVertices 51 | sccWorkGraph = sccWorkGraph.outerJoinVertices(sccWorkGraph.outDegrees) { 52 | (vid, data, degreeOpt) => if (degreeOpt.isDefined) data else (vid, true) 53 | }.outerJoinVertices(sccWorkGraph.inDegrees) { 54 | (vid, data, degreeOpt) => if (degreeOpt.isDefined) data else (vid, true) 55 | }.cache() 56 | 57 | // get all vertices to be removed 58 | val finalVertices = sccWorkGraph.vertices 59 | .filter { case (vid, (scc, isFinal)) => isFinal} 60 | .mapValues { (vid, data) => data._1} 61 | 62 | // write values to sccGraph 63 | sccGraph = sccGraph.outerJoinVertices(finalVertices) { 64 | (vid, scc, opt) => opt.getOrElse(scc) 65 | } 66 | // only keep vertices that are not final 67 | sccWorkGraph = sccWorkGraph.subgraph(vpred = (vid, data) => !data._2).cache() 68 | } while (sccWorkGraph.numVertices < numVertices) 69 | 70 | sccWorkGraph = sccWorkGraph.mapVertices{ case (vid, (color, isFinal)) => (vid, isFinal) } 71 | 72 | // collect min of all my neighbor's scc values, update if it's smaller than mine 73 | // then notify any neighbors with scc values larger than mine 74 | sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId]( 75 | sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)( 76 | (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2), 77 | e => { 78 | if (e.srcAttr._1 < e.dstAttr._1) { 79 | Iterator((e.dstId, e.srcAttr._1)) 80 | } else { 81 | Iterator() 82 | } 83 | }, 84 | (vid1, vid2) => math.min(vid1, vid2)) 85 | 86 | // start at root of SCCs. Traverse values in reverse, notify all my neighbors 87 | // do not propagate if colors do not match! 88 | sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean]( 89 | sccWorkGraph, false, activeDirection = EdgeDirection.In)( 90 | // vertex is final if it is the root of a color 91 | // or it has the same color as a neighbor that is final 92 | (vid, myScc, existsSameColorFinalNeighbor) => { 93 | val isColorRoot = vid == myScc._1 94 | (myScc._1, myScc._2 || isColorRoot || existsSameColorFinalNeighbor) 95 | }, 96 | // activate neighbor if they are not final, you are, and you have the same color 97 | e => { 98 | val sameColor = e.dstAttr._1 == e.srcAttr._1 99 | val onlyDstIsFinal = e.dstAttr._2 && !e.srcAttr._2 100 | if (sameColor && onlyDstIsFinal) { 101 | Iterator((e.srcId, e.dstAttr._2)) 102 | } else { 103 | Iterator() 104 | } 105 | }, 106 | (final1, final2) => final1 || final2) 107 | } 108 | sccGraph 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.lib 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.graphx._ 23 | 24 | /** 25 | * Compute the number of triangles passing through each vertex. 26 | * 27 | * The algorithm is relatively straightforward and can be computed in three steps: 28 | * 29 | *
182 | * 183 | * dst -> 184 | * (x,y) *************** _ 185 | * | | | | 186 | * | a | b | | 187 | * src | | | | 188 | * | *************** | T 189 | * \|/ | | | | 190 | * | c | d | | 191 | * | | | | 192 | * *************** - 193 | *194 | * 195 | * where this represents the subquadrant of the adj matrix currently being 196 | * subdivided. (x,y) represent the upper left hand corner of the subquadrant, 197 | * and T represents the side length (guaranteed to be a power of 2). 198 | * 199 | * After choosing the next level subquadrant, we get the resulting sets 200 | * of parameters: 201 | * {{{ 202 | * quad = a, x'=x, y'=y, T'=T/2 203 | * quad = b, x'=x+T/2, y'=y, T'=T/2 204 | * quad = c, x'=x, y'=y+T/2, T'=T/2 205 | * quad = d, x'=x+T/2, y'=y+T/2, T'=T/2 206 | * }}} 207 | */ 208 | @tailrec 209 | private def chooseCell(x: Int, y: Int, t: Int): (Int, Int) = { 210 | if (t <= 1) { 211 | (x, y) 212 | } else { 213 | val newT = math.round(t.toFloat/2.0).toInt 214 | pickQuadrant(RMATa, RMATb, RMATc, RMATd) match { 215 | case 0 => chooseCell(x, y, newT) 216 | case 1 => chooseCell(x + newT, y, newT) 217 | case 2 => chooseCell(x, y + newT, newT) 218 | case 3 => chooseCell(x + newT, y + newT, newT) 219 | } 220 | } 221 | } 222 | 223 | // TODO(crankshaw) turn result into an enum (or case class for pattern matching} 224 | private def pickQuadrant(a: Double, b: Double, c: Double, d: Double): Int = { 225 | if (a + b + c + d != 1.0) { 226 | throw new IllegalArgumentException("R-MAT probability parameters sum to " + (a + b + c + d) 227 | + ", should sum to 1.0") 228 | } 229 | val rand = new Random() 230 | val result = rand.nextDouble() 231 | result match { 232 | case x if x < a => 0 // 0 corresponds to quadrant a 233 | case x if (x >= a && x < a + b) => 1 // 1 corresponds to b 234 | case x if (x >= a + b && x < a + b + c) => 2 // 2 corresponds to c 235 | case _ => 3 // 3 corresponds to d 236 | } 237 | } 238 | 239 | /** 240 | * Create `rows` by `cols` grid graph with each vertex connected to its 241 | * row+1 and col+1 neighbors. Vertex ids are assigned in row major 242 | * order. 243 | * 244 | * @param sc the spark context in which to construct the graph 245 | * @param rows the number of rows 246 | * @param cols the number of columns 247 | * 248 | * @return A graph containing vertices with the row and column ids 249 | * as their attributes and edge values as 1.0. 250 | */ 251 | def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int, Int), Double] = { 252 | // Convert row column address into vertex ids (row major order) 253 | def sub2ind(r: Int, c: Int): VertexId = r * cols + c 254 | 255 | val vertices: RDD[(VertexId, (Int, Int))] = sc.parallelize(0 until rows).flatMap { r => 256 | (0 until cols).map( c => (sub2ind(r, c), (r, c)) ) 257 | } 258 | val edges: RDD[Edge[Double]] = 259 | vertices.flatMap{ case (vid, (r, c)) => 260 | (if (r + 1 < rows) { Seq( (sub2ind(r, c), sub2ind(r + 1, c))) } else { Seq.empty }) ++ 261 | (if (c + 1 < cols) { Seq( (sub2ind(r, c), sub2ind(r, c + 1))) } else { Seq.empty }) 262 | }.map{ case (src, dst) => Edge(src, dst, 1.0) } 263 | Graph(vertices, edges) 264 | } // end of gridGraph 265 | 266 | /** 267 | * Create a star graph with vertex 0 being the center. 268 | * 269 | * @param sc the spark context in which to construct the graph 270 | * @param nverts the number of vertices in the star 271 | * 272 | * @return A star graph containing `nverts` vertices with vertex 0 273 | * being the center vertex. 274 | */ 275 | def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = { 276 | val edges: RDD[(VertexId, VertexId)] = sc.parallelize(1 until nverts).map(vid => (vid, 0)) 277 | Graph.fromEdgeTuples(edges, 1) 278 | } // end of starGraph 279 | 280 | } // end of Graph Generators 281 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/util/collection/GraphXPrimitiveKeyOpenHashMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx.util.collection 19 | 20 | import org.apache.spark.util.collection.OpenHashSet 21 | 22 | import scala.reflect._ 23 | 24 | /** 25 | * A fast hash map implementation for primitive, non-null keys. This hash map supports 26 | * insertions and updates, but not deletions. This map is about an order of magnitude 27 | * faster than java.util.HashMap, while using much less space overhead. 28 | * 29 | * Under the hood, it uses our OpenHashSet implementation. 30 | */ 31 | private[graphx] 32 | class GraphXPrimitiveKeyOpenHashMap[@specialized(Long, Int) K: ClassTag, 33 | @specialized(Long, Int, Double) V: ClassTag]( 34 | val keySet: OpenHashSet[K], var _values: Array[V]) 35 | extends Iterable[(K, V)] 36 | with Serializable { 37 | 38 | /** 39 | * Allocate an OpenHashMap with a fixed initial capacity 40 | */ 41 | def this(initialCapacity: Int) = 42 | this(new OpenHashSet[K](initialCapacity), new Array[V](initialCapacity)) 43 | 44 | /** 45 | * Allocate an OpenHashMap with a default initial capacity, providing a true 46 | * no-argument constructor. 47 | */ 48 | def this() = this(64) 49 | 50 | /** 51 | * Allocate an OpenHashMap with a fixed initial capacity 52 | */ 53 | def this(keySet: OpenHashSet[K]) = this(keySet, new Array[V](keySet.capacity)) 54 | 55 | require(classTag[K] == classTag[Long] || classTag[K] == classTag[Int]) 56 | 57 | private var _oldValues: Array[V] = null 58 | 59 | override def size: Int = keySet.size 60 | 61 | /** Get the value for a given key */ 62 | def apply(k: K): V = { 63 | val pos = keySet.getPos(k) 64 | _values(pos) 65 | } 66 | 67 | /** Get the value for a given key, or returns elseValue if it doesn't exist. */ 68 | def getOrElse(k: K, elseValue: V): V = { 69 | val pos = keySet.getPos(k) 70 | if (pos >= 0) _values(pos) else elseValue 71 | } 72 | 73 | /** Set the value for a key */ 74 | def update(k: K, v: V) { 75 | val pos = keySet.addWithoutResize(k) & OpenHashSet.POSITION_MASK 76 | _values(pos) = v 77 | keySet.rehashIfNeeded(k, grow, move) 78 | _oldValues = null 79 | } 80 | 81 | 82 | /** Set the value for a key */ 83 | def setMerge(k: K, v: V, mergeF: (V, V) => V) { 84 | val pos = keySet.addWithoutResize(k) 85 | val ind = pos & OpenHashSet.POSITION_MASK 86 | if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) { // if first add 87 | _values(ind) = v 88 | } else { 89 | _values(ind) = mergeF(_values(ind), v) 90 | } 91 | keySet.rehashIfNeeded(k, grow, move) 92 | _oldValues = null 93 | } 94 | 95 | 96 | /** 97 | * If the key doesn't exist yet in the hash map, set its value to defaultValue; otherwise, 98 | * set its value to mergeValue(oldValue). 99 | * 100 | * @return the newly updated value. 101 | */ 102 | def changeValue(k: K, defaultValue: => V, mergeValue: (V) => V): V = { 103 | val pos = keySet.addWithoutResize(k) 104 | if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) { 105 | val newValue = defaultValue 106 | _values(pos & OpenHashSet.POSITION_MASK) = newValue 107 | keySet.rehashIfNeeded(k, grow, move) 108 | newValue 109 | } else { 110 | _values(pos) = mergeValue(_values(pos)) 111 | _values(pos) 112 | } 113 | } 114 | 115 | override def iterator: Iterator[(K, V)] = new Iterator[(K, V)] { 116 | var pos = 0 117 | var nextPair: (K, V) = computeNextPair() 118 | 119 | /** Get the next value we should return from next(), or null if we're finished iterating */ 120 | def computeNextPair(): (K, V) = { 121 | pos = keySet.nextPos(pos) 122 | if (pos >= 0) { 123 | val ret = (keySet.getValue(pos), _values(pos)) 124 | pos += 1 125 | ret 126 | } else { 127 | null 128 | } 129 | } 130 | 131 | def hasNext: Boolean = nextPair != null 132 | 133 | def next(): (K, V) = { 134 | val pair = nextPair 135 | nextPair = computeNextPair() 136 | pair 137 | } 138 | } 139 | 140 | // The following member variables are declared as protected instead of private for the 141 | // specialization to work (specialized class extends the unspecialized one and needs access 142 | // to the "private" variables). 143 | // They also should have been val's. We use var's because there is a Scala compiler bug that 144 | // would throw illegal access error at runtime if they are declared as val's. 145 | protected var grow = (newCapacity: Int) => { 146 | _oldValues = _values 147 | _values = new Array[V](newCapacity) 148 | } 149 | 150 | protected var move = (oldPos: Int, newPos: Int) => { 151 | _values(newPos) = _oldValues(oldPos) 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/util/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | /** 19 | * Collections of utilities used by graphx. 20 | */ 21 | package org.apache.spark.graphx.util; -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/graphx/util/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.graphx 19 | 20 | /** 21 | * Collections of utilities used by graphx. 22 | */ 23 | package object util 24 | --------------------------------------------------------------------------------